diff options
author | João Valverde <j@v6e.pt> | 2023-02-06 04:27:05 +0000 |
---|---|---|
committer | João Valverde <j@v6e.pt> | 2023-02-06 15:03:49 +0000 |
commit | 9feb85ce4db95ad86efe5c00fbbaee230e289f09 (patch) | |
tree | 5bf20b26646bebe02a7733f5e367271f07569ff8 /wsutil | |
parent | 9b797e97a2107cd0d5364634d517390e7d6910af (diff) |
Move get_utf_8_string() to wsutil
Diffstat (limited to 'wsutil')
-rw-r--r-- | wsutil/unicode-utils.c | 187 | ||||
-rw-r--r-- | wsutil/unicode-utils.h | 10 |
2 files changed, 197 insertions, 0 deletions
diff --git a/wsutil/unicode-utils.c b/wsutil/unicode-utils.c index 764c0b3c03..ef2404d502 100644 --- a/wsutil/unicode-utils.c +++ b/wsutil/unicode-utils.c @@ -31,6 +31,193 @@ int ws_utf8_seqlen[256] = { 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */ }; +/* Given a pointer and a length, validates a string of bytes as UTF-8. + * Returns the number of valid bytes, and a pointer immediately past + * the checked region. + * + * Differs from Glib's g_utf8_validate_len in that null bytes are + * considered valid UTF-8, and that maximal subparts are replaced as + * a unit. (I.e., given a sequence of 2 or 3 bytes which are a + * truncated version of a 3 or 4 byte UTF-8 character, but the next + * byte does not continue the character, the set of 2 or 3 bytes + * are replaced with one REPLACMENT CHARACTER.) + */ +static inline size_t +utf_8_validate(const guint8 *start, ssize_t length, const guint8 **end) +{ + const guint8 *ptr = start; + guint8 ch; + size_t unichar_len, valid_bytes = 0; + + while (length > 0) { + + ch = *ptr; + + if (ch < 0x80) { + valid_bytes++; + ptr++; + length--; + continue; + } + + ch = *ptr; + + if (ch < 0xc2 || ch > 0xf4) { + ptr++; + length--; + *end = ptr; + return valid_bytes; + } + + if (ch < 0xe0) { /* 110xxxxx, 2 byte char */ + unichar_len = 2; + } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */ + unichar_len = 3; + ptr++; + length--; + if (length < 1) { + *end = ptr; + return valid_bytes; + } + switch (ch) { + case 0xe0: + if (*ptr < 0xa0 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } + break; + case 0xed: + if (*ptr < 0x80 || *ptr > 0x9f) { + *end = ptr; + return valid_bytes; + } + break; + default: + if (*ptr < 0x80 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } + } + } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */ + unichar_len = 4; + ptr++; + length--; + if (length < 1) { + *end = ptr; + return valid_bytes; + } + switch (ch) { + case 0xf0: + if (*ptr < 0x90 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } + break; + case 0xf4: + if (*ptr < 0x80 || *ptr > 0x8f) { + *end = ptr; + return valid_bytes; + } + break; + default: + if (*ptr < 0x80 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } + } + ptr++; + length--; + if (length < 1) { + *end = ptr; + return valid_bytes; + } + if (*ptr < 0x80 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } + } + + ptr++; + length--; + if (length < 1) { + *end = ptr; + return valid_bytes; + } + if (*ptr < 0x80 || *ptr > 0xbf) { + *end = ptr; + return valid_bytes; + } else { + ptr++; + length--; + valid_bytes += unichar_len; + } + + } + *end = ptr; + return valid_bytes; +} + +/* + * Given a wmem scope, a pointer, and a length, treat the string of bytes + * referred to by the pointer and length as a UTF-8 string, and return a + * pointer to a UTF-8 string, allocated using the wmem scope, with all + * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER + * according to the recommended "best practices" given in the Unicode + * Standard and specified by W3C/WHATWG. + * + * Note that in conformance with the Unicode Standard, this treats three + * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired) + * and two byte overlong encodings of 7-bit ASCII characters as invalid and + * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard + * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could + * be added later. + * + * Compared with g_utf8_make_valid(), this function does not consider + * internal NUL bytes as invalid and replace them with replacment characters. + * It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3 + * bytes which are a truncated version of a valid 3 or 4 byte character (but + * the next byte does not continue the character) are replaced with a single + * REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the + * sequence with its own (3 octet) REPLACEMENT CHARACTER. + * + * XXX: length should probably be a size_t instead of a gint in all + * these encoding functions + * XXX: the buffer returned can be of different length than the input, + * and can have internal NULs as well (so that strlen doesn't give its + * length). As with the other encoding functions, we should return the + * length of the output buffer (or a wmem_strbuf_t directly) and an + * indication of whether there was an invalid character (i.e. + * REPLACEMENT CHARACTER was used.) + */ +guint8 * +ws_utf8_make_valid(wmem_allocator_t *scope, const guint8 *ptr, ssize_t length) +{ + wmem_strbuf_t *str; + + str = wmem_strbuf_new_sized(scope, length+1); + + /* See the Unicode Standard conformance chapter at + * https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially + * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and + * U+FFFD Substitution of Maximal Subparts. */ + + while (length > 0) { + const guint8 *prev = ptr; + size_t valid_bytes = utf_8_validate(prev, length, &ptr); + + if (valid_bytes) { + wmem_strbuf_append_len(str, prev, valid_bytes); + } + length -= ptr - prev; + prev += valid_bytes; + if (ptr - prev) { + wmem_strbuf_append_unichar_repl(str); + } + } + + return (guint8 *) wmem_strbuf_finalize(str); +} + #ifdef _WIN32 #include <strsafe.h> diff --git a/wsutil/unicode-utils.h b/wsutil/unicode-utils.h index cacb606b4d..2045152441 100644 --- a/wsutil/unicode-utils.h +++ b/wsutil/unicode-utils.h @@ -59,6 +59,16 @@ int ws_utf8_seqlen[256]; */ #define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)]) +/* + * Given a wmem scope, a pointer, and a length, treat the string of bytes + * referred to by the pointer and length as a UTF-8 string, and return a + * pointer to a UTF-8 string, allocated using the wmem scope, with all + * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER + * according to the recommended "best practices" given in the Unicode + * Standard and specified by W3C/WHATWG. + */ +WS_DLL_PUBLIC guint8 * +ws_utf8_make_valid(wmem_allocator_t *scope, const guint8 *ptr, ssize_t length); #ifdef _WIN32 |