diff options
author | John Thacker <johnthacker@gmail.com> | 2020-09-17 15:27:26 -0400 |
---|---|---|
committer | Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> | 2020-10-15 21:48:28 +0000 |
commit | 91b792c6dca364889e3601ca808ef95bbc65c9de (patch) | |
tree | c310eef77e958fe3685e29d6553d4a7825242c5c | |
parent | 8b622bffc80ec0fbafe37864205dfb63ec33f3ca (diff) |
Replace ill-formed UTF-8 byte sequences with replacement character
Implement the Unicode Standard "best practices" for replacing ill-formed
sequences with the Unicode REPLACEMENT CHARACTER. Add wmem_strbuf_append_len
for appending strings with embedded null characters. Clarify why
wmem_strbuf_grow() doesn't always ensure that there's enough room for
a new string, and short-circuit some tests there. Related to #14948
-rw-r--r-- | debian/libwireshark0.symbols | 2 | ||||
-rw-r--r-- | epan/charsets.c | 121 | ||||
-rw-r--r-- | epan/charsets.h | 11 | ||||
-rw-r--r-- | epan/tvbuff.c | 27 | ||||
-rw-r--r-- | epan/wmem/wmem_strbuf.c | 33 | ||||
-rw-r--r-- | epan/wmem/wmem_strbuf.h | 7 | ||||
-rw-r--r-- | epan/wmem/wmem_test.c | 4 |
7 files changed, 181 insertions, 24 deletions
diff --git a/debian/libwireshark0.symbols b/debian/libwireshark0.symbols index ee09f66aca..afc5fcfb28 100644 --- a/debian/libwireshark0.symbols +++ b/debian/libwireshark0.symbols @@ -856,6 +856,7 @@ libwireshark.so.0 libwireshark0 #MINVER# get_udp_stream_count@Base 1.12.0~rc1 get_unichar2_string@Base 1.12.0~rc1 get_utf_16_string@Base 1.12.0~rc1 + get_utf_8_string@Base 3.3.1 get_vlan_hash_table@Base 2.1.0 get_wka_hashtable@Base 1.12.0~rc1 get_quic_connections_count@Base 3.1.1 @@ -2051,6 +2052,7 @@ libwireshark.so.0 libwireshark0 #MINVER# wmem_str_hash@Base 1.12.0~rc1 wmem_strbuf_append@Base 1.9.1 wmem_strbuf_append_c@Base 1.12.0~rc1 + wmem_strbuf_append_len@Base 3.3.1 wmem_strbuf_append_printf@Base 1.9.1 wmem_strbuf_append_unichar@Base 1.12.0~rc1 wmem_strbuf_append_vprintf@Base 3.1.1 diff --git a/epan/charsets.c b/epan/charsets.c index bdab2ab4b5..cbc9b712a9 100644 --- a/epan/charsets.c +++ b/epan/charsets.c @@ -85,6 +85,127 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) } /* + * Given a wmem scope, a pointer, and a length, treat the string of bytes + * referred to by the pointer and length as a UTF-8 string, and return a + * pointer to a UTF-8 string, allocated using the wmem scope, with all + * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER + * according to the recommended "best practices" given in the Unicode + * Standard and specified by W3C/WHATWG. + */ +guint8 * +get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) +{ + wmem_strbuf_t *str; + guint8 ch; + const guint8 *prev; + gsize unichar_len; + + str = wmem_strbuf_sized_new(scope, length+1, 0); + + /* See the Unicode Standard conformance chapter at + * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf especially + * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and + * U+FFFD Substitution of Maximal Subparts. */ + while (length > 0) { + ch = *ptr; + unichar_len = 1; + + if (ch < 0x80) { + wmem_strbuf_append_c(str, ch); + } else if (ch < 0xc2 || ch > 0xf4) { + wmem_strbuf_append_unichar(str, UNREPL); + } else { + prev = ptr; + if (ch < 0xe0) { /* 110xxxxx, 2 byte char */ + unichar_len = 2; + } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */ + unichar_len = 3; + ptr++; + length--; + if (length < 1) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + switch (ch) { + case 0xe0: + if (*ptr < 0xa0 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + break; + case 0xed: + if (*ptr < 0x80 || *ptr > 0x9f) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + break; + default: + if (*ptr < 0x80 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + } + } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */ + unichar_len = 4; + ptr++; + length--; + if (length < 1) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + switch (ch) { + case 0xf0: + if (*ptr < 0x90 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + break; + case 0xf4: + if (*ptr < 0x80 || *ptr > 0x8f) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + break; + default: + if (*ptr < 0x80 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + } + ptr++; + length--; + if (length < 1) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + if (*ptr < 0x80 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + } + + ptr++; + length--; + if (length < 1) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } + if (*ptr < 0x80 || *ptr > 0xbf) { + wmem_strbuf_append_unichar(str, UNREPL); + continue; + } else { + wmem_strbuf_append_len(str, prev, unichar_len); + } + } + + ptr++; + length--; + } + + return (guint8 *) wmem_strbuf_finalize(str); +} + +/* * ISO 646 "Basic code table". */ const gunichar2 charset_table_iso_646_basic[0x80] = { diff --git a/epan/charsets.h b/epan/charsets.h index 4f824c5aef..9c842758b6 100644 --- a/epan/charsets.h +++ b/epan/charsets.h @@ -77,6 +77,17 @@ WS_DLL_PUBLIC guint8 * get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length); /* + * Given a wmem scope, a pointer, and a length, treat the string of bytes + * referred to by the pointer and length as a UTF-8 string, and return a + * pointer to a UTF-8 string, allocated using the wmem scope, with all + * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER + * according to the recommended "best practices" given in the Unicode + * Standard and specified by W3C/WHATWG. + */ +WS_DLL_PUBLIC guint8 * +get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length); + +/* * Given a wmem scope, a pointer, a length, and a translation table, * treat the string of bytes referred to by the pointer and length as a * string encoded using one octet per character, with octets with the diff --git a/epan/tvbuff.c b/epan/tvbuff.c index 5ef52e054b..3fff345d5a 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -2541,20 +2541,18 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint /* * Given a wmem scope, a tvbuff, an offset, and a length, treat the string * of bytes referred to by the tvbuff, the offset. and the length as a UTF-8 - * string, and return a pointer to that string, allocated using the wmem scope. - * - * XXX - should map invalid UTF-8 sequences to UNREPL. + * string, and return a pointer to a UTF-8 string, allocated using the wmem + * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT + * CHARACTER according to the recommended "best practices" given in the Unicode + * Standard and specified by W3C/WHATWG. */ static guint8 * tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length) { - guint8 *strbuf; + const guint8 *ptr; - tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */ - strbuf = (guint8 *)wmem_alloc(scope, length + 1); - tvb_memcpy(tvb, strbuf, offset, length); - strbuf[length] = '\0'; - return strbuf; + ptr = ensure_contiguous(tvb, offset, length); + return get_utf_8_string(scope, ptr, length); } /* @@ -2562,8 +2560,7 @@ tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, * of bytes referred to by the tvbuff, the offset, and the length as a * raw string, and return a pointer to that string, allocated using the * wmem scope. This means a null is appended at the end, but no replacement - * checking is done otherwise. Currently tvb_get_utf_8_string() does not - * replace either, but it might in the future. + * checking is done otherwise, unlike tvb_get_utf_8_string(). * * Also, this one allows a length of -1 to mean get all, but does not * allow a negative offset. @@ -3087,14 +3084,14 @@ static guint8 * tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp) { guint size; - guint8 *strptr; + const guint8 *ptr; size = tvb_strsize(tvb, offset); - strptr = (guint8 *)wmem_alloc(scope, size); - tvb_memcpy(tvb, strptr, offset, size); + ptr = ensure_contiguous(tvb, offset, size); + /* XXX, conversion between signed/unsigned integer */ if (lengthp) *lengthp = size; - return strptr; + return get_utf_8_string(scope, ptr, size); } static guint8 * diff --git a/epan/wmem/wmem_strbuf.c b/epan/wmem/wmem_strbuf.c index f56099af6a..fa0fdf2916 100644 --- a/epan/wmem/wmem_strbuf.c +++ b/epan/wmem/wmem_strbuf.c @@ -93,6 +93,8 @@ wmem_strbuf_new(wmem_allocator_t *allocator, const gchar *str) return strbuf; } +/* grows the allocated size of the wmem_strbuf_t. If max_len is set, then + * not guaranteed to grow by the full amount to_add */ static inline void wmem_strbuf_grow(wmem_strbuf_t *strbuf, const gsize to_add) { @@ -141,15 +143,30 @@ wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str) wmem_strbuf_grow(strbuf, append_len); - /* - * XXX - hasn't wmem_strbuf_grow() ensure there's enough room for - * all of str? - */ - g_strlcpy(&strbuf->str[strbuf->len], str, WMEM_STRBUF_RAW_ROOM(strbuf)); + g_strlcpy(&strbuf->str[strbuf->len], str, strbuf->max_len ? WMEM_STRBUF_RAW_ROOM(strbuf) : append_len+1); strbuf->len = MIN(strbuf->len + append_len, strbuf->alloc_len - 1); } +void +wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len) +{ + + if (!append_len || !str || str[0] == '\0') { + return; + } + + wmem_strbuf_grow(strbuf, append_len); + + if (strbuf->max_len) { + append_len = MIN(append_len, WMEM_STRBUF_ROOM(strbuf)); + } + + memcpy(&strbuf->str[strbuf->len], str, append_len); + strbuf->len += append_len; + strbuf->str[strbuf->len] = '\0'; +} + #ifndef _WIN32 void wmem_strbuf_append_vprintf(wmem_strbuf_t *strbuf, const gchar *fmt, va_list ap) @@ -220,8 +237,7 @@ wmem_strbuf_append_c(wmem_strbuf_t *strbuf, const gchar c) { wmem_strbuf_grow(strbuf, 1); - /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */ - if (WMEM_STRBUF_ROOM(strbuf) >= 1) { + if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= 1) { strbuf->str[strbuf->len] = c; strbuf->len++; strbuf->str[strbuf->len] = '\0'; @@ -238,8 +254,7 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c) wmem_strbuf_grow(strbuf, charlen); - /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */ - if (WMEM_STRBUF_ROOM(strbuf) >= charlen) { + if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= charlen) { memcpy(&strbuf->str[strbuf->len], buf, charlen); strbuf->len += charlen; strbuf->str[strbuf->len] = '\0'; diff --git a/epan/wmem/wmem_strbuf.h b/epan/wmem/wmem_strbuf.h index 855c02606a..d2f4b3bb2e 100644 --- a/epan/wmem/wmem_strbuf.h +++ b/epan/wmem/wmem_strbuf.h @@ -52,6 +52,13 @@ WS_DLL_PUBLIC void wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str); +/* Appends up to append_len bytes (as allowed by strbuf->max_len) from + * str. Ensures that strbuf is null terminated afterwards but will copy + * embedded nulls. */ +WS_DLL_PUBLIC +void +wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len); + WS_DLL_PUBLIC void wmem_strbuf_append_printf(wmem_strbuf_t *strbuf, const gchar *format, ...) diff --git a/epan/wmem/wmem_test.c b/epan/wmem/wmem_test.c index c25e095829..9dab52979c 100644 --- a/epan/wmem/wmem_test.c +++ b/epan/wmem/wmem_test.c @@ -1119,6 +1119,10 @@ wmem_test_strbuf(void) g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TES"); g_assert(wmem_strbuf_get_len(strbuf) == 3); + wmem_strbuf_append_len(strbuf, "TFUZZ1234", 5); + g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TESTFUZZ"); + g_assert(wmem_strbuf_get_len(strbuf) == 8); + strbuf = wmem_strbuf_sized_new(allocator, 10, 10); g_assert(strbuf); g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, ""); |