Replace ill-formed UTF-8 byte sequences with replacement character

Implement the Unicode Standard "best practices" for replacing ill-formed sequences with the Unicode REPLACEMENT CHARACTER. Add wmem_strbuf_append_len for appending strings with embedded null characters. Clarify why wmem_strbuf_grow() doesn't always ensure that there's enough room for a new string, and short-circuit some tests there. Related to #14948
author: John Thacker <johnthacker@gmail.com> 2020-09-17 15:27:26 -0400
committer: Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> 2020-10-15 21:48:28 +0000
commit: 91b792c6dca364889e3601ca808ef95bbc65c9de (patch)
tree: c310eef77e958fe3685e29d6553d4a7825242c5c
parent: 8b622bffc80ec0fbafe37864205dfb63ec33f3ca (diff)
7 files changed, 181 insertions, 24 deletions
diff --git a/debian/libwireshark0.symbols b/debian/libwireshark0.symbols
index ee09f66aca..afc5fcfb28 100644
--- a/debian/libwireshark0.symbols
+++ b/debian/libwireshark0.symbols
@@ -856,6 +856,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
  get_udp_stream_count@Base 1.12.0~rc1
  get_unichar2_string@Base 1.12.0~rc1
  get_utf_16_string@Base 1.12.0~rc1
+ get_utf_8_string@Base 3.3.1
  get_vlan_hash_table@Base 2.1.0
  get_wka_hashtable@Base 1.12.0~rc1
  get_quic_connections_count@Base 3.1.1
@@ -2051,6 +2052,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
  wmem_str_hash@Base 1.12.0~rc1
  wmem_strbuf_append@Base 1.9.1
  wmem_strbuf_append_c@Base 1.12.0~rc1
+ wmem_strbuf_append_len@Base 3.3.1
  wmem_strbuf_append_printf@Base 1.9.1
  wmem_strbuf_append_unichar@Base 1.12.0~rc1
  wmem_strbuf_append_vprintf@Base 3.1.1
diff --git a/epan/charsets.c b/epan/charsets.c
index bdab2ab4b5..cbc9b712a9 100644
--- a/epan/charsets.c
+++ b/epan/charsets.c
@@ -85,6 +85,127 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
 }
 
 /*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UTF-8 string, and return a
+ * pointer to a UTF-8 string, allocated using the wmem scope, with all
+ * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
+ * according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
+ */
+guint8 *
+get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
+{
+    wmem_strbuf_t *str;
+    guint8 ch;
+    const guint8 *prev;
+    gsize unichar_len;
+
+    str = wmem_strbuf_sized_new(scope, length+1, 0);
+
+    /* See the Unicode Standard conformance chapter at
+     * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf especially
+     * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
+     * U+FFFD Substitution of Maximal Subparts. */
+    while (length > 0) {
+        ch = *ptr;
+        unichar_len = 1;
+
+        if (ch < 0x80) {
+            wmem_strbuf_append_c(str, ch);
+        } else if (ch < 0xc2 || ch > 0xf4) {
+            wmem_strbuf_append_unichar(str, UNREPL);
+        } else {
+            prev = ptr;
+            if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
+                unichar_len = 2;
+            } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
+                unichar_len = 3;
+                ptr++;
+                length--;
+                if (length < 1) {
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    continue;
+                }
+                switch (ch) {
+                    case 0xe0:
+                        if (*ptr < 0xa0 || *ptr > 0xbf) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                        break;
+                    case 0xed:
+                        if (*ptr < 0x80 || *ptr > 0x9f) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                        break;
+                    default:
+                        if (*ptr < 0x80 || *ptr > 0xbf) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                }
+            } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
+                unichar_len = 4;
+                ptr++;
+                length--;
+                if (length < 1) {
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    continue;
+                }
+                switch (ch) {
+                    case 0xf0:
+                        if (*ptr < 0x90 || *ptr > 0xbf) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                        break;
+                    case 0xf4:
+                        if (*ptr < 0x80 || *ptr > 0x8f) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                        break;
+                    default:
+                        if (*ptr < 0x80 || *ptr > 0xbf) {
+                            wmem_strbuf_append_unichar(str, UNREPL);
+                            continue;
+                        }
+                }
+                ptr++;
+                length--;
+                if (length < 1) {
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    continue;
+                }
+                if (*ptr < 0x80 || *ptr > 0xbf) {
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    continue;
+                }
+            }
+
+            ptr++;
+            length--;
+            if (length < 1) {
+                wmem_strbuf_append_unichar(str, UNREPL);
+                continue;
+            }
+            if (*ptr < 0x80 || *ptr > 0xbf) {
+                wmem_strbuf_append_unichar(str, UNREPL);
+                continue;
+            } else {
+                wmem_strbuf_append_len(str, prev, unichar_len);
+            }
+        }
+
+        ptr++;
+        length--;
+    }
+
+    return (guint8 *) wmem_strbuf_finalize(str);
+}
+
+/*
  * ISO 646 "Basic code table".
  */
 const gunichar2 charset_table_iso_646_basic[0x80] = {
diff --git a/epan/charsets.h b/epan/charsets.h
index 4f824c5aef..9c842758b6 100644
--- a/epan/charsets.h
+++ b/epan/charsets.h
@@ -77,6 +77,17 @@ WS_DLL_PUBLIC guint8 *
 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
 
 /*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UTF-8 string, and return a
+ * pointer to a UTF-8 string, allocated using the wmem scope, with all
+ * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
+ * according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
+ */
+WS_DLL_PUBLIC guint8 *
+get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+
+/*
  * Given a wmem scope, a pointer, a length, and a translation table,
  * treat the string of bytes referred to by the pointer and length as a
  * string encoded using one octet per character, with octets with the
diff --git a/epan/tvbuff.c b/epan/tvbuff.c
index 5ef52e054b..3fff345d5a 100644
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@@ -2541,20 +2541,18 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
 /*
  * Given a wmem scope, a tvbuff, an offset, and a length, treat the string
  * of bytes referred to by the tvbuff, the offset. and the length as a UTF-8
- * string, and return a pointer to that string, allocated using the wmem scope.
- *
- * XXX - should map invalid UTF-8 sequences to UNREPL.
+ * string, and return a pointer to a UTF-8 string, allocated using the wmem
+ * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT
+ * CHARACTER according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
  */
 static guint8 *
 tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length)
 {
-	guint8 *strbuf;
+	const guint8  *ptr;
 
-	tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
-	strbuf = (guint8 *)wmem_alloc(scope, length + 1);
-	tvb_memcpy(tvb, strbuf, offset, length);
-	strbuf[length] = '\0';
-	return strbuf;
+	ptr = ensure_contiguous(tvb, offset, length);
+	return get_utf_8_string(scope, ptr, length);
 }
 
 /*
@@ -2562,8 +2560,7 @@ tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
  * of bytes referred to by the tvbuff, the offset, and the length as a
  * raw string, and return a pointer to that string, allocated using the
  * wmem scope. This means a null is appended at the end, but no replacement
- * checking is done otherwise. Currently tvb_get_utf_8_string() does not
- * replace either, but it might in the future.
+ * checking is done otherwise, unlike tvb_get_utf_8_string().
  *
  * Also, this one allows a length of -1 to mean get all, but does not
  * allow a negative offset.
@@ -3087,14 +3084,14 @@ static guint8 *
 tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
 {
 	guint   size;
-	guint8 *strptr;
+	const guint8  *ptr;
 
 	size   = tvb_strsize(tvb, offset);
-	strptr = (guint8 *)wmem_alloc(scope, size);
-	tvb_memcpy(tvb, strptr, offset, size);
+	ptr = ensure_contiguous(tvb, offset, size);
+	/* XXX, conversion between signed/unsigned integer */
 	if (lengthp)
 		*lengthp = size;
-	return strptr;
+	return get_utf_8_string(scope, ptr, size);
 }
 
 static guint8 *
diff --git a/epan/wmem/wmem_strbuf.c b/epan/wmem/wmem_strbuf.c
index f56099af6a..fa0fdf2916 100644
--- a/epan/wmem/wmem_strbuf.c
+++ b/epan/wmem/wmem_strbuf.c
@@ -93,6 +93,8 @@ wmem_strbuf_new(wmem_allocator_t *allocator, const gchar *str)
     return strbuf;
 }
 
+/* grows the allocated size of the wmem_strbuf_t. If max_len is set, then
+ * not guaranteed to grow by the full amount to_add */
 static inline void
 wmem_strbuf_grow(wmem_strbuf_t *strbuf, const gsize to_add)
 {
@@ -141,15 +143,30 @@ wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str)
 
     wmem_strbuf_grow(strbuf, append_len);
 
-    /*
-     * XXX - hasn't wmem_strbuf_grow() ensure there's enough room for
-     * all of str?
-     */
-    g_strlcpy(&strbuf->str[strbuf->len], str, WMEM_STRBUF_RAW_ROOM(strbuf));
+    g_strlcpy(&strbuf->str[strbuf->len], str, strbuf->max_len ? WMEM_STRBUF_RAW_ROOM(strbuf) : append_len+1);
 
     strbuf->len = MIN(strbuf->len + append_len, strbuf->alloc_len - 1);
 }
 
+void
+wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len)
+{
+
+    if (!append_len || !str || str[0] == '\0') {
+        return;
+    }
+
+    wmem_strbuf_grow(strbuf, append_len);
+
+    if (strbuf->max_len) {
+        append_len = MIN(append_len, WMEM_STRBUF_ROOM(strbuf));
+    }
+
+    memcpy(&strbuf->str[strbuf->len], str, append_len);
+    strbuf->len += append_len;
+    strbuf->str[strbuf->len] = '\0';
+}
+
 #ifndef _WIN32
 void
 wmem_strbuf_append_vprintf(wmem_strbuf_t *strbuf, const gchar *fmt, va_list ap)
@@ -220,8 +237,7 @@ wmem_strbuf_append_c(wmem_strbuf_t *strbuf, const gchar c)
 {
     wmem_strbuf_grow(strbuf, 1);
 
-    /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */
-    if (WMEM_STRBUF_ROOM(strbuf) >= 1) {
+    if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= 1) {
         strbuf->str[strbuf->len] = c;
         strbuf->len++;
         strbuf->str[strbuf->len] = '\0';
@@ -238,8 +254,7 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
 
     wmem_strbuf_grow(strbuf, charlen);
 
-    /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */
-    if (WMEM_STRBUF_ROOM(strbuf) >= charlen) {
+    if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= charlen) {
         memcpy(&strbuf->str[strbuf->len], buf, charlen);
         strbuf->len += charlen;
         strbuf->str[strbuf->len] = '\0';
diff --git a/epan/wmem/wmem_strbuf.h b/epan/wmem/wmem_strbuf.h
index 855c02606a..d2f4b3bb2e 100644
--- a/epan/wmem/wmem_strbuf.h
+++ b/epan/wmem/wmem_strbuf.h
@@ -52,6 +52,13 @@ WS_DLL_PUBLIC
 void
 wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str);
 
+/* Appends up to append_len bytes (as allowed by strbuf->max_len) from
+ * str. Ensures that strbuf is null terminated afterwards but will copy
+ * embedded nulls. */
+WS_DLL_PUBLIC
+void
+wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len);
+
 WS_DLL_PUBLIC
 void
 wmem_strbuf_append_printf(wmem_strbuf_t *strbuf, const gchar *format, ...)
diff --git a/epan/wmem/wmem_test.c b/epan/wmem/wmem_test.c
index c25e095829..9dab52979c 100644
--- a/epan/wmem/wmem_test.c
+++ b/epan/wmem/wmem_test.c
@@ -1119,6 +1119,10 @@ wmem_test_strbuf(void)
     g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TES");
     g_assert(wmem_strbuf_get_len(strbuf) == 3);
 
+    wmem_strbuf_append_len(strbuf, "TFUZZ1234", 5);
+    g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TESTFUZZ");
+    g_assert(wmem_strbuf_get_len(strbuf) == 8);
+
     strbuf = wmem_strbuf_sized_new(allocator, 10, 10);
     g_assert(strbuf);
     g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "");
author	John Thacker <johnthacker@gmail.com>	2020-09-17 15:27:26 -0400
committer	Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>	2020-10-15 21:48:28 +0000
commit	91b792c6dca364889e3601ca808ef95bbc65c9de (patch)
tree	c310eef77e958fe3685e29d6553d4a7825242c5c
parent	8b622bffc80ec0fbafe37864205dfb63ec33f3ca (diff)