aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--debian/libwireshark0.symbols2
-rw-r--r--epan/charsets.c121
-rw-r--r--epan/charsets.h11
-rw-r--r--epan/tvbuff.c27
-rw-r--r--epan/wmem/wmem_strbuf.c33
-rw-r--r--epan/wmem/wmem_strbuf.h7
-rw-r--r--epan/wmem/wmem_test.c4
7 files changed, 181 insertions, 24 deletions
diff --git a/debian/libwireshark0.symbols b/debian/libwireshark0.symbols
index ee09f66aca..afc5fcfb28 100644
--- a/debian/libwireshark0.symbols
+++ b/debian/libwireshark0.symbols
@@ -856,6 +856,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
get_udp_stream_count@Base 1.12.0~rc1
get_unichar2_string@Base 1.12.0~rc1
get_utf_16_string@Base 1.12.0~rc1
+ get_utf_8_string@Base 3.3.1
get_vlan_hash_table@Base 2.1.0
get_wka_hashtable@Base 1.12.0~rc1
get_quic_connections_count@Base 3.1.1
@@ -2051,6 +2052,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
wmem_str_hash@Base 1.12.0~rc1
wmem_strbuf_append@Base 1.9.1
wmem_strbuf_append_c@Base 1.12.0~rc1
+ wmem_strbuf_append_len@Base 3.3.1
wmem_strbuf_append_printf@Base 1.9.1
wmem_strbuf_append_unichar@Base 1.12.0~rc1
wmem_strbuf_append_vprintf@Base 3.1.1
diff --git a/epan/charsets.c b/epan/charsets.c
index bdab2ab4b5..cbc9b712a9 100644
--- a/epan/charsets.c
+++ b/epan/charsets.c
@@ -85,6 +85,127 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
}
/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UTF-8 string, and return a
+ * pointer to a UTF-8 string, allocated using the wmem scope, with all
+ * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
+ * according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
+ */
+guint8 *
+get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
+{
+ wmem_strbuf_t *str;
+ guint8 ch;
+ const guint8 *prev;
+ gsize unichar_len;
+
+ str = wmem_strbuf_sized_new(scope, length+1, 0);
+
+ /* See the Unicode Standard conformance chapter at
+ * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf especially
+ * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
+ * U+FFFD Substitution of Maximal Subparts. */
+ while (length > 0) {
+ ch = *ptr;
+ unichar_len = 1;
+
+ if (ch < 0x80) {
+ wmem_strbuf_append_c(str, ch);
+ } else if (ch < 0xc2 || ch > 0xf4) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ } else {
+ prev = ptr;
+ if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
+ unichar_len = 2;
+ } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
+ unichar_len = 3;
+ ptr++;
+ length--;
+ if (length < 1) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ switch (ch) {
+ case 0xe0:
+ if (*ptr < 0xa0 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ break;
+ case 0xed:
+ if (*ptr < 0x80 || *ptr > 0x9f) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ break;
+ default:
+ if (*ptr < 0x80 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ }
+ } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
+ unichar_len = 4;
+ ptr++;
+ length--;
+ if (length < 1) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ switch (ch) {
+ case 0xf0:
+ if (*ptr < 0x90 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ break;
+ case 0xf4:
+ if (*ptr < 0x80 || *ptr > 0x8f) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ break;
+ default:
+ if (*ptr < 0x80 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ }
+ ptr++;
+ length--;
+ if (length < 1) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ if (*ptr < 0x80 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ }
+
+ ptr++;
+ length--;
+ if (length < 1) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ }
+ if (*ptr < 0x80 || *ptr > 0xbf) {
+ wmem_strbuf_append_unichar(str, UNREPL);
+ continue;
+ } else {
+ wmem_strbuf_append_len(str, prev, unichar_len);
+ }
+ }
+
+ ptr++;
+ length--;
+ }
+
+ return (guint8 *) wmem_strbuf_finalize(str);
+}
+
+/*
* ISO 646 "Basic code table".
*/
const gunichar2 charset_table_iso_646_basic[0x80] = {
diff --git a/epan/charsets.h b/epan/charsets.h
index 4f824c5aef..9c842758b6 100644
--- a/epan/charsets.h
+++ b/epan/charsets.h
@@ -77,6 +77,17 @@ WS_DLL_PUBLIC guint8 *
get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
/*
+ * Given a wmem scope, a pointer, and a length, treat the string of bytes
+ * referred to by the pointer and length as a UTF-8 string, and return a
+ * pointer to a UTF-8 string, allocated using the wmem scope, with all
+ * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
+ * according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
+ */
+WS_DLL_PUBLIC guint8 *
+get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+
+/*
* Given a wmem scope, a pointer, a length, and a translation table,
* treat the string of bytes referred to by the pointer and length as a
* string encoded using one octet per character, with octets with the
diff --git a/epan/tvbuff.c b/epan/tvbuff.c
index 5ef52e054b..3fff345d5a 100644
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@@ -2541,20 +2541,18 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
/*
* Given a wmem scope, a tvbuff, an offset, and a length, treat the string
* of bytes referred to by the tvbuff, the offset. and the length as a UTF-8
- * string, and return a pointer to that string, allocated using the wmem scope.
- *
- * XXX - should map invalid UTF-8 sequences to UNREPL.
+ * string, and return a pointer to a UTF-8 string, allocated using the wmem
+ * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT
+ * CHARACTER according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
*/
static guint8 *
tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length)
{
- guint8 *strbuf;
+ const guint8 *ptr;
- tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
- strbuf = (guint8 *)wmem_alloc(scope, length + 1);
- tvb_memcpy(tvb, strbuf, offset, length);
- strbuf[length] = '\0';
- return strbuf;
+ ptr = ensure_contiguous(tvb, offset, length);
+ return get_utf_8_string(scope, ptr, length);
}
/*
@@ -2562,8 +2560,7 @@ tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* of bytes referred to by the tvbuff, the offset, and the length as a
* raw string, and return a pointer to that string, allocated using the
* wmem scope. This means a null is appended at the end, but no replacement
- * checking is done otherwise. Currently tvb_get_utf_8_string() does not
- * replace either, but it might in the future.
+ * checking is done otherwise, unlike tvb_get_utf_8_string().
*
* Also, this one allows a length of -1 to mean get all, but does not
* allow a negative offset.
@@ -3087,14 +3084,14 @@ static guint8 *
tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
{
guint size;
- guint8 *strptr;
+ const guint8 *ptr;
size = tvb_strsize(tvb, offset);
- strptr = (guint8 *)wmem_alloc(scope, size);
- tvb_memcpy(tvb, strptr, offset, size);
+ ptr = ensure_contiguous(tvb, offset, size);
+ /* XXX, conversion between signed/unsigned integer */
if (lengthp)
*lengthp = size;
- return strptr;
+ return get_utf_8_string(scope, ptr, size);
}
static guint8 *
diff --git a/epan/wmem/wmem_strbuf.c b/epan/wmem/wmem_strbuf.c
index f56099af6a..fa0fdf2916 100644
--- a/epan/wmem/wmem_strbuf.c
+++ b/epan/wmem/wmem_strbuf.c
@@ -93,6 +93,8 @@ wmem_strbuf_new(wmem_allocator_t *allocator, const gchar *str)
return strbuf;
}
+/* grows the allocated size of the wmem_strbuf_t. If max_len is set, then
+ * not guaranteed to grow by the full amount to_add */
static inline void
wmem_strbuf_grow(wmem_strbuf_t *strbuf, const gsize to_add)
{
@@ -141,15 +143,30 @@ wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str)
wmem_strbuf_grow(strbuf, append_len);
- /*
- * XXX - hasn't wmem_strbuf_grow() ensure there's enough room for
- * all of str?
- */
- g_strlcpy(&strbuf->str[strbuf->len], str, WMEM_STRBUF_RAW_ROOM(strbuf));
+ g_strlcpy(&strbuf->str[strbuf->len], str, strbuf->max_len ? WMEM_STRBUF_RAW_ROOM(strbuf) : append_len+1);
strbuf->len = MIN(strbuf->len + append_len, strbuf->alloc_len - 1);
}
+void
+wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len)
+{
+
+ if (!append_len || !str || str[0] == '\0') {
+ return;
+ }
+
+ wmem_strbuf_grow(strbuf, append_len);
+
+ if (strbuf->max_len) {
+ append_len = MIN(append_len, WMEM_STRBUF_ROOM(strbuf));
+ }
+
+ memcpy(&strbuf->str[strbuf->len], str, append_len);
+ strbuf->len += append_len;
+ strbuf->str[strbuf->len] = '\0';
+}
+
#ifndef _WIN32
void
wmem_strbuf_append_vprintf(wmem_strbuf_t *strbuf, const gchar *fmt, va_list ap)
@@ -220,8 +237,7 @@ wmem_strbuf_append_c(wmem_strbuf_t *strbuf, const gchar c)
{
wmem_strbuf_grow(strbuf, 1);
- /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */
- if (WMEM_STRBUF_ROOM(strbuf) >= 1) {
+ if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= 1) {
strbuf->str[strbuf->len] = c;
strbuf->len++;
strbuf->str[strbuf->len] = '\0';
@@ -238,8 +254,7 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
wmem_strbuf_grow(strbuf, charlen);
- /* XXX - hasn't wmem_strbuf_grow() ensure this to be true? */
- if (WMEM_STRBUF_ROOM(strbuf) >= charlen) {
+ if (!strbuf->max_len || WMEM_STRBUF_ROOM(strbuf) >= charlen) {
memcpy(&strbuf->str[strbuf->len], buf, charlen);
strbuf->len += charlen;
strbuf->str[strbuf->len] = '\0';
diff --git a/epan/wmem/wmem_strbuf.h b/epan/wmem/wmem_strbuf.h
index 855c02606a..d2f4b3bb2e 100644
--- a/epan/wmem/wmem_strbuf.h
+++ b/epan/wmem/wmem_strbuf.h
@@ -52,6 +52,13 @@ WS_DLL_PUBLIC
void
wmem_strbuf_append(wmem_strbuf_t *strbuf, const gchar *str);
+/* Appends up to append_len bytes (as allowed by strbuf->max_len) from
+ * str. Ensures that strbuf is null terminated afterwards but will copy
+ * embedded nulls. */
+WS_DLL_PUBLIC
+void
+wmem_strbuf_append_len(wmem_strbuf_t *strbuf, const gchar *str, gsize append_len);
+
WS_DLL_PUBLIC
void
wmem_strbuf_append_printf(wmem_strbuf_t *strbuf, const gchar *format, ...)
diff --git a/epan/wmem/wmem_test.c b/epan/wmem/wmem_test.c
index c25e095829..9dab52979c 100644
--- a/epan/wmem/wmem_test.c
+++ b/epan/wmem/wmem_test.c
@@ -1119,6 +1119,10 @@ wmem_test_strbuf(void)
g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TES");
g_assert(wmem_strbuf_get_len(strbuf) == 3);
+ wmem_strbuf_append_len(strbuf, "TFUZZ1234", 5);
+ g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "TESTFUZZ");
+ g_assert(wmem_strbuf_get_len(strbuf) == 8);
+
strbuf = wmem_strbuf_sized_new(allocator, 10, 10);
g_assert(strbuf);
g_assert_cmpstr(wmem_strbuf_get_str(strbuf), ==, "");