aboutsummaryrefslogtreecommitdiffstats
path: root/epan/tvbuff.c
diff options
context:
space:
mode:
authorJohn Thacker <johnthacker@gmail.com>2020-09-17 15:27:26 -0400
committerWireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>2020-10-15 21:48:28 +0000
commit91b792c6dca364889e3601ca808ef95bbc65c9de (patch)
treec310eef77e958fe3685e29d6553d4a7825242c5c /epan/tvbuff.c
parent8b622bffc80ec0fbafe37864205dfb63ec33f3ca (diff)
Replace ill-formed UTF-8 byte sequences with replacement character
Implement the Unicode Standard "best practices" for replacing ill-formed sequences with the Unicode REPLACEMENT CHARACTER. Add wmem_strbuf_append_len for appending strings with embedded null characters. Clarify why wmem_strbuf_grow() doesn't always ensure that there's enough room for a new string, and short-circuit some tests there. Related to #14948
Diffstat (limited to 'epan/tvbuff.c')
-rw-r--r--epan/tvbuff.c27
1 files changed, 12 insertions, 15 deletions
diff --git a/epan/tvbuff.c b/epan/tvbuff.c
index 5ef52e054b..3fff345d5a 100644
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@@ -2541,20 +2541,18 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
/*
* Given a wmem scope, a tvbuff, an offset, and a length, treat the string
* of bytes referred to by the tvbuff, the offset. and the length as a UTF-8
- * string, and return a pointer to that string, allocated using the wmem scope.
- *
- * XXX - should map invalid UTF-8 sequences to UNREPL.
+ * string, and return a pointer to a UTF-8 string, allocated using the wmem
+ * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT
+ * CHARACTER according to the recommended "best practices" given in the Unicode
+ * Standard and specified by W3C/WHATWG.
*/
static guint8 *
tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length)
{
- guint8 *strbuf;
+ const guint8 *ptr;
- tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
- strbuf = (guint8 *)wmem_alloc(scope, length + 1);
- tvb_memcpy(tvb, strbuf, offset, length);
- strbuf[length] = '\0';
- return strbuf;
+ ptr = ensure_contiguous(tvb, offset, length);
+ return get_utf_8_string(scope, ptr, length);
}
/*
@@ -2562,8 +2560,7 @@ tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* of bytes referred to by the tvbuff, the offset, and the length as a
* raw string, and return a pointer to that string, allocated using the
* wmem scope. This means a null is appended at the end, but no replacement
- * checking is done otherwise. Currently tvb_get_utf_8_string() does not
- * replace either, but it might in the future.
+ * checking is done otherwise, unlike tvb_get_utf_8_string().
*
* Also, this one allows a length of -1 to mean get all, but does not
* allow a negative offset.
@@ -3087,14 +3084,14 @@ static guint8 *
tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
{
guint size;
- guint8 *strptr;
+ const guint8 *ptr;
size = tvb_strsize(tvb, offset);
- strptr = (guint8 *)wmem_alloc(scope, size);
- tvb_memcpy(tvb, strptr, offset, size);
+ ptr = ensure_contiguous(tvb, offset, size);
+ /* XXX, conversion between signed/unsigned integer */
if (lengthp)
*lengthp = size;
- return strptr;
+ return get_utf_8_string(scope, ptr, size);
}
static guint8 *