diff options
author | John Thacker <johnthacker@gmail.com> | 2022-02-03 08:28:11 -0500 |
---|---|---|
committer | A Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> | 2022-02-06 00:32:13 +0000 |
commit | 25d0c88251b7649cd58ae791252ec74d6ab6e3de (patch) | |
tree | 18319432dcff185e9ce8325336267799067276c5 /wsutil | |
parent | ebe22f7b7b6238d2cbe35889c89ffc3c485ea696 (diff) |
epan: Add BASE_SHOW_UTF_8_PRINTABLE
Add BASE_SHOW_UTF_8_PRINTABLE and related function tvb_utf_8_isprint
for supporting fields of bytes that are "maybe UTF-8" (default or
SHOULD be UTF-8 but could be something else, with no encoding indicator),
such as SSID fields in IEEE 802.11 (See #16208), certain OctetString
fields in Diameter or PFCP, and other places where
BASE_SHOW_ASCII_PRINTABLE is currently used. Fix #5307
Diffstat (limited to 'wsutil')
-rw-r--r-- | wsutil/str_util.c | 16 | ||||
-rw-r--r-- | wsutil/str_util.h | 26 |
2 files changed, 34 insertions, 8 deletions
diff --git a/wsutil/str_util.c b/wsutil/str_util.c index bb0ab91a0a..d17f9a6d60 100644 --- a/wsutil/str_util.c +++ b/wsutil/str_util.c @@ -273,18 +273,24 @@ isprint_string(const gchar *str) /* Check if an entire UTF-8 string is printable. */ gboolean -isprint_utf8_string(const gchar *str, guint length) +isprint_utf8_string(const gchar *str, const guint length) { - const char *c; + const gchar *strend = str + length; - if (!g_utf8_validate (str, length, NULL)) { + if (!g_utf8_validate_len(str, length, NULL)) { return FALSE; } - for (c = str; *c; c = g_utf8_next_char(c)) { - if (!g_unichar_isprint(g_utf8_get_char(c))) { + while (str < strend) { + /* This returns false for G_UNICODE_CONTROL | G_UNICODE_FORMAT | + * G_UNICODE_UNASSIGNED | G_UNICODE_SURROGATE + * XXX: Could it be ok to have certain format characters, e.g. + * U+00AD SOFT HYPHEN? If so, format_text() should be changed too. + */ + if (!g_unichar_isprint(g_utf8_get_char(str))) { return FALSE; } + str = g_utf8_next_char(str); } return TRUE; diff --git a/wsutil/str_util.h b/wsutil/str_util.h index 3029753c26..be6ce12485 100644 --- a/wsutil/str_util.h +++ b/wsutil/str_util.h @@ -114,14 +114,34 @@ gchar *ascii_strup_inplace(gchar *str); WS_DLL_PUBLIC gboolean isprint_string(const gchar *str); -/** Check if an entire UTF-8 string consists of printable characters +/** Given a not-necessarily-null-terminated string, expected to be in + * UTF-8 but possibly containing invalid sequences (as it may have come + * from packet data), and the length of the string, deterimine if the + * string is valid UTF-8 consisting entirely of printable characters. + * + * This means that it: + * + * does not contain an illegal UTF-8 sequence (including overlong encodings, + * the sequences reserved for UTF-16 surrogate halves, and the values for + * code points above U+10FFFF that are no longer in Unicode) + * + * does not contain a non-printable Unicode character such as control + * characters (including internal NULL bytes) + * + * does not end in a partial sequence that could begin a valid character; + * + * does not start with a partial sequence that could end a valid character; + * + * and thus guarantees that the result of format_text() would be the same as + * that of wmem_strndup() with the same parameters. * * @param str The string to be checked * @param length The number of bytes to validate - * @return TRUE if the entire string is printable, otherwise FALSE + * @return TRUE if the entire string is valid and printable UTF-8, + * otherwise FALSE */ WS_DLL_PUBLIC -gboolean isprint_utf8_string(const gchar *str, guint length); +gboolean isprint_utf8_string(const gchar *str, const guint length); /** Check if an entire string consists of digits * |