aboutsummaryrefslogtreecommitdiffstats
path: root/wsutil
diff options
context:
space:
mode:
authorJohn Thacker <johnthacker@gmail.com>2022-02-03 08:28:11 -0500
committerA Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>2022-02-06 00:32:13 +0000
commit25d0c88251b7649cd58ae791252ec74d6ab6e3de (patch)
tree18319432dcff185e9ce8325336267799067276c5 /wsutil
parentebe22f7b7b6238d2cbe35889c89ffc3c485ea696 (diff)
epan: Add BASE_SHOW_UTF_8_PRINTABLE
Add BASE_SHOW_UTF_8_PRINTABLE and related function tvb_utf_8_isprint for supporting fields of bytes that are "maybe UTF-8" (default or SHOULD be UTF-8 but could be something else, with no encoding indicator), such as SSID fields in IEEE 802.11 (See #16208), certain OctetString fields in Diameter or PFCP, and other places where BASE_SHOW_ASCII_PRINTABLE is currently used. Fix #5307
Diffstat (limited to 'wsutil')
-rw-r--r--wsutil/str_util.c16
-rw-r--r--wsutil/str_util.h26
2 files changed, 34 insertions, 8 deletions
diff --git a/wsutil/str_util.c b/wsutil/str_util.c
index bb0ab91a0a..d17f9a6d60 100644
--- a/wsutil/str_util.c
+++ b/wsutil/str_util.c
@@ -273,18 +273,24 @@ isprint_string(const gchar *str)
/* Check if an entire UTF-8 string is printable. */
gboolean
-isprint_utf8_string(const gchar *str, guint length)
+isprint_utf8_string(const gchar *str, const guint length)
{
- const char *c;
+ const gchar *strend = str + length;
- if (!g_utf8_validate (str, length, NULL)) {
+ if (!g_utf8_validate_len(str, length, NULL)) {
return FALSE;
}
- for (c = str; *c; c = g_utf8_next_char(c)) {
- if (!g_unichar_isprint(g_utf8_get_char(c))) {
+ while (str < strend) {
+ /* This returns false for G_UNICODE_CONTROL | G_UNICODE_FORMAT |
+ * G_UNICODE_UNASSIGNED | G_UNICODE_SURROGATE
+ * XXX: Could it be ok to have certain format characters, e.g.
+ * U+00AD SOFT HYPHEN? If so, format_text() should be changed too.
+ */
+ if (!g_unichar_isprint(g_utf8_get_char(str))) {
return FALSE;
}
+ str = g_utf8_next_char(str);
}
return TRUE;
diff --git a/wsutil/str_util.h b/wsutil/str_util.h
index 3029753c26..be6ce12485 100644
--- a/wsutil/str_util.h
+++ b/wsutil/str_util.h
@@ -114,14 +114,34 @@ gchar *ascii_strup_inplace(gchar *str);
WS_DLL_PUBLIC
gboolean isprint_string(const gchar *str);
-/** Check if an entire UTF-8 string consists of printable characters
+/** Given a not-necessarily-null-terminated string, expected to be in
+ * UTF-8 but possibly containing invalid sequences (as it may have come
+ * from packet data), and the length of the string, deterimine if the
+ * string is valid UTF-8 consisting entirely of printable characters.
+ *
+ * This means that it:
+ *
+ * does not contain an illegal UTF-8 sequence (including overlong encodings,
+ * the sequences reserved for UTF-16 surrogate halves, and the values for
+ * code points above U+10FFFF that are no longer in Unicode)
+ *
+ * does not contain a non-printable Unicode character such as control
+ * characters (including internal NULL bytes)
+ *
+ * does not end in a partial sequence that could begin a valid character;
+ *
+ * does not start with a partial sequence that could end a valid character;
+ *
+ * and thus guarantees that the result of format_text() would be the same as
+ * that of wmem_strndup() with the same parameters.
*
* @param str The string to be checked
* @param length The number of bytes to validate
- * @return TRUE if the entire string is printable, otherwise FALSE
+ * @return TRUE if the entire string is valid and printable UTF-8,
+ * otherwise FALSE
*/
WS_DLL_PUBLIC
-gboolean isprint_utf8_string(const gchar *str, guint length);
+gboolean isprint_utf8_string(const gchar *str, const guint length);
/** Check if an entire string consists of digits
*