Add some UTF-8 debug checks with a compile time flag

Some older dissectors that predate Unicode and parse text protocols are prone to generate invalid UTF-8 strings. This is a bug and can have safety implications. For example passing invalid UTF-8 to proto_tree_add_string() is a common bug. There are safeguards in format_text() but this should not be relied on as a general solution to the problem. For one, as the name implies, it is only used with representation of a field value, which is not the same as the value itself of an FT_STRING field. Issue #18317 shows another reason why. For now this compile flag only enables extra checks for string ftypes, which covers a subset of proto.h APIs including proto_tree_append_string(). Later is should be extended to other interfaces. This is also not expected to be disabled for release builds because there are still many dissectors that do not correctly handle strings. More work is needed to 1) identify them and 2) fix them. Ping #18317
author: João Valverde <j@v6e.pt> 2022-09-26 22:39:01 +0100
committer: João Valverde <j@v6e.pt> 2022-09-27 17:04:44 +0000
commit: 6d06d4e46bc9bf5c133ca4b039be329ec1fe5067 (patch)
tree: 60b307a6a6f832f22958dbc69e98bf89dfe28fb4 /epan/ftypes/ftype-string.c
parent: 7b53fd127ef42a39a9d5a71732eab13c5b42dc33 (diff)
1 files changed, 22 insertions, 5 deletions
diff --git a/epan/ftypes/ftype-string.c b/epan/ftypes/ftype-string.c
index c2789a2422..017a71fb94 100644
--- a/epan/ftypes/ftype-string.c
+++ b/epan/ftypes/ftype-string.c
@@ -15,6 +15,23 @@
 #include <strutil.h>
 #include <wsutil/ws_assert.h>
 
+
+#ifdef WS_DEBUG_UTF_8
+static inline void
+string_validate_utf8(fvalue_t *fv)
+{
+	if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
+		ws_warning("String fvalues must use a valid UTF-8 encoding."
+				" This string has been sanitized to look like this: %s",
+				wmem_strbuf_get_str(fv->value.strbuf));
+	}
+}
+#define CHECK_UTF_8(fv) string_validate_utf8(fv)
+#else /* !WS_DEBUG_UTF_8 */
+#define CHECK_UTF_8(fv)  (void)(fv)
+#endif /* WS_DEBUG_UTF_8 */
+
+
 static void
 string_fvalue_new(fvalue_t *fv)
 {
@@ -42,6 +59,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
 	string_fvalue_free(fv);
 
 	fv->value.strbuf = value;
+	CHECK_UTF_8(fv);
 }
 
 static char *
@@ -73,20 +91,19 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
 		fv->value.strbuf = wmem_strbuf_new_len(NULL, s, len);
 	else
 		fv->value.strbuf = wmem_strbuf_new(NULL, s);
+
+	CHECK_UTF_8(fv);
 	return TRUE;
 }
 
 static gboolean
-val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg _U_)
+val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg)
 {
 	/* Just turn it into a string */
 	/* XXX Should probably be a syntax error instead. It's more user-friendly to ask the
 	 * user to be explicit about the meaning of an unquoted literal than them trying to figure out
 	 * why a valid filter expression is giving wrong results. */
-	string_fvalue_free(fv);
-
-	fv->value.strbuf = wmem_strbuf_new(NULL, s);
-	return TRUE;
+	return val_from_string(fv, s, 0, err_msg);
 }
 
 static gboolean
author	João Valverde <j@v6e.pt>	2022-09-26 22:39:01 +0100
committer	João Valverde <j@v6e.pt>	2022-09-27 17:04:44 +0000
commit	6d06d4e46bc9bf5c133ca4b039be329ec1fe5067 (patch)
tree	60b307a6a6f832f22958dbc69e98bf89dfe28fb4 /epan/ftypes/ftype-string.c
parent	7b53fd127ef42a39a9d5a71732eab13c5b42dc33 (diff)