dfilter: Require double-quoted strings with "matches"

Matches is a special case that looks on the RHS and tries to convert every unparsed value to a string, regardless of the LHS type. This is not how types work in the display filter. Require double-quotes to avoid ambiguity, because matches doesn't follow normal Wireshark display filter type rules. It doesn't need nor benefit from the flexibility provided by unparsed strings in the syntax. For matches the RHS is always a literal strings except if the RHS is also a field name, then it complains of an incompatible type. This is confusing. No type can be compatible because no type rules are ever considered. Every unparsed value is a text string except if it happens to coincide with a field name it also requires double-quoting or it throws a syntax error, just to be difficult. We could remove this odd quirk but requiring double-quotes for regular expressions is a better, more elegant fix. Before: Filter: tcp matches "udp" Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp.srcport dftest: tcp and udp.srcport are not of compatible types. Filter: tcp matches udp.srcportt Constants: 00000 PUT_PCRE udp.srcportt -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN After: Filter: tcp matches "udp" Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp dftest: "udp" was unexpected in this context. Filter: tcp matches udp.srcport dftest: "udp.srcport" was unexpected in this context. Filter: tcp matches udp.srcportt dftest: "udp.srcportt" was unexpected in this context. The error message could still be improved.
author: João Valverde <j@v6e.pt> 2021-10-09 16:40:08 +0100
committer: Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> 2021-10-17 22:53:36 +0000
commit: a975d478badf9e3cf1e5faa894197eb27b26fc41 (patch)
tree: 3af83c1fefaa1440ba96c868dbffe54c79f885c5
parent: 4e5e8066044f830407678c60b6b8b1c5aa21873b (diff)
8 files changed, 105 insertions, 117 deletions
diff --git a/docbook/release-notes.adoc b/docbook/release-notes.adoc
index 7253ac654c..124565f7ab 100644
--- a/docbook/release-notes.adoc
+++ b/docbook/release-notes.adoc
@@ -108,6 +108,8 @@ The following features are new (or have been significantly updated) since versio
    match a valid byte array specification. Now this is a syntax error. Use double-quotes to match literal strings.
 ** For string comparisons literal byte arrays are always interpreted as unquoted literal strings. This avoids unexpected results with embedded NUL bytes.
    For example "http.user_agent contains aa:bb" tries to match "aa:bb". Avoid this usage, always use double-quotes: http.user_agent contains "\xaa\xbb".
+** Regular expressions (using "matches" or "~") must be specified using character strings. It is a syntax error to omit the double-quotes around
+   the regular expression. Before the syntax rules of an unquoted regex string could be difficult to predict.
 
 // === Removed Features and Support
 
diff --git a/epan/dfilter/dfilter-int.h b/epan/dfilter/dfilter-int.h
index 9e6cef482c..44d1a448ae 100644
--- a/epan/dfilter/dfilter-int.h
+++ b/epan/dfilter/dfilter-int.h
@@ -86,6 +86,9 @@ DfilterTrace(FILE *TraceFILE, char *zTracePrompt);
 stnode_t *
 dfilter_new_function(dfwork_t *dfw, const char *name);
 
+stnode_t *
+dfilter_new_regex(dfwork_t *dfw, const char *patt);
+
 gboolean
 dfilter_str_to_gint32(dfwork_t *dfw, const char *s, gint32* pint);
 
diff --git a/epan/dfilter/dfilter.c b/epan/dfilter/dfilter.c
index 5cbf570ce2..9dd34f1163 100644
--- a/epan/dfilter/dfilter.c
+++ b/epan/dfilter/dfilter.c
@@ -78,6 +78,42 @@ dfilter_new_function(dfwork_t *dfw, const char *name)
 	return stnode_new(STTYPE_FUNCTION, def, name);
 }
 
+/* Gets a GRegex from a string, and sets the error message on failure. */
+stnode_t *
+dfilter_new_regex(dfwork_t *dfw, const char *patt)
+{
+	GError *regex_error = NULL;
+	GRegex *pcre;
+
+	ws_debug("Compile regex pattern: %s", patt);
+
+	/*
+	 * As a string is not guaranteed to contain valid UTF-8,
+	 * we have to disable support for UTF-8 patterns and treat
+	 * every pattern and subject as raw bytes.
+	 *
+	 * Should support for UTF-8 patterns be necessary, then we
+	 * should compile a pattern without G_REGEX_RAW. Additionally,
+	 * we MUST use g_utf8_validate() before calling g_regex_match_full()
+	 * or risk crashes.
+	 */
+	GRegexCompileFlags cflags = G_REGEX_CASELESS | G_REGEX_OPTIMIZE | G_REGEX_RAW;
+
+	pcre = g_regex_new(
+			patt,			/* pattern */
+			cflags,			/* Compile options */
+			0,			/* Match options */
+			&regex_error);		/* Compile / study errors */
+
+	if (regex_error) {
+		dfilter_parse_fail(dfw, "%s", regex_error->message);
+		g_error_free(regex_error);
+		pcre = NULL;
+	}
+
+	return stnode_new(STTYPE_PCRE, pcre, patt);
+}
+
 gboolean
 dfilter_str_to_gint32(dfwork_t *dfw, const char *s, gint32* pint)
 {
diff --git a/epan/dfilter/grammar.lemon b/epan/dfilter/grammar.lemon
index d3dc79ea9c..b48d76da9c 100644
--- a/epan/dfilter/grammar.lemon
+++ b/epan/dfilter/grammar.lemon
@@ -302,10 +302,13 @@ relation_test(T) ::= entity(E) rel_binop(O) relation_test(R).
 }
 
 /* "matches" does not chain with other relational tests. */ 
-relation_test(T) ::= entity(E) TEST_MATCHES entity(F).
+relation_test(T) ::= entity(E) TEST_MATCHES STRING(S).
 {
+	stnode_t *R = dfilter_new_regex(dfw, stnode_token_value(S));
+	stnode_free(S);
+
 	T = stnode_new(STTYPE_TEST, NULL, NULL);
-	sttype_test_set2(T, TEST_OP_MATCHES, E, F);
+	sttype_test_set2(T, TEST_OP_MATCHES, E, R);
 }
 
 relation_test(T) ::= entity(E) TEST_IN LBRACE set_node_list(L) RBRACE.
diff --git a/epan/dfilter/semcheck.c b/epan/dfilter/semcheck.c
index be9a5ec145..bef49649d1 100644
--- a/epan/dfilter/semcheck.c
+++ b/epan/dfilter/semcheck.c
@@ -424,50 +424,6 @@ is_bytes_type(enum ftenum type)
 	return FALSE;
 }
 
-/* Gets a GRegex from a string, and sets the error message on failure. */
-WS_RETNONNULL
-static GRegex*
-dfilter_g_regex_from_string(dfwork_t *dfw, stnode_t *st)
-{
-	GError *regex_error = NULL;
-	GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE);
-	GRegex *pcre;
-	const char *s = stnode_data(st);
-
-	/*
-	 * As FT_BYTES and FT_PROTOCOL contain arbitrary binary data
-	 * and FT_STRING is not guaranteed to contain valid UTF-8,
-	 * we have to disable support for UTF-8 patterns and treat
-	 * every pattern and subject as raw bytes.
-	 *
-	 * Should support for UTF-8 patterns be necessary, then we
-	 * should compile a pattern without G_REGEX_RAW. Additionally,
-	 * we MUST use g_utf8_validate() before calling g_regex_match_full()
-	 * or risk crashes.
-	 */
-	cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
-
-	ws_debug("Compile regex pattern: %s", s);
-
-	pcre = g_regex_new(
-			s,			/* pattern */
-			cflags,			/* Compile options */
-			(GRegexMatchFlags)0,	/* Match options */
-			&regex_error		/* Compile / study errors */
-			);
-
-	if (regex_error) {
-		if (dfw->error_message == NULL)
-			dfw->error_message = g_strdup(regex_error->message);
-		g_error_free(regex_error);
-		if (pcre) {
-			g_regex_unref(pcre);
-		}
-		THROW(TypeError);
-	}
-	return pcre;
-}
-
 /* Check the semantics of an existence test. */
 static void
 check_exists(dfwork_t *dfw, stnode_t *st_arg1)
@@ -738,7 +694,6 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 	df_func_def_t		*funcdef;
 	ftenum_t		ftype1, ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 
 	type2 = stnode_type_id(st_arg2);
 
@@ -777,34 +732,28 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 	}
 	else if (type2 == STTYPE_STRING || type2 == STTYPE_UNPARSED ||
 	         type2 == STTYPE_CHARCONST) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			/* Skip incompatible fields */
-			while (hfinfo1->same_name_prev_id != -1 &&
-					((type2 == STTYPE_STRING && ftype1 != FT_STRING && ftype1!= FT_STRINGZ) ||
-					(type2 != STTYPE_STRING && (ftype1 == FT_STRING || ftype1== FT_STRINGZ)))) {
-				hfinfo1 = proto_registrar_get_nth(hfinfo1->same_name_prev_id);
-				ftype1 = hfinfo1->type;
-			}
+		/* Skip incompatible fields */
+		while (hfinfo1->same_name_prev_id != -1 &&
+				((type2 == STTYPE_STRING && ftype1 != FT_STRING && ftype1!= FT_STRINGZ) ||
+				(type2 != STTYPE_STRING && (ftype1 == FT_STRING || ftype1== FT_STRINGZ)))) {
+			hfinfo1 = proto_registrar_get_nth(hfinfo1->same_name_prev_id);
+			ftype1 = hfinfo1->type;
+		}
 
-			if (type2 == STTYPE_STRING) {
-				fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, hfinfo1);
-			}
-			else if (type2 == STTYPE_CHARCONST &&
-			    strcmp(relation_string, "contains") == 0) {
-				/* The RHS should be the same type as the LHS,
-				 * but a character is just a one-byte byte
-				 * string. */
-				fvalue = dfilter_fvalue_from_charconst_string(dfw, ftype1, st_arg2, allow_partial_value);
-			}
-			else {
-				fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, hfinfo1);
-			}
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
+		if (type2 == STTYPE_STRING) {
+			fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, hfinfo1);
 		}
+		else if (type2 == STTYPE_CHARCONST &&
+		    strcmp(relation_string, "contains") == 0) {
+			/* The RHS should be the same type as the LHS,
+			 * but a character is just a one-byte byte
+			 * string. */
+			fvalue = dfilter_fvalue_from_charconst_string(dfw, ftype1, st_arg2, allow_partial_value);
+		}
+		else {
+			fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, hfinfo1);
+		}
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		check_drange_sanity(dfw, st_arg2);
@@ -881,6 +830,9 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 			nodelist = g_slist_next(nodelist);
 		}
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
@@ -1032,7 +984,6 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 	header_field_info	*hfinfo2;
 	ftenum_t		ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 
 	ws_debug("5 check_relation_LHS_RANGE(%s)", relation_string);
 
@@ -1059,38 +1010,20 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 	}
 	else if (type2 == STTYPE_STRING) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_STRING)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex * */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_string(dfw, FT_BYTES, st_arg2, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_string(dfw, FT_BYTES, st_arg2, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_UNPARSED) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_UNPARSED)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_unparsed(dfw, FT_BYTES, st_arg2, allow_partial_value, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_unparsed(dfw, FT_BYTES, st_arg2, allow_partial_value, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_CHARCONST) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_CHARCONST)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			/* The RHS should be FT_BYTES, but a character is just a
-			 * one-byte byte string. */
-			fvalue = dfilter_fvalue_from_charconst_string(dfw, FT_BYTES, st_arg2, allow_partial_value);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		/* The RHS should be FT_BYTES, but a character is just a
+		 * one-byte byte string. */
+		fvalue = dfilter_fvalue_from_charconst_string(dfw, FT_BYTES, st_arg2, allow_partial_value);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_RANGE)");
@@ -1118,6 +1051,9 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 		dfilter_fail(dfw, "Only a field may be tested for membership in a set.");
 		THROW(TypeError);
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
@@ -1154,7 +1090,6 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 	header_field_info	*hfinfo2;
 	ftenum_t		ftype1, ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 	df_func_def_t		*funcdef;
 	df_func_def_t		*funcdef2;
 	/* GSList          *params; */
@@ -1192,24 +1127,12 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 		}
 	}
 	else if (type2 == STTYPE_STRING) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_UNPARSED || type2 == STTYPE_CHARCONST) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		check_drange_sanity(dfw, st_arg2);
@@ -1249,6 +1172,9 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 		dfilter_fail(dfw, "Only a field may be tested for membership in a set.");
 		THROW(TypeError);
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
diff --git a/epan/dfilter/syntax-tree.c b/epan/dfilter/syntax-tree.c
index 6638cc09dd..1bd88f88fd 100644
--- a/epan/dfilter/syntax-tree.c
+++ b/epan/dfilter/syntax-tree.c
@@ -119,6 +119,8 @@ _node_init(stnode_t *node, sttype_id_t type_id, gpointer data)
 		node->data = NULL;
 	}
 	else {
+		/* Creating an initialized node with a NULL pointer is
+		 * allowed and needs to be safe. The parser relies on that. */
 		type = sttype_lookup(type_id);
 		ws_assert(type);
 		node->type = type;
diff --git a/test/suite_dfilter/group_syntax.py b/test/suite_dfilter/group_syntax.py
index 7e1cfdacfd..23b39052ac 100644
--- a/test/suite_dfilter/group_syntax.py
+++ b/test/suite_dfilter/group_syntax.py
@@ -30,3 +30,15 @@ class case_syntax(unittest.TestCase):
     def test_value_string_1(self, checkDFilterSucceed):
         dfilter = 'eth.fcs.status=="Bad"'
         checkDFilterSucceed(dfilter)
+
+    def test_matches_1(self, checkDFilterSucceed):
+        dfilter = 'http.request.method matches "^HEAD"'
+        checkDFilterSucceed(dfilter)
+
+    def test_matches_2(self, checkDFilterFail):
+        dfilter = 'http.request.method matches HEAD'
+        checkDFilterFail(dfilter, '"HEAD" was unexpected in this context')
+
+    def test_matches_3(self, checkDFilterFail):
+        dfilter = 'http.request.method matches "^HEAD" matches "^POST"'
+        checkDFilterFail(dfilter, '"matches" was unexpected in this context.')
diff --git a/wsutil/ws_assert.h b/wsutil/ws_assert.h
index e2727b589c..90e4647279 100644
--- a/wsutil/ws_assert.h
+++ b/wsutil/ws_assert.h
@@ -13,6 +13,7 @@
 #include <ws_symbol_export.h>
 #include <ws_attributes.h>
 #include <stdbool.h>
+#include <string.h>
 
 #ifdef WS_LOG_DOMAIN
 #define _ASSERT_DOMAIN WS_LOG_DOMAIN
@@ -58,6 +59,9 @@ void ws_assert_failed(const char *file, int line, const char *function,
 #define ws_assert(expr) ws_abort_if_fail(expr)
 #endif
 
+#define ws_assert_streq(s1, s2) \
+        ws_assert((s1) && (s2) && strcmp((s1), (s2)) == 0)
+
 /*
  * We don't want to disable ws_assert_not_reached() with WS_DISABLE_ASSERT.
  * That would blast compiler warnings everywhere for no benefit, not
author	João Valverde <j@v6e.pt>	2021-10-09 16:40:08 +0100
committer	Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>	2021-10-17 22:53:36 +0000
commit	a975d478badf9e3cf1e5faa894197eb27b26fc41 (patch)
tree	3af83c1fefaa1440ba96c868dbffe54c79f885c5
parent	4e5e8066044f830407678c60b6b8b1c5aa21873b (diff)