dfilter: Require double-quoted strings with "matches"

Matches is a special case that looks on the RHS and tries to convert every unparsed value to a string, regardless of the LHS type. This is not how types work in the display filter. Require double-quotes to avoid ambiguity, because matches doesn't follow normal Wireshark display filter type rules. It doesn't need nor benefit from the flexibility provided by unparsed strings in the syntax. For matches the RHS is always a literal strings except if the RHS is also a field name, then it complains of an incompatible type. This is confusing. No type can be compatible because no type rules are ever considered. Every unparsed value is a text string except if it happens to coincide with a field name it also requires double-quoting or it throws a syntax error, just to be difficult. We could remove this odd quirk but requiring double-quotes for regular expressions is a better, more elegant fix. Before: Filter: tcp matches "udp" Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp.srcport dftest: tcp and udp.srcport are not of compatible types. Filter: tcp matches udp.srcportt Constants: 00000 PUT_PCRE udp.srcportt -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN After: Filter: tcp matches "udp" Constants: 00000 PUT_PCRE udp -> reg#1 Instructions: 00000 READ_TREE tcp -> reg#0 00001 IF-FALSE-GOTO 3 00002 ANY_MATCHES reg#0 matches reg#1 00003 RETURN Filter: tcp matches udp dftest: "udp" was unexpected in this context. Filter: tcp matches udp.srcport dftest: "udp.srcport" was unexpected in this context. Filter: tcp matches udp.srcportt dftest: "udp.srcportt" was unexpected in this context. The error message could still be improved.
author: João Valverde <j@v6e.pt> 2021-10-09 16:40:08 +0100
committer: Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org> 2021-10-17 22:53:36 +0000
commit: a975d478badf9e3cf1e5faa894197eb27b26fc41 (patch)
tree: 3af83c1fefaa1440ba96c868dbffe54c79f885c5 /epan/dfilter
parent: 4e5e8066044f830407678c60b6b8b1c5aa21873b (diff)
5 files changed, 87 insertions, 117 deletions
diff --git a/epan/dfilter/dfilter-int.h b/epan/dfilter/dfilter-int.h
index 9e6cef482c..44d1a448ae 100644
--- a/epan/dfilter/dfilter-int.h
+++ b/epan/dfilter/dfilter-int.h
@@ -86,6 +86,9 @@ DfilterTrace(FILE *TraceFILE, char *zTracePrompt);
 stnode_t *
 dfilter_new_function(dfwork_t *dfw, const char *name);
 
+stnode_t *
+dfilter_new_regex(dfwork_t *dfw, const char *patt);
+
 gboolean
 dfilter_str_to_gint32(dfwork_t *dfw, const char *s, gint32* pint);
 
diff --git a/epan/dfilter/dfilter.c b/epan/dfilter/dfilter.c
index 5cbf570ce2..9dd34f1163 100644
--- a/epan/dfilter/dfilter.c
+++ b/epan/dfilter/dfilter.c
@@ -78,6 +78,42 @@ dfilter_new_function(dfwork_t *dfw, const char *name)
 	return stnode_new(STTYPE_FUNCTION, def, name);
 }
 
+/* Gets a GRegex from a string, and sets the error message on failure. */
+stnode_t *
+dfilter_new_regex(dfwork_t *dfw, const char *patt)
+{
+	GError *regex_error = NULL;
+	GRegex *pcre;
+
+	ws_debug("Compile regex pattern: %s", patt);
+
+	/*
+	 * As a string is not guaranteed to contain valid UTF-8,
+	 * we have to disable support for UTF-8 patterns and treat
+	 * every pattern and subject as raw bytes.
+	 *
+	 * Should support for UTF-8 patterns be necessary, then we
+	 * should compile a pattern without G_REGEX_RAW. Additionally,
+	 * we MUST use g_utf8_validate() before calling g_regex_match_full()
+	 * or risk crashes.
+	 */
+	GRegexCompileFlags cflags = G_REGEX_CASELESS | G_REGEX_OPTIMIZE | G_REGEX_RAW;
+
+	pcre = g_regex_new(
+			patt,			/* pattern */
+			cflags,			/* Compile options */
+			0,			/* Match options */
+			&regex_error);		/* Compile / study errors */
+
+	if (regex_error) {
+		dfilter_parse_fail(dfw, "%s", regex_error->message);
+		g_error_free(regex_error);
+		pcre = NULL;
+	}
+
+	return stnode_new(STTYPE_PCRE, pcre, patt);
+}
+
 gboolean
 dfilter_str_to_gint32(dfwork_t *dfw, const char *s, gint32* pint)
 {
diff --git a/epan/dfilter/grammar.lemon b/epan/dfilter/grammar.lemon
index d3dc79ea9c..b48d76da9c 100644
--- a/epan/dfilter/grammar.lemon
+++ b/epan/dfilter/grammar.lemon
@@ -302,10 +302,13 @@ relation_test(T) ::= entity(E) rel_binop(O) relation_test(R).
 }
 
 /* "matches" does not chain with other relational tests. */ 
-relation_test(T) ::= entity(E) TEST_MATCHES entity(F).
+relation_test(T) ::= entity(E) TEST_MATCHES STRING(S).
 {
+	stnode_t *R = dfilter_new_regex(dfw, stnode_token_value(S));
+	stnode_free(S);
+
 	T = stnode_new(STTYPE_TEST, NULL, NULL);
-	sttype_test_set2(T, TEST_OP_MATCHES, E, F);
+	sttype_test_set2(T, TEST_OP_MATCHES, E, R);
 }
 
 relation_test(T) ::= entity(E) TEST_IN LBRACE set_node_list(L) RBRACE.
diff --git a/epan/dfilter/semcheck.c b/epan/dfilter/semcheck.c
index be9a5ec145..bef49649d1 100644
--- a/epan/dfilter/semcheck.c
+++ b/epan/dfilter/semcheck.c
@@ -424,50 +424,6 @@ is_bytes_type(enum ftenum type)
 	return FALSE;
 }
 
-/* Gets a GRegex from a string, and sets the error message on failure. */
-WS_RETNONNULL
-static GRegex*
-dfilter_g_regex_from_string(dfwork_t *dfw, stnode_t *st)
-{
-	GError *regex_error = NULL;
-	GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE);
-	GRegex *pcre;
-	const char *s = stnode_data(st);
-
-	/*
-	 * As FT_BYTES and FT_PROTOCOL contain arbitrary binary data
-	 * and FT_STRING is not guaranteed to contain valid UTF-8,
-	 * we have to disable support for UTF-8 patterns and treat
-	 * every pattern and subject as raw bytes.
-	 *
-	 * Should support for UTF-8 patterns be necessary, then we
-	 * should compile a pattern without G_REGEX_RAW. Additionally,
-	 * we MUST use g_utf8_validate() before calling g_regex_match_full()
-	 * or risk crashes.
-	 */
-	cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
-
-	ws_debug("Compile regex pattern: %s", s);
-
-	pcre = g_regex_new(
-			s,			/* pattern */
-			cflags,			/* Compile options */
-			(GRegexMatchFlags)0,	/* Match options */
-			&regex_error		/* Compile / study errors */
-			);
-
-	if (regex_error) {
-		if (dfw->error_message == NULL)
-			dfw->error_message = g_strdup(regex_error->message);
-		g_error_free(regex_error);
-		if (pcre) {
-			g_regex_unref(pcre);
-		}
-		THROW(TypeError);
-	}
-	return pcre;
-}
-
 /* Check the semantics of an existence test. */
 static void
 check_exists(dfwork_t *dfw, stnode_t *st_arg1)
@@ -738,7 +694,6 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 	df_func_def_t		*funcdef;
 	ftenum_t		ftype1, ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 
 	type2 = stnode_type_id(st_arg2);
 
@@ -777,34 +732,28 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 	}
 	else if (type2 == STTYPE_STRING || type2 == STTYPE_UNPARSED ||
 	         type2 == STTYPE_CHARCONST) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			/* Skip incompatible fields */
-			while (hfinfo1->same_name_prev_id != -1 &&
-					((type2 == STTYPE_STRING && ftype1 != FT_STRING && ftype1!= FT_STRINGZ) ||
-					(type2 != STTYPE_STRING && (ftype1 == FT_STRING || ftype1== FT_STRINGZ)))) {
-				hfinfo1 = proto_registrar_get_nth(hfinfo1->same_name_prev_id);
-				ftype1 = hfinfo1->type;
-			}
+		/* Skip incompatible fields */
+		while (hfinfo1->same_name_prev_id != -1 &&
+				((type2 == STTYPE_STRING && ftype1 != FT_STRING && ftype1!= FT_STRINGZ) ||
+				(type2 != STTYPE_STRING && (ftype1 == FT_STRING || ftype1== FT_STRINGZ)))) {
+			hfinfo1 = proto_registrar_get_nth(hfinfo1->same_name_prev_id);
+			ftype1 = hfinfo1->type;
+		}
 
-			if (type2 == STTYPE_STRING) {
-				fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, hfinfo1);
-			}
-			else if (type2 == STTYPE_CHARCONST &&
-			    strcmp(relation_string, "contains") == 0) {
-				/* The RHS should be the same type as the LHS,
-				 * but a character is just a one-byte byte
-				 * string. */
-				fvalue = dfilter_fvalue_from_charconst_string(dfw, ftype1, st_arg2, allow_partial_value);
-			}
-			else {
-				fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, hfinfo1);
-			}
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
+		if (type2 == STTYPE_STRING) {
+			fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, hfinfo1);
 		}
+		else if (type2 == STTYPE_CHARCONST &&
+		    strcmp(relation_string, "contains") == 0) {
+			/* The RHS should be the same type as the LHS,
+			 * but a character is just a one-byte byte
+			 * string. */
+			fvalue = dfilter_fvalue_from_charconst_string(dfw, ftype1, st_arg2, allow_partial_value);
+		}
+		else {
+			fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, hfinfo1);
+		}
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		check_drange_sanity(dfw, st_arg2);
@@ -881,6 +830,9 @@ check_relation_LHS_FIELD(dfwork_t *dfw, const char *relation_string,
 			nodelist = g_slist_next(nodelist);
 		}
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
@@ -1032,7 +984,6 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 	header_field_info	*hfinfo2;
 	ftenum_t		ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 
 	ws_debug("5 check_relation_LHS_RANGE(%s)", relation_string);
 
@@ -1059,38 +1010,20 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 	}
 	else if (type2 == STTYPE_STRING) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_STRING)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex * */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_string(dfw, FT_BYTES, st_arg2, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_string(dfw, FT_BYTES, st_arg2, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_UNPARSED) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_UNPARSED)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_unparsed(dfw, FT_BYTES, st_arg2, allow_partial_value, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_unparsed(dfw, FT_BYTES, st_arg2, allow_partial_value, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_CHARCONST) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_CHARCONST)");
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			/* The RHS should be FT_BYTES, but a character is just a
-			 * one-byte byte string. */
-			fvalue = dfilter_fvalue_from_charconst_string(dfw, FT_BYTES, st_arg2, allow_partial_value);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		/* The RHS should be FT_BYTES, but a character is just a
+		 * one-byte byte string. */
+		fvalue = dfilter_fvalue_from_charconst_string(dfw, FT_BYTES, st_arg2, allow_partial_value);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		ws_debug("5 check_relation_LHS_RANGE(type2 = STTYPE_RANGE)");
@@ -1118,6 +1051,9 @@ check_relation_LHS_RANGE(dfwork_t *dfw, const char *relation_string,
 		dfilter_fail(dfw, "Only a field may be tested for membership in a set.");
 		THROW(TypeError);
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
@@ -1154,7 +1090,6 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 	header_field_info	*hfinfo2;
 	ftenum_t		ftype1, ftype2;
 	fvalue_t		*fvalue;
-	GRegex			*pcre;
 	df_func_def_t		*funcdef;
 	df_func_def_t		*funcdef2;
 	/* GSList          *params; */
@@ -1192,24 +1127,12 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 		}
 	}
 	else if (type2 == STTYPE_STRING) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_string(dfw, ftype1, st_arg2, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_UNPARSED || type2 == STTYPE_CHARCONST) {
-		if (strcmp(relation_string, "matches") == 0) {
-			/* Convert to a GRegex */
-			pcre = dfilter_g_regex_from_string(dfw, st_arg2);
-			stnode_replace(st_arg2, STTYPE_PCRE, pcre);
-		} else {
-			fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, NULL);
-			stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
-		}
+		fvalue = dfilter_fvalue_from_unparsed(dfw, ftype1, st_arg2, allow_partial_value, NULL);
+		stnode_replace(st_arg2, STTYPE_FVALUE, fvalue);
 	}
 	else if (type2 == STTYPE_RANGE) {
 		check_drange_sanity(dfw, st_arg2);
@@ -1249,6 +1172,9 @@ check_relation_LHS_FUNCTION(dfwork_t *dfw, const char *relation_string,
 		dfilter_fail(dfw, "Only a field may be tested for membership in a set.");
 		THROW(TypeError);
 	}
+	else if (type2 == STTYPE_PCRE) {
+		ws_assert_streq(relation_string, "matches");
+	}
 	else {
 		ws_assert_not_reached();
 	}
diff --git a/epan/dfilter/syntax-tree.c b/epan/dfilter/syntax-tree.c
index 6638cc09dd..1bd88f88fd 100644
--- a/epan/dfilter/syntax-tree.c
+++ b/epan/dfilter/syntax-tree.c
@@ -119,6 +119,8 @@ _node_init(stnode_t *node, sttype_id_t type_id, gpointer data)
 		node->data = NULL;
 	}
 	else {
+		/* Creating an initialized node with a NULL pointer is
+		 * allowed and needs to be safe. The parser relies on that. */
 		type = sttype_lookup(type_id);
 		ws_assert(type);
 		node->type = type;
author	João Valverde <j@v6e.pt>	2021-10-09 16:40:08 +0100
committer	Wireshark GitLab Utility <gerald+gitlab-utility@wireshark.org>	2021-10-17 22:53:36 +0000
commit	a975d478badf9e3cf1e5faa894197eb27b26fc41 (patch)
tree	3af83c1fefaa1440ba96c868dbffe54c79f885c5 /epan/dfilter
parent	4e5e8066044f830407678c60b6b8b1c5aa21873b (diff)