dfilter: Optimize some scanner patterns

Cleanup flex code. Optimize some patterns to avoid lookups for field matches for values that are not legal field names. Improve warning and add some comments.
author: João Valverde <j@v6e.pt> 2022-12-29 06:04:00 +0000
committer: João Valverde <j@v6e.pt> 2023-01-07 21:15:25 +0000
commit: 1861679e8132a26ff1eb891e349856fdcea1f8be (patch)
tree: d1eaaa8be2265158a9cc276ddaefecff122a0f70
parent: 7641ba7416cdb0232ecef7c5aa3d39a711d1766c (diff)
5 files changed, 208 insertions, 67 deletions
diff --git a/epan/dfilter/scanner.l b/epan/dfilter/scanner.l
index 1943f2acc4..04c57de19a 100644
--- a/epan/dfilter/scanner.l
+++ b/epan/dfilter/scanner.l
@@ -80,13 +80,14 @@ WS_WARN_UNUSED static int set_lval_simple(df_scanner_state_t *state, int token,
 #define test(token)	(update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_TEST))
 #define math(token)	(update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_ARITHMETIC))
 
-WS_WARN_UNUSED static int set_lval_literal(df_scanner_state_t *state, const char *token_value);
-WS_WARN_UNUSED static int set_lval_unparsed(df_scanner_state_t *state, const char *token_value);
+WS_WARN_UNUSED static int set_lval_literal(df_scanner_state_t *state,  const char *value, const char *token_value);
+WS_WARN_UNUSED static int set_lval_identifier(df_scanner_state_t *state,  const char *value, const char *token_value);
+WS_WARN_UNUSED static int set_lval_constant(df_scanner_state_t *state,  const char *value, const char *token_value);
+WS_WARN_UNUSED static int set_lval_unparsed(df_scanner_state_t *state, const char *value, const char *token_value);
+
+WS_WARN_UNUSED static int set_lval_field(df_scanner_state_t *state, const header_field_info *hfinfo, const char *token_value);
 WS_WARN_UNUSED static int set_lval_quoted_string(df_scanner_state_t *state, GString *quoted_string);
 WS_WARN_UNUSED static int set_lval_charconst(df_scanner_state_t *state, GString *quoted_string);
-WS_WARN_UNUSED static int set_lval_field(df_scanner_state_t *state, const char *token_value, const header_field_info *hfinfo);
-WS_WARN_UNUSED static int set_lval_identifier(df_scanner_state_t *state, const char *token_value);
-WS_WARN_UNUSED static int set_lval_constant(df_scanner_state_t *state, const char *token_value);
 
 static gboolean append_escaped_char(df_scanner_state_t *state, GString *str, char c);
 static gboolean append_universal_character_name(df_scanner_state_t *state, GString *str, const char *ucn);
@@ -103,27 +104,43 @@ static void update_string_loc(df_scanner_state_t *state, const char *text);
 
 %}
 
-Identifier	[[:alnum:]_][[:alnum:]_-]*(\.[[:alnum:]_-]+)*
+FunctionIdentifier	[[:alpha:]_][[:alnum:]_]*
+
+/*
+ * Cannot start with '-'. * Some protocol name can contain '-', for example "mac-lte".
+ * Note that some protocol names start with a number, for example "9p".
+ * Some protocol names contain dots, e.g: _ws.expert
+ * Protocol or protocol field cannot contain DOTDOT anywhere.
+ */
+VarIdentifier		[[:alnum:]_][[:alnum:]_-]*
+ProtoFieldIdentifier	{VarIdentifier}(\.{VarIdentifier})*
+
+hex2			[[:xdigit:]]{2}
+ColonMacAddress		{hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2}
+HyphenMacAddress 	{hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2}
+DotMacAddress		{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}
 
-hex2		[[:xdigit:]]{2}
-MacAddress	{hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2}|{hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2}|{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}
+hex4			[[:xdigit:]]{4}
+DotQuadMacAddress 	{hex4}\.{hex4}\.{hex4}
 
-hex4		[[:xdigit:]]{4}
-QuadMacAddress	{hex4}\.{hex4}\.{hex4}
+ColonBytes		({hex2}:)|({hex2}(:{hex2})+)
+HyphenBytes		{hex2}(-{hex2})+
+DotBytes		{hex2}(\.{hex2})+
 
-DecOctet	[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]
-IPv4Address	{DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet}
+DecOctet		[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]
+IPv4Address		{DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet}
 
-h16		[0-9A-Fa-f]{1,4}
-ls32		{h16}:{h16}|{IPv4Address}
-IPv6Address	({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?::
+h16			[0-9A-Fa-f]{1,4}
+ls32			{h16}:{h16}|{IPv4Address}
+IPv6Address		({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?::
 
-V4CidrPrefix	\/[[:digit:]]{1,2}
-V6CidrPrefix	\/[[:digit:]]{1,3}
+V4CidrPrefix		\/[[:digit:]]{1,2}
+V6CidrPrefix		\/[[:digit:]]{1,3}
 
-ColonBytes	({hex2}:)|({hex2}(:{hex2})+)
-DotBytes	{hex2}(\.{hex2})+
-HyphenBytes	{hex2}(-{hex2})+
+/* Catch all valid semantic values. Cannot contain DOT DOT or start with MINUS. */
+StartAlphabet		[[:alnum:]_:]
+Alphabet		[[:alnum:]_:/-]
+LiteralValue		{StartAlphabet}{Alphabet}*(\.{Alphabet}+)*
 
 %x RANGE
 %x LAYER
@@ -403,57 +420,120 @@ HyphenBytes	{hex2}(-{hex2})+
 	g_string_append(yyextra->quoted_string, yytext);
 }
 
+	/* NOTE: None of the patterns below can match ".." anywhere in the token string. */
 
-	/* None of the patterns below can match ".." anywhere in the token string. */
+	/* MAC address. */
 
-{MacAddress}|{QuadMacAddress}		{
+{ColonMacAddress}|{HyphenMacAddress}	{
 	/* MAC Address. */
 	update_location(yyextra, yytext);
-	return set_lval_unparsed(yyextra, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
+}
+
+{DotMacAddress}|{DotQuadMacAddress}	{
+	/* MAC Address, can also be a field. */
+	update_location(yyextra, yytext);
+	return set_lval_unparsed(yyextra, yytext, yytext);
 }
 
+	/* IP address. */
+
 {IPv4Address}{V4CidrPrefix}?		{
 	/* IPv4 with or without prefix. */
 	update_location(yyextra, yytext);
-	return set_lval_unparsed(yyextra, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
 }
 
 {IPv6Address}{V6CidrPrefix}?		{
 	/* IPv6 with or without prefix. */
 	update_location(yyextra, yytext);
-	return set_lval_unparsed(yyextra, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
+}
+
+	/* Integer or bytes */
+
+0[bBoOxX][[:xdigit:]]+	{
+	/* Binary/octal/hex integer. */
+	update_location(yyextra, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
+}
+
+:[[:xdigit:]]+	{
+	/* Numeric prefixed with ':'. */
+	update_location(yyextra, yytext);
+	return set_lval_literal(yyextra, yytext + 1, yytext);
+}
+
+[[:xdigit:]]+	{
+	/* Numeric or field. */
+	update_location(yyextra, yytext);
+	return set_lval_unparsed(yyextra, yytext, yytext);
+}
+
+	/* Floating point. */
+
+[[:digit:]]+\.[[:digit:]]+([eE][+-]?[[:digit:]]+)?	{
+	/* Decimal float with optional exponent. */
+	/* Significand cannot have any side omitted. */
+	update_location(yyextra, yytext);
+	return set_lval_unparsed(yyextra, yytext, yytext);
+}
+
+0[xX][[:xdigit:]]+\.[[:xdigit:]]+([pP][+-]?[[:digit:]]+)?	{
+	/* Hexadecimal float with optional exponent. Can't be a field because
+	 * field cannot beging with 0x. */
+	/* Significand cannot have any side omitted. */
+	update_location(yyextra, yytext);
+	return set_lval_literal(yyextra,  yytext, yytext);
 }
 
-:?({ColonBytes}|{DotBytes}|{HyphenBytes})	{
+	/* Bytes. */
+
+:?{ColonBytes}	{
 	/* Bytes. */
 	update_location(yyextra, yytext);
 	if (yytext[0] == ':')
-		return set_lval_literal(yyextra, yytext); /* Keep leading colon. */
-	return set_lval_unparsed(yyextra, yytext);
+		return set_lval_literal(yyextra, yytext + 1, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
 }
 
-:[[:xdigit:]]+		{
-	/* Numeric. */
+:?{HyphenBytes}	{
+	/* Bytes. */
 	update_location(yyextra, yytext);
-	return set_lval_literal(yyextra, yytext); /* Keep leading colon. */
+	if (yytext[0] == ':')
+		return set_lval_literal(yyextra, yytext + 1, yytext);
+	return set_lval_literal(yyextra, yytext, yytext);
 }
 
-{Identifier}			{
-	/* Identifier (field or function) or constant (literal). */
+:?{DotBytes}	{
+	/* DotBytes, can be a field without ':' prefix. */
+	update_location(yyextra, yytext);
+	if (yytext[0] == ':')
+		return set_lval_literal(yyextra, yytext + 1, yytext);
+	return set_lval_unparsed(yyextra, yytext, yytext);
+}
+
+	/* Identifier (protocol/field/function name). */
+
+	/* This must come before FieldIdentifier to match function names. */
+{FunctionIdentifier}	{
+	/* Identifier (field or function) or constant (bytes without separator). */
+	/* We use CONSTANT instead of LITERAL because the difference is significant
+	 * in the syntactical grammar. */
 	update_location(yyextra, yytext);
 	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra->dfw, yytext);
 	if (hfinfo != NULL) {
-		return set_lval_identifier(yyextra, yytext);
+		return set_lval_identifier(yyextra, yytext, yytext);
 	}
         df_func_def_t *def = df_func_lookup(yytext);
 	if (def != NULL) {
-		return set_lval_identifier(yyextra, yytext);
+		return set_lval_identifier(yyextra, yytext, yytext);
 	}
-	return set_lval_constant(yyextra, yytext);
+	return set_lval_constant(yyextra, yytext, yytext);
 }
 
-\.{Identifier}			{
-	/* Field. */
+\.{ProtoFieldIdentifier}	{
+	/* Identifier, prefixed with a '.'. */
 	update_location(yyextra, yytext);
 	const char *name = yytext + 1;
 	header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra->dfw, name);
@@ -461,7 +541,21 @@ HyphenBytes	{hex2}(-{hex2})+
 		FAIL("\"%s\" is not a valid protocol or protocol field.", name);
 		return SCAN_FAILED;
 	}
-	return set_lval_field(yyextra, yytext, hfinfo);
+	return set_lval_field(yyextra, hfinfo, yytext);
+}
+
+{ProtoFieldIdentifier}	{
+	/* Catch-all for protocol values. Can also be a literal. */
+	update_location(yyextra, yytext);
+	return set_lval_unparsed(yyextra, yytext, yytext);
+}
+
+{LiteralValue}	{
+	/* Catch-all for semantic values. */
+	update_location(yyextra, yytext);
+	/* We use literal here because identifiers (using unparsed) should have
+	 * matched one of the previous rules. */
+	return set_lval_literal(yyextra, yytext, yytext);
 }
 
 . {
@@ -474,7 +568,6 @@ HyphenBytes	{hex2}(-{hex2})+
 	return SCAN_FAILED;
 }
 
-
 %%
 
 /*
@@ -511,34 +604,46 @@ set_lval_simple(df_scanner_state_t *state, int token, const char *token_value, s
 }
 
 static int
-set_lval_literal(df_scanner_state_t *state, const char *token_value)
+set_lval_literal(df_scanner_state_t *state, const char *value, const char *token_value)
 {
-	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location);
+	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location);
 	return TOKEN_LITERAL;
 }
 
 static int
-set_lval_identifier(df_scanner_state_t *state, const char *token_value)
+set_lval_identifier(df_scanner_state_t *state, const char *value, const char *token_value)
 {
-	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location);
+	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location);
 	return TOKEN_IDENTIFIER;
 }
 
 static int
-set_lval_constant(df_scanner_state_t *state, const char *token_value)
+set_lval_constant(df_scanner_state_t *state, const char *value, const char *token_value)
 {
-	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location);
+	state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location);
 	return TOKEN_CONSTANT;
 }
 
 static int
-set_lval_unparsed(df_scanner_state_t *state, const char *token_value)
+set_lval_unparsed(df_scanner_state_t *state, const char *value, const char *token_value)
 {
-	const header_field_info *hfinfo = dfilter_resolve_unparsed(state->dfw, token_value);
+	int token;
+	const header_field_info *hfinfo = dfilter_resolve_unparsed(state->dfw, value);
 	if (hfinfo != NULL) {
-		return set_lval_field(state, token_value, hfinfo);
+		token = set_lval_field(state, hfinfo, token_value);
     	}
-	return set_lval_literal(state, token_value);
+	else {
+		token = set_lval_literal(state, value, token_value);
+	}
+	stnode_set_flags(state->df_lval, STFLAG_UNPARSED);
+	return token;
+}
+
+static int
+set_lval_field(df_scanner_state_t *state, const header_field_info *hfinfo, const char *token_value)
+{
+	state->df_lval = stnode_new(STTYPE_FIELD, (gpointer)hfinfo, g_strdup(token_value), state->location);
+	return TOKEN_FIELD;
 }
 
 static int
@@ -567,13 +672,6 @@ set_lval_charconst(df_scanner_state_t *state, GString *quoted_string)
 	return TOKEN_CHARCONST;
 }
 
-static int
-set_lval_field(df_scanner_state_t *state, const char *token_value, const header_field_info *hfinfo)
-{
-	state->df_lval = stnode_new(STTYPE_FIELD, (gpointer)hfinfo, g_strdup(token_value), state->location);
-	return TOKEN_FIELD;
-}
-
 static gboolean
 append_escaped_char(df_scanner_state_t *state, GString *str, char c)
 {
diff --git a/epan/dfilter/semcheck.c b/epan/dfilter/semcheck.c
index a769ac4ed5..f116050501 100644
--- a/epan/dfilter/semcheck.c
+++ b/epan/dfilter/semcheck.c
@@ -1149,18 +1149,18 @@ check_relation(dfwork_t *dfw, stnode_op_t st_op,
 }
 
 static void
-check_relation_contains_RHS_FIELD(dfwork_t *dfw, stnode_t *st_node _U_,
+check_warning_contains_RHS_FIELD(dfwork_t *dfw, stnode_t *st_node _U_,
 		stnode_t *st_arg1 _U_, stnode_t *st_arg2)
 {
 	const char *token = stnode_token(st_arg2);
-	if (token[0] == '.' || token[0] == ':')
-		return;
-
 	header_field_info *hfinfo = sttype_field_hfinfo(st_arg2);
-	fvalue_t *fvalue = fvalue_from_literal(FT_BYTES, hfinfo->abbrev, FALSE, NULL);
+	fvalue_t *fvalue = fvalue_from_literal(FT_BYTES, token, TRUE, NULL);
 	if (fvalue != NULL) {
-		add_compile_warning(dfw, "Interpreting \"%s\" as \"%s\". Consider writing :%s or .%s",
-					hfinfo->abbrev, hfinfo->name, hfinfo->abbrev, hfinfo->abbrev);
+		char *repr = fvalue_to_string_repr(dfw->dfw_scope, fvalue, FTREPR_DFILTER, 0);
+		add_compile_warning(dfw, "Interpreting \"%s\" as %s instead of %s. "
+					"Consider writing \"%s\" or \".%s\" to remove this warning",
+					token, hfinfo->name, ftype_pretty_name(FT_BYTES),
+					repr, hfinfo->abbrev);
 		fvalue_free(fvalue);
 	}
 }
@@ -1171,8 +1171,8 @@ check_relation_contains(dfwork_t *dfw, stnode_t *st_node,
 {
 	LOG_NODE(st_node);
 
-	if (stnode_type_id(st_arg2) == STTYPE_FIELD) {
-		check_relation_contains_RHS_FIELD(dfw, st_node, st_arg1, st_arg2);
+	if (stnode_type_id(st_arg2) == STTYPE_FIELD && stnode_get_flags(st_arg2, STFLAG_UNPARSED)) {
+		check_warning_contains_RHS_FIELD(dfw, st_node, st_arg1, st_arg2);
 	}
 
 	switch (stnode_type_id(st_arg1)) {
diff --git a/epan/dfilter/syntax-tree.c b/epan/dfilter/syntax-tree.c
index c0f1207e95..6220817899 100644
--- a/epan/dfilter/syntax-tree.c
+++ b/epan/dfilter/syntax-tree.c
@@ -99,6 +99,7 @@ stnode_clear(stnode_t *node)
 	node->repr_token = NULL;
 	node->location.col_start = -1;
 	node->location.col_len = 0;
+	node->flags = 0;
 }
 
 void
@@ -113,6 +114,7 @@ stnode_init(stnode_t *node, sttype_id_t type_id, gpointer data, char *token, df_
 	node->repr_debug = NULL;
 	node->repr_token = token;
 	node->location = loc;
+	node->flags = 0;
 
 	if (type_id == STTYPE_UNINITIALIZED) {
 		node->type = NULL;
@@ -138,8 +140,10 @@ stnode_replace(stnode_t *node, sttype_id_t type_id, gpointer data)
 {
 	char *token = g_strdup(node->repr_token);
 	df_loc_t loc = node->location;
+	uint16_t flags = node->flags;
 	stnode_clear(node);
 	stnode_init(node, type_id, data, token, loc);
+	node->flags = flags;
 }
 
 stnode_t*
@@ -174,6 +178,7 @@ stnode_dup(const stnode_t *node)
 	new->repr_debug = NULL;
 	new->repr_token = g_strdup(node->repr_token);
 	new->location = node->location;
+	new->flags = node->flags;
 
 	new->type = node->type;
 	if (node->type == NULL)
@@ -256,6 +261,18 @@ stnode_set_location(stnode_t *node, df_loc_t loc)
 	node->location = loc;
 }
 
+gboolean
+stnode_get_flags(stnode_t *node, uint16_t flags)
+{
+	return node->flags & flags;
+}
+
+void
+stnode_set_flags(stnode_t *node, uint16_t flags)
+{
+	node->flags |= flags;
+}
+
 /* Finds the first and last location from a set and creates
  * a new location from start of first (col_start) to end of
  * last (col_start + col_len). Sets the result to dst. */
diff --git a/epan/dfilter/syntax-tree.h b/epan/dfilter/syntax-tree.h
index ef3dabcc17..3b97fe0bbb 100644
--- a/epan/dfilter/syntax-tree.h
+++ b/epan/dfilter/syntax-tree.h
@@ -54,6 +54,10 @@ typedef struct {
 	STTypeToStrFunc		func_tostr;
 } sttype_t;
 
+
+/* Lexical value is ambiguous (can be a protocol field or a literal). */
+#define STFLAG_UNPARSED		(1 << 0)
+
 /** Node (type instance) information */
 typedef struct {
 	uint32_t	magic;
@@ -63,6 +67,7 @@ typedef struct {
 	char 		*repr_display;
 	char 		*repr_debug;
 	df_loc_t	location;
+	uint16_t	flags;
 } stnode_t;
 
 typedef enum {
@@ -159,6 +164,12 @@ stnode_location(stnode_t *node);
 void
 stnode_set_location(stnode_t *node, df_loc_t loc);
 
+gboolean
+stnode_get_flags(stnode_t *node, uint16_t flags);
+
+void
+stnode_set_flags(stnode_t *node, uint16_t flags);
+
 void
 stnode_merge_location(stnode_t *dst, stnode_t *n1, stnode_t *n2);
 
diff --git a/test/suite_dfilter/group_syntax.py b/test/suite_dfilter/group_syntax.py
index a2494ae84f..77762fb8ea 100644
--- a/test/suite_dfilter/group_syntax.py
+++ b/test/suite_dfilter/group_syntax.py
@@ -193,11 +193,26 @@ class case_equality(unittest.TestCase):
         dfilter = 'frame[37] == fc:'
         checkDFilterCount(dfilter, 1)
 
-    def test_rhs_literal_bias_4(self, checkDFilterCount):
+    def test_rhs_bias_4(self, checkDFilterCount):
         # Protocol "Fibre Channel" on the RHS
         dfilter = 'frame[37] == .fc'
         checkDFilterCount(dfilter, 0)
 
+    def test_rhs_bias_5(self, checkDFilterSucceed):
+        # Protocol "Fibre Channel" on the RHS (with warning)
+        dfilter = 'frame contains fc'
+        checkDFilterSucceed(dfilter, 'Interpreting "fc" as Fibre Channel')
+
+    def test_rhs_bias_6(self, checkDFilterSucceed):
+        # Protocol "Fibre Channel" on the RHS (without warning)
+        dfilter = 'frame contains .fc'
+        checkDFilterSucceed(dfilter)
+
+    def test_rhs_bias_7(self, checkDFilterSucceed):
+        # Byte 0xFC on the RHS
+        dfilter = 'frame contains fc:'
+        checkDFilterSucceed(dfilter)
+
 @fixtures.uses_fixtures
 class case_bitwise(unittest.TestCase):
     trace_file = "http.pcap"
author	João Valverde <j@v6e.pt>	2022-12-29 06:04:00 +0000
committer	João Valverde <j@v6e.pt>	2023-01-07 21:15:25 +0000
commit	1861679e8132a26ff1eb891e349856fdcea1f8be (patch)
tree	d1eaaa8be2265158a9cc276ddaefecff122a0f70
parent	7641ba7416cdb0232ecef7c5aa3d39a711d1766c (diff)