diff options
author | João Valverde <j@v6e.pt> | 2022-12-29 06:04:00 +0000 |
---|---|---|
committer | João Valverde <j@v6e.pt> | 2023-01-07 21:15:25 +0000 |
commit | 1861679e8132a26ff1eb891e349856fdcea1f8be (patch) | |
tree | d1eaaa8be2265158a9cc276ddaefecff122a0f70 | |
parent | 7641ba7416cdb0232ecef7c5aa3d39a711d1766c (diff) |
dfilter: Optimize some scanner patterns
Cleanup flex code. Optimize some patterns to avoid lookups
for field matches for values that are not legal field names.
Improve warning and add some comments.
-rw-r--r-- | epan/dfilter/scanner.l | 212 | ||||
-rw-r--r-- | epan/dfilter/semcheck.c | 18 | ||||
-rw-r--r-- | epan/dfilter/syntax-tree.c | 17 | ||||
-rw-r--r-- | epan/dfilter/syntax-tree.h | 11 | ||||
-rw-r--r-- | test/suite_dfilter/group_syntax.py | 17 |
5 files changed, 208 insertions, 67 deletions
diff --git a/epan/dfilter/scanner.l b/epan/dfilter/scanner.l index 1943f2acc4..04c57de19a 100644 --- a/epan/dfilter/scanner.l +++ b/epan/dfilter/scanner.l @@ -80,13 +80,14 @@ WS_WARN_UNUSED static int set_lval_simple(df_scanner_state_t *state, int token, #define test(token) (update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_TEST)) #define math(token) (update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_ARITHMETIC)) -WS_WARN_UNUSED static int set_lval_literal(df_scanner_state_t *state, const char *token_value); -WS_WARN_UNUSED static int set_lval_unparsed(df_scanner_state_t *state, const char *token_value); +WS_WARN_UNUSED static int set_lval_literal(df_scanner_state_t *state, const char *value, const char *token_value); +WS_WARN_UNUSED static int set_lval_identifier(df_scanner_state_t *state, const char *value, const char *token_value); +WS_WARN_UNUSED static int set_lval_constant(df_scanner_state_t *state, const char *value, const char *token_value); +WS_WARN_UNUSED static int set_lval_unparsed(df_scanner_state_t *state, const char *value, const char *token_value); + +WS_WARN_UNUSED static int set_lval_field(df_scanner_state_t *state, const header_field_info *hfinfo, const char *token_value); WS_WARN_UNUSED static int set_lval_quoted_string(df_scanner_state_t *state, GString *quoted_string); WS_WARN_UNUSED static int set_lval_charconst(df_scanner_state_t *state, GString *quoted_string); -WS_WARN_UNUSED static int set_lval_field(df_scanner_state_t *state, const char *token_value, const header_field_info *hfinfo); -WS_WARN_UNUSED static int set_lval_identifier(df_scanner_state_t *state, const char *token_value); -WS_WARN_UNUSED static int set_lval_constant(df_scanner_state_t *state, const char *token_value); static gboolean append_escaped_char(df_scanner_state_t *state, GString *str, char c); static gboolean append_universal_character_name(df_scanner_state_t *state, GString *str, const char *ucn); @@ -103,27 +104,43 @@ static void update_string_loc(df_scanner_state_t *state, const char *text); %} -Identifier [[:alnum:]_][[:alnum:]_-]*(\.[[:alnum:]_-]+)* +FunctionIdentifier [[:alpha:]_][[:alnum:]_]* + +/* + * Cannot start with '-'. * Some protocol name can contain '-', for example "mac-lte". + * Note that some protocol names start with a number, for example "9p". + * Some protocol names contain dots, e.g: _ws.expert + * Protocol or protocol field cannot contain DOTDOT anywhere. + */ +VarIdentifier [[:alnum:]_][[:alnum:]_-]* +ProtoFieldIdentifier {VarIdentifier}(\.{VarIdentifier})* + +hex2 [[:xdigit:]]{2} +ColonMacAddress {hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2} +HyphenMacAddress {hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2} +DotMacAddress {hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2} -hex2 [[:xdigit:]]{2} -MacAddress {hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2}|{hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2}|{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2} +hex4 [[:xdigit:]]{4} +DotQuadMacAddress {hex4}\.{hex4}\.{hex4} -hex4 [[:xdigit:]]{4} -QuadMacAddress {hex4}\.{hex4}\.{hex4} +ColonBytes ({hex2}:)|({hex2}(:{hex2})+) +HyphenBytes {hex2}(-{hex2})+ +DotBytes {hex2}(\.{hex2})+ -DecOctet [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] -IPv4Address {DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet} +DecOctet [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] +IPv4Address {DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet} -h16 [0-9A-Fa-f]{1,4} -ls32 {h16}:{h16}|{IPv4Address} -IPv6Address ({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?:: +h16 [0-9A-Fa-f]{1,4} +ls32 {h16}:{h16}|{IPv4Address} +IPv6Address ({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?:: -V4CidrPrefix \/[[:digit:]]{1,2} -V6CidrPrefix \/[[:digit:]]{1,3} +V4CidrPrefix \/[[:digit:]]{1,2} +V6CidrPrefix \/[[:digit:]]{1,3} -ColonBytes ({hex2}:)|({hex2}(:{hex2})+) -DotBytes {hex2}(\.{hex2})+ -HyphenBytes {hex2}(-{hex2})+ +/* Catch all valid semantic values. Cannot contain DOT DOT or start with MINUS. */ +StartAlphabet [[:alnum:]_:] +Alphabet [[:alnum:]_:/-] +LiteralValue {StartAlphabet}{Alphabet}*(\.{Alphabet}+)* %x RANGE %x LAYER @@ -403,57 +420,120 @@ HyphenBytes {hex2}(-{hex2})+ g_string_append(yyextra->quoted_string, yytext); } + /* NOTE: None of the patterns below can match ".." anywhere in the token string. */ - /* None of the patterns below can match ".." anywhere in the token string. */ + /* MAC address. */ -{MacAddress}|{QuadMacAddress} { +{ColonMacAddress}|{HyphenMacAddress} { /* MAC Address. */ update_location(yyextra, yytext); - return set_lval_unparsed(yyextra, yytext); + return set_lval_literal(yyextra, yytext, yytext); +} + +{DotMacAddress}|{DotQuadMacAddress} { + /* MAC Address, can also be a field. */ + update_location(yyextra, yytext); + return set_lval_unparsed(yyextra, yytext, yytext); } + /* IP address. */ + {IPv4Address}{V4CidrPrefix}? { /* IPv4 with or without prefix. */ update_location(yyextra, yytext); - return set_lval_unparsed(yyextra, yytext); + return set_lval_literal(yyextra, yytext, yytext); } {IPv6Address}{V6CidrPrefix}? { /* IPv6 with or without prefix. */ update_location(yyextra, yytext); - return set_lval_unparsed(yyextra, yytext); + return set_lval_literal(yyextra, yytext, yytext); +} + + /* Integer or bytes */ + +0[bBoOxX][[:xdigit:]]+ { + /* Binary/octal/hex integer. */ + update_location(yyextra, yytext); + return set_lval_literal(yyextra, yytext, yytext); +} + +:[[:xdigit:]]+ { + /* Numeric prefixed with ':'. */ + update_location(yyextra, yytext); + return set_lval_literal(yyextra, yytext + 1, yytext); +} + +[[:xdigit:]]+ { + /* Numeric or field. */ + update_location(yyextra, yytext); + return set_lval_unparsed(yyextra, yytext, yytext); +} + + /* Floating point. */ + +[[:digit:]]+\.[[:digit:]]+([eE][+-]?[[:digit:]]+)? { + /* Decimal float with optional exponent. */ + /* Significand cannot have any side omitted. */ + update_location(yyextra, yytext); + return set_lval_unparsed(yyextra, yytext, yytext); +} + +0[xX][[:xdigit:]]+\.[[:xdigit:]]+([pP][+-]?[[:digit:]]+)? { + /* Hexadecimal float with optional exponent. Can't be a field because + * field cannot beging with 0x. */ + /* Significand cannot have any side omitted. */ + update_location(yyextra, yytext); + return set_lval_literal(yyextra, yytext, yytext); } -:?({ColonBytes}|{DotBytes}|{HyphenBytes}) { + /* Bytes. */ + +:?{ColonBytes} { /* Bytes. */ update_location(yyextra, yytext); if (yytext[0] == ':') - return set_lval_literal(yyextra, yytext); /* Keep leading colon. */ - return set_lval_unparsed(yyextra, yytext); + return set_lval_literal(yyextra, yytext + 1, yytext); + return set_lval_literal(yyextra, yytext, yytext); } -:[[:xdigit:]]+ { - /* Numeric. */ +:?{HyphenBytes} { + /* Bytes. */ update_location(yyextra, yytext); - return set_lval_literal(yyextra, yytext); /* Keep leading colon. */ + if (yytext[0] == ':') + return set_lval_literal(yyextra, yytext + 1, yytext); + return set_lval_literal(yyextra, yytext, yytext); } -{Identifier} { - /* Identifier (field or function) or constant (literal). */ +:?{DotBytes} { + /* DotBytes, can be a field without ':' prefix. */ + update_location(yyextra, yytext); + if (yytext[0] == ':') + return set_lval_literal(yyextra, yytext + 1, yytext); + return set_lval_unparsed(yyextra, yytext, yytext); +} + + /* Identifier (protocol/field/function name). */ + + /* This must come before FieldIdentifier to match function names. */ +{FunctionIdentifier} { + /* Identifier (field or function) or constant (bytes without separator). */ + /* We use CONSTANT instead of LITERAL because the difference is significant + * in the syntactical grammar. */ update_location(yyextra, yytext); header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra->dfw, yytext); if (hfinfo != NULL) { - return set_lval_identifier(yyextra, yytext); + return set_lval_identifier(yyextra, yytext, yytext); } df_func_def_t *def = df_func_lookup(yytext); if (def != NULL) { - return set_lval_identifier(yyextra, yytext); + return set_lval_identifier(yyextra, yytext, yytext); } - return set_lval_constant(yyextra, yytext); + return set_lval_constant(yyextra, yytext, yytext); } -\.{Identifier} { - /* Field. */ +\.{ProtoFieldIdentifier} { + /* Identifier, prefixed with a '.'. */ update_location(yyextra, yytext); const char *name = yytext + 1; header_field_info *hfinfo = dfilter_resolve_unparsed(yyextra->dfw, name); @@ -461,7 +541,21 @@ HyphenBytes {hex2}(-{hex2})+ FAIL("\"%s\" is not a valid protocol or protocol field.", name); return SCAN_FAILED; } - return set_lval_field(yyextra, yytext, hfinfo); + return set_lval_field(yyextra, hfinfo, yytext); +} + +{ProtoFieldIdentifier} { + /* Catch-all for protocol values. Can also be a literal. */ + update_location(yyextra, yytext); + return set_lval_unparsed(yyextra, yytext, yytext); +} + +{LiteralValue} { + /* Catch-all for semantic values. */ + update_location(yyextra, yytext); + /* We use literal here because identifiers (using unparsed) should have + * matched one of the previous rules. */ + return set_lval_literal(yyextra, yytext, yytext); } . { @@ -474,7 +568,6 @@ HyphenBytes {hex2}(-{hex2})+ return SCAN_FAILED; } - %% /* @@ -511,34 +604,46 @@ set_lval_simple(df_scanner_state_t *state, int token, const char *token_value, s } static int -set_lval_literal(df_scanner_state_t *state, const char *token_value) +set_lval_literal(df_scanner_state_t *state, const char *value, const char *token_value) { - state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location); + state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location); return TOKEN_LITERAL; } static int -set_lval_identifier(df_scanner_state_t *state, const char *token_value) +set_lval_identifier(df_scanner_state_t *state, const char *value, const char *token_value) { - state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location); + state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location); return TOKEN_IDENTIFIER; } static int -set_lval_constant(df_scanner_state_t *state, const char *token_value) +set_lval_constant(df_scanner_state_t *state, const char *value, const char *token_value) { - state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(token_value), g_strdup(token_value), state->location); + state->df_lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), state->location); return TOKEN_CONSTANT; } static int -set_lval_unparsed(df_scanner_state_t *state, const char *token_value) +set_lval_unparsed(df_scanner_state_t *state, const char *value, const char *token_value) { - const header_field_info *hfinfo = dfilter_resolve_unparsed(state->dfw, token_value); + int token; + const header_field_info *hfinfo = dfilter_resolve_unparsed(state->dfw, value); if (hfinfo != NULL) { - return set_lval_field(state, token_value, hfinfo); + token = set_lval_field(state, hfinfo, token_value); } - return set_lval_literal(state, token_value); + else { + token = set_lval_literal(state, value, token_value); + } + stnode_set_flags(state->df_lval, STFLAG_UNPARSED); + return token; +} + +static int +set_lval_field(df_scanner_state_t *state, const header_field_info *hfinfo, const char *token_value) +{ + state->df_lval = stnode_new(STTYPE_FIELD, (gpointer)hfinfo, g_strdup(token_value), state->location); + return TOKEN_FIELD; } static int @@ -567,13 +672,6 @@ set_lval_charconst(df_scanner_state_t *state, GString *quoted_string) return TOKEN_CHARCONST; } -static int -set_lval_field(df_scanner_state_t *state, const char *token_value, const header_field_info *hfinfo) -{ - state->df_lval = stnode_new(STTYPE_FIELD, (gpointer)hfinfo, g_strdup(token_value), state->location); - return TOKEN_FIELD; -} - static gboolean append_escaped_char(df_scanner_state_t *state, GString *str, char c) { diff --git a/epan/dfilter/semcheck.c b/epan/dfilter/semcheck.c index a769ac4ed5..f116050501 100644 --- a/epan/dfilter/semcheck.c +++ b/epan/dfilter/semcheck.c @@ -1149,18 +1149,18 @@ check_relation(dfwork_t *dfw, stnode_op_t st_op, } static void -check_relation_contains_RHS_FIELD(dfwork_t *dfw, stnode_t *st_node _U_, +check_warning_contains_RHS_FIELD(dfwork_t *dfw, stnode_t *st_node _U_, stnode_t *st_arg1 _U_, stnode_t *st_arg2) { const char *token = stnode_token(st_arg2); - if (token[0] == '.' || token[0] == ':') - return; - header_field_info *hfinfo = sttype_field_hfinfo(st_arg2); - fvalue_t *fvalue = fvalue_from_literal(FT_BYTES, hfinfo->abbrev, FALSE, NULL); + fvalue_t *fvalue = fvalue_from_literal(FT_BYTES, token, TRUE, NULL); if (fvalue != NULL) { - add_compile_warning(dfw, "Interpreting \"%s\" as \"%s\". Consider writing :%s or .%s", - hfinfo->abbrev, hfinfo->name, hfinfo->abbrev, hfinfo->abbrev); + char *repr = fvalue_to_string_repr(dfw->dfw_scope, fvalue, FTREPR_DFILTER, 0); + add_compile_warning(dfw, "Interpreting \"%s\" as %s instead of %s. " + "Consider writing \"%s\" or \".%s\" to remove this warning", + token, hfinfo->name, ftype_pretty_name(FT_BYTES), + repr, hfinfo->abbrev); fvalue_free(fvalue); } } @@ -1171,8 +1171,8 @@ check_relation_contains(dfwork_t *dfw, stnode_t *st_node, { LOG_NODE(st_node); - if (stnode_type_id(st_arg2) == STTYPE_FIELD) { - check_relation_contains_RHS_FIELD(dfw, st_node, st_arg1, st_arg2); + if (stnode_type_id(st_arg2) == STTYPE_FIELD && stnode_get_flags(st_arg2, STFLAG_UNPARSED)) { + check_warning_contains_RHS_FIELD(dfw, st_node, st_arg1, st_arg2); } switch (stnode_type_id(st_arg1)) { diff --git a/epan/dfilter/syntax-tree.c b/epan/dfilter/syntax-tree.c index c0f1207e95..6220817899 100644 --- a/epan/dfilter/syntax-tree.c +++ b/epan/dfilter/syntax-tree.c @@ -99,6 +99,7 @@ stnode_clear(stnode_t *node) node->repr_token = NULL; node->location.col_start = -1; node->location.col_len = 0; + node->flags = 0; } void @@ -113,6 +114,7 @@ stnode_init(stnode_t *node, sttype_id_t type_id, gpointer data, char *token, df_ node->repr_debug = NULL; node->repr_token = token; node->location = loc; + node->flags = 0; if (type_id == STTYPE_UNINITIALIZED) { node->type = NULL; @@ -138,8 +140,10 @@ stnode_replace(stnode_t *node, sttype_id_t type_id, gpointer data) { char *token = g_strdup(node->repr_token); df_loc_t loc = node->location; + uint16_t flags = node->flags; stnode_clear(node); stnode_init(node, type_id, data, token, loc); + node->flags = flags; } stnode_t* @@ -174,6 +178,7 @@ stnode_dup(const stnode_t *node) new->repr_debug = NULL; new->repr_token = g_strdup(node->repr_token); new->location = node->location; + new->flags = node->flags; new->type = node->type; if (node->type == NULL) @@ -256,6 +261,18 @@ stnode_set_location(stnode_t *node, df_loc_t loc) node->location = loc; } +gboolean +stnode_get_flags(stnode_t *node, uint16_t flags) +{ + return node->flags & flags; +} + +void +stnode_set_flags(stnode_t *node, uint16_t flags) +{ + node->flags |= flags; +} + /* Finds the first and last location from a set and creates * a new location from start of first (col_start) to end of * last (col_start + col_len). Sets the result to dst. */ diff --git a/epan/dfilter/syntax-tree.h b/epan/dfilter/syntax-tree.h index ef3dabcc17..3b97fe0bbb 100644 --- a/epan/dfilter/syntax-tree.h +++ b/epan/dfilter/syntax-tree.h @@ -54,6 +54,10 @@ typedef struct { STTypeToStrFunc func_tostr; } sttype_t; + +/* Lexical value is ambiguous (can be a protocol field or a literal). */ +#define STFLAG_UNPARSED (1 << 0) + /** Node (type instance) information */ typedef struct { uint32_t magic; @@ -63,6 +67,7 @@ typedef struct { char *repr_display; char *repr_debug; df_loc_t location; + uint16_t flags; } stnode_t; typedef enum { @@ -159,6 +164,12 @@ stnode_location(stnode_t *node); void stnode_set_location(stnode_t *node, df_loc_t loc); +gboolean +stnode_get_flags(stnode_t *node, uint16_t flags); + +void +stnode_set_flags(stnode_t *node, uint16_t flags); + void stnode_merge_location(stnode_t *dst, stnode_t *n1, stnode_t *n2); diff --git a/test/suite_dfilter/group_syntax.py b/test/suite_dfilter/group_syntax.py index a2494ae84f..77762fb8ea 100644 --- a/test/suite_dfilter/group_syntax.py +++ b/test/suite_dfilter/group_syntax.py @@ -193,11 +193,26 @@ class case_equality(unittest.TestCase): dfilter = 'frame[37] == fc:' checkDFilterCount(dfilter, 1) - def test_rhs_literal_bias_4(self, checkDFilterCount): + def test_rhs_bias_4(self, checkDFilterCount): # Protocol "Fibre Channel" on the RHS dfilter = 'frame[37] == .fc' checkDFilterCount(dfilter, 0) + def test_rhs_bias_5(self, checkDFilterSucceed): + # Protocol "Fibre Channel" on the RHS (with warning) + dfilter = 'frame contains fc' + checkDFilterSucceed(dfilter, 'Interpreting "fc" as Fibre Channel') + + def test_rhs_bias_6(self, checkDFilterSucceed): + # Protocol "Fibre Channel" on the RHS (without warning) + dfilter = 'frame contains .fc' + checkDFilterSucceed(dfilter) + + def test_rhs_bias_7(self, checkDFilterSucceed): + # Byte 0xFC on the RHS + dfilter = 'frame contains fc:' + checkDFilterSucceed(dfilter) + @fixtures.uses_fixtures class case_bitwise(unittest.TestCase): trace_file = "http.pcap" |