diff options
author | Peter Wu <peter@lekensteyn.nl> | 2019-02-08 17:20:37 +0100 |
---|---|---|
committer | Anders Broman <a.broman58@gmail.com> | 2019-02-11 05:08:53 +0000 |
commit | 0ca65a66f425c8beaa1af3deb3b84c2b16cffb55 (patch) | |
tree | 4015973a2b1cd2758be093183711f7fd9b733489 /epan/ftypes | |
parent | f2dc64e9b8d6cbd3dd5f7cda03596abd1c0ceea7 (diff) |
Fix crash when using the "matches" operator on non-UTF-8 data
GRegex is a thin wrapper around PCRE. Inputs (patterns and subjects) are
assumed to be UTF-8 by default (unless G_REGEX_RAW is set). If the
subject is not valid UTF-8, normally pcre_exec will immediately return a
failure. However, as GLib sets PCRE_NO_UTF8_CHECK when G_REGEX_RAW is
given, pcre_exec() will skip the safety check and crash instead.
Fix this by always assuming raw byte patterns. Regression risk: patterns
such as `ö.ï` will no longer match `öñï` since `ñ` is a multi-byte
sequence. Patterns such as `(GET|POST) /` remain functional though.
Bug: 14905
Change-Id: I6450bb83f565d377f82a5dbb01690c5f49acd96f
Reviewed-on: https://code.wireshark.org/review/31935
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot
Reviewed-by: Anders Broman <a.broman58@gmail.com>
Diffstat (limited to 'epan/ftypes')
-rw-r--r-- | epan/ftypes/ftype-bytes.c | 24 | ||||
-rw-r--r-- | epan/ftypes/ftype-pcre.c | 38 |
2 files changed, 10 insertions, 52 deletions
diff --git a/epan/ftypes/ftype-bytes.c b/epan/ftypes/ftype-bytes.c index c1d57f0bbd..9bfc37b637 100644 --- a/epan/ftypes/ftype-bytes.c +++ b/epan/ftypes/ftype-bytes.c @@ -665,30 +665,6 @@ cmp_matches(const fvalue_t *fv_a, const fvalue_t *fv_b) if (! regex) { return FALSE; } - /* - * XXX - do we want G_REGEX_RAW or not? - * - * If we're matching against a string, we don't want it (and - * we want the string value encoded in UTF-8 - and, if it can't - * be converted to UTF-8, because it's in a character encoding - * that doesn't map every possible byte sequence to Unicode (and - * that includes strings that are supposed to be in UTF-8 but - * that contain invalid UTF-8 sequences!), treat the match as - * failing. - * - * If we're matching against binary data, and matching a binary - * pattern (e.g. "0xfa, 3 or more 0xff, and 0x37, in order"), - * we'd want G_REGEX_RAW. If we're matching a text pattern, - * it's not clear *what* the right thing to do is - if they're - * matching against a pattern containing non-ASCII characters, - * they might want it to match in whatever encoding the binary - * data is, but Wireshark might not have a clue what that - * encoding is. In addition, it's not clear how to tell - * whether a pattern is "binary" or not, short of having - * a different (non-PCRE) syntax for binary patterns. - * - * So we don't use G_REGEX_RAW for now. - */ return g_regex_match_full( regex, /* Compiled PCRE */ (char *)a->data, /* The data to check for the pattern... */ diff --git a/epan/ftypes/ftype-pcre.c b/epan/ftypes/ftype-pcre.c index 5c9ad9f97b..ac854c73a2 100644 --- a/epan/ftypes/ftype-pcre.c +++ b/epan/ftypes/ftype-pcre.c @@ -33,28 +33,6 @@ gregex_fvalue_free(fvalue_t *fv) } } -/* Determines whether pattern needs to match raw byte sequences */ -static gboolean -raw_flag_needed(const gchar *pattern) -{ - gboolean found = FALSE; - const gchar *s = pattern; - size_t i, len; - - /* find any character whose hex value is two letters */ - len = strlen(s); - for (i = 0; i < len; i++) { - /* Upper and lower-nibble must be >= 0xA */ - if ((guchar)(s[i] & 0xF0) >= 0xA0 && - (guchar)(s[i] & 0x0F) >= 0x0A) - { - found = TRUE; - break; - } - } - return found; -} - /* Generate a FT_PCRE from a parsed string pattern. * On failure, if err_msg is non-null, set *err_msg to point to a * g_malloc()ed error message. */ @@ -64,12 +42,16 @@ val_from_string(fvalue_t *fv, const char *pattern, gchar **err_msg) GError *regex_error = NULL; GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE); - /* Set RAW flag only if pattern requires matching raw byte - sequences. Otherwise, omit it so that GRegex treats its - input as UTF8-encoded string. */ - if (raw_flag_needed(pattern)) { - cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW); - } + /* + * As FT_BYTES and FT_PROTOCOL contain arbitrary binary data and FT_STRING + * is not guaranteed to contain valid UTF-8, we have to disable support for + * UTF-8 patterns and treat every pattern and subject as raw bytes. + * + * Should support for UTF-8 patterns be necessary, then we should compile a + * pattern without G_REGEX_RAW. Additionally, we MUST use g_utf8_validate() + * before calling g_regex_match_full() or risk crashes. + */ + cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW); /* Free up the old value, if we have one */ gregex_fvalue_free(fv); |