Fix crash when using the "matches" operator on non-UTF-8 data

GRegex is a thin wrapper around PCRE. Inputs (patterns and subjects) are assumed to be UTF-8 by default (unless G_REGEX_RAW is set). If the subject is not valid UTF-8, normally pcre_exec will immediately return a failure. However, as GLib sets PCRE_NO_UTF8_CHECK when G_REGEX_RAW is given, pcre_exec() will skip the safety check and crash instead. Fix this by always assuming raw byte patterns. Regression risk: patterns such as `ö.ï` will no longer match `öñï` since `ñ` is a multi-byte sequence. Patterns such as `(GET|POST) /` remain functional though. Bug: 14905 Change-Id: I6450bb83f565d377f82a5dbb01690c5f49acd96f Reviewed-on: https://code.wireshark.org/review/31935 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot Reviewed-by: Anders Broman <a.broman58@gmail.com>
author: Peter Wu <peter@lekensteyn.nl> 2019-02-08 17:20:37 +0100
committer: Anders Broman <a.broman58@gmail.com> 2019-02-11 05:08:53 +0000
commit: 0ca65a66f425c8beaa1af3deb3b84c2b16cffb55 (patch)
tree: 4015973a2b1cd2758be093183711f7fd9b733489 /epan/ftypes
parent: f2dc64e9b8d6cbd3dd5f7cda03596abd1c0ceea7 (diff)
2 files changed, 10 insertions, 52 deletions
diff --git a/epan/ftypes/ftype-bytes.c b/epan/ftypes/ftype-bytes.c
index c1d57f0bbd..9bfc37b637 100644
--- a/epan/ftypes/ftype-bytes.c
+++ b/epan/ftypes/ftype-bytes.c
@@ -665,30 +665,6 @@ cmp_matches(const fvalue_t *fv_a, const fvalue_t *fv_b)
 	if (! regex) {
 		return FALSE;
 	}
-	/*
-	 * XXX - do we want G_REGEX_RAW or not?
-	 *
-	 * If we're matching against a string, we don't want it (and
-	 * we want the string value encoded in UTF-8 - and, if it can't
-	 * be converted to UTF-8, because it's in a character encoding
-	 * that doesn't map every possible byte sequence to Unicode (and
-	 * that includes strings that are supposed to be in UTF-8 but
-	 * that contain invalid UTF-8 sequences!), treat the match as
-	 * failing.
-	 *
-	 * If we're matching against binary data, and matching a binary
-	 * pattern (e.g. "0xfa, 3 or more 0xff, and 0x37, in order"),
-	 * we'd want G_REGEX_RAW. If we're matching a text pattern,
-	 * it's not clear *what* the right thing to do is - if they're
-	 * matching against a pattern containing non-ASCII characters,
-	 * they might want it to match in whatever encoding the binary
-	 * data is, but Wireshark might not have a clue what that
-	 * encoding is.  In addition, it's not clear how to tell
-	 * whether a pattern is "binary" or not, short of having
-	 * a different (non-PCRE) syntax for binary patterns.
-	 *
-	 * So we don't use G_REGEX_RAW for now.
-	 */
 	return g_regex_match_full(
 		regex,			/* Compiled PCRE */
 		(char *)a->data,	/* The data to check for the pattern... */
diff --git a/epan/ftypes/ftype-pcre.c b/epan/ftypes/ftype-pcre.c
index 5c9ad9f97b..ac854c73a2 100644
--- a/epan/ftypes/ftype-pcre.c
+++ b/epan/ftypes/ftype-pcre.c
@@ -33,28 +33,6 @@ gregex_fvalue_free(fvalue_t *fv)
     }
 }
 
-/* Determines whether pattern needs to match raw byte sequences */
-static gboolean
-raw_flag_needed(const gchar *pattern)
-{
-    gboolean found = FALSE;
-    const gchar *s = pattern;
-    size_t i, len;
-
-    /* find any character whose hex value is two letters */
-    len = strlen(s);
-    for (i = 0; i < len; i++) {
-        /* Upper and lower-nibble must be >= 0xA */
-        if ((guchar)(s[i] & 0xF0) >= 0xA0 &&
-            (guchar)(s[i] & 0x0F) >= 0x0A)
-        {
-            found = TRUE;
-            break;
-        }
-    }
-    return found;
-}
-
 /* Generate a FT_PCRE from a parsed string pattern.
  * On failure, if err_msg is non-null, set *err_msg to point to a
  * g_malloc()ed error message. */
@@ -64,12 +42,16 @@ val_from_string(fvalue_t *fv, const char *pattern, gchar **err_msg)
     GError *regex_error = NULL;
     GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE);
 
-    /* Set RAW flag only if pattern requires matching raw byte
-       sequences. Otherwise, omit it so that GRegex treats its
-       input as UTF8-encoded string. */
-    if (raw_flag_needed(pattern)) {
-        cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
-    }
+    /*
+     * As FT_BYTES and FT_PROTOCOL contain arbitrary binary data and FT_STRING
+     * is not guaranteed to contain valid UTF-8, we have to disable support for
+     * UTF-8 patterns and treat every pattern and subject as raw bytes.
+     *
+     * Should support for UTF-8 patterns be necessary, then we should compile a
+     * pattern without G_REGEX_RAW. Additionally, we MUST use g_utf8_validate()
+     * before calling g_regex_match_full() or risk crashes.
+     */
+    cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
 
     /* Free up the old value, if we have one */
     gregex_fvalue_free(fv);
author	Peter Wu <peter@lekensteyn.nl>	2019-02-08 17:20:37 +0100
committer	Anders Broman <a.broman58@gmail.com>	2019-02-11 05:08:53 +0000
commit	0ca65a66f425c8beaa1af3deb3b84c2b16cffb55 (patch)
tree	4015973a2b1cd2758be093183711f7fd9b733489 /epan/ftypes
parent	f2dc64e9b8d6cbd3dd5f7cda03596abd1c0ceea7 (diff)