aboutsummaryrefslogtreecommitdiffstats
path: root/file.c
diff options
context:
space:
mode:
authorJohn Thacker <johnthacker@gmail.com>2022-02-08 09:04:55 -0500
committerJohn Thacker <johnthacker@gmail.com>2022-02-17 12:06:57 +0000
commit9308f760a6f65ff14bfa6e2c321609ed2ff183cd (patch)
treeca31fde3ae53a6738c278b31df5c15e5d033194c /file.c
parent4633621c3e8526669fc6dcc10e045a5f85cee7ca (diff)
file: Optimize Find Packet
Split the match functions in twain, one for case-sensitive and one for case-insensitive, so we can use memchr to search for the first byte in the case-sensitive version and ws_mempbrk for the case-insensitive version. They are highly optimized on most systems and considerably faster on large files. Also fix a few issues regarding wide strings, such as false positives and the length to highlight when matching. Fix #12908
Diffstat (limited to 'file.c')
-rw-r--r--file.c429
1 files changed, 343 insertions, 86 deletions
diff --git a/file.c b/file.c
index c26a0a359d..fc0f949517 100644
--- a/file.c
+++ b/file.c
@@ -92,10 +92,16 @@ static match_result match_summary_line(capture_file *cf, frame_data *fdata,
wtap_rec *, Buffer *, void *criterion);
static match_result match_narrow_and_wide(capture_file *cf, frame_data *fdata,
wtap_rec *, Buffer *, void *criterion);
+static match_result match_narrow_and_wide_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *, Buffer *, void *criterion);
static match_result match_narrow(capture_file *cf, frame_data *fdata,
wtap_rec *, Buffer *, void *criterion);
+static match_result match_narrow_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *, Buffer *, void *criterion);
static match_result match_wide(capture_file *cf, frame_data *fdata,
wtap_rec *, Buffer *, void *criterion);
+static match_result match_wide_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *, Buffer *, void *criterion);
static match_result match_binary(capture_file *cf, frame_data *fdata,
wtap_rec *, Buffer *, void *criterion);
static match_result match_regex(capture_file *cf, frame_data *fdata,
@@ -3250,24 +3256,32 @@ match_summary_line(capture_file *cf, frame_data *fdata,
typedef struct {
const guint8 *data;
size_t data_len;
+ ws_mempbrk_pattern *pattern;
} cbs_t; /* "Counted byte string" */
/*
* The current match_* routines only support ASCII case insensitivity and don't
- * convert UTF-8 inputs to UTF-16 for matching.
+ * convert UTF-8 inputs to UTF-16 for matching. The UTF-16 support just
+ * interleaves with \0 bytes, which works for 7 bit ASCII.
*
* We could modify them to use the GLib Unicode routines or the International
* Components for Unicode library but it's not apparent that we could do so
* without consuming a lot more CPU and memory or that searching would be
* significantly better.
+ *
+ * XXX: We could test the search string to see if it's all ASCII, and if not
+ * use Unicode aware routines for case insensitive searches or any UTF-16
+ * search.
*/
gboolean
cf_find_packet_data(capture_file *cf, const guint8 *string, size_t string_size,
search_direction dir)
{
- cbs_t info;
+ cbs_t info;
+ guint8 needles[3];
+ ws_mempbrk_pattern pattern;
info.data = string;
info.data_len = string_size;
@@ -3278,20 +3292,44 @@ cf_find_packet_data(capture_file *cf, const guint8 *string, size_t string_size,
return find_packet(cf, match_regex, NULL, dir);
} else if (cf->string) {
/* String search - what type of string? */
- switch (cf->scs_type) {
+ if (cf->case_type) {
+ needles[0] = string[0];
+ needles[1] = g_ascii_tolower(needles[0]);
+ needles[2] = '\0';
+ ws_mempbrk_compile(&pattern, needles);
+ info.pattern = &pattern;
+ switch (cf->scs_type) {
+
+ case SCS_NARROW_AND_WIDE:
+ return find_packet(cf, match_narrow_and_wide_case, &info, dir);
+
+ case SCS_NARROW:
+ return find_packet(cf, match_narrow_case, &info, dir);
+
+ case SCS_WIDE:
+ return find_packet(cf, match_wide_case, &info, dir);
+
+ default:
+ ws_assert_not_reached();
+ return FALSE;
+ }
- case SCS_NARROW_AND_WIDE:
- return find_packet(cf, match_narrow_and_wide, &info, dir);
+ } else {
+ switch (cf->scs_type) {
- case SCS_NARROW:
- return find_packet(cf, match_narrow, &info, dir);
+ case SCS_NARROW_AND_WIDE:
+ return find_packet(cf, match_narrow_and_wide, &info, dir);
- case SCS_WIDE:
- return find_packet(cf, match_wide, &info, dir);
+ case SCS_NARROW:
+ return find_packet(cf, match_narrow, &info, dir);
- default:
- ws_assert_not_reached();
- return FALSE;
+ case SCS_WIDE:
+ return find_packet(cf, match_wide, &info, dir);
+
+ default:
+ ws_assert_not_reached();
+ return FALSE;
+ }
}
} else
return find_packet(cf, match_binary, &info, dir);
@@ -3306,7 +3344,7 @@ match_narrow_and_wide(capture_file *cf, frame_data *fdata,
size_t textlen = info->data_len;
match_result result;
guint32 buf_len;
- guint8 *pd;
+ guint8 *pd, *buf_start, *buf_end;
guint32 i;
guint8 c_char;
size_t c_match = 0;
@@ -3319,31 +3357,129 @@ match_narrow_and_wide(capture_file *cf, frame_data *fdata,
result = MR_NOTMATCHED;
buf_len = fdata->cap_len;
- pd = ws_buffer_start_ptr(buf);
- i = 0;
- while (i < buf_len) {
- c_char = pd[i];
- if (cf->case_type)
- c_char = g_ascii_toupper(c_char);
- if (c_char != '\0') {
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)memchr(pd, ascii_text[0], buf_end - pd);
+ if (pd == NULL) break;
+ /* Try narrow match at this start location */
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = pd[i];
if (c_char == ascii_text[c_match]) {
- c_match += 1;
+ c_match++;
if (c_match == textlen) {
result = MR_MATCHED;
- cf->search_pos = i; /* Save the position of the last character
- for highlighting the field. */
- cf->search_len = (guint32)textlen;
- break;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
}
+ } else {
+ break;
}
- else {
- ws_assert(i>=c_match);
- i -= (guint32)c_match;
- c_match = 0;
+ }
+
+ /* Now try wide match at the same start location. */
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = pd[i];
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ i++;
+ if (pd + i >= buf_end || pd[i] != '\0') break;
+ } else {
+ break;
}
}
- i += 1;
}
+
+done:
+ return result;
+}
+
+/* Case insensitive match */
+static match_result
+match_narrow_and_wide_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *rec, Buffer *buf, void *criterion)
+{
+ cbs_t *info = (cbs_t *)criterion;
+ const guint8 *ascii_text = info->data;
+ size_t textlen = info->data_len;
+ ws_mempbrk_pattern *pattern = info->pattern;
+ match_result result;
+ guint32 buf_len;
+ guint8 *pd, *buf_start, *buf_end;
+ guint32 i;
+ guint8 c_char;
+ size_t c_match = 0;
+
+ /* Load the frame's data. */
+ if (!cf_read_record(cf, fdata, rec, buf)) {
+ /* Attempt to get the packet failed. */
+ return MR_ERROR;
+ }
+
+ ws_assert(pattern != NULL);
+
+ result = MR_NOTMATCHED;
+ buf_len = fdata->cap_len;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)ws_mempbrk_exec(pd, buf_end - pd, pattern, &c_char);
+ if (pd == NULL) break;
+ /* Try narrow match at this start location */
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = g_ascii_toupper(pd[i]);
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ } else {
+ break;
+ }
+ }
+
+ /* Now try wide match at the same start location. */
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = g_ascii_toupper(pd[i]);
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ i++;
+ if (pd + i >= buf_end || pd[i] != '\0') break;
+ } else {
+ break;
+ }
+ }
+ }
+
+done:
return result;
}
@@ -3356,7 +3492,7 @@ match_narrow(capture_file *cf, frame_data *fdata,
size_t textlen = info->data_len;
match_result result;
guint32 buf_len;
- guint8 *pd;
+ guint8 *pd, *buf_start, *buf_end;
guint32 i;
guint8 c_char;
size_t c_match = 0;
@@ -3369,30 +3505,85 @@ match_narrow(capture_file *cf, frame_data *fdata,
result = MR_NOTMATCHED;
buf_len = fdata->cap_len;
- pd = ws_buffer_start_ptr(buf);
- i = 0;
- while (i < buf_len) {
- c_char = pd[i];
- if (cf->case_type)
- c_char = g_ascii_toupper(c_char);
- if (c_char == ascii_text[c_match]) {
- c_match += 1;
- if (c_match == textlen) {
- result = MR_MATCHED;
- cf->search_pos = i; /* Save the position of the last character
- for highlighting the field. */
- cf->search_len = (guint32)textlen;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)memchr(pd, ascii_text[0], buf_end - pd);
+ if (pd == NULL) break;
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = pd[i];
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ } else {
break;
}
}
- else {
- ws_assert(i>=c_match);
- i -= (guint32)c_match;
- c_match = 0;
+ }
+
+done:
+ return result;
+}
+
+/* Case insensitive match */
+static match_result
+match_narrow_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *rec, Buffer *buf, void *criterion)
+{
+ cbs_t *info = (cbs_t *)criterion;
+ const guint8 *ascii_text = info->data;
+ size_t textlen = info->data_len;
+ ws_mempbrk_pattern *pattern = info->pattern;
+ match_result result;
+ guint32 buf_len;
+ guint8 *pd, *buf_start, *buf_end;
+ guint32 i;
+ guint8 c_char;
+ size_t c_match = 0;
+
+ /* Load the frame's data. */
+ if (!cf_read_record(cf, fdata, rec, buf)) {
+ /* Attempt to get the packet failed. */
+ return MR_ERROR;
+ }
+
+ ws_assert(pattern != NULL);
+
+ result = MR_NOTMATCHED;
+ buf_len = fdata->cap_len;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)ws_mempbrk_exec(pd, buf_end - pd, pattern, &c_char);
+ if (pd == NULL) break;
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = g_ascii_toupper(pd[i]);
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ } else {
+ break;
+ }
}
- i += 1;
}
+done:
return result;
}
@@ -3405,7 +3596,7 @@ match_wide(capture_file *cf, frame_data *fdata,
size_t textlen = info->data_len;
match_result result;
guint32 buf_len;
- guint8 *pd;
+ guint8 *pd, *buf_start, *buf_end;
guint32 i;
guint8 c_char;
size_t c_match = 0;
@@ -3418,30 +3609,89 @@ match_wide(capture_file *cf, frame_data *fdata,
result = MR_NOTMATCHED;
buf_len = fdata->cap_len;
- pd = ws_buffer_start_ptr(buf);
- i = 0;
- while (i < buf_len) {
- c_char = pd[i];
- if (cf->case_type)
- c_char = g_ascii_toupper(c_char);
- if (c_char == ascii_text[c_match]) {
- c_match += 1;
- if (c_match == textlen) {
- result = MR_MATCHED;
- cf->search_pos = i; /* Save the position of the last character
- for highlighting the field. */
- cf->search_len = (guint32)textlen;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)memchr(pd, ascii_text[0], buf_end - pd);
+ if (pd == NULL) break;
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = pd[i];
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ i++;
+ if (pd + i >= buf_end || pd[i] != '\0') break;
+ } else {
break;
}
- i += 1;
}
- else {
- ws_assert(i>=(c_match*2));
- i -= (guint32)c_match*2;
- c_match = 0;
+ }
+
+done:
+ return result;
+}
+
+/* Case insensitive match */
+static match_result
+match_wide_case(capture_file *cf, frame_data *fdata,
+ wtap_rec *rec, Buffer *buf, void *criterion)
+{
+ cbs_t *info = (cbs_t *)criterion;
+ const guint8 *ascii_text = info->data;
+ size_t textlen = info->data_len;
+ ws_mempbrk_pattern *pattern = info->pattern;
+ match_result result;
+ guint32 buf_len;
+ guint8 *pd, *buf_start, *buf_end;
+ guint32 i;
+ guint8 c_char;
+ size_t c_match = 0;
+
+ /* Load the frame's data. */
+ if (!cf_read_record(cf, fdata, rec, buf)) {
+ /* Attempt to get the packet failed. */
+ return MR_ERROR;
+ }
+
+ ws_assert(pattern != NULL);
+
+ result = MR_NOTMATCHED;
+ buf_len = fdata->cap_len;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)ws_mempbrk_exec(pd, buf_end - pd, pattern, &c_char);
+ if (pd == NULL) break;
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ c_char = g_ascii_toupper(pd[i]);
+ if (c_char == ascii_text[c_match]) {
+ c_match++;
+ if (c_match == textlen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ i++;
+ if (pd + i >= buf_end || pd[i] != '\0') break;
+ } else {
+ break;
+ }
}
- i += 1;
}
+
+done:
return result;
}
@@ -3454,7 +3704,7 @@ match_binary(capture_file *cf, frame_data *fdata,
size_t datalen = info->data_len;
match_result result;
guint32 buf_len;
- guint8 *pd;
+ guint8 *pd, *buf_start, *buf_end;
guint32 i;
size_t c_match = 0;
@@ -3466,26 +3716,33 @@ match_binary(capture_file *cf, frame_data *fdata,
result = MR_NOTMATCHED;
buf_len = fdata->cap_len;
- pd = ws_buffer_start_ptr(buf);
- i = 0;
- while (i < buf_len) {
- if (pd[i] == binary_data[c_match]) {
- c_match += 1;
- if (c_match == datalen) {
- result = MR_MATCHED;
- cf->search_pos = i; /* Save the position of the last character
- for highlighting the field. */
- cf->search_len = (guint32)datalen;
+ buf_start = ws_buffer_start_ptr(buf);
+ buf_end = buf_start + buf_len;
+ /* Not clear if using memcmp() is faster. memmem() on systems that
+ * have it should be faster, though.
+ */
+ for (pd = buf_start; pd < buf_end; pd++) {
+ pd = (guint8 *)memchr(pd, binary_data[0], buf_end - pd);
+ if (pd == NULL) break;
+ c_match = 0;
+ for (i = 0; i < buf_end - pd; i++) {
+ if (pd[i] == binary_data[c_match]) {
+ c_match++;
+ if (c_match == datalen) {
+ result = MR_MATCHED;
+ cf->search_pos = i + (guint32)(pd - buf_start);
+ /* Save the position of the last character
+ for highlighting the field. */
+ cf->search_len = i + 1;
+ goto done;
+ }
+ } else {
break;
}
}
- else {
- ws_assert(i>=c_match);
- i -= (guint32)c_match;
- c_match = 0;
- }
- i += 1;
}
+
+done:
return result;
}