aboutsummaryrefslogtreecommitdiffstats
path: root/wsutil/ws_mempbrk_sse42.c
diff options
context:
space:
mode:
authorJakub Zawadzki <darkjames@darkjames.pl>2014-05-22 23:04:40 +0200
committerAnders Broman <a.broman58@gmail.com>2014-06-09 12:02:27 +0000
commitfcb710baec3caa30c2cb7c444bddbe087fc86574 (patch)
tree1ff3f1c4d9b5dca0794162a1000bede1b3a128e6 /wsutil/ws_mempbrk_sse42.c
parent66695661992beb054eff219dd73a23559220c867 (diff)
Add sse4.2 optimized function ws_mempbrk_sse42()
In text protocols, like SIP, lot of time is spend guint8_pbrk(), assume that text is not binary (no NULs), and use SSE4.2 pcmpistri instruction. Also move & rename guint8_pbrk() from tvbuff.c as _ws_mempbrk. HAVE_SSE42 must be defined to use _ws_mempbrk_sse42() only activaded for Windows currently. Change-Id: Ic853d84805bdb6492c4f45d2bcc79a973fd9804e Reviewed-on: https://code.wireshark.org/review/1730 Reviewed-by: Anders Broman <a.broman58@gmail.com>
Diffstat (limited to 'wsutil/ws_mempbrk_sse42.c')
-rw-r--r--wsutil/ws_mempbrk_sse42.c188
1 files changed, 188 insertions, 0 deletions
diff --git a/wsutil/ws_mempbrk_sse42.c b/wsutil/ws_mempbrk_sse42.c
new file mode 100644
index 0000000000..97587cc2d3
--- /dev/null
+++ b/wsutil/ws_mempbrk_sse42.c
@@ -0,0 +1,188 @@
+/* strcspn with SSE4.2 intrinsics
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+
+#include <glib.h>
+
+#ifdef WIN32
+ #include <tmmintrin.h>
+ #include <stdint.h>
+#endif
+
+#include <nmmintrin.h>
+#include <string.h>
+
+extern const guint8 *_ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles);
+const char *_ws_mempbrk_sse42(const char* haystack, size_t haystacklen, const char *needles);
+
+/* Helper for variable shifts of SSE registers.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ */
+
+static const int8_t ___m128i_shift_right[31] =
+ {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+
+static inline __m128i
+__m128i_shift_right (__m128i value, unsigned long int offset)
+{
+ /* _mm_loadu_si128() works with unaligned data, cast safe */
+ return _mm_shuffle_epi8 (value,
+ _mm_loadu_si128 ((__m128i *) (void *) (___m128i_shift_right + offset)));
+}
+
+/* We use 0x2:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ANY
+ | _SIDD_POSITIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ A A A A A A A A A A A A A A A A
+
+ to find out if the first 16byte data element has any byte A and
+ the offset of the first byte. There are 3 cases:
+
+ 1. The first 16byte data element has the byte A at the offset X.
+ 2. The first 16byte data element has EOS and doesn't have the byte A.
+ 3. The first 16byte data element is valid and doesn't have the byte A.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+ 1 X 1 0/1 0
+ 2 16 0 1 0
+ 3 16 0 0 0
+
+ We exit from the loop for cases 1 and 2 with jbe which branches
+ when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
+ X for case 1. */
+
+const char *
+_ws_mempbrk_sse42(const char *s, size_t slen, const char *a)
+{
+ const char *aligned;
+ __m128i mask;
+ int offset;
+
+ offset = (int) ((size_t) a & 15);
+ aligned = (const char *) ((size_t) a & -16L);
+ if (offset != 0)
+ {
+ int length;
+
+ /* Load masks. */
+ /* cast safe - _mm_load_si128() it's 16B aligned */
+ mask = __m128i_shift_right(_mm_load_si128 ((__m128i *) (void *) aligned), offset);
+
+ /* Find where the NULL terminator is. */
+ length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16 - offset)
+ {
+ /* There is no NULL terminator. */
+ __m128i mask1 = _mm_load_si128 ((__m128i *) (void *) (aligned + 16));
+ int index = _mm_cmpistri (mask1, mask1, 0x3a);
+ length += index;
+
+ /* Don't use SSE4.2 if the length of A > 16. */
+ if (length > 16)
+ return _ws_mempbrk(s, slen, a);
+
+ if (index != 0)
+ {
+ /* Combine mask0 and mask1. We could play games with
+ palignr, but frankly this data should be in L1 now
+ so do the merge via an unaligned load. */
+ mask = _mm_loadu_si128 ((__m128i *) (void *) a);
+ }
+ }
+ }
+ else
+ {
+ int length;
+
+ /* A is aligned. (cast safe) */
+ mask = _mm_load_si128 ((__m128i *) (void *) a);
+
+ /* Find where the NULL terminator is. */
+ length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return _ws_mempbrk(s, slen, a);
+ }
+ }
+
+ offset = (int) ((size_t) s & 15);
+ aligned = (const char *) ((size_t) s & -16L);
+ if (offset != 0)
+ {
+ /* Check partial string. cast safe it's 16B aligned */
+ __m128i value = __m128i_shift_right (_mm_load_si128 ((__m128i *) (void *) aligned), offset);
+
+ int length = _mm_cmpistri (mask, value, 0x2);
+ /* No need to check ZFlag since ZFlag is always 1. */
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ int index = _mm_cmpistri (value, value, 0x3a);
+
+ if (cflag)
+ return s + length;
+ /* Find where the NULL terminator is. */
+ if (index < 16 - offset)
+ {
+ /* fond NUL @ 'index', need to switch to slower mempbrk */
+ return _ws_mempbrk(s + index + 1, slen - index - 1, a); /* slen is bigger than 16 & index < 16 so no undeflow here */
+ }
+ aligned += 16;
+ slen -= (16 - offset);
+ }
+ else
+ aligned = s;
+
+ while (slen >= 16)
+ {
+ __m128i value = _mm_load_si128 ((__m128i *) (void *) aligned);
+ int index = _mm_cmpistri (mask, value, 0x2);
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ int zflag = _mm_cmpistrz (mask, value, 0x2);
+
+ if (cflag)
+ return aligned + index;
+ if (zflag)
+ {
+ /* found NUL, need to switch to slower mempbrk */
+ return _ws_mempbrk(aligned, slen, a);
+ }
+ aligned += 16;
+ slen -= 16;
+ }
+
+ /* XXX, use mempbrk_slow here? */
+ return _ws_mempbrk(aligned, slen, a);
+}