diff options
author | Jakub Zawadzki <darkjames@darkjames.pl> | 2014-05-22 23:04:40 +0200 |
---|---|---|
committer | Anders Broman <a.broman58@gmail.com> | 2014-06-09 12:02:27 +0000 |
commit | fcb710baec3caa30c2cb7c444bddbe087fc86574 (patch) | |
tree | 1ff3f1c4d9b5dca0794162a1000bede1b3a128e6 /wsutil/ws_mempbrk_sse42.c | |
parent | 66695661992beb054eff219dd73a23559220c867 (diff) |
Add sse4.2 optimized function ws_mempbrk_sse42()
In text protocols, like SIP, lot of time is spend guint8_pbrk(),
assume that text is not binary (no NULs), and use SSE4.2 pcmpistri
instruction.
Also move & rename guint8_pbrk() from tvbuff.c as _ws_mempbrk.
HAVE_SSE42 must be defined to use _ws_mempbrk_sse42() only activaded for
Windows currently.
Change-Id: Ic853d84805bdb6492c4f45d2bcc79a973fd9804e
Reviewed-on: https://code.wireshark.org/review/1730
Reviewed-by: Anders Broman <a.broman58@gmail.com>
Diffstat (limited to 'wsutil/ws_mempbrk_sse42.c')
-rw-r--r-- | wsutil/ws_mempbrk_sse42.c | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/wsutil/ws_mempbrk_sse42.c b/wsutil/ws_mempbrk_sse42.c new file mode 100644 index 0000000000..97587cc2d3 --- /dev/null +++ b/wsutil/ws_mempbrk_sse42.c @@ -0,0 +1,188 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009-2014 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "config.h" + +#include <glib.h> + +#ifdef WIN32 + #include <tmmintrin.h> + #include <stdint.h> +#endif + +#include <nmmintrin.h> +#include <string.h> + +extern const guint8 *_ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles); +const char *_ws_mempbrk_sse42(const char* haystack, size_t haystacklen, const char *needles); + +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010 Free Software Foundation, Inc. + */ + +static const int8_t ___m128i_shift_right[31] = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + +static inline __m128i +__m128i_shift_right (__m128i value, unsigned long int offset) +{ + /* _mm_loadu_si128() works with unaligned data, cast safe */ + return _mm_shuffle_epi8 (value, + _mm_loadu_si128 ((__m128i *) (void *) (___m128i_shift_right + offset))); +} + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +const char * +_ws_mempbrk_sse42(const char *s, size_t slen, const char *a) +{ + const char *aligned; + __m128i mask; + int offset; + + offset = (int) ((size_t) a & 15); + aligned = (const char *) ((size_t) a & -16L); + if (offset != 0) + { + int length; + + /* Load masks. */ + /* cast safe - _mm_load_si128() it's 16B aligned */ + mask = __m128i_shift_right(_mm_load_si128 ((__m128i *) (void *) aligned), offset); + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (void *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return _ws_mempbrk(s, slen, a); + + if (index != 0) + { + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) (void *) a); + } + } + } + else + { + int length; + + /* A is aligned. (cast safe) */ + mask = _mm_load_si128 ((__m128i *) (void *) a); + + /* Find where the NULL terminator is. */ + length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return _ws_mempbrk(s, slen, a); + } + } + + offset = (int) ((size_t) s & 15); + aligned = (const char *) ((size_t) s & -16L); + if (offset != 0) + { + /* Check partial string. cast safe it's 16B aligned */ + __m128i value = __m128i_shift_right (_mm_load_si128 ((__m128i *) (void *) aligned), offset); + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + int index = _mm_cmpistri (value, value, 0x3a); + + if (cflag) + return s + length; + /* Find where the NULL terminator is. */ + if (index < 16 - offset) + { + /* fond NUL @ 'index', need to switch to slower mempbrk */ + return _ws_mempbrk(s + index + 1, slen - index - 1, a); /* slen is bigger than 16 & index < 16 so no undeflow here */ + } + aligned += 16; + slen -= (16 - offset); + } + else + aligned = s; + + while (slen >= 16) + { + __m128i value = _mm_load_si128 ((__m128i *) (void *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + + if (cflag) + return aligned + index; + if (zflag) + { + /* found NUL, need to switch to slower mempbrk */ + return _ws_mempbrk(aligned, slen, a); + } + aligned += 16; + slen -= 16; + } + + /* XXX, use mempbrk_slow here? */ + return _ws_mempbrk(aligned, slen, a); +} |