diff options
author | Philipp Maier <pmaier@sysmocom.de> | 2017-03-20 12:08:42 +0100 |
---|---|---|
committer | Tom Tsou <tom@tsou.cc> | 2017-05-19 17:21:45 +0000 |
commit | e8ae9fcf387540f1b210f5ece372d0fd070b6249 (patch) | |
tree | a2cea15903fda9795776f35d1cee575b558d0ebc /Transceiver52M/x86/convert.c | |
parent | f5bf33b287d5c4ce13c0ecac91c8d7f128a24eb5 (diff) |
buildenv: Split up SSE3 and SSE4.1 code
Currently we find SSE3 and SSE4.1 code mixed togehter along with
generic code in one file. This introduces the risk that the
compiler exidantly mixes SSE4.1 instructions into an SSE3, or
even worse into a generic code path.
This commit splits the SSE3 and SSE4.1 code into separate files
and compiles them with the matching target options.
Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Diffstat (limited to 'Transceiver52M/x86/convert.c')
-rw-r--r-- | Transceiver52M/x86/convert.c | 136 |
1 files changed, 2 insertions, 134 deletions
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 3f76b65..db98050 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -20,6 +20,8 @@ #include <malloc.h> #include <string.h> #include "convert.h" +#include "convert_sse_3.h" +#include "convert_sse_4_1.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -36,140 +38,6 @@ struct convert_cpu_context { static struct convert_cpu_context c; -#ifdef HAVE_SSE3 -#include <xmmintrin.h> -#include <emmintrin.h> - -#ifdef HAVE_SSE4_1 -#include <smmintrin.h> - -/* 16*N 16-bit signed integer converted to single precision floats */ -static void _sse_convert_si16_ps_16n(float *restrict out, - const short *restrict in, - int len) -{ - __m128i m0, m1, m2, m3, m4, m5; - __m128 m6, m7, m8, m9; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]); - m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]); - - /* Unpack */ - m2 = _mm_cvtepi16_epi32(m0); - m4 = _mm_cvtepi16_epi32(m1); - m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); - m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); - m3 = _mm_cvtepi16_epi32(m0); - m5 = _mm_cvtepi16_epi32(m1); - - /* Convert */ - m6 = _mm_cvtepi32_ps(m2); - m7 = _mm_cvtepi32_ps(m3); - m8 = _mm_cvtepi32_ps(m4); - m9 = _mm_cvtepi32_ps(m5); - - /* Store */ - _mm_storeu_ps(&out[16 * i + 0], m6); - _mm_storeu_ps(&out[16 * i + 4], m7); - _mm_storeu_ps(&out[16 * i + 8], m8); - _mm_storeu_ps(&out[16 * i + 12], m9); - } -} - -/* 16*N 16-bit signed integer conversion with remainder */ -static void _sse_convert_si16_ps(float *restrict out, - const short *restrict in, - int len) -{ - int start = len / 16 * 16; - - _sse_convert_si16_ps_16n(out, in, len); - - for (int i = 0; i < len % 16; i++) - out[start + i] = in[start + i]; -} -#endif /* HAVE_SSE4_1 */ - -/* 8*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_8n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2; - __m128i m4, m5; - - for (int i = 0; i < len / 8; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_ps(&in[8 * i + 0]); - m1 = _mm_loadu_ps(&in[8 * i + 4]); - m2 = _mm_load1_ps(&scale); - - /* Scale */ - m0 = _mm_mul_ps(m0, m2); - m1 = _mm_mul_ps(m1, m2); - - /* Convert */ - m4 = _mm_cvtps_epi32(m0); - m5 = _mm_cvtps_epi32(m1); - - /* Pack and store */ - m5 = _mm_packs_epi32(m4, m5); - _mm_storeu_si128((__m128i *) &out[8 * i], m5); - } -} - -/* 8*N single precision floats scaled and converted with remainder */ -static void _sse_convert_scale_ps_si16(short *restrict out, - const float *restrict in, - float scale, int len) -{ - int start = len / 8 * 8; - - _sse_convert_scale_ps_si16_8n(out, in, scale, len); - - for (int i = 0; i < len % 8; i++) - out[start + i] = in[start + i] * scale; -} - -/* 16*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_16n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2, m3, m4; - __m128i m5, m6, m7, m8; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_ps(&in[16 * i + 0]); - m1 = _mm_loadu_ps(&in[16 * i + 4]); - m2 = _mm_loadu_ps(&in[16 * i + 8]); - m3 = _mm_loadu_ps(&in[16 * i + 12]); - m4 = _mm_load1_ps(&scale); - - /* Scale */ - m0 = _mm_mul_ps(m0, m4); - m1 = _mm_mul_ps(m1, m4); - m2 = _mm_mul_ps(m2, m4); - m3 = _mm_mul_ps(m3, m4); - - /* Convert */ - m5 = _mm_cvtps_epi32(m0); - m6 = _mm_cvtps_epi32(m1); - m7 = _mm_cvtps_epi32(m2); - m8 = _mm_cvtps_epi32(m3); - - /* Pack and store */ - m5 = _mm_packs_epi32(m5, m6); - m7 = _mm_packs_epi32(m7, m8); - _mm_storeu_si128((__m128i *) &out[16 * i + 0], m5); - _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7); - } -} -#endif - void convert_init(void) { c.convert_scale_ps_si16_16n = base_convert_float_short; |