diff options
Diffstat (limited to 'Transceiver52M/x86/convert.c')
-rw-r--r-- | Transceiver52M/x86/convert.c | 136 |
1 files changed, 2 insertions, 134 deletions
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 3f76b65..db98050 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -20,6 +20,8 @@ #include <malloc.h> #include <string.h> #include "convert.h" +#include "convert_sse_3.h" +#include "convert_sse_4_1.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -36,140 +38,6 @@ struct convert_cpu_context { static struct convert_cpu_context c; -#ifdef HAVE_SSE3 -#include <xmmintrin.h> -#include <emmintrin.h> - -#ifdef HAVE_SSE4_1 -#include <smmintrin.h> - -/* 16*N 16-bit signed integer converted to single precision floats */ -static void _sse_convert_si16_ps_16n(float *restrict out, - const short *restrict in, - int len) -{ - __m128i m0, m1, m2, m3, m4, m5; - __m128 m6, m7, m8, m9; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]); - m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]); - - /* Unpack */ - m2 = _mm_cvtepi16_epi32(m0); - m4 = _mm_cvtepi16_epi32(m1); - m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); - m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); - m3 = _mm_cvtepi16_epi32(m0); - m5 = _mm_cvtepi16_epi32(m1); - - /* Convert */ - m6 = _mm_cvtepi32_ps(m2); - m7 = _mm_cvtepi32_ps(m3); - m8 = _mm_cvtepi32_ps(m4); - m9 = _mm_cvtepi32_ps(m5); - - /* Store */ - _mm_storeu_ps(&out[16 * i + 0], m6); - _mm_storeu_ps(&out[16 * i + 4], m7); - _mm_storeu_ps(&out[16 * i + 8], m8); - _mm_storeu_ps(&out[16 * i + 12], m9); - } -} - -/* 16*N 16-bit signed integer conversion with remainder */ -static void _sse_convert_si16_ps(float *restrict out, - const short *restrict in, - int len) -{ - int start = len / 16 * 16; - - _sse_convert_si16_ps_16n(out, in, len); - - for (int i = 0; i < len % 16; i++) - out[start + i] = in[start + i]; -} -#endif /* HAVE_SSE4_1 */ - -/* 8*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_8n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2; - __m128i m4, m5; - - for (int i = 0; i < len / 8; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_ps(&in[8 * i + 0]); - m1 = _mm_loadu_ps(&in[8 * i + 4]); - m2 = _mm_load1_ps(&scale); - - /* Scale */ - m0 = _mm_mul_ps(m0, m2); - m1 = _mm_mul_ps(m1, m2); - - /* Convert */ - m4 = _mm_cvtps_epi32(m0); - m5 = _mm_cvtps_epi32(m1); - - /* Pack and store */ - m5 = _mm_packs_epi32(m4, m5); - _mm_storeu_si128((__m128i *) &out[8 * i], m5); - } -} - -/* 8*N single precision floats scaled and converted with remainder */ -static void _sse_convert_scale_ps_si16(short *restrict out, - const float *restrict in, - float scale, int len) -{ - int start = len / 8 * 8; - - _sse_convert_scale_ps_si16_8n(out, in, scale, len); - - for (int i = 0; i < len % 8; i++) - out[start + i] = in[start + i] * scale; -} - -/* 16*N single precision floats scaled and converted to 16-bit signed integer */ -static void _sse_convert_scale_ps_si16_16n(short *restrict out, - const float *restrict in, - float scale, int len) -{ - __m128 m0, m1, m2, m3, m4; - __m128i m5, m6, m7, m8; - - for (int i = 0; i < len / 16; i++) { - /* Load (unaligned) packed floats */ - m0 = _mm_loadu_ps(&in[16 * i + 0]); - m1 = _mm_loadu_ps(&in[16 * i + 4]); - m2 = _mm_loadu_ps(&in[16 * i + 8]); - m3 = _mm_loadu_ps(&in[16 * i + 12]); - m4 = _mm_load1_ps(&scale); - - /* Scale */ - m0 = _mm_mul_ps(m0, m4); - m1 = _mm_mul_ps(m1, m4); - m2 = _mm_mul_ps(m2, m4); - m3 = _mm_mul_ps(m3, m4); - - /* Convert */ - m5 = _mm_cvtps_epi32(m0); - m6 = _mm_cvtps_epi32(m1); - m7 = _mm_cvtps_epi32(m2); - m8 = _mm_cvtps_epi32(m3); - - /* Pack and store */ - m5 = _mm_packs_epi32(m5, m6); - m7 = _mm_packs_epi32(m7, m8); - _mm_storeu_si128((__m128i *) &out[16 * i + 0], m5); - _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7); - } -} -#endif - void convert_init(void) { c.convert_scale_ps_si16_16n = base_convert_float_short; |