aboutsummaryrefslogtreecommitdiffstats
path: root/Transceiver52M/x86/convert.c
diff options
context:
space:
mode:
authorPhilipp Maier <pmaier@sysmocom.de>2017-03-20 12:08:42 +0100
committerTom Tsou <tom@tsou.cc>2017-05-19 17:21:45 +0000
commite8ae9fcf387540f1b210f5ece372d0fd070b6249 (patch)
treea2cea15903fda9795776f35d1cee575b558d0ebc /Transceiver52M/x86/convert.c
parentf5bf33b287d5c4ce13c0ecac91c8d7f128a24eb5 (diff)
buildenv: Split up SSE3 and SSE4.1 code
Currently we find SSE3 and SSE4.1 code mixed togehter along with generic code in one file. This introduces the risk that the compiler exidantly mixes SSE4.1 instructions into an SSE3, or even worse into a generic code path. This commit splits the SSE3 and SSE4.1 code into separate files and compiles them with the matching target options. Change-Id: I846e190e92f1258cd412d1b2d79b539e204e04b3
Diffstat (limited to 'Transceiver52M/x86/convert.c')
-rw-r--r--Transceiver52M/x86/convert.c136
1 files changed, 2 insertions, 134 deletions
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index 3f76b65..db98050 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -20,6 +20,8 @@
#include <malloc.h>
#include <string.h>
#include "convert.h"
+#include "convert_sse_3.h"
+#include "convert_sse_4_1.h"
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -36,140 +38,6 @@ struct convert_cpu_context {
static struct convert_cpu_context c;
-#ifdef HAVE_SSE3
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-#ifdef HAVE_SSE4_1
-#include <smmintrin.h>
-
-/* 16*N 16-bit signed integer converted to single precision floats */
-static void _sse_convert_si16_ps_16n(float *restrict out,
- const short *restrict in,
- int len)
-{
- __m128i m0, m1, m2, m3, m4, m5;
- __m128 m6, m7, m8, m9;
-
- for (int i = 0; i < len / 16; i++) {
- /* Load (unaligned) packed floats */
- m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]);
- m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]);
-
- /* Unpack */
- m2 = _mm_cvtepi16_epi32(m0);
- m4 = _mm_cvtepi16_epi32(m1);
- m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
- m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
- m3 = _mm_cvtepi16_epi32(m0);
- m5 = _mm_cvtepi16_epi32(m1);
-
- /* Convert */
- m6 = _mm_cvtepi32_ps(m2);
- m7 = _mm_cvtepi32_ps(m3);
- m8 = _mm_cvtepi32_ps(m4);
- m9 = _mm_cvtepi32_ps(m5);
-
- /* Store */
- _mm_storeu_ps(&out[16 * i + 0], m6);
- _mm_storeu_ps(&out[16 * i + 4], m7);
- _mm_storeu_ps(&out[16 * i + 8], m8);
- _mm_storeu_ps(&out[16 * i + 12], m9);
- }
-}
-
-/* 16*N 16-bit signed integer conversion with remainder */
-static void _sse_convert_si16_ps(float *restrict out,
- const short *restrict in,
- int len)
-{
- int start = len / 16 * 16;
-
- _sse_convert_si16_ps_16n(out, in, len);
-
- for (int i = 0; i < len % 16; i++)
- out[start + i] = in[start + i];
-}
-#endif /* HAVE_SSE4_1 */
-
-/* 8*N single precision floats scaled and converted to 16-bit signed integer */
-static void _sse_convert_scale_ps_si16_8n(short *restrict out,
- const float *restrict in,
- float scale, int len)
-{
- __m128 m0, m1, m2;
- __m128i m4, m5;
-
- for (int i = 0; i < len / 8; i++) {
- /* Load (unaligned) packed floats */
- m0 = _mm_loadu_ps(&in[8 * i + 0]);
- m1 = _mm_loadu_ps(&in[8 * i + 4]);
- m2 = _mm_load1_ps(&scale);
-
- /* Scale */
- m0 = _mm_mul_ps(m0, m2);
- m1 = _mm_mul_ps(m1, m2);
-
- /* Convert */
- m4 = _mm_cvtps_epi32(m0);
- m5 = _mm_cvtps_epi32(m1);
-
- /* Pack and store */
- m5 = _mm_packs_epi32(m4, m5);
- _mm_storeu_si128((__m128i *) &out[8 * i], m5);
- }
-}
-
-/* 8*N single precision floats scaled and converted with remainder */
-static void _sse_convert_scale_ps_si16(short *restrict out,
- const float *restrict in,
- float scale, int len)
-{
- int start = len / 8 * 8;
-
- _sse_convert_scale_ps_si16_8n(out, in, scale, len);
-
- for (int i = 0; i < len % 8; i++)
- out[start + i] = in[start + i] * scale;
-}
-
-/* 16*N single precision floats scaled and converted to 16-bit signed integer */
-static void _sse_convert_scale_ps_si16_16n(short *restrict out,
- const float *restrict in,
- float scale, int len)
-{
- __m128 m0, m1, m2, m3, m4;
- __m128i m5, m6, m7, m8;
-
- for (int i = 0; i < len / 16; i++) {
- /* Load (unaligned) packed floats */
- m0 = _mm_loadu_ps(&in[16 * i + 0]);
- m1 = _mm_loadu_ps(&in[16 * i + 4]);
- m2 = _mm_loadu_ps(&in[16 * i + 8]);
- m3 = _mm_loadu_ps(&in[16 * i + 12]);
- m4 = _mm_load1_ps(&scale);
-
- /* Scale */
- m0 = _mm_mul_ps(m0, m4);
- m1 = _mm_mul_ps(m1, m4);
- m2 = _mm_mul_ps(m2, m4);
- m3 = _mm_mul_ps(m3, m4);
-
- /* Convert */
- m5 = _mm_cvtps_epi32(m0);
- m6 = _mm_cvtps_epi32(m1);
- m7 = _mm_cvtps_epi32(m2);
- m8 = _mm_cvtps_epi32(m3);
-
- /* Pack and store */
- m5 = _mm_packs_epi32(m5, m6);
- m7 = _mm_packs_epi32(m7, m8);
- _mm_storeu_si128((__m128i *) &out[16 * i + 0], m5);
- _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7);
- }
-}
-#endif
-
void convert_init(void)
{
c.convert_scale_ps_si16_16n = base_convert_float_short;