aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilipp Maier <pmaier@sysmocom.de>2017-03-15 12:39:25 +0100
committerTom Tsou <tom@tsou.cc>2017-05-02 17:17:57 +0000
commit131f82bfacc1b677a87e66746208e53fba40ff41 (patch)
tree8a4845c8b9f1fc3aa52bd354251da0f0dd9d45ee
parent78b5627fa1c911713a776e4aa1cb2d8f3a04bd8f (diff)
cosmetic: Make parameter lists uniform
The non-sse implementation and the sse implementation of the convert and convolve functions have different parameter lists. This makes it difficult to use function pointers in order to select the right function depending on the SSE-Level and CPU. This commit uniformizes the parameter lists in preparation for planned runtime cpu detection support Change-Id: Ice063b89791537c4b591751f12f5ef5c413a2d27
-rw-r--r--Transceiver52M/x86/convert.c22
-rw-r--r--Transceiver52M/x86/convolve.c231
2 files changed, 143 insertions, 110 deletions
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index eafe7b2..862a2e7 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -176,26 +176,34 @@ static void convert_si16_ps(float *out, const short *in, int len)
void convert_float_short(short *out, const float *in, float scale, int len)
{
+ void (*conv_func)(short *, const float *, float, int);
+
#ifdef HAVE_SSE3
if (!(len % 16))
- _sse_convert_scale_ps_si16_16n(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16_16n;
else if (!(len % 8))
- _sse_convert_scale_ps_si16_8n(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16_8n;
else
- _sse_convert_scale_ps_si16(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16;
#else
- convert_scale_ps_si16(out, in, scale, len);
+ conv_func = convert_scale_ps_si16;
#endif
+
+ conv_func(out, in, scale, len);
}
void convert_short_float(float *out, const short *in, int len)
{
+ void (*conv_func) (float *, const short *, int);
+
#ifdef HAVE_SSE4_1
if (!(len % 16))
- _sse_convert_si16_ps_16n(out, in, len);
+ conv_func = _sse_convert_si16_ps_16n;
else
- _sse_convert_si16_ps(out, in, len);
+ conv_func = _sse_convert_si16_ps;
#else
- convert_si16_ps(out, in, len);
+ conv_func = convert_si16_ps;
#endif
+
+ conv_func(out, in, len);
}
diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c
index 04923bc..e2a1dea 100644
--- a/Transceiver52M/x86/convolve.c
+++ b/Transceiver52M/x86/convolve.c
@@ -47,13 +47,21 @@ int bounds_check(int x_len, int h_len, int y_len,
#include <pmmintrin.h>
/* 4-tap SSE complex-real convolution */
-static void sse_conv_real4(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real4(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* NOTE: The parameter list of this function has to match the parameter
+ * list of _base_convolve_real() in convolve_base.c. This specific
+ * implementation, ignores some of the parameters of
+ * _base_convolve_complex(), which are: x_len, y_len, offset, step */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -61,8 +69,8 @@ static void sse_conv_real4(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -81,13 +89,18 @@ static void sse_conv_real4(const float *restrict x,
}
/* 8-tap SSE complex-real convolution */
-static void sse_conv_real8(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real8(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -99,10 +112,10 @@ static void sse_conv_real8(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -128,14 +141,19 @@ static void sse_conv_real8(const float *restrict x,
}
/* 12-tap SSE complex-real convolution */
-static void sse_conv_real12(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real12(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -150,18 +168,18 @@ static void sse_conv_real12(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
- m0 = _mm_loadu_ps(&x[2 * i + 16]);
- m1 = _mm_loadu_ps(&x[2 * i + 20]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 20]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -190,14 +208,19 @@ static void sse_conv_real12(const float *restrict x,
}
/* 16-tap SSE complex-real convolution */
-static void sse_conv_real16(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real16(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -216,20 +239,20 @@ static void sse_conv_real16(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
- m0 = _mm_loadu_ps(&x[2 * i + 16]);
- m1 = _mm_loadu_ps(&x[2 * i + 20]);
- m2 = _mm_loadu_ps(&x[2 * i + 24]);
- m3 = _mm_loadu_ps(&x[2 * i + 28]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 20]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 24]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 28]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -265,14 +288,19 @@ static void sse_conv_real16(const float *restrict x,
}
/* 20-tap SSE complex-real convolution */
-static void sse_conv_real20(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real20(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -293,12 +321,12 @@ static void sse_conv_real20(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Multiply-accumulate first 12 taps */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
- m4 = _mm_loadu_ps(&x[2 * i + 16]);
- m5 = _mm_loadu_ps(&x[2 * i + 20]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
+ m4 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m5 = _mm_loadu_ps(&_x[2 * i + 20]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -320,10 +348,10 @@ static void sse_conv_real20(const float *restrict x,
m9 = _mm_add_ps(m1, m7);
/* Multiply-accumulate last 8 taps */
- m0 = _mm_loadu_ps(&x[2 * i + 24]);
- m1 = _mm_loadu_ps(&x[2 * i + 28]);
- m2 = _mm_loadu_ps(&x[2 * i + 32]);
- m3 = _mm_loadu_ps(&x[2 * i + 36]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 24]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 28]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 32]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 36]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -351,13 +379,18 @@ static void sse_conv_real20(const float *restrict x,
}
/* 4*N-tap SSE complex-real convolution */
-static void sse_conv_real4n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_real4n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@@ -370,8 +403,8 @@ static void sse_conv_real4n(const float *x,
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -394,13 +427,21 @@ static void sse_conv_real4n(const float *x,
}
/* 4*N-tap SSE complex-complex convolution */
-static void sse_conv_cmplx_4n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_cmplx_4n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* NOTE: The parameter list of this function has to match the parameter
+ * list of _base_convolve_complex() in convolve_base.c. This specific
+ * implementation, ignores some of the parameters of
+ * _base_convolve_complex(), which are: x_len, y_len, offset, step. */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@@ -414,8 +455,8 @@ static void sse_conv_cmplx_4n(const float *x,
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -445,14 +486,19 @@ static void sse_conv_cmplx_4n(const float *x,
}
/* 8*N-tap SSE complex-complex convolution */
-static void sse_conv_cmplx_8n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_cmplx_8n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_cmplx_4n() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m12 = _mm_setzero_ps();
@@ -473,10 +519,10 @@ static void sse_conv_cmplx_8n(const float *x,
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -522,14 +568,10 @@ static void sse_conv_cmplx_8n(const float *x,
/* API: Aligned complex-real */
int convolve_real(const float *x, int x_len,
const float *h, int h_len,
- float *y, int y_len,
- int start, int len,
- int step, int offset)
+ float *y, int y_len, int start, int len, int step, int offset)
{
- void (*conv_func)(const float *, const float *,
- float *, int) = NULL;
- void (*conv_func_n)(const float *, const float *,
- float *, int, int) = NULL;
+ void (*conv_func) (const float *, int, const float *, int, float *, int,
+ int, int, int, int) = (void *)_base_convolve_real;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@@ -556,22 +598,12 @@ int convolve_real(const float *x, int x_len,
break;
default:
if (!(h_len % 4))
- conv_func_n = sse_conv_real4n;
+ conv_func = sse_conv_real4n;
}
}
#endif
- if (conv_func) {
- conv_func(&x[2 * (-(h_len - 1) + start)],
- h, y, len);
- } else if (conv_func_n) {
- conv_func_n(&x[2 * (-(h_len - 1) + start)],
- h, y, h_len, len);
- } else {
- _base_convolve_real(x, x_len,
- h, h_len,
- y, y_len,
- start, len, step, offset);
- }
+
+ conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}
@@ -580,11 +612,11 @@ int convolve_real(const float *x, int x_len,
int convolve_complex(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
- int start, int len,
- int step, int offset)
+ int start, int len, int step, int offset)
{
- void (*conv_func)(const float *, const float *,
- float *, int, int) = NULL;
+ void (*conv_func) (const float *, int, const float *, int, float *, int,
+ int, int, int, int) =
+ (void *)_base_convolve_complex;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@@ -599,15 +631,8 @@ int convolve_complex(const float *x, int x_len,
conv_func = sse_conv_cmplx_4n;
}
#endif
- if (conv_func) {
- conv_func(&x[2 * (-(h_len - 1) + start)],
- h, y, h_len, len);
- } else {
- _base_convolve_complex(x, x_len,
- h, h_len,
- y, y_len,
- start, len, step, offset);
- }
+
+ conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}