aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Transceiver52M/x86/convert.c22
-rw-r--r--Transceiver52M/x86/convolve.c231
2 files changed, 143 insertions, 110 deletions
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index eafe7b2..862a2e7 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -176,26 +176,34 @@ static void convert_si16_ps(float *out, const short *in, int len)
void convert_float_short(short *out, const float *in, float scale, int len)
{
+ void (*conv_func)(short *, const float *, float, int);
+
#ifdef HAVE_SSE3
if (!(len % 16))
- _sse_convert_scale_ps_si16_16n(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16_16n;
else if (!(len % 8))
- _sse_convert_scale_ps_si16_8n(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16_8n;
else
- _sse_convert_scale_ps_si16(out, in, scale, len);
+ conv_func = _sse_convert_scale_ps_si16;
#else
- convert_scale_ps_si16(out, in, scale, len);
+ conv_func = convert_scale_ps_si16;
#endif
+
+ conv_func(out, in, scale, len);
}
void convert_short_float(float *out, const short *in, int len)
{
+ void (*conv_func) (float *, const short *, int);
+
#ifdef HAVE_SSE4_1
if (!(len % 16))
- _sse_convert_si16_ps_16n(out, in, len);
+ conv_func = _sse_convert_si16_ps_16n;
else
- _sse_convert_si16_ps(out, in, len);
+ conv_func = _sse_convert_si16_ps;
#else
- convert_si16_ps(out, in, len);
+ conv_func = convert_si16_ps;
#endif
+
+ conv_func(out, in, len);
}
diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c
index 04923bc..e2a1dea 100644
--- a/Transceiver52M/x86/convolve.c
+++ b/Transceiver52M/x86/convolve.c
@@ -47,13 +47,21 @@ int bounds_check(int x_len, int h_len, int y_len,
#include <pmmintrin.h>
/* 4-tap SSE complex-real convolution */
-static void sse_conv_real4(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real4(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* NOTE: The parameter list of this function has to match the parameter
+ * list of _base_convolve_real() in convolve_base.c. This specific
+ * implementation, ignores some of the parameters of
+ * _base_convolve_complex(), which are: x_len, y_len, offset, step */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -61,8 +69,8 @@ static void sse_conv_real4(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -81,13 +89,18 @@ static void sse_conv_real4(const float *restrict x,
}
/* 8-tap SSE complex-real convolution */
-static void sse_conv_real8(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real8(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -99,10 +112,10 @@ static void sse_conv_real8(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -128,14 +141,19 @@ static void sse_conv_real8(const float *restrict x,
}
/* 12-tap SSE complex-real convolution */
-static void sse_conv_real12(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real12(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -150,18 +168,18 @@ static void sse_conv_real12(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
- m0 = _mm_loadu_ps(&x[2 * i + 16]);
- m1 = _mm_loadu_ps(&x[2 * i + 20]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 20]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -190,14 +208,19 @@ static void sse_conv_real12(const float *restrict x,
}
/* 16-tap SSE complex-real convolution */
-static void sse_conv_real16(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real16(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -216,20 +239,20 @@ static void sse_conv_real16(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
- m0 = _mm_loadu_ps(&x[2 * i + 16]);
- m1 = _mm_loadu_ps(&x[2 * i + 20]);
- m2 = _mm_loadu_ps(&x[2 * i + 24]);
- m3 = _mm_loadu_ps(&x[2 * i + 28]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 20]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 24]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 28]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -265,14 +288,19 @@ static void sse_conv_real16(const float *restrict x,
}
/* 20-tap SSE complex-real convolution */
-static void sse_conv_real20(const float *restrict x,
- const float *restrict h,
- float *restrict y,
- int len)
+static void sse_conv_real20(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
/* Load (aligned) filter taps */
m0 = _mm_load_ps(&h[0]);
m1 = _mm_load_ps(&h[4]);
@@ -293,12 +321,12 @@ static void sse_conv_real20(const float *restrict x,
for (int i = 0; i < len; i++) {
/* Multiply-accumulate first 12 taps */
- m0 = _mm_loadu_ps(&x[2 * i + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 12]);
- m4 = _mm_loadu_ps(&x[2 * i + 16]);
- m5 = _mm_loadu_ps(&x[2 * i + 20]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 12]);
+ m4 = _mm_loadu_ps(&_x[2 * i + 16]);
+ m5 = _mm_loadu_ps(&_x[2 * i + 20]);
m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -320,10 +348,10 @@ static void sse_conv_real20(const float *restrict x,
m9 = _mm_add_ps(m1, m7);
/* Multiply-accumulate last 8 taps */
- m0 = _mm_loadu_ps(&x[2 * i + 24]);
- m1 = _mm_loadu_ps(&x[2 * i + 28]);
- m2 = _mm_loadu_ps(&x[2 * i + 32]);
- m3 = _mm_loadu_ps(&x[2 * i + 36]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 24]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 28]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 32]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 36]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -351,13 +379,18 @@ static void sse_conv_real20(const float *restrict x,
}
/* 4*N-tap SSE complex-real convolution */
-static void sse_conv_real4n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_real4n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_real4() */
+
__m128 m0, m1, m2, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@@ -370,8 +403,8 @@ static void sse_conv_real4n(const float *x,
m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -394,13 +427,21 @@ static void sse_conv_real4n(const float *x,
}
/* 4*N-tap SSE complex-complex convolution */
-static void sse_conv_cmplx_4n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_cmplx_4n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* NOTE: The parameter list of this function has to match the parameter
+ * list of _base_convolve_complex() in convolve_base.c. This specific
+ * implementation, ignores some of the parameters of
+ * _base_convolve_complex(), which are: x_len, y_len, offset, step. */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m6 = _mm_setzero_ps();
@@ -414,8 +455,8 @@ static void sse_conv_cmplx_4n(const float *x,
m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -445,14 +486,19 @@ static void sse_conv_cmplx_4n(const float *x,
}
/* 8*N-tap SSE complex-complex convolution */
-static void sse_conv_cmplx_8n(const float *x,
- const float *h,
- float *y,
- int h_len, int len)
+static void sse_conv_cmplx_8n(const float *x, int x_len,
+ const float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
{
+ /* See NOTE in sse_conv_cmplx_4n() */
+
__m128 m0, m1, m2, m3, m4, m5, m6, m7;
__m128 m8, m9, m10, m11, m12, m13, m14, m15;
+ const float *_x = &x[2 * (-(h_len - 1) + start)];
+
for (int i = 0; i < len; i++) {
/* Zero */
m12 = _mm_setzero_ps();
@@ -473,10 +519,10 @@ static void sse_conv_cmplx_8n(const float *x,
m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
/* Load (unaligned) input data */
- m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
- m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
- m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
- m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
+ m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
+ m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
+ m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
+ m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
@@ -522,14 +568,10 @@ static void sse_conv_cmplx_8n(const float *x,
/* API: Aligned complex-real */
int convolve_real(const float *x, int x_len,
const float *h, int h_len,
- float *y, int y_len,
- int start, int len,
- int step, int offset)
+ float *y, int y_len, int start, int len, int step, int offset)
{
- void (*conv_func)(const float *, const float *,
- float *, int) = NULL;
- void (*conv_func_n)(const float *, const float *,
- float *, int, int) = NULL;
+ void (*conv_func) (const float *, int, const float *, int, float *, int,
+ int, int, int, int) = (void *)_base_convolve_real;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@@ -556,22 +598,12 @@ int convolve_real(const float *x, int x_len,
break;
default:
if (!(h_len % 4))
- conv_func_n = sse_conv_real4n;
+ conv_func = sse_conv_real4n;
}
}
#endif
- if (conv_func) {
- conv_func(&x[2 * (-(h_len - 1) + start)],
- h, y, len);
- } else if (conv_func_n) {
- conv_func_n(&x[2 * (-(h_len - 1) + start)],
- h, y, h_len, len);
- } else {
- _base_convolve_real(x, x_len,
- h, h_len,
- y, y_len,
- start, len, step, offset);
- }
+
+ conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}
@@ -580,11 +612,11 @@ int convolve_real(const float *x, int x_len,
int convolve_complex(const float *x, int x_len,
const float *h, int h_len,
float *y, int y_len,
- int start, int len,
- int step, int offset)
+ int start, int len, int step, int offset)
{
- void (*conv_func)(const float *, const float *,
- float *, int, int) = NULL;
+ void (*conv_func) (const float *, int, const float *, int, float *, int,
+ int, int, int, int) =
+ (void *)_base_convolve_complex;
if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
return -1;
@@ -599,15 +631,8 @@ int convolve_complex(const float *x, int x_len,
conv_func = sse_conv_cmplx_4n;
}
#endif
- if (conv_func) {
- conv_func(&x[2 * (-(h_len - 1) + start)],
- h, y, h_len, len);
- } else {
- _base_convolve_complex(x, x_len,
- h, h_len,
- y, y_len,
- start, len, step, offset);
- }
+
+ conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
return len;
}