diff options
-rw-r--r-- | Transceiver52M/arm/convolve.c | 7 | ||||
-rw-r--r-- | Transceiver52M/common/convert.h | 1 | ||||
-rw-r--r-- | Transceiver52M/common/convolve.h | 2 | ||||
-rw-r--r-- | Transceiver52M/osmo-trx.cpp | 8 | ||||
-rw-r--r-- | Transceiver52M/x86/convert.c | 67 | ||||
-rw-r--r-- | Transceiver52M/x86/convolve.c | 106 |
6 files changed, 142 insertions, 49 deletions
diff --git a/Transceiver52M/arm/convolve.c b/Transceiver52M/arm/convolve.c index 2b42090..912d0c2 100644 --- a/Transceiver52M/arm/convolve.c +++ b/Transceiver52M/arm/convolve.c @@ -58,6 +58,13 @@ static void neon_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len) } #endif +/* API: Initalize convolve module */ +void convolve_init(void) +{ + /* Stub */ + return; +} + /* API: Aligned complex-real */ int convolve_real(float *x, int x_len, float *h, int h_len, diff --git a/Transceiver52M/common/convert.h b/Transceiver52M/common/convert.h index 4827c28..1d3a180 100644 --- a/Transceiver52M/common/convert.h +++ b/Transceiver52M/common/convert.h @@ -3,5 +3,6 @@ void convert_float_short(short *out, const float *in, float scale, int len); void convert_short_float(float *out, const short *in, int len); +void convert_init(void); #endif /* _CONVERT_H_ */ diff --git a/Transceiver52M/common/convolve.h b/Transceiver52M/common/convolve.h index 08bda0c..43db577 100644 --- a/Transceiver52M/common/convolve.h +++ b/Transceiver52M/common/convolve.h @@ -27,4 +27,6 @@ int base_convolve_complex(const float *x, int x_len, int start, int len, int step, int offset); +void convolve_init(void); + #endif /* _CONVOLVE_H_ */ diff --git a/Transceiver52M/osmo-trx.cpp b/Transceiver52M/osmo-trx.cpp index b07ffe8..2d35a60 100644 --- a/Transceiver52M/osmo-trx.cpp +++ b/Transceiver52M/osmo-trx.cpp @@ -32,6 +32,11 @@ #include <Logger.h> #include <Configuration.h> +extern "C" { +#include "convolve.h" +#include "convert.h" +} + /* Samples-per-symbol for downlink path * 4 - Uses precision modulator (more computation, less distortion) * 1 - Uses minimized modulator (less computation, more distortion) @@ -422,6 +427,9 @@ int main(int argc, char *argv[]) RadioDevice::InterfaceType iface = RadioDevice::NORMAL; struct trx_config config; + convolve_init(); + convert_init(); + handle_options(argc, argv, &config); setup_signal_handlers(); diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index 862a2e7..db1c0fc 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -25,6 +25,17 @@ #include "config.h" #endif +/* Architecture dependant function pointers */ +struct convert_cpu_context { + void (*convert_si16_ps_16n) (float *, const short *, int); + void (*convert_si16_ps) (float *, const short *, int); + void (*convert_scale_ps_si16_16n)(short *, const float *, float, int); + void (*convert_scale_ps_si16_8n)(short *, const float *, float, int); + void (*convert_scale_ps_si16)(short *, const float *, float, int); +}; + +static struct convert_cpu_context c; + #ifdef HAVE_SSE3 #include <xmmintrin.h> #include <emmintrin.h> @@ -157,53 +168,61 @@ static void _sse_convert_scale_ps_si16_16n(short *restrict out, _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7); } } -#else /* HAVE_SSE3 */ +#endif + +__attribute__((optimize("no-tree-vectorize"))) static void convert_scale_ps_si16(short *out, const float *in, float scale, int len) { for (int i = 0; i < len; i++) out[i] = in[i] * scale; } -#endif -#ifndef HAVE_SSE4_1 +__attribute__((optimize("no-tree-vectorize"))) static void convert_si16_ps(float *out, const short *in, int len) { for (int i = 0; i < len; i++) out[i] = in[i]; } -#endif -void convert_float_short(short *out, const float *in, float scale, int len) +void convert_init(void) { - void (*conv_func)(short *, const float *, float, int); + c.convert_scale_ps_si16_16n = convert_scale_ps_si16; + c.convert_scale_ps_si16_8n = convert_scale_ps_si16; + c.convert_scale_ps_si16 = convert_scale_ps_si16; + c.convert_si16_ps_16n = convert_si16_ps; + c.convert_si16_ps = convert_si16_ps; + +#ifdef HAVE_SSE4_1 + if (__builtin_cpu_supports("sse4.1")) { + c.convert_si16_ps_16n = &_sse_convert_si16_ps_16n; + c.convert_si16_ps = &_sse_convert_si16_ps; + } +#endif #ifdef HAVE_SSE3 + if (__builtin_cpu_supports("sse3")) { + c.convert_scale_ps_si16_16n = _sse_convert_scale_ps_si16_16n; + c.convert_scale_ps_si16_8n = _sse_convert_scale_ps_si16_8n; + c.convert_scale_ps_si16 = _sse_convert_scale_ps_si16; + } +#endif +} + +void convert_float_short(short *out, const float *in, float scale, int len) +{ if (!(len % 16)) - conv_func = _sse_convert_scale_ps_si16_16n; + c.convert_scale_ps_si16_16n(out, in, scale, len); else if (!(len % 8)) - conv_func = _sse_convert_scale_ps_si16_8n; + c.convert_scale_ps_si16_8n(out, in, scale, len); else - conv_func = _sse_convert_scale_ps_si16; -#else - conv_func = convert_scale_ps_si16; -#endif - - conv_func(out, in, scale, len); + c.convert_scale_ps_si16(out, in, scale, len); } void convert_short_float(float *out, const short *in, int len) { - void (*conv_func) (float *, const short *, int); - -#ifdef HAVE_SSE4_1 if (!(len % 16)) - conv_func = _sse_convert_si16_ps_16n; + c.convert_si16_ps_16n(out, in, len); else - conv_func = _sse_convert_si16_ps; -#else - conv_func = convert_si16_ps; -#endif - - conv_func(out, in, len); + c.convert_si16_ps(out, in, len); } diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c index e2a1dea..2f3b293 100644 --- a/Transceiver52M/x86/convolve.c +++ b/Transceiver52M/x86/convolve.c @@ -26,6 +26,31 @@ #include "config.h" #endif +/* Architecture dependant function pointers */ +struct convolve_cpu_context { + void (*conv_cmplx_4n) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_cmplx_8n) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_cmplx) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real4) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real8) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real12) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real16) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real20) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real4n) (const float *, int, const float *, int, float *, + int, int, int, int, int); + void (*conv_real) (const float *, int, const float *, int, float *, int, + int, int, int, int); +}; +static struct convolve_cpu_context c; + /* Forward declarations from base implementation */ int _base_convolve_real(const float *x, int x_len, const float *h, int h_len, @@ -565,45 +590,77 @@ static void sse_conv_cmplx_8n(const float *x, int x_len, } #endif +/* API: Initalize convolve module */ +void convolve_init(void) +{ + c.conv_cmplx_4n = (void *)_base_convolve_complex; + c.conv_cmplx_8n = (void *)_base_convolve_complex; + c.conv_cmplx = (void *)_base_convolve_complex; + c.conv_real4 = (void *)_base_convolve_real; + c.conv_real8 = (void *)_base_convolve_real; + c.conv_real12 = (void *)_base_convolve_real; + c.conv_real16 = (void *)_base_convolve_real; + c.conv_real20 = (void *)_base_convolve_real; + c.conv_real4n = (void *)_base_convolve_real; + c.conv_real = (void *)_base_convolve_real; + +#ifdef HAVE_SSE3 + if (__builtin_cpu_supports("sse3")) { + c.conv_cmplx_4n = sse_conv_cmplx_4n; + c.conv_cmplx_8n = sse_conv_cmplx_8n; + c.conv_real4 = sse_conv_real4; + c.conv_real8 = sse_conv_real8; + c.conv_real12 = sse_conv_real12; + c.conv_real16 = sse_conv_real16; + c.conv_real20 = sse_conv_real20; + c.conv_real4n = sse_conv_real4n; + } +#endif +} + /* API: Aligned complex-real */ int convolve_real(const float *x, int x_len, const float *h, int h_len, float *y, int y_len, int start, int len, int step, int offset) { - void (*conv_func) (const float *, int, const float *, int, float *, int, - int, int, int, int) = (void *)_base_convolve_real; - if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; memset(y, 0, len * 2 * sizeof(float)); -#ifdef HAVE_SSE3 if (step <= 4) { switch (h_len) { case 4: - conv_func = sse_conv_real4; + c.conv_real4(x, x_len, h, h_len, y, y_len, start, len, + step, offset); break; case 8: - conv_func = sse_conv_real8; + c.conv_real8(x, x_len, h, h_len, y, y_len, start, len, + step, offset); break; case 12: - conv_func = sse_conv_real12; + c.conv_real12(x, x_len, h, h_len, y, y_len, start, len, + step, offset); break; case 16: - conv_func = sse_conv_real16; + c.conv_real16(x, x_len, h, h_len, y, y_len, start, len, + step, offset); break; case 20: - conv_func = sse_conv_real20; + c.conv_real20(x, x_len, h, h_len, y, y_len, start, len, + step, offset); break; default: if (!(h_len % 4)) - conv_func = sse_conv_real4n; + c.conv_real4n(x, x_len, h, h_len, y, y_len, + start, len, step, offset); + else + c.conv_real(x, x_len, h, h_len, y, y_len, start, + len, step, offset); } - } -#endif - - conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); + } else + c.conv_real(x, x_len, h, h_len, y, y_len, start, len, step, + offset); return len; } @@ -614,25 +671,24 @@ int convolve_complex(const float *x, int x_len, float *y, int y_len, int start, int len, int step, int offset) { - void (*conv_func) (const float *, int, const float *, int, float *, int, - int, int, int, int) = - (void *)_base_convolve_complex; - if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; memset(y, 0, len * 2 * sizeof(float)); -#ifdef HAVE_SSE3 if (step <= 4) { if (!(h_len % 8)) - conv_func = sse_conv_cmplx_8n; + c.conv_cmplx_8n(x, x_len, h, h_len, y, y_len, start, + len, step, offset); else if (!(h_len % 4)) - conv_func = sse_conv_cmplx_4n; - } -#endif - - conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); + c.conv_cmplx_4n(x, x_len, h, h_len, y, y_len, start, + len, step, offset); + else + c.conv_cmplx(x, x_len, h, h_len, y, y_len, start, len, + step, offset); + } else + c.conv_cmplx(x, x_len, h, h_len, y, y_len, start, len, step, + offset); return len; } |