aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Transceiver52M/Makefile.am9
-rw-r--r--Transceiver52M/arm/Makefile.am21
-rw-r--r--Transceiver52M/arm/convert.c96
-rw-r--r--Transceiver52M/arm/convert_neon.S51
-rw-r--r--Transceiver52M/arm/convolve.c139
-rw-r--r--Transceiver52M/arm/convolve_neon.S277
-rw-r--r--Transceiver52M/arm/scale.c56
-rw-r--r--Transceiver52M/arm/scale_neon.S50
-rw-r--r--Transceiver52M/common/scale.h6
-rw-r--r--Transceiver52M/sigProcLib.cpp17
-rw-r--r--Transceiver52M/x86/Makefile.am2
-rw-r--r--configure.ac24
12 files changed, 743 insertions, 5 deletions
diff --git a/Transceiver52M/Makefile.am b/Transceiver52M/Makefile.am
index c1cfb1b..981447f 100644
--- a/Transceiver52M/Makefile.am
+++ b/Transceiver52M/Makefile.am
@@ -24,9 +24,13 @@ include $(top_srcdir)/Makefile.common
AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) -I./common
AM_CXXFLAGS = -ldl -lpthread
-SUBDIRS = x86
+SUBDIRS = arm x86
+if ARCH_ARM
+ARCH_LA = arm/libarch.la
+else
ARCH_LA = x86/libarch.la
+endif
if USRP1
AM_CPPFLAGS += $(USRP_CFLAGS)
@@ -72,7 +76,8 @@ noinst_HEADERS = \
USRPDevice.h \
Resampler.h \
common/convolve.h \
- common/convert.h
+ common/convert.h \
+ common/scale.h
transceiver_SOURCES = runTransceiver.cpp
transceiver_LDADD = \
diff --git a/Transceiver52M/arm/Makefile.am b/Transceiver52M/arm/Makefile.am
new file mode 100644
index 0000000..6d34daa
--- /dev/null
+++ b/Transceiver52M/arm/Makefile.am
@@ -0,0 +1,21 @@
+if ARCH_ARM
+if ARCH_ARM_A15
+ARCH_FLAGS = -mfpu=neon-vfpv4
+else
+ARCH_FLAGS = -mfpu=neon
+endif
+
+AM_CFLAGS = -Wall $(ARCH_FLAGS) -std=gnu99 -I../common
+AM_CCASFLAGS = $(ARCH_FLAGS)
+
+noinst_LTLIBRARIES = libarch.la
+
+libarch_la_SOURCES = \
+ ../common/convolve_base.c \
+ convert.c \
+ convert_neon.S \
+ convolve.c \
+ convolve_neon.S \
+ scale.c \
+ scale_neon.S
+endif
diff --git a/Transceiver52M/arm/convert.c b/Transceiver52M/arm/convert.c
new file mode 100644
index 0000000..a32eb55
--- /dev/null
+++ b/Transceiver52M/arm/convert.c
@@ -0,0 +1,96 @@
+/*
+ * NEON type conversions
+ * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+#include "convert.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+void neon_convert_ps_si16_4n(short *, float *, float *, int);
+void neon_convert_si16_ps_4n(float *, short *, int);
+
+#ifndef HAVE_NEON
+static void convert_si16_ps(float *out, short *in, int len)
+{
+ for (int i = 0; i < len; i++)
+ out[i] = in[i];
+}
+
+static void convert_ps_si16(short *out, float *in, float scale, int len)
+{
+ for (int i = 0; i < len; i++)
+ out[i] = in[i] * scale;
+}
+#else
+/* 4*N 16-bit signed integer conversion with remainder */
+static void neon_convert_si16_ps(float *restrict out,
+ short *restrict in,
+ int len)
+{
+ int start = len / 4 * 4;
+
+ neon_convert_si16_ps_4n(out, in, len >> 2);
+
+ for (int i = 0; i < len % 4; i++)
+ out[start + i] = (float) in[start + i];
+}
+
+/* 4*N 16-bit signed integer conversion with remainder */
+static void neon_convert_ps_si16(short *restrict out,
+ float *restrict in,
+ float *restrict scale,
+ int len)
+{
+ int start = len / 4 * 4;
+
+ neon_convert_ps_si16_4n(out, in, scale, len >> 2);
+
+ for (int i = 0; i < len % 4; i++)
+ out[start + i] = (short) (in[start + i] * (*scale));
+}
+#endif
+
+void convert_float_short(short *out, float *in, float scale, int len)
+{
+#ifdef HAVE_NEON
+ float q[4] = { scale, scale, scale, scale };
+
+ if (len % 4)
+ neon_convert_ps_si16(out, in, q, len);
+ else
+ neon_convert_ps_si16_4n(out, in, q, len >> 2);
+#else
+ convert_ps_si16(out, in, scale, len);
+#endif
+}
+
+void convert_short_float(float *out, short *in, int len)
+{
+#ifdef HAVE_NEON
+ if (len % 4)
+ neon_convert_si16_ps(out, in, len);
+ else
+ neon_convert_si16_ps_4n(out, in, len >> 2);
+#else
+ convert_si16_ps(out, in, len);
+#endif
+}
diff --git a/Transceiver52M/arm/convert_neon.S b/Transceiver52M/arm/convert_neon.S
new file mode 100644
index 0000000..842ed9f
--- /dev/null
+++ b/Transceiver52M/arm/convert_neon.S
@@ -0,0 +1,51 @@
+/*
+ * NEON type conversions
+ * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ .syntax unified
+ .text
+ .align 2
+ .global neon_convert_ps_si16_4n
+ .type neon_convert_ps_si16_4n, %function
+neon_convert_ps_si16_4n:
+ vld1.32 {q1}, [r2]
+.loop_fltint:
+ vld1.64 {d0-d1}, [r1]!
+ vmul.f32 q0, q1
+ vcvt.s32.f32 q2, q0
+ vqmovn.s32 d0, q2
+ vst1.64 {d0}, [r0]!
+ subs r3, #1
+ bne .loop_fltint
+ bx lr
+ .size neon_convert_ps_si16_4n, .-neon_convert_ps_si16_4n
+ .text
+ .align 2
+ .global neon_convert_si16_ps_4n
+ .type neon_convert_si16_ps_4n, %function
+neon_convert_si16_ps_4n:
+.loop_intflt:
+ vld1.64 {d0}, [r1]!
+ vmovl.s16 q1, d0
+ vcvt.f32.s32 q0, q1
+ vst1.64 {q0}, [r0]!
+ subs r2, #1
+ bne .loop_intflt
+ bx lr
+ .size neon_convert_si16_ps_4n, .-neon_convert_si16_ps_4n
+ .section .note.GNU-stack,"",%progbits
diff --git a/Transceiver52M/arm/convolve.c b/Transceiver52M/arm/convolve.c
new file mode 100644
index 0000000..2b42090
--- /dev/null
+++ b/Transceiver52M/arm/convolve.c
@@ -0,0 +1,139 @@
+/*
+ * NEON Convolution
+ * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+/* Forward declarations from base implementation */
+int _base_convolve_real(float *x, int x_len,
+ float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset);
+
+int _base_convolve_complex(float *x, int x_len,
+ float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset);
+
+int bounds_check(int x_len, int h_len, int y_len,
+ int start, int len, int step);
+
+#ifdef HAVE_NEON
+/* Calls into NEON assembler */
+void neon_conv_real4(float *x, float *h, float *y, int len);
+void neon_conv_real8(float *x, float *h, float *y, int len);
+void neon_conv_real12(float *x, float *h, float *y, int len);
+void neon_conv_real16(float *x, float *h, float *y, int len);
+void neon_conv_real20(float *x, float *h, float *y, int len);
+void mac_cx_neon4(float *x, float *h, float *y, int len);
+
+/* Complex-complex convolution */
+static void neon_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len)
+{
+ for (int i = 0; i < len; i++)
+ mac_cx_neon4(&x[2 * i], h, &y[2 * i], h_len >> 2);
+}
+#endif
+
+/* API: Aligned complex-real */
+int convolve_real(float *x, int x_len,
+ float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
+{
+ void (*conv_func)(float *, float *, float *, int) = NULL;
+
+ if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+ return -1;
+
+ memset(y, 0, len * 2 * sizeof(float));
+
+#ifdef HAVE_NEON
+ if (step <= 4) {
+ switch (h_len) {
+ case 4:
+ conv_func = neon_conv_real4;
+ break;
+ case 8:
+ conv_func = neon_conv_real8;
+ break;
+ case 12:
+ conv_func = neon_conv_real12;
+ break;
+ case 16:
+ conv_func = neon_conv_real16;
+ break;
+ case 20:
+ conv_func = neon_conv_real20;
+ break;
+ }
+ }
+#endif
+ if (conv_func) {
+ conv_func(&x[2 * (-(h_len - 1) + start)],
+ h, y, len);
+ } else {
+ _base_convolve_real(x, x_len,
+ h, h_len,
+ y, y_len,
+ start, len, step, offset);
+ }
+
+ return len;
+}
+
+
+/* API: Aligned complex-complex */
+int convolve_complex(float *x, int x_len,
+ float *h, int h_len,
+ float *y, int y_len,
+ int start, int len,
+ int step, int offset)
+{
+ void (*conv_func)(float *, float *, float *, int, int) = NULL;
+
+ if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
+ return -1;
+
+ memset(y, 0, len * 2 * sizeof(float));
+
+#ifdef HAVE_NEON
+ if (step <= 4 && !(h_len % 4))
+ conv_func = neon_conv_cmplx_4n;
+#endif
+ if (conv_func) {
+ conv_func(&x[2 * (-(h_len - 1) + start)],
+ h, y, h_len, len);
+ } else {
+ _base_convolve_complex(x, x_len,
+ h, h_len,
+ y, y_len,
+ start, len, step, offset);
+ }
+
+ return len;
+}
diff --git a/Transceiver52M/arm/convolve_neon.S b/Transceiver52M/arm/convolve_neon.S
new file mode 100644
index 0000000..637d150
--- /dev/null
+++ b/Transceiver52M/arm/convolve_neon.S
@@ -0,0 +1,277 @@
+/*
+ * NEON Convolution
+ * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+ .syntax unified
+ .text
+ .align 2
+ .global neon_conv_real4
+ .type neon_conv_real4, %function
+neon_conv_real4:
+ push {r4, lr}
+ vpush {q4-q7}
+ vld2.32 {q0-q1}, [r1]
+ ldr r4, =8
+.neon_conv_loop4:
+ vld2.32 {q2-q3}, [r0], r4
+ vmul.f32 q4, q2, q0
+ vmul.f32 q5, q3, q0
+ vpadd.f32 d12, d8, d9
+ vpadd.f32 d13, d10, d11
+ vpadd.f32 d14, d12, d13
+ vst1.64 {d14}, [r2]!
+ subs r3, r3, #1
+ bne .neon_conv_loop4
+ vpop {q4-q7}
+ pop {r4, pc}
+ .size neon_conv_real4, .-neon_conv_real4
+ .align 2
+ .p2align 4,,15
+ .global neon_conv_real8
+ .type neon_conv_real8, %function
+neon_conv_real8:
+ push {r4-r5, lr}
+ vpush {q4-q7}
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r1]
+ add r4, r0, #32
+ ldr r5, =8
+.neon_conv_loop8:
+ vld2.32 {q4-q5}, [r0], r5
+ vld2.32 {q6-q7}, [r4], r5
+ vmul.f32 q8, q4, q0
+ vmul.f32 q9, q5, q0
+ vmul.f32 q10, q6, q2
+ vmul.f32 q11, q7, q2
+
+ vadd.f32 q12, q8, q10
+ vadd.f32 q13, q9, q11
+
+ vpadd.f32 d22, d24, d25
+ vpadd.f32 d23, d26, d27
+ vpadd.f32 d24, d22, d23
+ vst1.64 {d24}, [r2]!
+ subs r3, r3, #1
+ bne .neon_conv_loop8
+ vpop {q4-q7}
+ pop {r4-r5, pc}
+ .size neon_conv_real8, .-neon_conv_real8
+ .align 2
+ .global neon_conv_real12
+ .type neon_conv_real12, %function
+neon_conv_real12:
+ push {r4-r6, lr}
+ vpush {q4-q7}
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r1]!
+ vld2.32 {q4-q5}, [r1]!
+ add r4, r0, #32
+ add r5, r0, #64
+ ldr r6, =8
+.neon_conv_loop12:
+ vld2.32 {q6-q7}, [r0], r6
+ vld2.32 {q8-q9}, [r4], r6
+ vld2.32 {q10-q11}, [r5], r6
+#ifdef HAVE_NEON_FMA
+ vfma.f32 q1, q6, q0
+ vfma.f32 q3, q7, q0
+ vfma.f32 q1, q8, q2
+ vfma.f32 q3, q9, q2
+ vfma.f32 q1, q10, q4
+ vfma.f32 q3, q11, q4
+#else
+ vmul.f32 q12, q6, q0
+ vmul.f32 q13, q7, q0
+ vmul.f32 q14, q8, q2
+ vmul.f32 q15, q9, q2
+ vmul.f32 q1, q10, q4
+ vmul.f32 q3, q11, q4
+
+ vadd.f32 q5, q12, q14
+ vadd.f32 q6, q13, q15
+ vadd.f32 q1, q5, q1
+ vadd.f32 q3, q6, q3
+#endif
+ vpadd.f32 d2, d2, d3
+ vpadd.f32 d3, d6, d7
+ vpadd.f32 d6, d2, d3
+ vst1.64 {d6}, [r2]!
+ subs r3, r3, #1
+ bne .neon_conv_loop12
+ vpop {q4-q7}
+ pop {r4-r6, pc}
+ .size neon_conv_real12, .-neon_conv_real12
+ .align 2
+ .global neon_conv_real16
+ .type neon_conv_real16, %function
+neon_conv_real16:
+ push {r4-r7, lr}
+ vpush {q4-q7}
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r1]!
+ vld2.32 {q4-q5}, [r1]!
+ vld2.32 {q6-q7}, [r1]
+ add r4, r0, #32
+ add r5, r0, #64
+ add r6, r0, #96
+ ldr r7, =8
+.neon_conv_loop16:
+ vld2.32 {q8-q9}, [r0], r7
+ vld2.32 {q10-q11}, [r4], r7
+ vld2.32 {q12-q13}, [r5], r7
+ vld2.32 {q14-q15}, [r6], r7
+#ifdef HAVE_NEON_FMA
+ vmul.f32 q1, q8, q0
+ vmul.f32 q3, q9, q0
+ vfma.f32 q1, q10, q2
+ vfma.f32 q3, q11, q2
+ vfma.f32 q1, q12, q4
+ vfma.f32 q3, q13, q4
+ vfma.f32 q1, q14, q6
+ vfma.f32 q3, q15, q6
+#else
+ vmul.f32 q1, q8, q0
+ vmul.f32 q3, q9, q0
+ vmul.f32 q5, q10, q2
+ vmul.f32 q7, q11, q2
+ vmul.f32 q8, q12, q4
+ vmul.f32 q9, q13, q4
+ vmul.f32 q10, q14, q6
+ vmul.f32 q11, q15, q6
+
+ vadd.f32 q1, q1, q5
+ vadd.f32 q3, q3, q7
+ vadd.f32 q5, q8, q10
+ vadd.f32 q7, q9, q11
+ vadd.f32 q1, q1, q5
+ vadd.f32 q3, q3, q7
+#endif
+ vpadd.f32 d2, d2, d3
+ vpadd.f32 d3, d6, d7
+ vpadd.f32 d6, d2, d3
+ vst1.64 {d6}, [r2]!
+ subs r3, r3, #1
+ bne .neon_conv_loop16
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+ .size neon_conv_real16, .-neon_conv_real16
+ .align 2
+ .global neon_conv_real20
+ .type neon_conv_real20, %function
+neon_conv_real20:
+ push {r4-r8, lr}
+ vpush {q4-q7}
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r1]!
+ vld2.32 {q4-q5}, [r1]!
+ vld2.32 {q6-q7}, [r1]!
+ vld2.32 {q8-q9}, [r1]
+ add r4, r0, #32
+ add r5, r0, #64
+ add r6, r0, #96
+ add r7, r0, #128
+ ldr r8, =8
+.neon_conv_loop20:
+ vld2.32 {q10-q11}, [r0], r8
+ vld2.32 {q12-q13}, [r4], r8
+ vld2.32 {q14-q15}, [r5], r8
+#ifdef HAVE_NEON_FMA
+ vmul.f32 q1, q10, q0
+ vfma.f32 q1, q12, q2
+ vfma.f32 q1, q14, q4
+ vmul.f32 q3, q11, q0
+ vfma.f32 q3, q13, q2
+ vfma.f32 q3, q15, q4
+
+ vld2.32 {q12-q13}, [r6], r8
+ vld2.32 {q14-q15}, [r7], r8
+
+ vfma.f32 q1, q12, q6
+ vfma.f32 q3, q13, q6
+ vfma.f32 q1, q14, q8
+ vfma.f32 q3, q15, q8
+#else
+ vmul.f32 q1, q10, q0
+ vmul.f32 q3, q12, q2
+ vmul.f32 q5, q14, q4
+ vmul.f32 q7, q11, q0
+ vmul.f32 q9, q13, q2
+ vmul.f32 q10, q15, q4
+ vadd.f32 q1, q1, q3
+ vadd.f32 q3, q7, q9
+ vadd.f32 q9, q1, q5
+ vadd.f32 q10, q3, q10
+
+ vld2.32 {q12-q13}, [r6], r8
+ vld2.32 {q14-q15}, [r7], r8
+
+ vmul.f32 q1, q12, q6
+ vmul.f32 q3, q13, q6
+ vmul.f32 q5, q14, q8
+ vmul.f32 q7, q15, q8
+ vadd.f32 q12, q1, q9
+ vadd.f32 q14, q3, q10
+ vadd.f32 q1, q12, q5
+ vadd.f32 q3, q14, q7
+#endif
+ vpadd.f32 d2, d2, d3
+ vpadd.f32 d3, d6, d7
+ vpadd.f32 d6, d2, d3
+ vst1.64 {d6}, [r2]!
+ subs r3, r3, #1
+ bne .neon_conv_loop20
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+ .size neon_conv_real20, .-neon_conv_real20
+ .align 2
+ .global mac_cx_neon4
+ .type mac_cx_neon4, %function
+mac_cx_neon4:
+ push {r4, lr}
+ ldr r4, =32
+ veor q14, q14
+ veor q15, q15
+.neon_conv_loop_mac4:
+ vld2.32 {q0-q1}, [r0], r4
+ vld2.32 {q2-q3}, [r1]!
+
+ vmul.f32 q10, q0, q2
+ vmul.f32 q11, q1, q3
+ vmul.f32 q12, q0, q3
+ vmul.f32 q13, q2, q1
+ vsub.f32 q8, q10, q11
+ vadd.f32 q9, q12, q13
+
+ vadd.f32 q14, q8
+ vadd.f32 q15, q9
+ subs r3, #1
+ bne .neon_conv_loop_mac4
+
+ vld1.64 d0, [r2]
+ vpadd.f32 d28, d28, d29
+ vpadd.f32 d30, d30, d31
+ vpadd.f32 d1, d28, d30
+ vadd.f32 d1, d0
+ vst1.64 d1, [r2]
+ pop {r4, pc}
+ .size mac_cx_neon4, .-mac_cx_neon4
+ .section .note.GNU-stack,"",%progbits
diff --git a/Transceiver52M/arm/scale.c b/Transceiver52M/arm/scale.c
new file mode 100644
index 0000000..2de13ff
--- /dev/null
+++ b/Transceiver52M/arm/scale.c
@@ -0,0 +1,56 @@
+/*
+ * NEON scaling
+ * Copyright (C) 2012,2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+#include <scale.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+void neon_scale_4n(float *, float *, float *, int);
+
+static void scale_ps(float *out, float *in, float *scale, int len)
+{
+ float ai, aq, bi, bq;
+
+ bi = scale[0];
+ bq = scale[1];
+
+ for (int i = 0; i < len; i++) {
+ ai = in[2 * i + 0];
+ aq = in[2 * i + 1];
+
+ out[2 * i + 0] = ai * bi - aq * bq;
+ out[2 * i + 1] = ai * bq + aq * bi;
+ }
+}
+
+void scale_complex(float *out, float *in, float* scale, int len)
+{
+#ifdef HAVE_NEON
+ if (len % 4)
+ scale_ps(out, in, scale, len);
+ else
+ neon_scale_4n(in, scale, out, len >> 2);
+#else
+ scale_ps(out, in, scale, len);
+#endif
+}
diff --git a/Transceiver52M/arm/scale_neon.S b/Transceiver52M/arm/scale_neon.S
new file mode 100644
index 0000000..a66fbe5
--- /dev/null
+++ b/Transceiver52M/arm/scale_neon.S
@@ -0,0 +1,50 @@
+/*
+ * ARM NEON Scaling
+ * Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ .syntax unified
+ .text
+ .align 2
+ .global neon_scale_4n
+ .type neon_scale_4n, %function
+neon_scale_4n:
+ push {r4, lr}
+ ldr r4, =32
+
+ vld1.64 d0, [r1]
+ vmov.32 s4, s1
+ vmov.32 s1, s0
+ vmov.64 d1, d0
+ vmov.32 s5, s4
+ vmov.64 d3, d2
+.loop_mul_const:
+ vld2.32 {q2-q3}, [r0], r4
+
+ vmul.f32 q8, q0, q2
+ vmul.f32 q9, q1, q3
+ vmul.f32 q10, q0, q3
+ vmul.f32 q11, q1, q2
+ vsub.f32 q8, q8, q9
+ vadd.f32 q9, q10, q11
+
+ vst2.32 {q8-q9}, [r2]!
+ subs r3, #1
+ bne .loop_mul_const
+ pop {r4, pc}
+ .size neon_scale_4n, .-neon_scale_4n
+ .section .note.GNU-stack,"",%progbits
diff --git a/Transceiver52M/common/scale.h b/Transceiver52M/common/scale.h
new file mode 100644
index 0000000..da867e7
--- /dev/null
+++ b/Transceiver52M/common/scale.h
@@ -0,0 +1,6 @@
+#ifndef _SCALE_H_
+#define _SCALE_H_
+
+void scale_complex(float *out, float *in, float *scale, int len);
+
+#endif /* _SCALE_H_ */
diff --git a/Transceiver52M/sigProcLib.cpp b/Transceiver52M/sigProcLib.cpp
index 595efa3..5a1ab77 100644
--- a/Transceiver52M/sigProcLib.cpp
+++ b/Transceiver52M/sigProcLib.cpp
@@ -22,15 +22,20 @@
*/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
#include "sigProcLib.h"
#include "GSMCommon.h"
-using namespace GSM;
-
extern "C" {
#include "convolve.h"
+#include "scale.h"
}
+using namespace GSM;
+
#define TABLESIZE 1024
/** Lookup tables for trigonometric approximation */
@@ -958,6 +963,13 @@ complex peakDetect(const signalVector &rxBurst,
void scaleVector(signalVector &x,
complex scale)
{
+#ifdef HAVE_NEON
+ int len = x.size();
+
+ scale_complex((float *) x.begin(),
+ (float *) x.begin(),
+ (float *) &scale, len);
+#else
signalVector::iterator xP = x.begin();
signalVector::iterator xPEnd = x.end();
if (!x.isRealOnly()) {
@@ -972,6 +984,7 @@ void scaleVector(signalVector &x,
xP++;
}
}
+#endif
}
/** in-place conjugation */
diff --git a/Transceiver52M/x86/Makefile.am b/Transceiver52M/x86/Makefile.am
index 0621b17..e880351 100644
--- a/Transceiver52M/x86/Makefile.am
+++ b/Transceiver52M/x86/Makefile.am
@@ -1,3 +1,4 @@
+if !ARCH_ARM
AM_CFLAGS = -Wall -std=gnu99 -march=native -I../common
noinst_LTLIBRARIES = libarch.la
@@ -6,3 +7,4 @@ libarch_la_SOURCES = \
../common/convolve_base.c \
convert.c \
convolve.c
+endif
diff --git a/configure.ac b/configure.ac
index 848f887..ae4ea3b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -68,6 +68,25 @@ AC_ARG_WITH(singledb, [
[enable single daughterboard use on USRP1])
])
+AC_ARG_WITH(neon, [
+ AS_HELP_STRING([--with-neon],
+ [enable ARM NEON support])
+])
+
+AC_ARG_WITH(neon-vfpv4, [
+ AS_HELP_STRING([--with-neon-vfpv4],
+ [enable ARM NEON FMA support])
+])
+
+AS_IF([test "x$with_neon" = "xyes"], [
+ AC_DEFINE(HAVE_NEON, 1, Support ARM NEON)
+])
+
+AS_IF([test "x$with_neon_vfpv4" = "xyes"], [
+ AC_DEFINE(HAVE_NEON, 1, Support ARM NEON)
+ AC_DEFINE(HAVE_NEON_FMA, 1, Support ARM NEON with FMA)
+])
+
AS_IF([test "x$with_usrp1" = "xyes"], [
PKG_CHECK_MODULES(USRP, usrp >= 3.3)
])
@@ -85,6 +104,8 @@ AS_IF([test "x$with_singledb" = "xyes"], [
AX_EXT
AM_CONDITIONAL(USRP1, [test "x$with_usrp1" = "xyes"])
+AM_CONDITIONAL(ARCH_ARM, [test "x$with_neon" = "xyes" || test "x$with_neon_vfpv4" = "xyes"])
+AM_CONDITIONAL(ARCH_ARM_A15, [test "x$with_neon_vfpv4" = "xyes"])
PKG_CHECK_MODULES(LIBUSB, libusb-1.0)
@@ -94,8 +115,9 @@ AC_CONFIG_FILES([\
CommonLibs/Makefile \
GSM/Makefile \
Transceiver52M/Makefile \
+ Transceiver52M/arm/Makefile \
Transceiver52M/x86/Makefile \
sqlite3/Makefile \
-])
+])
AC_OUTPUT