aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Welte <laforge@gnumonks.org>2015-12-27 19:18:14 +0100
committerHarald Welte <laforge@gnumonks.org>2015-12-27 19:18:14 +0100
commit1ea1d362fd3f239840170d1746f735e5e5ad1fdb (patch)
treeb15604c3e633d1d1f49b9f9d328e684a376592f2
parent7a04624563ac72b7237e39d6fcee5dab96f4d948 (diff)
WIP on adding MMX support for some of the HR codec primitives
-rw-r--r--configure.ac1
-rw-r--r--libgsmhr/Makefile.am6
-rw-r--r--libgsmhr/asm.s49
-rw-r--r--libgsmhr/mmx.c71
4 files changed, 124 insertions, 3 deletions
diff --git a/configure.ac b/configure.ac
index 1208025..70232c3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -45,6 +45,7 @@ AC_SUBST(SYMBOL_VISIBILITY)
# Checks for programs.
AC_PROG_CC
+AM_PROG_AS
AC_PROG_LIBTOOL
# Checks for libraries.
diff --git a/libgsmhr/Makefile.am b/libgsmhr/Makefile.am
index 07390f9..a3a5a83 100644
--- a/libgsmhr/Makefile.am
+++ b/libgsmhr/Makefile.am
@@ -4,7 +4,7 @@ AM_CFLAGS = -fPIC -Wall ${SYMBOL_VISIBILITY}
LIBVERSION=0:0:0
REFSRC_PATH=refsrc
-REFSRC_SRC=refsrc/dtx.c refsrc/globdefs.c refsrc/host.c refsrc/mathhalf.c refsrc/sp_enc.c refsrc/sp_rom.c refsrc/vad.c refsrc/err_conc.c refsrc/homing.c refsrc/mathdp31.c refsrc/sp_dec.c refsrc/sp_frm.c refsrc/sp_sfrm.c
+REFSRC_SRC=refsrc/dtx.c refsrc/globdefs.c refsrc/host.c refsrc/mathhalf.c refsrc/sp_enc.c refsrc/sp_rom.c refsrc/vad.c refsrc/err_conc.c refsrc/homing.c refsrc/mathdp31.c refsrc/sp_dec.c refsrc/sp_frm.c refsrc/sp_sfrm.c asm.s mmx.c
${REFSRC_PATH}/.downloaded:
./fetch_sources.py "${REFSRC_PATH}"
@@ -19,5 +19,5 @@ ${REFSRC_PATH}/dtx.c: ${REFSRC_PATH}/.downloaded
lib_LTLIBRARIES = libgsmhr.la
libgsmhr_la_SOURCES = $(REFSRC_SRC) libgsmhr.c
-clean-local:
- -rm -rf ${REFSRC_PATH}
+#clean-local:
+# -rm -rf ${REFSRC_PATH}
diff --git a/libgsmhr/asm.s b/libgsmhr/asm.s
new file mode 100644
index 0000000..d1d491e
--- /dev/null
+++ b/libgsmhr/asm.s
@@ -0,0 +1,49 @@
+
+.globl sat_adds32b
+.type sat_adds32b,@function
+sat_adds32b:
+ mov %edi, %eax
+ shr $0x1f, %edi
+ add $0x7fffffff, %edi
+ add %esi, %eax
+ cmovo %edi, %eax
+ retq
+.size sat_adds32b, .-sat_adds32b
+
+.globl sat_subs32b
+.type sat_subs32b,@function
+sat_subs32b:
+ mov %edi, %eax
+ shr $0x1f, %edi
+ add $0x7fffffff, %edi
+ sub %esi, %eax
+ cmovo %edi, %eax
+ retq
+.size sat_subs32b, .-sat_subs32b
+
+.globl sat_divs32b
+.type sat_divs32b,@function
+sat_divs32b:
+ mov %edi, %eax
+ lea 0x1(%rsi), %edx
+ add $0x80000000, %edi
+ or %edx, %edi
+ cdq
+ neg %edi
+ sbb $-1, %eax
+ idiv %esi
+ retq
+.size sat_divs32b, .-sat_divs32b
+
+.globl sat_muls32b
+.type sat_muls32b,@function
+sat_muls32b:
+ mov %edi, %eax
+ xor %esi, %edi
+ shr $0x1f, %edi
+ add $0x7fffffff, %edi
+ imul %esi
+ cmovc %edi, %eax
+ retq
+.size sat_muls32b, .-sat_muls32b
+
diff --git a/libgsmhr/mmx.c b/libgsmhr/mmx.c
new file mode 100644
index 0000000..8e4fad2
--- /dev/null
+++ b/libgsmhr/mmx.c
@@ -0,0 +1,71 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+typedef int16_t v4hi __attribute__ ((vector_size (8)));
+typedef int32_t v2si __attribute__ ((vector_size (8)));
+
+union v4_s16 {
+ v4hi v;
+ int16_t s16[4];
+};
+
+union v2_s32 {
+ v2si v;
+ int32_t s32[2];
+};
+
+extern int32_t sat_adds32b(int32_t a, int32_t b);
+
+/* multiply-accumulate of 8 16-bit values */
+int32_t mmx_mac_unsat(int16_t *_x, int16_t *_y)
+{
+ register union v4_s16 *x = (union v4_s16 *)_x;
+ union v4_s16 *y = (union v4_s16 *)_y;
+ union v2_s32 im1, im2, r;
+
+ /* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1]
+ * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */
+ im1.v = __builtin_ia32_pmaddwd(x->v, y->v);
+ /* results are saturated */
+
+ /* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5]
+ * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */
+ im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v);
+ /* results are saturated */
+
+ /* im1.s32[0] = im1.s32[0] + im2.s32[0]
+ * im1.s32[1] = im1.s32[1] + im2.s32[1] */
+ /* FIXME: overflow in addition could happen */
+ im1.v = __builtin_ia32_paddd(im1.v, im2.v);
+
+ /* FIXME: overflow in addition could happen */
+ return (im1.s32[0] + im1.s32[1]) << 1;
+}
+
+/* multiply-accumulate of 8 16-bit values */
+int32_t mmx_mac_sat(int16_t *_x, int16_t *_y)
+{
+ register union v4_s16 *x = (union v4_s16 *)_x;
+ union v4_s16 *y = (union v4_s16 *)_y;
+ union v2_s32 im1, im2, r;
+
+ /* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1]
+ * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */
+ im1.v = __builtin_ia32_pmaddwd(x->v, y->v);
+ /* results are saturated */
+
+ /* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5]
+ * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */
+ im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v);
+ /* results are saturated */
+
+ /* im1.s32[0] = im1.s32[0] + im2.s32[0]
+ * im1.s32[1] = im1.s32[1] + im2.s32[1] */
+ im1.s32[0] = sat_adds32b(im1.s32[0], im2.s32[0]);
+ im1.s32[1] = sat_adds32b(im1.s32[1], im2.s32[1]);
+
+ /* FIXME: overflow in addition could happen */
+ return sat_adds32b(im1.s32[0], im1.s32[1]) << 1;
+}