diff options
author | Harald Welte <laforge@gnumonks.org> | 2015-12-27 19:18:14 +0100 |
---|---|---|
committer | Harald Welte <laforge@gnumonks.org> | 2015-12-27 19:18:14 +0100 |
commit | 1ea1d362fd3f239840170d1746f735e5e5ad1fdb (patch) | |
tree | b15604c3e633d1d1f49b9f9d328e684a376592f2 | |
parent | 7a04624563ac72b7237e39d6fcee5dab96f4d948 (diff) |
WIP on adding MMX support for some of the HR codec primitives
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | libgsmhr/Makefile.am | 6 | ||||
-rw-r--r-- | libgsmhr/asm.s | 49 | ||||
-rw-r--r-- | libgsmhr/mmx.c | 71 |
4 files changed, 124 insertions, 3 deletions
diff --git a/configure.ac b/configure.ac index 1208025..70232c3 100644 --- a/configure.ac +++ b/configure.ac @@ -45,6 +45,7 @@ AC_SUBST(SYMBOL_VISIBILITY) # Checks for programs. AC_PROG_CC +AM_PROG_AS AC_PROG_LIBTOOL # Checks for libraries. diff --git a/libgsmhr/Makefile.am b/libgsmhr/Makefile.am index 07390f9..a3a5a83 100644 --- a/libgsmhr/Makefile.am +++ b/libgsmhr/Makefile.am @@ -4,7 +4,7 @@ AM_CFLAGS = -fPIC -Wall ${SYMBOL_VISIBILITY} LIBVERSION=0:0:0 REFSRC_PATH=refsrc -REFSRC_SRC=refsrc/dtx.c refsrc/globdefs.c refsrc/host.c refsrc/mathhalf.c refsrc/sp_enc.c refsrc/sp_rom.c refsrc/vad.c refsrc/err_conc.c refsrc/homing.c refsrc/mathdp31.c refsrc/sp_dec.c refsrc/sp_frm.c refsrc/sp_sfrm.c +REFSRC_SRC=refsrc/dtx.c refsrc/globdefs.c refsrc/host.c refsrc/mathhalf.c refsrc/sp_enc.c refsrc/sp_rom.c refsrc/vad.c refsrc/err_conc.c refsrc/homing.c refsrc/mathdp31.c refsrc/sp_dec.c refsrc/sp_frm.c refsrc/sp_sfrm.c asm.s mmx.c ${REFSRC_PATH}/.downloaded: ./fetch_sources.py "${REFSRC_PATH}" @@ -19,5 +19,5 @@ ${REFSRC_PATH}/dtx.c: ${REFSRC_PATH}/.downloaded lib_LTLIBRARIES = libgsmhr.la libgsmhr_la_SOURCES = $(REFSRC_SRC) libgsmhr.c -clean-local: - -rm -rf ${REFSRC_PATH} +#clean-local: +# -rm -rf ${REFSRC_PATH} diff --git a/libgsmhr/asm.s b/libgsmhr/asm.s new file mode 100644 index 0000000..d1d491e --- /dev/null +++ b/libgsmhr/asm.s @@ -0,0 +1,49 @@ + +.globl sat_adds32b +.type sat_adds32b,@function +sat_adds32b: + mov %edi, %eax + shr $0x1f, %edi + add $0x7fffffff, %edi + add %esi, %eax + cmovo %edi, %eax + retq +.size sat_adds32b, .-sat_adds32b + +.globl sat_subs32b +.type sat_subs32b,@function +sat_subs32b: + mov %edi, %eax + shr $0x1f, %edi + add $0x7fffffff, %edi + sub %esi, %eax + cmovo %edi, %eax + retq +.size sat_subs32b, .-sat_subs32b + +.globl sat_divs32b +.type sat_divs32b,@function +sat_divs32b: + mov %edi, %eax + lea 0x1(%rsi), %edx + add $0x80000000, %edi + or %edx, %edi + cdq + neg %edi + sbb $-1, %eax + idiv %esi + retq +.size sat_divs32b, .-sat_divs32b + +.globl sat_muls32b +.type sat_muls32b,@function +sat_muls32b: + mov %edi, %eax + xor %esi, %edi + shr $0x1f, %edi + add $0x7fffffff, %edi + imul %esi + cmovc %edi, %eax + retq +.size sat_muls32b, .-sat_muls32b + diff --git a/libgsmhr/mmx.c b/libgsmhr/mmx.c new file mode 100644 index 0000000..8e4fad2 --- /dev/null +++ b/libgsmhr/mmx.c @@ -0,0 +1,71 @@ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> + +typedef int16_t v4hi __attribute__ ((vector_size (8))); +typedef int32_t v2si __attribute__ ((vector_size (8))); + +union v4_s16 { + v4hi v; + int16_t s16[4]; +}; + +union v2_s32 { + v2si v; + int32_t s32[2]; +}; + +extern int32_t sat_adds32b(int32_t a, int32_t b); + +/* multiply-accumulate of 8 16-bit values */ +int32_t mmx_mac_unsat(int16_t *_x, int16_t *_y) +{ + register union v4_s16 *x = (union v4_s16 *)_x; + union v4_s16 *y = (union v4_s16 *)_y; + union v2_s32 im1, im2, r; + + /* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1] + * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */ + im1.v = __builtin_ia32_pmaddwd(x->v, y->v); + /* results are saturated */ + + /* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5] + * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */ + im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v); + /* results are saturated */ + + /* im1.s32[0] = im1.s32[0] + im2.s32[0] + * im1.s32[1] = im1.s32[1] + im2.s32[1] */ + /* FIXME: overflow in addition could happen */ + im1.v = __builtin_ia32_paddd(im1.v, im2.v); + + /* FIXME: overflow in addition could happen */ + return (im1.s32[0] + im1.s32[1]) << 1; +} + +/* multiply-accumulate of 8 16-bit values */ +int32_t mmx_mac_sat(int16_t *_x, int16_t *_y) +{ + register union v4_s16 *x = (union v4_s16 *)_x; + union v4_s16 *y = (union v4_s16 *)_y; + union v2_s32 im1, im2, r; + + /* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1] + * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */ + im1.v = __builtin_ia32_pmaddwd(x->v, y->v); + /* results are saturated */ + + /* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5] + * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */ + im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v); + /* results are saturated */ + + /* im1.s32[0] = im1.s32[0] + im2.s32[0] + * im1.s32[1] = im1.s32[1] + im2.s32[1] */ + im1.s32[0] = sat_adds32b(im1.s32[0], im2.s32[0]); + im1.s32[1] = sat_adds32b(im1.s32[1], im2.s32[1]); + + /* FIXME: overflow in addition could happen */ + return sat_adds32b(im1.s32[0], im1.s32[1]) << 1; +} |