From 12caf496aa2649559c364753c15a027251c95e0d Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sun, 27 Dec 2015 19:23:18 +0100 Subject: WIP: add laforge-mmx.diff to illustrate MMX related changes I tried --- libgsmhr/laforge-mmx.diff | 167 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 libgsmhr/laforge-mmx.diff diff --git a/libgsmhr/laforge-mmx.diff b/libgsmhr/laforge-mmx.diff new file mode 100644 index 0000000..a846c25 --- /dev/null +++ b/libgsmhr/laforge-mmx.diff @@ -0,0 +1,167 @@ +Only in refsrc: .downloaded +Only in refsrc: .sp_frm.c.swp +diff -u refsrc.orig/sp_frm.c refsrc/sp_frm.c +--- refsrc.orig/sp_frm.c 2015-12-27 19:22:13.966296058 +0100 ++++ refsrc/sp_frm.c 2014-05-13 22:43:56.786205819 +0200 +@@ -60,6 +60,13 @@ + * + **************************************************************************/ + ++#include ++ ++#include ++//#define HAVE_MMX ++extern int32_t mmx_mac_unsat(int16_t *_x, int16_t *_y); ++extern int32_t mmx_mac_sat(int16_t *_x, int16_t *_y); ++ + /*_________________________________________________________________________ + | | + | Include Files | +@@ -384,6 +391,7 @@ + /* get a vector */ + /*--------------*/ + ++ // 16, 32 or 64 iteraitons + getNextVec(pswRc); + + /* clear the limiter flag */ +@@ -432,6 +440,8 @@ + for (iCnt = 0; iCnt < quantList.iNum; iCnt++) + { + ++ // 4 * 16, 32 or 64 iteraitons ++ + /* get a vector */ + /*--------------*/ + +@@ -931,6 +941,7 @@ + + for (i = 0; i <= bound; i++) + { ++ // 3-4 iterations + L_sum = L_mac(L_ROUND, pswVOld[i], pswQntRc[j]); + L_sum = L_mac(L_sum, pswVOld[-i], pswQntRc[j]); + L_sum = L_mac(L_sum, pswPOld[i], pswQntRcSqd[j]); +@@ -949,6 +960,7 @@ + + for (i = -bound; i < 0; i++) + { ++ // 3-4 iterations + L_sum = L_msu(L_ROUND, pswVOld[i + 1], SW_MIN); + L_sum = L_mac(L_sum, pswQntRcSqd[j], pswVOld[-i - 1]); + L_sum = L_mac(L_sum, pswQntRc[j], pswPOld[-i - 1]); +@@ -958,6 +970,7 @@ + + for (i = 0; i <= bound; i++) + { ++ // 3-4 iterations + L_sum = L_msu(L_ROUND, pswVOld[i + 1], SW_MIN); + L_sum = L_mac(L_sum, pswQntRcSqd[j], pswVOld[-i - 1]); + L_sum = L_mac(L_sum, pswQntRc[j], pswPOld[i + 1]); +@@ -4536,6 +4549,11 @@ + + pswScaledWSpeech = pswScaledWSpeechBuffer + LSMAX; + ++#if 0 ++ printf("G_FRAME_LEN=%d\n", G_FRAME_LEN); ++ printf("LSMIN=%d\n", LSMIN); ++ printf("LSMAX=%d\n", LSMAX); ++#endif + /*_________________________________________________________________________ + | | + | Executable Code | +@@ -4633,11 +4651,20 @@ + /*---------------------------------------------------------------------*/ + + L_G = 0; ++#ifndef HAVE_MMX ++ // 40 iterations (MMX: 5) + for (i = -LSMAX; i < -LSMAX + S_LEN; i++) + L_G = L_mac(L_G, pswScaledWSpeech[i], pswScaledWSpeech[i]); ++#else ++ for (i = -LSMAX; i < -LSMAX + S_LEN; i+=8) ++ L_G += mmx_mac_unsat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]); ++#endif + + pswGFrame[G_FRAME_LEN - 1] = extract_h(L_G); + ++//#ifndef HAVE_MMX ++#if 1 ++ // 248 iterations (MMX: 31) + for (i = -LSMAX; i < G_FRAME_LEN - LSMAX - 1; i++) + { + +@@ -4646,6 +4673,13 @@ + pswScaledWSpeech[i + S_LEN]); + pswGFrame[G_FRAME_LEN - LSMAX - 2 - i] = extract_h(L_G); + } ++#else ++ for (i = -LSMAX; i < G_FRAME_LEN - LSMAX - 1; i+= 8) { ++ L_G -= mmx_mac_sat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]); ++ L_G += mmx_mac_sat(&pswScaledWSpeech[i + S_LEN], ++ &pswScaledWSpeech[i + S_LEN]); ++ } ++#endif + + ppswGSfrm[0] = pswGFrame + 3 * S_LEN; + ppswGSfrm[1] = pswGFrame + 2 * S_LEN; +@@ -4661,8 +4695,14 @@ + pswSfrmEng[2] = pswGFrame[G_FRAME_LEN - 1 - LSMAX - 2 * S_LEN]; + + L_WSfrmEng = 0; ++#ifndef HAVE_MMX ++ // 40 iterations (MMX: 5) + for (i = F_LEN - S_LEN; i < F_LEN; i++) + L_WSfrmEng = L_mac(L_WSfrmEng, pswScaledWSpeech[i], pswScaledWSpeech[i]); ++#else ++ for (i = F_LEN - S_LEN; i < F_LEN; i+= 8) ++ L_WSfrmEng += mmx_mac_unsat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]); ++#endif + + pswSfrmEng[3] = extract_h(L_WSfrmEng); + +@@ -4671,19 +4711,26 @@ + /* as in the G buffer.) */ + /*------------------------------------------------------------*/ + ++ // 4 iterations + for (i = 0; i < N_SUB; i++) + { + ++ // 127 iterations + for (j = LSMIN; j <= LSMAX; j++) + { +- + L_C = 0; ++#ifndef HAVE_MMX ++ // 4*127*40 iterations (MMX: 4*127*5) + for (k = 0; k < S_LEN; k++) + { +- + L_C = L_mac(L_C, pswScaledWSpeech[i * S_LEN + k], + pswScaledWSpeech[i * S_LEN - j + k]); + } ++#else ++ for (k = 0; k < S_LEN; k+= 8) ++ L_C += mmx_mac_unsat(&pswScaledWSpeech[i*S_LEN + k], ++ &pswScaledWSpeech[i*S_LEN - j + k]); ++#endif + + pswCFrame[i * CG_TERMS + j - LSMIN] = extract_h(L_C); + } +@@ -4750,6 +4797,7 @@ + + L_Voicing = 0; + for (i = 0; i < N_SUB; i++) ++ // 4 Iterations + L_Voicing = L_mac(L_Voicing, pswSfrmEng[i], UV_SCALE0); + + L_Voicing = L_add(L_Voicing, L_deposit_h(swBestPG)); +@@ -5085,6 +5133,7 @@ + siLowestSoFar = 2; + for (i = 0; i < N_SUB; i++) + { ++ // 4 iterations + + /* Check this subframe against highest voicing threshold */ + /*-------------------------------------------------------*/ -- cgit v1.2.3