summaryrefslogtreecommitdiffstats
path: root/libgsmhr/mmx.c
blob: 8e4fad26dbf80e0a30d73af55756b9042617bd50 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

typedef int16_t v4hi __attribute__ ((vector_size (8)));
typedef int32_t v2si __attribute__ ((vector_size (8)));

union v4_s16 {
	v4hi	v;
	int16_t	s16[4];
};

union v2_s32 {
	v2si	v;
	int32_t s32[2];
};

extern int32_t sat_adds32b(int32_t a, int32_t b);

/* multiply-accumulate of 8 16-bit values */
int32_t mmx_mac_unsat(int16_t *_x, int16_t *_y)
{
	register union v4_s16 *x = (union v4_s16 *)_x;
	union v4_s16 *y = (union v4_s16 *)_y;
	union v2_s32 im1, im2, r;

	/* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1]
	 * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */
	im1.v = __builtin_ia32_pmaddwd(x->v, y->v);
	/* results are saturated */

	/* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5]
	 * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */
	im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v);
	/* results are saturated */

	/* im1.s32[0] = im1.s32[0] + im2.s32[0]
	 * im1.s32[1] = im1.s32[1] + im2.s32[1] */
	/* FIXME: overflow in addition could happen */
	im1.v = __builtin_ia32_paddd(im1.v, im2.v);

	/* FIXME: overflow in addition could happen */
	return (im1.s32[0] + im1.s32[1]) << 1;
}

/* multiply-accumulate of 8 16-bit values */
int32_t mmx_mac_sat(int16_t *_x, int16_t *_y)
{
	register union v4_s16 *x = (union v4_s16 *)_x;
	union v4_s16 *y = (union v4_s16 *)_y;
	union v2_s32 im1, im2, r;

	/* im1.s32[0] = x->s16[0] * y->s16[0] + x->s16[1] * y->s16[1]
	 * im1.s32[1] = x->s16[2] * y->s16[2] + x->s16[3] * y->s16[3] */
	im1.v = __builtin_ia32_pmaddwd(x->v, y->v);
	/* results are saturated */

	/* im2.s32[0] = x->s16[4] * y->s16[4] + x->s16[5] * y->s16[5]
	 * im2.s32[1] = x->s16[6] * y->s16[6]+ x->s16[7] * y->s16[7] */
	im2.v = __builtin_ia32_pmaddwd((x+1)->v, (y+1)->v);
	/* results are saturated */

	/* im1.s32[0] = im1.s32[0] + im2.s32[0]
	 * im1.s32[1] = im1.s32[1] + im2.s32[1] */
	im1.s32[0] = sat_adds32b(im1.s32[0], im2.s32[0]);
	im1.s32[1] = sat_adds32b(im1.s32[1], im2.s32[1]);

	/* FIXME: overflow in addition could happen */
	return sat_adds32b(im1.s32[0], im1.s32[1]) << 1;
}