37
37
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
38
38
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
39
{rounder, rounder, rounder, rounder,
40
rounder, rounder, rounder, rounder};
39
{rounder, rounder, rounder, rounder,
40
rounder, rounder, rounder, rounder};
41
41
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
43
(16-x16)*(16-y16), /* A */
44
( x16)*(16-y16), /* B */
45
(16-x16)*( y16), /* C */
46
( x16)*( y16), /* D */
47
0, 0, 0, 0 /* padding */
43
(16-x16)*(16-y16), /* A */
44
( x16)*(16-y16), /* B */
45
(16-x16)*( y16), /* C */
46
( x16)*( y16), /* D */
47
0, 0, 0, 0 /* padding */
49
49
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
50
50
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
51
51
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
74
74
src_1 = vec_ld(16, src);
75
75
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
77
if (src_really_odd != 0x0000000F)
78
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
79
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
77
if (src_really_odd != 0x0000000F) {
78
// if src & 0xF == 0xF, then (src+1) is properly aligned
79
// on the second vector.
80
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
85
84
srcvA = vec_mergeh(vczero, srcvA);
86
85
srcvB = vec_mergeh(vczero, srcvB);
90
dst_odd = (unsigned long)dst & 0x0000000F;
91
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
93
dstv = vec_ld(0, dst);
95
// we we'll be able to pick-up our 9 char elements
96
// at src + stride from those 32 bytes
97
// then reuse the resulting 2 vectors srvcC and srcvD
98
// as the next srcvA and srcvB
99
src_0 = vec_ld(stride + 0, src);
100
src_1 = vec_ld(stride + 16, src);
101
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
103
if (src_really_odd != 0x0000000F)
104
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
105
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
112
srcvC = vec_mergeh(vczero, srcvC);
113
srcvD = vec_mergeh(vczero, srcvD);
116
// OK, now we (finally) do the math :-)
117
// those four instructions replaces 32 int muls & 32 int adds.
118
// isn't AltiVec nice ?
119
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
120
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
121
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
122
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
127
tempD = vec_sr(tempD, vcsr8);
129
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
133
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
137
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
140
vec_st(dstv2, 0, dst);
88
dst_odd = (unsigned long)dst & 0x0000000F;
89
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
91
dstv = vec_ld(0, dst);
93
// we we'll be able to pick-up our 9 char elements
94
// at src + stride from those 32 bytes
95
// then reuse the resulting 2 vectors srvcC and srcvD
96
// as the next srcvA and srcvB
97
src_0 = vec_ld(stride + 0, src);
98
src_1 = vec_ld(stride + 16, src);
99
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
101
if (src_really_odd != 0x0000000F) {
102
// if src & 0xF == 0xF, then (src+1) is properly aligned
103
// on the second vector.
104
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
109
srcvC = vec_mergeh(vczero, srcvC);
110
srcvD = vec_mergeh(vczero, srcvD);
113
// OK, now we (finally) do the math :-)
114
// those four instructions replaces 32 int muls & 32 int adds.
115
// isn't AltiVec nice ?
116
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
117
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
118
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
119
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
124
tempD = vec_sr(tempD, vcsr8);
126
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
129
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
131
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
134
vec_st(dstv2, 0, dst);
146
140
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);