2
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4
* This library is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU Lesser General Public
6
* License as published by the Free Software Foundation; either
7
* version 2 of the License, or (at your option) any later version.
9
* This library is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
* Lesser General Public License for more details.
14
* You should have received a copy of the GNU Lesser General Public
15
* License along with this library; if not, write to the Free Software
16
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
/* this code assume that stride % 16 == 0 */
20
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21
POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22
signed int ABCD[4] __attribute__((aligned(16))) =
28
vector unsigned char fperm;
29
const vector signed int vABCD = vec_ld(0, ABCD);
30
const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31
const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32
const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33
const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34
const vector signed int vzero = vec_splat_s32(0);
35
const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36
const vector unsigned short v6us = vec_splat_u16(6);
37
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
40
vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41
vector unsigned char vsrc0uc, vsrc1uc;
42
vector signed short vsrc0ssH, vsrc1ssH;
43
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44
vector signed short vsrc2ssH, vsrc3ssH, psum;
45
vector unsigned char vdst, ppsum, vfdst, fsum;
47
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
49
if (((unsigned long)dst) % 16 == 0) {
50
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51
0x14, 0x15, 0x16, 0x17,
52
0x08, 0x09, 0x0A, 0x0B,
53
0x0C, 0x0D, 0x0E, 0x0F);
55
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56
0x04, 0x05, 0x06, 0x07,
57
0x18, 0x19, 0x1A, 0x1B,
58
0x1C, 0x1D, 0x1E, 0x1F);
61
vsrcAuc = vec_ld(0, src);
64
vsrcBuc = vec_ld(16, src);
65
vsrcperm0 = vec_lvsl(0, src);
66
vsrcperm1 = vec_lvsl(1, src);
68
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
72
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
74
vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75
(vector unsigned char)vsrc0uc);
76
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
(vector unsigned char)vsrc1uc);
79
if (!loadSecond) {// -> !reallyBadAlign
80
for (i = 0 ; i < h ; i++) {
83
vsrcCuc = vec_ld(stride + 0, src);
85
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
88
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89
(vector unsigned char)vsrc2uc);
90
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
(vector unsigned char)vsrc3uc);
93
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
94
psum = vec_mladd(vB, vsrc1ssH, psum);
95
psum = vec_mladd(vC, vsrc2ssH, psum);
96
psum = vec_mladd(vD, vsrc3ssH, psum);
97
psum = vec_add(v32ss, psum);
98
psum = vec_sra(psum, v6us);
100
vdst = vec_ld(0, dst);
101
ppsum = (vector unsigned char)vec_packsu(psum, psum);
102
vfdst = vec_perm(vdst, ppsum, fperm);
104
OP_U8_ALTIVEC(fsum, vfdst, vdst);
106
vec_st(fsum, 0, dst);
115
vector unsigned char vsrcDuc;
116
for (i = 0 ; i < h ; i++) {
117
vsrcCuc = vec_ld(stride + 0, src);
118
vsrcDuc = vec_ld(stride + 16, src);
120
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
124
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
126
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127
(vector unsigned char)vsrc2uc);
128
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
(vector unsigned char)vsrc3uc);
131
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
132
psum = vec_mladd(vB, vsrc1ssH, psum);
133
psum = vec_mladd(vC, vsrc2ssH, psum);
134
psum = vec_mladd(vD, vsrc3ssH, psum);
135
psum = vec_add(v32ss, psum);
136
psum = vec_sr(psum, v6us);
138
vdst = vec_ld(0, dst);
139
ppsum = (vector unsigned char)vec_pack(psum, psum);
140
vfdst = vec_perm(vdst, ppsum, fperm);
142
OP_U8_ALTIVEC(fsum, vfdst, vdst);
144
vec_st(fsum, 0, dst);
153
POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
156
/* this code assume stride % 16 == 0 */
157
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
158
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
161
const vector signed int vzero = vec_splat_s32(0);
162
const vector unsigned char permM2 = vec_lvsl(-2, src);
163
const vector unsigned char permM1 = vec_lvsl(-1, src);
164
const vector unsigned char permP0 = vec_lvsl(+0, src);
165
const vector unsigned char permP1 = vec_lvsl(+1, src);
166
const vector unsigned char permP2 = vec_lvsl(+2, src);
167
const vector unsigned char permP3 = vec_lvsl(+3, src);
168
const vector signed short v5ss = vec_splat_s16(5);
169
const vector unsigned short v5us = vec_splat_u16(5);
170
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
171
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
172
const vector unsigned char dstperm = vec_lvsr(0, dst);
173
const vector unsigned char neg1 =
174
(const vector unsigned char) vec_splat_s8(-1);
176
const vector unsigned char dstmask =
177
vec_perm((const vector unsigned char)vzero,
180
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
182
register int align = ((((unsigned long)src) - 2) % 16);
184
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185
srcP2A, srcP2B, srcP3A, srcP3B,
186
srcM1A, srcM1B, srcM2A, srcM2B,
187
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189
psumA, psumB, sumA, sumB;
191
vector unsigned char sum, dst1, dst2, vdst, fsum,
194
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
196
for (i = 0 ; i < 16 ; i ++) {
197
vector unsigned char srcR1 = vec_ld(-2, src);
198
vector unsigned char srcR2 = vec_ld(14, src);
202
srcM2 = vec_perm(srcR1, srcR2, permM2);
203
srcM1 = vec_perm(srcR1, srcR2, permM1);
204
srcP0 = vec_perm(srcR1, srcR2, permP0);
205
srcP1 = vec_perm(srcR1, srcR2, permP1);
206
srcP2 = vec_perm(srcR1, srcR2, permP2);
207
srcP3 = vec_perm(srcR1, srcR2, permP3);
210
srcM2 = vec_perm(srcR1, srcR2, permM2);
211
srcM1 = vec_perm(srcR1, srcR2, permM1);
212
srcP0 = vec_perm(srcR1, srcR2, permP0);
213
srcP1 = vec_perm(srcR1, srcR2, permP1);
214
srcP2 = vec_perm(srcR1, srcR2, permP2);
218
vector unsigned char srcR3 = vec_ld(30, src);
219
srcM2 = vec_perm(srcR1, srcR2, permM2);
220
srcM1 = vec_perm(srcR1, srcR2, permM1);
221
srcP0 = vec_perm(srcR1, srcR2, permP0);
222
srcP1 = vec_perm(srcR1, srcR2, permP1);
224
srcP3 = vec_perm(srcR2, srcR3, permP3);
227
vector unsigned char srcR3 = vec_ld(30, src);
228
srcM2 = vec_perm(srcR1, srcR2, permM2);
229
srcM1 = vec_perm(srcR1, srcR2, permM1);
230
srcP0 = vec_perm(srcR1, srcR2, permP0);
232
srcP2 = vec_perm(srcR2, srcR3, permP2);
233
srcP3 = vec_perm(srcR2, srcR3, permP3);
236
vector unsigned char srcR3 = vec_ld(30, src);
237
srcM2 = vec_perm(srcR1, srcR2, permM2);
238
srcM1 = vec_perm(srcR1, srcR2, permM1);
240
srcP1 = vec_perm(srcR2, srcR3, permP1);
241
srcP2 = vec_perm(srcR2, srcR3, permP2);
242
srcP3 = vec_perm(srcR2, srcR3, permP3);
245
vector unsigned char srcR3 = vec_ld(30, src);
246
srcM2 = vec_perm(srcR1, srcR2, permM2);
248
srcP0 = vec_perm(srcR2, srcR3, permP0);
249
srcP1 = vec_perm(srcR2, srcR3, permP1);
250
srcP2 = vec_perm(srcR2, srcR3, permP2);
251
srcP3 = vec_perm(srcR2, srcR3, permP3);
255
srcP0A = (vector signed short)
256
vec_mergeh((vector unsigned char)vzero, srcP0);
257
srcP0B = (vector signed short)
258
vec_mergel((vector unsigned char)vzero, srcP0);
259
srcP1A = (vector signed short)
260
vec_mergeh((vector unsigned char)vzero, srcP1);
261
srcP1B = (vector signed short)
262
vec_mergel((vector unsigned char)vzero, srcP1);
264
srcP2A = (vector signed short)
265
vec_mergeh((vector unsigned char)vzero, srcP2);
266
srcP2B = (vector signed short)
267
vec_mergel((vector unsigned char)vzero, srcP2);
268
srcP3A = (vector signed short)
269
vec_mergeh((vector unsigned char)vzero, srcP3);
270
srcP3B = (vector signed short)
271
vec_mergel((vector unsigned char)vzero, srcP3);
273
srcM1A = (vector signed short)
274
vec_mergeh((vector unsigned char)vzero, srcM1);
275
srcM1B = (vector signed short)
276
vec_mergel((vector unsigned char)vzero, srcM1);
277
srcM2A = (vector signed short)
278
vec_mergeh((vector unsigned char)vzero, srcM2);
279
srcM2B = (vector signed short)
280
vec_mergel((vector unsigned char)vzero, srcM2);
282
sum1A = vec_adds(srcP0A, srcP1A);
283
sum1B = vec_adds(srcP0B, srcP1B);
284
sum2A = vec_adds(srcM1A, srcP2A);
285
sum2B = vec_adds(srcM1B, srcP2B);
286
sum3A = vec_adds(srcM2A, srcP3A);
287
sum3B = vec_adds(srcM2B, srcP3B);
289
pp1A = vec_mladd(sum1A, v20ss, v16ss);
290
pp1B = vec_mladd(sum1B, v20ss, v16ss);
292
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
293
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
295
pp3A = vec_add(sum3A, pp1A);
296
pp3B = vec_add(sum3B, pp1B);
298
psumA = vec_sub(pp3A, pp2A);
299
psumB = vec_sub(pp3B, pp2B);
301
sumA = vec_sra(psumA, v5us);
302
sumB = vec_sra(psumB, v5us);
304
sum = vec_packsu(sumA, sumB);
306
dst1 = vec_ld(0, dst);
307
dst2 = vec_ld(16, dst);
308
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
310
OP_U8_ALTIVEC(fsum, sum, vdst);
312
rsum = vec_perm(fsum, fsum, dstperm);
313
fdst1 = vec_sel(dst1, rsum, dstmask);
314
fdst2 = vec_sel(rsum, dst2, dstmask);
316
vec_st(fdst1, 0, dst);
317
vec_st(fdst2, 16, dst);
322
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
325
/* this code assume stride % 16 == 0 */
326
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
327
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
331
const vector signed int vzero = vec_splat_s32(0);
332
const vector unsigned char perm = vec_lvsl(0, src);
333
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
334
const vector unsigned short v5us = vec_splat_u16(5);
335
const vector signed short v5ss = vec_splat_s16(5);
336
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
337
const vector unsigned char dstperm = vec_lvsr(0, dst);
338
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
339
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
341
uint8_t *srcbis = src - (srcStride * 2);
343
const vector unsigned char srcM2a = vec_ld(0, srcbis);
344
const vector unsigned char srcM2b = vec_ld(16, srcbis);
345
const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
346
// srcbis += srcStride;
347
const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
348
const vector unsigned char srcM1b = vec_ld(16, srcbis);
349
const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
350
// srcbis += srcStride;
351
const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
352
const vector unsigned char srcP0b = vec_ld(16, srcbis);
353
const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
354
// srcbis += srcStride;
355
const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
356
const vector unsigned char srcP1b = vec_ld(16, srcbis);
357
const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
358
// srcbis += srcStride;
359
const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
360
const vector unsigned char srcP2b = vec_ld(16, srcbis);
361
const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
362
// srcbis += srcStride;
364
vector signed short srcM2ssA = (vector signed short)
365
vec_mergeh((vector unsigned char)vzero, srcM2);
366
vector signed short srcM2ssB = (vector signed short)
367
vec_mergel((vector unsigned char)vzero, srcM2);
368
vector signed short srcM1ssA = (vector signed short)
369
vec_mergeh((vector unsigned char)vzero, srcM1);
370
vector signed short srcM1ssB = (vector signed short)
371
vec_mergel((vector unsigned char)vzero, srcM1);
372
vector signed short srcP0ssA = (vector signed short)
373
vec_mergeh((vector unsigned char)vzero, srcP0);
374
vector signed short srcP0ssB = (vector signed short)
375
vec_mergel((vector unsigned char)vzero, srcP0);
376
vector signed short srcP1ssA = (vector signed short)
377
vec_mergeh((vector unsigned char)vzero, srcP1);
378
vector signed short srcP1ssB = (vector signed short)
379
vec_mergel((vector unsigned char)vzero, srcP1);
380
vector signed short srcP2ssA = (vector signed short)
381
vec_mergeh((vector unsigned char)vzero, srcP2);
382
vector signed short srcP2ssB = (vector signed short)
383
vec_mergel((vector unsigned char)vzero, srcP2);
385
vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
386
psumA, psumB, sumA, sumB,
388
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
390
vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
391
srcP3a, srcP3b, srcP3;
393
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
395
for (i = 0 ; i < 16 ; i++) {
396
srcP3a = vec_ld(0, srcbis += srcStride);
397
srcP3b = vec_ld(16, srcbis);
398
srcP3 = vec_perm(srcP3a, srcP3b, perm);
399
srcP3ssA = (vector signed short)
400
vec_mergeh((vector unsigned char)vzero, srcP3);
401
srcP3ssB = (vector signed short)
402
vec_mergel((vector unsigned char)vzero, srcP3);
403
// srcbis += srcStride;
405
sum1A = vec_adds(srcP0ssA, srcP1ssA);
406
sum1B = vec_adds(srcP0ssB, srcP1ssB);
407
sum2A = vec_adds(srcM1ssA, srcP2ssA);
408
sum2B = vec_adds(srcM1ssB, srcP2ssB);
409
sum3A = vec_adds(srcM2ssA, srcP3ssA);
410
sum3B = vec_adds(srcM2ssB, srcP3ssB);
423
pp1A = vec_mladd(sum1A, v20ss, v16ss);
424
pp1B = vec_mladd(sum1B, v20ss, v16ss);
426
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
427
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
429
pp3A = vec_add(sum3A, pp1A);
430
pp3B = vec_add(sum3B, pp1B);
432
psumA = vec_sub(pp3A, pp2A);
433
psumB = vec_sub(pp3B, pp2B);
435
sumA = vec_sra(psumA, v5us);
436
sumB = vec_sra(psumB, v5us);
438
sum = vec_packsu(sumA, sumB);
440
dst1 = vec_ld(0, dst);
441
dst2 = vec_ld(16, dst);
442
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
444
OP_U8_ALTIVEC(fsum, sum, vdst);
446
rsum = vec_perm(fsum, fsum, dstperm);
447
fdst1 = vec_sel(dst1, rsum, dstmask);
448
fdst2 = vec_sel(rsum, dst2, dstmask);
450
vec_st(fdst1, 0, dst);
451
vec_st(fdst2, 16, dst);
455
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
458
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
459
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
460
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
462
const vector signed int vzero = vec_splat_s32(0);
463
const vector unsigned char permM2 = vec_lvsl(-2, src);
464
const vector unsigned char permM1 = vec_lvsl(-1, src);
465
const vector unsigned char permP0 = vec_lvsl(+0, src);
466
const vector unsigned char permP1 = vec_lvsl(+1, src);
467
const vector unsigned char permP2 = vec_lvsl(+2, src);
468
const vector unsigned char permP3 = vec_lvsl(+3, src);
469
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
470
const vector unsigned int v10ui = vec_splat_u32(10);
471
const vector signed short v5ss = vec_splat_s16(5);
472
const vector signed short v1ss = vec_splat_s16(1);
473
const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
474
const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
476
register int align = ((((unsigned long)src) - 2) % 16);
478
const vector unsigned char neg1 = (const vector unsigned char)
481
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
482
srcP2A, srcP2B, srcP3A, srcP3B,
483
srcM1A, srcM1B, srcM2A, srcM2B,
484
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
487
const vector unsigned char dstperm = vec_lvsr(0, dst);
489
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
491
const vector unsigned char mperm = (const vector unsigned char)
492
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
493
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
494
int16_t *tmpbis = tmp;
496
vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
497
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
500
vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
501
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
502
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
503
ssumAe, ssumAo, ssumBe, ssumBo;
504
vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
506
vector signed short ssume, ssumo;
508
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
509
src -= (2 * srcStride);
510
for (i = 0 ; i < 21 ; i ++) {
511
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
512
vector unsigned char srcR1 = vec_ld(-2, src);
513
vector unsigned char srcR2 = vec_ld(14, src);
517
srcM2 = vec_perm(srcR1, srcR2, permM2);
518
srcM1 = vec_perm(srcR1, srcR2, permM1);
519
srcP0 = vec_perm(srcR1, srcR2, permP0);
520
srcP1 = vec_perm(srcR1, srcR2, permP1);
521
srcP2 = vec_perm(srcR1, srcR2, permP2);
522
srcP3 = vec_perm(srcR1, srcR2, permP3);
525
srcM2 = vec_perm(srcR1, srcR2, permM2);
526
srcM1 = vec_perm(srcR1, srcR2, permM1);
527
srcP0 = vec_perm(srcR1, srcR2, permP0);
528
srcP1 = vec_perm(srcR1, srcR2, permP1);
529
srcP2 = vec_perm(srcR1, srcR2, permP2);
533
vector unsigned char srcR3 = vec_ld(30, src);
534
srcM2 = vec_perm(srcR1, srcR2, permM2);
535
srcM1 = vec_perm(srcR1, srcR2, permM1);
536
srcP0 = vec_perm(srcR1, srcR2, permP0);
537
srcP1 = vec_perm(srcR1, srcR2, permP1);
539
srcP3 = vec_perm(srcR2, srcR3, permP3);
542
vector unsigned char srcR3 = vec_ld(30, src);
543
srcM2 = vec_perm(srcR1, srcR2, permM2);
544
srcM1 = vec_perm(srcR1, srcR2, permM1);
545
srcP0 = vec_perm(srcR1, srcR2, permP0);
547
srcP2 = vec_perm(srcR2, srcR3, permP2);
548
srcP3 = vec_perm(srcR2, srcR3, permP3);
551
vector unsigned char srcR3 = vec_ld(30, src);
552
srcM2 = vec_perm(srcR1, srcR2, permM2);
553
srcM1 = vec_perm(srcR1, srcR2, permM1);
555
srcP1 = vec_perm(srcR2, srcR3, permP1);
556
srcP2 = vec_perm(srcR2, srcR3, permP2);
557
srcP3 = vec_perm(srcR2, srcR3, permP3);
560
vector unsigned char srcR3 = vec_ld(30, src);
561
srcM2 = vec_perm(srcR1, srcR2, permM2);
563
srcP0 = vec_perm(srcR2, srcR3, permP0);
564
srcP1 = vec_perm(srcR2, srcR3, permP1);
565
srcP2 = vec_perm(srcR2, srcR3, permP2);
566
srcP3 = vec_perm(srcR2, srcR3, permP3);
570
srcP0A = (vector signed short)
571
vec_mergeh((vector unsigned char)vzero, srcP0);
572
srcP0B = (vector signed short)
573
vec_mergel((vector unsigned char)vzero, srcP0);
574
srcP1A = (vector signed short)
575
vec_mergeh((vector unsigned char)vzero, srcP1);
576
srcP1B = (vector signed short)
577
vec_mergel((vector unsigned char)vzero, srcP1);
579
srcP2A = (vector signed short)
580
vec_mergeh((vector unsigned char)vzero, srcP2);
581
srcP2B = (vector signed short)
582
vec_mergel((vector unsigned char)vzero, srcP2);
583
srcP3A = (vector signed short)
584
vec_mergeh((vector unsigned char)vzero, srcP3);
585
srcP3B = (vector signed short)
586
vec_mergel((vector unsigned char)vzero, srcP3);
588
srcM1A = (vector signed short)
589
vec_mergeh((vector unsigned char)vzero, srcM1);
590
srcM1B = (vector signed short)
591
vec_mergel((vector unsigned char)vzero, srcM1);
592
srcM2A = (vector signed short)
593
vec_mergeh((vector unsigned char)vzero, srcM2);
594
srcM2B = (vector signed short)
595
vec_mergel((vector unsigned char)vzero, srcM2);
597
sum1A = vec_adds(srcP0A, srcP1A);
598
sum1B = vec_adds(srcP0B, srcP1B);
599
sum2A = vec_adds(srcM1A, srcP2A);
600
sum2B = vec_adds(srcM1B, srcP2B);
601
sum3A = vec_adds(srcM2A, srcP3A);
602
sum3B = vec_adds(srcM2B, srcP3B);
604
pp1A = vec_mladd(sum1A, v20ss, sum3A);
605
pp1B = vec_mladd(sum1B, v20ss, sum3B);
607
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
608
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
610
psumA = vec_sub(pp1A, pp2A);
611
psumB = vec_sub(pp1B, pp2B);
613
vec_st(psumA, 0, tmp);
614
vec_st(psumB, 16, tmp);
617
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
620
tmpM2ssA = vec_ld(0, tmpbis);
621
tmpM2ssB = vec_ld(16, tmpbis);
623
tmpM1ssA = vec_ld(0, tmpbis);
624
tmpM1ssB = vec_ld(16, tmpbis);
626
tmpP0ssA = vec_ld(0, tmpbis);
627
tmpP0ssB = vec_ld(16, tmpbis);
629
tmpP1ssA = vec_ld(0, tmpbis);
630
tmpP1ssB = vec_ld(16, tmpbis);
632
tmpP2ssA = vec_ld(0, tmpbis);
633
tmpP2ssB = vec_ld(16, tmpbis);
636
for (i = 0 ; i < 16 ; i++) {
637
const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
638
const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
640
const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
641
const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
642
const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
643
const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
644
const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
645
const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
660
pp1Ae = vec_mule(sum1A, v20ss);
661
pp1Ao = vec_mulo(sum1A, v20ss);
662
pp1Be = vec_mule(sum1B, v20ss);
663
pp1Bo = vec_mulo(sum1B, v20ss);
665
pp2Ae = vec_mule(sum2A, v5ss);
666
pp2Ao = vec_mulo(sum2A, v5ss);
667
pp2Be = vec_mule(sum2B, v5ss);
668
pp2Bo = vec_mulo(sum2B, v5ss);
670
pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
671
pp3Ao = vec_mulo(sum3A, v1ss);
672
pp3Be = vec_sra((vector signed int)sum3B, v16ui);
673
pp3Bo = vec_mulo(sum3B, v1ss);
675
pp1cAe = vec_add(pp1Ae, v512si);
676
pp1cAo = vec_add(pp1Ao, v512si);
677
pp1cBe = vec_add(pp1Be, v512si);
678
pp1cBo = vec_add(pp1Bo, v512si);
680
pp32Ae = vec_sub(pp3Ae, pp2Ae);
681
pp32Ao = vec_sub(pp3Ao, pp2Ao);
682
pp32Be = vec_sub(pp3Be, pp2Be);
683
pp32Bo = vec_sub(pp3Bo, pp2Bo);
685
sumAe = vec_add(pp1cAe, pp32Ae);
686
sumAo = vec_add(pp1cAo, pp32Ao);
687
sumBe = vec_add(pp1cBe, pp32Be);
688
sumBo = vec_add(pp1cBo, pp32Bo);
690
ssumAe = vec_sra(sumAe, v10ui);
691
ssumAo = vec_sra(sumAo, v10ui);
692
ssumBe = vec_sra(sumBe, v10ui);
693
ssumBo = vec_sra(sumBo, v10ui);
695
ssume = vec_packs(ssumAe, ssumBe);
696
ssumo = vec_packs(ssumAo, ssumBo);
698
sumv = vec_packsu(ssume, ssumo);
699
sum = vec_perm(sumv, sumv, mperm);
701
dst1 = vec_ld(0, dst);
702
dst2 = vec_ld(16, dst);
703
vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
705
OP_U8_ALTIVEC(fsum, sum, vdst);
707
rsum = vec_perm(fsum, fsum, dstperm);
708
fdst1 = vec_sel(dst1, rsum, dstmask);
709
fdst2 = vec_sel(rsum, dst2, dstmask);
711
vec_st(fdst1, 0, dst);
712
vec_st(fdst2, 16, dst);
716
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);