2
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4
* This file is part of Libav.
6
* Libav is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* Libav is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with Libav; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
#include "libavutil/mem.h"
24
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
26
#define ASSERT_ALIGNED(ptr) ;
29
/* this code assume that stride % 16 == 0 */
31
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
32
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
33
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
35
psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
36
psum = vec_mladd(vB, vsrc1ssH, psum);\
37
psum = vec_mladd(vC, vsrc2ssH, psum);\
38
psum = vec_mladd(vD, vsrc3ssH, psum);\
40
psum = vec_sr(psum, v6us);\
42
vdst = vec_ld(0, dst);\
43
ppsum = (vec_u8)vec_pack(psum, psum);\
44
vfdst = vec_perm(vdst, ppsum, fperm);\
46
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
48
vec_st(fsum, 0, dst);\
56
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
58
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
59
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
61
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
62
psum = vec_mladd(vE, vsrc1ssH, psum);\
63
psum = vec_sr(psum, v6us);\
65
vdst = vec_ld(0, dst);\
66
ppsum = (vec_u8)vec_pack(psum, psum);\
67
vfdst = vec_perm(vdst, ppsum, fperm);\
69
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
71
vec_st(fsum, 0, dst);\
77
#define add28(a) vec_add(v28ss, a)
79
#ifdef PREFIX_h264_chroma_mc8_altivec
80
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
81
int stride, int h, int x, int y) {
82
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
89
const vec_s32 vABCD = vec_ld(0, ABCD);
90
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
91
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
92
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
93
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
95
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
96
const vec_u16 v6us = vec_splat_u16(6);
97
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
98
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
100
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
101
vec_u8 vsrc0uc, vsrc1uc;
102
vec_s16 vsrc0ssH, vsrc1ssH;
103
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
104
vec_s16 vsrc2ssH, vsrc3ssH, psum;
105
vec_u8 vdst, ppsum, vfdst, fsum;
107
if (((unsigned long)dst) % 16 == 0) {
108
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
109
0x14, 0x15, 0x16, 0x17,
110
0x08, 0x09, 0x0A, 0x0B,
111
0x0C, 0x0D, 0x0E, 0x0F};
113
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
114
0x04, 0x05, 0x06, 0x07,
115
0x18, 0x19, 0x1A, 0x1B,
116
0x1C, 0x1D, 0x1E, 0x1F};
119
vsrcAuc = vec_ld(0, src);
122
vsrcBuc = vec_ld(16, src);
123
vsrcperm0 = vec_lvsl(0, src);
124
vsrcperm1 = vec_lvsl(1, src);
126
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
130
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
132
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
133
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
136
if (!loadSecond) {// -> !reallyBadAlign
137
for (i = 0 ; i < h ; i++) {
138
vsrcCuc = vec_ld(stride + 0, src);
139
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
140
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
142
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
146
for (i = 0 ; i < h ; i++) {
147
vsrcCuc = vec_ld(stride + 0, src);
148
vsrcDuc = vec_ld(stride + 16, src);
149
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
153
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
155
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
159
const vec_s16 vE = vec_add(vB, vC);
160
if (ABCD[2]) { // x == 0 B == 0
161
if (!loadSecond) {// -> !reallyBadAlign
162
for (i = 0 ; i < h ; i++) {
163
vsrcCuc = vec_ld(stride + 0, src);
164
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
165
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
171
for (i = 0 ; i < h ; i++) {
172
vsrcCuc = vec_ld(stride + 0, src);
173
vsrcDuc = vec_ld(stride + 15, src);
174
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
175
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
180
} else { // y == 0 C == 0
181
if (!loadSecond) {// -> !reallyBadAlign
182
for (i = 0 ; i < h ; i++) {
183
vsrcCuc = vec_ld(0, src);
184
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
185
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
187
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
191
for (i = 0 ; i < h ; i++) {
192
vsrcCuc = vec_ld(0, src);
193
vsrcDuc = vec_ld(15, src);
194
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
198
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
200
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
208
/* this code assume that stride % 16 == 0 */
209
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
210
static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
212
{((8 - x) * (8 - y)),
218
const vec_s32 vABCD = vec_ld(0, ABCD);
219
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
224
const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225
const vec_u16 v6us = vec_splat_u16(6);
226
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
229
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230
vec_u8 vsrc0uc, vsrc1uc;
231
vec_s16 vsrc0ssH, vsrc1ssH;
232
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233
vec_s16 vsrc2ssH, vsrc3ssH, psum;
234
vec_u8 vdst, ppsum, vfdst, fsum;
236
if (((unsigned long)dst) % 16 == 0) {
237
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238
0x14, 0x15, 0x16, 0x17,
239
0x08, 0x09, 0x0A, 0x0B,
240
0x0C, 0x0D, 0x0E, 0x0F};
242
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243
0x04, 0x05, 0x06, 0x07,
244
0x18, 0x19, 0x1A, 0x1B,
245
0x1C, 0x1D, 0x1E, 0x1F};
248
vsrcAuc = vec_ld(0, src);
251
vsrcBuc = vec_ld(16, src);
252
vsrcperm0 = vec_lvsl(0, src);
253
vsrcperm1 = vec_lvsl(1, src);
255
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
259
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
261
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
264
if (!loadSecond) {// -> !reallyBadAlign
265
for (i = 0 ; i < h ; i++) {
268
vsrcCuc = vec_ld(stride + 0, src);
270
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
273
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
277
for (i = 0 ; i < h ; i++) {
278
vsrcCuc = vec_ld(stride + 0, src);
279
vsrcDuc = vec_ld(stride + 16, src);
281
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
285
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
287
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
295
#undef CHROMA_MC8_ALTIVEC_CORE
297
/* this code assume stride % 16 == 0 */
298
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
299
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
303
const vec_u8 permM2 = vec_lvsl(-2, src);
304
const vec_u8 permM1 = vec_lvsl(-1, src);
305
const vec_u8 permP0 = vec_lvsl(+0, src);
306
const vec_u8 permP1 = vec_lvsl(+1, src);
307
const vec_u8 permP2 = vec_lvsl(+2, src);
308
const vec_u8 permP3 = vec_lvsl(+3, src);
309
const vec_s16 v5ss = vec_splat_s16(5);
310
const vec_u16 v5us = vec_splat_u16(5);
311
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
312
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
314
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
316
register int align = ((((unsigned long)src) - 2) % 16);
318
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319
srcP2A, srcP2B, srcP3A, srcP3B,
320
srcM1A, srcM1B, srcM2A, srcM2B,
321
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
323
psumA, psumB, sumA, sumB;
327
for (i = 0 ; i < 16 ; i ++) {
328
vec_u8 srcR1 = vec_ld(-2, src);
329
vec_u8 srcR2 = vec_ld(14, src);
333
srcM2 = vec_perm(srcR1, srcR2, permM2);
334
srcM1 = vec_perm(srcR1, srcR2, permM1);
335
srcP0 = vec_perm(srcR1, srcR2, permP0);
336
srcP1 = vec_perm(srcR1, srcR2, permP1);
337
srcP2 = vec_perm(srcR1, srcR2, permP2);
338
srcP3 = vec_perm(srcR1, srcR2, permP3);
341
srcM2 = vec_perm(srcR1, srcR2, permM2);
342
srcM1 = vec_perm(srcR1, srcR2, permM1);
343
srcP0 = vec_perm(srcR1, srcR2, permP0);
344
srcP1 = vec_perm(srcR1, srcR2, permP1);
345
srcP2 = vec_perm(srcR1, srcR2, permP2);
349
vec_u8 srcR3 = vec_ld(30, src);
350
srcM2 = vec_perm(srcR1, srcR2, permM2);
351
srcM1 = vec_perm(srcR1, srcR2, permM1);
352
srcP0 = vec_perm(srcR1, srcR2, permP0);
353
srcP1 = vec_perm(srcR1, srcR2, permP1);
355
srcP3 = vec_perm(srcR2, srcR3, permP3);
358
vec_u8 srcR3 = vec_ld(30, src);
359
srcM2 = vec_perm(srcR1, srcR2, permM2);
360
srcM1 = vec_perm(srcR1, srcR2, permM1);
361
srcP0 = vec_perm(srcR1, srcR2, permP0);
363
srcP2 = vec_perm(srcR2, srcR3, permP2);
364
srcP3 = vec_perm(srcR2, srcR3, permP3);
367
vec_u8 srcR3 = vec_ld(30, src);
368
srcM2 = vec_perm(srcR1, srcR2, permM2);
369
srcM1 = vec_perm(srcR1, srcR2, permM1);
371
srcP1 = vec_perm(srcR2, srcR3, permP1);
372
srcP2 = vec_perm(srcR2, srcR3, permP2);
373
srcP3 = vec_perm(srcR2, srcR3, permP3);
376
vec_u8 srcR3 = vec_ld(30, src);
377
srcM2 = vec_perm(srcR1, srcR2, permM2);
379
srcP0 = vec_perm(srcR2, srcR3, permP0);
380
srcP1 = vec_perm(srcR2, srcR3, permP1);
381
srcP2 = vec_perm(srcR2, srcR3, permP2);
382
srcP3 = vec_perm(srcR2, srcR3, permP3);
386
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
387
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
388
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
389
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
391
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
392
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
393
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
394
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
396
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
397
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
398
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
399
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
401
sum1A = vec_adds(srcP0A, srcP1A);
402
sum1B = vec_adds(srcP0B, srcP1B);
403
sum2A = vec_adds(srcM1A, srcP2A);
404
sum2B = vec_adds(srcM1B, srcP2B);
405
sum3A = vec_adds(srcM2A, srcP3A);
406
sum3B = vec_adds(srcM2B, srcP3B);
408
pp1A = vec_mladd(sum1A, v20ss, v16ss);
409
pp1B = vec_mladd(sum1B, v20ss, v16ss);
411
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
412
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
414
pp3A = vec_add(sum3A, pp1A);
415
pp3B = vec_add(sum3B, pp1B);
417
psumA = vec_sub(pp3A, pp2A);
418
psumB = vec_sub(pp3B, pp2B);
420
sumA = vec_sra(psumA, v5us);
421
sumB = vec_sra(psumB, v5us);
423
sum = vec_packsu(sumA, sumB);
427
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
429
vec_st(fsum, 0, dst);
437
/* this code assume stride % 16 == 0 */
438
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
439
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
443
const vec_u8 perm = vec_lvsl(0, src);
444
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
445
const vec_u16 v5us = vec_splat_u16(5);
446
const vec_s16 v5ss = vec_splat_s16(5);
447
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
449
uint8_t *srcbis = src - (srcStride * 2);
451
const vec_u8 srcM2a = vec_ld(0, srcbis);
452
const vec_u8 srcM2b = vec_ld(16, srcbis);
453
const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
454
//srcbis += srcStride;
455
const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
456
const vec_u8 srcM1b = vec_ld(16, srcbis);
457
const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
458
//srcbis += srcStride;
459
const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
460
const vec_u8 srcP0b = vec_ld(16, srcbis);
461
const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
462
//srcbis += srcStride;
463
const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
464
const vec_u8 srcP1b = vec_ld(16, srcbis);
465
const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
466
//srcbis += srcStride;
467
const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
468
const vec_u8 srcP2b = vec_ld(16, srcbis);
469
const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
470
//srcbis += srcStride;
472
vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
473
vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
474
vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
475
vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
476
vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
477
vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
478
vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
479
vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
480
vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
481
vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
483
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
484
psumA, psumB, sumA, sumB,
486
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
488
vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
490
for (i = 0 ; i < 16 ; i++) {
491
srcP3a = vec_ld(0, srcbis += srcStride);
492
srcP3b = vec_ld(16, srcbis);
493
srcP3 = vec_perm(srcP3a, srcP3b, perm);
494
srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
495
srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
496
//srcbis += srcStride;
498
sum1A = vec_adds(srcP0ssA, srcP1ssA);
499
sum1B = vec_adds(srcP0ssB, srcP1ssB);
500
sum2A = vec_adds(srcM1ssA, srcP2ssA);
501
sum2B = vec_adds(srcM1ssB, srcP2ssB);
502
sum3A = vec_adds(srcM2ssA, srcP3ssA);
503
sum3B = vec_adds(srcM2ssB, srcP3ssB);
516
pp1A = vec_mladd(sum1A, v20ss, v16ss);
517
pp1B = vec_mladd(sum1B, v20ss, v16ss);
519
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
520
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
522
pp3A = vec_add(sum3A, pp1A);
523
pp3B = vec_add(sum3B, pp1B);
525
psumA = vec_sub(pp3A, pp2A);
526
psumB = vec_sub(pp3B, pp2B);
528
sumA = vec_sra(psumA, v5us);
529
sumB = vec_sra(psumB, v5us);
531
sum = vec_packsu(sumA, sumB);
535
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
537
vec_st(fsum, 0, dst);
544
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
545
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
546
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
549
const vec_u8 permM2 = vec_lvsl(-2, src);
550
const vec_u8 permM1 = vec_lvsl(-1, src);
551
const vec_u8 permP0 = vec_lvsl(+0, src);
552
const vec_u8 permP1 = vec_lvsl(+1, src);
553
const vec_u8 permP2 = vec_lvsl(+2, src);
554
const vec_u8 permP3 = vec_lvsl(+3, src);
555
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556
const vec_u32 v10ui = vec_splat_u32(10);
557
const vec_s16 v5ss = vec_splat_s16(5);
558
const vec_s16 v1ss = vec_splat_s16(1);
559
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
562
register int align = ((((unsigned long)src) - 2) % 16);
564
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565
srcP2A, srcP2B, srcP3A, srcP3B,
566
srcM1A, srcM1B, srcM2A, srcM2B,
567
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
570
const vec_u8 mperm = (const vec_u8)
571
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573
int16_t *tmpbis = tmp;
575
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
579
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582
ssumAe, ssumAo, ssumBe, ssumBo;
583
vec_u8 fsum, sumv, sum;
584
vec_s16 ssume, ssumo;
586
src -= (2 * srcStride);
587
for (i = 0 ; i < 21 ; i ++) {
588
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589
vec_u8 srcR1 = vec_ld(-2, src);
590
vec_u8 srcR2 = vec_ld(14, src);
594
srcM2 = vec_perm(srcR1, srcR2, permM2);
595
srcM1 = vec_perm(srcR1, srcR2, permM1);
596
srcP0 = vec_perm(srcR1, srcR2, permP0);
597
srcP1 = vec_perm(srcR1, srcR2, permP1);
598
srcP2 = vec_perm(srcR1, srcR2, permP2);
599
srcP3 = vec_perm(srcR1, srcR2, permP3);
602
srcM2 = vec_perm(srcR1, srcR2, permM2);
603
srcM1 = vec_perm(srcR1, srcR2, permM1);
604
srcP0 = vec_perm(srcR1, srcR2, permP0);
605
srcP1 = vec_perm(srcR1, srcR2, permP1);
606
srcP2 = vec_perm(srcR1, srcR2, permP2);
610
vec_u8 srcR3 = vec_ld(30, src);
611
srcM2 = vec_perm(srcR1, srcR2, permM2);
612
srcM1 = vec_perm(srcR1, srcR2, permM1);
613
srcP0 = vec_perm(srcR1, srcR2, permP0);
614
srcP1 = vec_perm(srcR1, srcR2, permP1);
616
srcP3 = vec_perm(srcR2, srcR3, permP3);
619
vec_u8 srcR3 = vec_ld(30, src);
620
srcM2 = vec_perm(srcR1, srcR2, permM2);
621
srcM1 = vec_perm(srcR1, srcR2, permM1);
622
srcP0 = vec_perm(srcR1, srcR2, permP0);
624
srcP2 = vec_perm(srcR2, srcR3, permP2);
625
srcP3 = vec_perm(srcR2, srcR3, permP3);
628
vec_u8 srcR3 = vec_ld(30, src);
629
srcM2 = vec_perm(srcR1, srcR2, permM2);
630
srcM1 = vec_perm(srcR1, srcR2, permM1);
632
srcP1 = vec_perm(srcR2, srcR3, permP1);
633
srcP2 = vec_perm(srcR2, srcR3, permP2);
634
srcP3 = vec_perm(srcR2, srcR3, permP3);
637
vec_u8 srcR3 = vec_ld(30, src);
638
srcM2 = vec_perm(srcR1, srcR2, permM2);
640
srcP0 = vec_perm(srcR2, srcR3, permP0);
641
srcP1 = vec_perm(srcR2, srcR3, permP1);
642
srcP2 = vec_perm(srcR2, srcR3, permP2);
643
srcP3 = vec_perm(srcR2, srcR3, permP3);
647
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
648
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
649
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
650
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
652
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
653
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
654
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
655
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
657
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
658
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
659
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
660
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
662
sum1A = vec_adds(srcP0A, srcP1A);
663
sum1B = vec_adds(srcP0B, srcP1B);
664
sum2A = vec_adds(srcM1A, srcP2A);
665
sum2B = vec_adds(srcM1B, srcP2B);
666
sum3A = vec_adds(srcM2A, srcP3A);
667
sum3B = vec_adds(srcM2B, srcP3B);
669
pp1A = vec_mladd(sum1A, v20ss, sum3A);
670
pp1B = vec_mladd(sum1B, v20ss, sum3B);
672
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
673
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
675
psumA = vec_sub(pp1A, pp2A);
676
psumB = vec_sub(pp1B, pp2B);
678
vec_st(psumA, 0, tmp);
679
vec_st(psumB, 16, tmp);
682
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
685
tmpM2ssA = vec_ld(0, tmpbis);
686
tmpM2ssB = vec_ld(16, tmpbis);
688
tmpM1ssA = vec_ld(0, tmpbis);
689
tmpM1ssB = vec_ld(16, tmpbis);
691
tmpP0ssA = vec_ld(0, tmpbis);
692
tmpP0ssB = vec_ld(16, tmpbis);
694
tmpP1ssA = vec_ld(0, tmpbis);
695
tmpP1ssB = vec_ld(16, tmpbis);
697
tmpP2ssA = vec_ld(0, tmpbis);
698
tmpP2ssB = vec_ld(16, tmpbis);
701
for (i = 0 ; i < 16 ; i++) {
702
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
705
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709
const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710
const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
725
pp1Ae = vec_mule(sum1A, v20ss);
726
pp1Ao = vec_mulo(sum1A, v20ss);
727
pp1Be = vec_mule(sum1B, v20ss);
728
pp1Bo = vec_mulo(sum1B, v20ss);
730
pp2Ae = vec_mule(sum2A, v5ss);
731
pp2Ao = vec_mulo(sum2A, v5ss);
732
pp2Be = vec_mule(sum2B, v5ss);
733
pp2Bo = vec_mulo(sum2B, v5ss);
735
pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
736
pp3Ao = vec_mulo(sum3A, v1ss);
737
pp3Be = vec_sra((vec_s32)sum3B, v16ui);
738
pp3Bo = vec_mulo(sum3B, v1ss);
740
pp1cAe = vec_add(pp1Ae, v512si);
741
pp1cAo = vec_add(pp1Ao, v512si);
742
pp1cBe = vec_add(pp1Be, v512si);
743
pp1cBo = vec_add(pp1Bo, v512si);
745
pp32Ae = vec_sub(pp3Ae, pp2Ae);
746
pp32Ao = vec_sub(pp3Ao, pp2Ao);
747
pp32Be = vec_sub(pp3Be, pp2Be);
748
pp32Bo = vec_sub(pp3Bo, pp2Bo);
750
sumAe = vec_add(pp1cAe, pp32Ae);
751
sumAo = vec_add(pp1cAo, pp32Ao);
752
sumBe = vec_add(pp1cBe, pp32Be);
753
sumBo = vec_add(pp1cBo, pp32Bo);
755
ssumAe = vec_sra(sumAe, v10ui);
756
ssumAo = vec_sra(sumAo, v10ui);
757
ssumBe = vec_sra(sumBe, v10ui);
758
ssumBo = vec_sra(sumBo, v10ui);
760
ssume = vec_packs(ssumAe, ssumBe);
761
ssumo = vec_packs(ssumAo, ssumBo);
763
sumv = vec_packsu(ssume, ssumo);
764
sum = vec_perm(sumv, sumv, mperm);
768
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
770
vec_st(fsum, 0, dst);