268
268
vector unsigned char t1, t2, t3,t4, t5;
269
269
vector unsigned int sad;
270
270
vector signed int sumdiffs;
272
272
sad = (vector unsigned int)vec_splat_u32(0);
275
275
for(i=0;i<h;i++) {
276
/* Read potentially unaligned pixels into t1 and t2 */
276
/* Read potentially unaligned pixels into t1 and t2 */
277
277
perm1 = vec_lvsl(0, pix1);
278
278
pix1v = (vector unsigned char *) pix1;
279
279
perm2 = vec_lvsl(0, pix2);
280
280
pix2v = (vector unsigned char *) pix2;
281
281
t1 = vec_perm(pix1v[0], pix1v[1], perm1);
282
282
t2 = vec_perm(pix2v[0], pix2v[1], perm2);
284
/* Calculate a sum of abs differences vector */
284
/* Calculate a sum of abs differences vector */
285
285
t3 = vec_max(t1, t2);
286
286
t4 = vec_min(t1, t2);
287
287
t5 = vec_sub(t3, t4);
289
/* Add each 4 pixel group together and put 4 results into sad */
289
/* Add each 4 pixel group together and put 4 results into sad */
290
290
sad = vec_sum4s(t5, sad);
292
292
pix1 += line_size;
316
316
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
318
318
for(i=0;i<h;i++) {
319
/* Read potentially unaligned pixels into t1 and t2
320
Since we're reading 16 pixels, and actually only want 8,
321
mask out the last 8 pixels. The 0s don't change the sum. */
319
/* Read potentially unaligned pixels into t1 and t2
320
Since we're reading 16 pixels, and actually only want 8,
321
mask out the last 8 pixels. The 0s don't change the sum. */
322
322
perm1 = vec_lvsl(0, pix1);
323
323
pix1v = (vector unsigned char *) pix1;
324
324
perm2 = vec_lvsl(0, pix2);
391
391
vector unsigned char t1, t2, t3,t4, t5;
392
392
vector unsigned int sum;
393
393
vector signed int sumsqr;
395
395
sum = (vector unsigned int)vec_splat_u32(0);
397
397
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
400
400
for(i=0;i<h;i++) {
401
/* Read potentially unaligned pixels into t1 and t2
402
Since we're reading 16 pixels, and actually only want 8,
403
mask out the last 8 pixels. The 0s don't change the sum. */
401
/* Read potentially unaligned pixels into t1 and t2
402
Since we're reading 16 pixels, and actually only want 8,
403
mask out the last 8 pixels. The 0s don't change the sum. */
404
404
perm1 = vec_lvsl(0, pix1);
405
405
pix1v = (vector unsigned char *) pix1;
406
406
perm2 = vec_lvsl(0, pix2);
496
496
int s __attribute__((aligned(16)));
498
498
sad = (vector unsigned int)vec_splat_u32(0);
500
500
for (i = 0; i < 16; i++) {
501
/* Read the potentially unaligned 16 pixels into t1 */
501
/* Read the potentially unaligned 16 pixels into t1 */
502
502
perm = vec_lvsl(0, pix);
503
503
pixv = (vector unsigned char *) pix;
504
504
t1 = vec_perm(pixv[0], pixv[1], perm);
506
/* Add each 4 pixel group together and put 4 results into sad */
506
/* Add each 4 pixel group together and put 4 results into sad */
507
507
sad = vec_sum4s(t1, sad);
509
509
pix += line_size;
512
512
/* Sum up the four partial sums, and put the result into s */
513
513
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
514
514
sumdiffs = vec_splat(sumdiffs, 3);
515
515
vec_ste(sumdiffs, 0, &s);
801
801
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
803
803
for (i = 0; i < h; i++) {
805
805
block is 8 bytes-aligned, so we're either in the
806
806
left block (16 bytes-aligned) or in the right block (not)
808
808
int rightside = ((unsigned long)block & 0x0000000F);
810
810
blockv = vec_ld(0, block);
811
811
pixelsv1 = vec_ld(0, (unsigned char*)pixels);
812
812
pixelsv2 = vec_ld(16, (unsigned char*)pixels);
813
813
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
817
817
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
1335
1335
0x00, 0x01, 0x02, 0x03,
1336
1336
0x04, 0x05, 0x06, 0x07);
1338
#define ONEITERBUTTERFLY(i, res) \
1340
register vector unsigned char src1, src2, srcO; \
1341
register vector unsigned char dst1, dst2, dstO; \
1342
src1 = vec_ld(stride * i, src); \
1343
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1344
src2 = vec_ld((stride * i) + 16, src); \
1345
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1346
dst1 = vec_ld(stride * i, dst); \
1347
if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1348
dst2 = vec_ld((stride * i) + 16, dst); \
1349
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1350
/* promote the unsigned chars to signed shorts */ \
1351
/* we're in the 8x8 function, we only care for the first 8 */ \
1352
register vector signed short srcV = \
1353
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1354
register vector signed short dstV = \
1355
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1356
/* substractions inside the first butterfly */ \
1357
register vector signed short but0 = vec_sub(srcV, dstV); \
1358
register vector signed short op1 = vec_perm(but0, but0, perm1); \
1359
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1360
register vector signed short op2 = vec_perm(but1, but1, perm2); \
1361
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1362
register vector signed short op3 = vec_perm(but2, but2, perm3); \
1363
res = vec_mladd(but2, vprod3, op3); \
1338
#define ONEITERBUTTERFLY(i, res) \
1340
register vector unsigned char src1, src2, srcO; \
1341
register vector unsigned char dst1, dst2, dstO; \
1342
src1 = vec_ld(stride * i, src); \
1343
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1344
src2 = vec_ld((stride * i) + 16, src); \
1345
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1346
dst1 = vec_ld(stride * i, dst); \
1347
if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1348
dst2 = vec_ld((stride * i) + 16, dst); \
1349
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1350
/* promote the unsigned chars to signed shorts */ \
1351
/* we're in the 8x8 function, we only care for the first 8 */ \
1352
register vector signed short srcV = \
1353
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1354
register vector signed short dstV = \
1355
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1356
/* substractions inside the first butterfly */ \
1357
register vector signed short but0 = vec_sub(srcV, dstV); \
1358
register vector signed short op1 = vec_perm(but0, but0, perm1); \
1359
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1360
register vector signed short op2 = vec_perm(but1, but1, perm2); \
1361
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1362
register vector signed short op3 = vec_perm(but2, but2, perm3); \
1363
res = vec_mladd(but2, vprod3, op3); \
1365
1365
ONEITERBUTTERFLY(0, temp0);
1366
1366
ONEITERBUTTERFLY(1, temp1);
1480
1480
0x00, 0x01, 0x02, 0x03,
1481
1481
0x04, 0x05, 0x06, 0x07);
1483
#define ONEITERBUTTERFLY(i, res1, res2) \
1483
#define ONEITERBUTTERFLY(i, res1, res2) \
1485
1485
register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
1486
1486
register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
1487
src1 = vec_ld(stride * i, src); \
1488
src2 = vec_ld((stride * i) + 16, src); \
1487
src1 = vec_ld(stride * i, src); \
1488
src2 = vec_ld((stride * i) + 16, src); \
1489
1489
register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1490
dst1 = vec_ld(stride * i, dst); \
1491
dst2 = vec_ld((stride * i) + 16, dst); \
1490
dst1 = vec_ld(stride * i, dst); \
1491
dst2 = vec_ld((stride * i) + 16, dst); \
1492
1492
register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1493
/* promote the unsigned chars to signed shorts */ \
1493
/* promote the unsigned chars to signed shorts */ \
1494
1494
register vector signed short srcV asm ("v24") = \
1495
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1495
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1496
1496
register vector signed short dstV asm ("v25") = \
1497
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1497
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1498
1498
register vector signed short srcW asm ("v26") = \
1499
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1499
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1500
1500
register vector signed short dstW asm ("v27") = \
1501
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1502
/* substractions inside the first butterfly */ \
1501
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1502
/* substractions inside the first butterfly */ \
1503
1503
register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
1504
1504
register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
1505
1505
register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
1511
1511
register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
1512
1512
register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
1513
1513
register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
1514
res1 = vec_mladd(but2, vprod3, op3); \
1514
res1 = vec_mladd(but2, vprod3, op3); \
1515
1515
register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
1516
res2 = vec_mladd(but2S, vprod3, op3S); \
1516
res2 = vec_mladd(but2S, vprod3, op3S); \
1518
1518
ONEITERBUTTERFLY(0, temp0, temp0S);
1519
1519
ONEITERBUTTERFLY(1, temp1, temp1S);