1
/* sumsq.c, this file is part of the
2
* AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3
* Copyright (C) 2002 James Klicman <james@klicman.org>
5
* This library is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include "altivec_motion.h"
25
#include "vectorize.h"
26
#include "../mjpeg_logging.h"
28
/* #define AMBER_ENABLE */
32
/* include last to ensure AltiVec type semantics, especially for bool. */
38
* b) blk2 is always vector aligned
39
* c) rowstride is a multiple of 16
40
* d) h is either 8 or 16
51
#define SUMSQ_ARGS blk1, blk2, rowstride, hx, hy, h
55
* for (j = 0; j < h; j++) {
56
* for (i = 0; i < 16; i++) {
57
* d = blk[i] - ref[i];
62
static int sumsq_00(SUMSQ_PDECL) /* {{{ */
65
unsigned char *pblk1, *pblk2;
66
vector unsigned char blk1A, blk2A, blk1B, blk2B;
67
vector unsigned char blk1A0, blk1B0;
68
vector unsigned char blk1A1, blk1B1;
69
vector unsigned char minA, minB;
70
vector unsigned char maxA, maxB;
71
vector unsigned char difA, difB;
72
vector unsigned int sum;
73
vector signed int zero;
74
vector unsigned char perm;
88
zero = vec_splat_s32(0);
89
sum = vec_splat_u32(0);
92
if (VECTOR_ALIGNED(pblk1)) {
94
blk1A = vec_ld(0, pblk1);
96
blk1B = vec_ld(0, pblk1);
98
blk2A = vec_ld(0, pblk2);
100
blk2B = vec_ld(0, pblk2);
103
maxA = vec_max(blk1A, blk2A);
104
minA = vec_min(blk1A, blk2A);
107
blk1A = vec_ld(0, pblk1);
109
blk2A = vec_ld(0, pblk2);
111
difA = vec_sub(maxA, minA);
112
sum = vec_msum(difA, difA, sum);
114
maxB = vec_max(blk1B, blk2B);
115
minB = vec_min(blk1B, blk2B);
118
blk1B = vec_ld(0, pblk1);
120
blk2B = vec_ld(0, pblk2);
122
difB = vec_sub(maxB, minB);
123
sum = vec_msum(difB, difB, sum);
128
perm = vec_lvsl(0, pblk1);
130
blk1A0 = vec_ld(0, pblk1);
131
blk1A1 = vec_ld(16, pblk1);
133
blk1B0 = vec_ld(0, pblk1);
134
blk1B1 = vec_ld(16, pblk1);
136
blk2A = vec_ld(0, pblk2);
138
blk2B = vec_ld(0, pblk2);
141
blk1A = vec_perm(blk1A0, blk1A1, perm);
144
blk1A0 = vec_ld(0, pblk1);
145
blk1A1 = vec_ld(16, pblk1);
147
maxA = vec_max(blk1A, blk2A);
148
minA = vec_min(blk1A, blk2A);
151
blk2A = vec_ld(0, pblk2);
153
difA = vec_sub(maxA, minA);
154
sum = vec_msum(difA, difA, sum);
157
blk1B = vec_perm(blk1B0, blk1B1, perm);
159
blk1B0 = vec_ld(0, pblk1);
160
blk1B1 = vec_ld(16, pblk1);
162
maxB = vec_max(blk1B, blk2B);
163
minB = vec_min(blk1B, blk2B);
166
blk2B = vec_ld(0, pblk2);
168
difB = vec_sub(maxB, minB);
169
sum = vec_msum(difB, difB, sum);
172
blk1A = vec_perm(blk1A0, blk1A1, perm);
173
blk1B = vec_perm(blk1B0, blk1B1, perm);
177
maxA = vec_max(blk1A, blk2A);
178
minA = vec_min(blk1A, blk2A);
179
difA = vec_sub(maxA, minA);
180
sum = vec_msum(difA, difA, sum);
182
maxB = vec_max(blk1B, blk2B);
183
minB = vec_min(blk1B, blk2B);
184
difB = vec_sub(maxB, minB);
185
sum = vec_msum(difB, difB, sum);
187
vo.v = vec_sums(vs32(sum), zero);
196
* for (j = 0; j < h; j++) {
197
* for (i = 0; i < 16; i++) {
198
* d = ((int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
205
static int sumsq_10(SUMSQ_PDECL) /* {{{ */
208
unsigned char *pB, *pR;
209
vector unsigned char l0, l1, l2, l3, lR, lB0, lB1, perm0, perm1;
210
vector unsigned short b0H, b0L, b1H, b1L;
211
vector unsigned short bH, bL;
212
vector unsigned char max, min, dif;
213
vector unsigned int sum;
214
vector unsigned char zero;
215
vector unsigned short one;
224
#define ISAD() /* {{{ */ \
225
/* pB[i] + pB[i+1] */ \
226
bH = vec_add(b0H, b1H); \
227
bL = vec_add(b0L, b1L); \
229
/* (pB[i]+pB[i+1]) + 1 */ \
230
bH = vec_add(bH, one); \
231
bL = vec_add(bL, one); \
233
/* (pB[i]+pB[i+1]+1) >> 1 */ \
234
bH = vec_sra(bH, one); \
235
bL = vec_sra(bL, one); \
237
/* d = abs( ((pB[i]+pB[i+1]+1)>>1) - pR[i] ) */ \
238
bH = vu16(vec_packsu(bH, bL)); \
239
min = vec_min(vu8(bH), lR); \
240
max = vec_max(vu8(bH), lR); \
241
dif = vec_sub(max, min); \
244
sum = vec_msum(dif, dif, sum); \
259
/* initialize constants */
260
zero = vec_splat_u8(0);
261
one = vec_splat_u16(1);
263
sum = vec_splat_u32(0);
266
perm0 = vec_lvsl(0, pB);
267
perm1 = vec_splat_u8(1);
268
perm1 = vec_add(perm0, perm1);
272
do { /* while (--i) */
274
lB0 = vec_perm(l0, l1, perm0);
275
lB1 = vec_perm(l0, l1, perm1);
281
/* (unsigned short[]) pB[0-7] */
282
b0H = vu16(vec_mergeh(zero, lB0));
284
/* (unsigned short[]) pB[8-15] */
285
b0L = vu16(vec_mergel(zero, lB0));
287
/* (unsigned short[]) pB[1-8] */
288
b1H = vu16(vec_mergeh(zero, lB1));
290
/* (unsigned short[]) pB[9-16] */
291
b1L = vu16(vec_mergel(zero, lB1));
298
lB0 = vec_perm(l2, l3, perm0);
299
lB1 = vec_perm(l2, l3, perm1);
305
/* (unsigned short[]) pB[0-7] */
306
b0H = vu16(vec_mergeh(zero, lB0));
308
/* (unsigned short[]) pB[8-15] */
309
b0L = vu16(vec_mergel(zero, lB0));
311
/* (unsigned short[]) pB[1-8] */
312
b1H = vu16(vec_mergeh(zero, lB1));
314
/* (unsigned short[]) pB[9-16] */
315
b1L = vu16(vec_mergel(zero, lB1));
324
lB0 = vec_perm(l0, l1, perm0);
325
lB1 = vec_perm(l0, l1, perm1);
327
/* (unsigned short[]) pB[0-7] */
328
b0H = vu16(vec_mergeh(zero, lB0));
330
/* (unsigned short[]) pB[8-15] */
331
b0L = vu16(vec_mergel(zero, lB0));
333
/* (unsigned short[]) pB[1-8] */
334
b1H = vu16(vec_mergeh(zero, lB1));
336
/* (unsigned short[]) pB[9-16] */
337
b1L = vu16(vec_mergel(zero, lB1));
344
lB0 = vec_perm(l2, l3, perm0);
345
lB1 = vec_perm(l2, l3, perm1);
347
/* (unsigned short[]) pB[0-7] */
348
b0H = vu16(vec_mergeh(zero, lB0));
350
/* (unsigned short[]) pB[8-15] */
351
b0L = vu16(vec_mergel(zero, lB0));
353
/* (unsigned short[]) pB[1-8] */
354
b1H = vu16(vec_mergeh(zero, lB1));
356
/* (unsigned short[]) pB[9-16] */
357
b1L = vu16(vec_mergel(zero, lB1));
361
vo.v = vec_sums(vs32(sum), vs32(zero));
371
* for (j = 0; j < h; j++) {
372
* for (i = 0; i < 16; i++) {
373
* d = ((int)(p1[i]+p1[i+s]+1)>>1) - p2[i];
380
static int sumsq_01(SUMSQ_PDECL) /* {{{ */
383
unsigned char *pB, *pR;
384
vector unsigned char l0, l1, lR, lB0, lB1, perm;
385
vector unsigned short b0H, b0L, b1H, b1L;
386
vector unsigned short bH, bL;
387
vector unsigned char max, min, dif;
388
vector unsigned int sum;
389
vector unsigned char zero;
390
vector unsigned short one;
400
#define ISAD() /* {{{ */ \
401
/* pB[i] + pB[i+s] */ \
402
bH = vec_add(b0H, b1H); \
403
bL = vec_add(b0L, b1L); \
405
/* (pB[i]+pB[i+s]) + 1 */ \
406
bH = vec_add(bH, one); \
407
bL = vec_add(bL, one); \
409
/* (pB[i]+pB[i+s]+1) >> 1 */ \
410
bH = vec_sra(bH, one); \
411
bL = vec_sra(bL, one); \
413
/* d = abs( ((pB[i]+pB[i+s]+1)>>1) - pR[i] ) */ \
414
bH = vu16(vec_packsu(bH, bL)); \
415
min = vec_min(vu8(bH), lR); \
416
max = vec_max(vu8(bH), lR); \
417
dif = vec_sub(max, min); \
420
sum = vec_msum(dif, dif, sum); \
426
/* initialize constants */
427
zero = vec_splat_u8(0);
428
one = vec_splat_u16(1);
430
sum = vec_splat_u32(0);
436
if (VECTOR_ALIGNED(pB)) {
438
/* lB0 = vec_ld(0, pB); */
446
/* (unsigned short[]) pB[0-7] */
447
b0H = vu16(vec_mergeh(zero, lB0));
449
/* (unsigned short[]) pB[8-15] */
450
b0L = vu16(vec_mergel(zero, lB0));
452
/* (unsigned short[]) pB+s[0-7] */
453
b1H = vu16(vec_mergeh(zero, lB1));
455
/* (unsigned short[]) pB+s[8-15] */
456
b1L = vu16(vec_mergel(zero, lB1));
458
lB0 = vec_sld(l0, l0, 0);
460
do { /* while (--i) */
466
/* start loading next lR */
470
/* (unsigned short[]) pB[0-7] */
471
b0H = vu16(vec_mergeh(zero, lB0));
473
/* (unsigned short[]) pB[8-15] */
474
b0L = vu16(vec_mergel(zero, lB0));
481
/* start loading next lR */
485
/* (unsigned short[]) pB[0-7] */
486
b1H = vu16(vec_mergeh(zero, lB1));
488
/* (unsigned short[]) pB[8-15] */
489
b1L = vu16(vec_mergel(zero, lB1));
501
perm = vec_lvsl(0, pB);
503
/* lB0 = vec_ld(0, pB); */
512
lB0 = vec_perm(lB0, l0, perm);
513
lB1 = vec_perm(lB1, l1, perm);
515
/* (unsigned short[]) pB[0-7] */
516
b0H = vu16(vec_mergeh(zero, lB0));
518
/* (unsigned short[]) pB[8-15] */
519
b0L = vu16(vec_mergel(zero, lB0));
521
/* (unsigned short[]) pB+s[0-7] */
522
b1H = vu16(vec_mergeh(zero, lB1));
524
/* (unsigned short[]) pB+s[8-15] */
525
b1L = vu16(vec_mergel(zero, lB1));
531
do { /* while (--i) */
535
lB0 = vec_perm(l0, l1, perm);
541
/* (unsigned short[]) pB[0-7] */
542
b0H = vu16(vec_mergeh(zero, lB0));
544
/* (unsigned short[]) pB[8-15] */
545
b0L = vu16(vec_mergel(zero, lB0));
547
/* start loading next lR */
553
lB1 = vec_perm(l0, l1, perm);
559
/* (unsigned short[]) pB[0-7] */
560
b1H = vu16(vec_mergeh(zero, lB1));
562
/* (unsigned short[]) pB[8-15] */
563
b1L = vu16(vec_mergel(zero, lB1));
566
/* start loading next lR */
577
lB0 = vec_perm(l0, l1, perm);
581
/* (unsigned short[]) pB[0-7] */
582
b0H = vu16(vec_mergeh(zero, lB0));
584
/* (unsigned short[]) pB[8-15] */
585
b0L = vu16(vec_mergel(zero, lB0));
589
vo.v = vec_sums(vs32(sum), vs32(zero));
599
* for (j = 0; j < h; j++) {
600
* for (i = 0; i < 16; i++)
601
* d = ((int)(pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2)>>2) - pR[i];
607
static int sumsq_11(SUMSQ_PDECL) /* {{{ */
610
unsigned char *pB, *pR;
611
vector unsigned char l0, l1, l2, l3, lR, lB0, lB1, lB2, lB3, perm, perm1;
612
vector unsigned short b0H, b0L, b1H, b1L, b2H, b2L, b3H, b3L;
613
vector unsigned short bH, bL;
614
vector unsigned char zero;
615
vector unsigned short two;
616
vector unsigned char max, min, dif;
617
vector unsigned int sum;
630
/* start loading first blocks */
637
/* initialize constants */
638
zero = vec_splat_u8(0);
639
two = vec_splat_u16(2);
641
sum = vec_splat_u32(0);
644
perm = vec_lvsl(0, blk1);
645
perm1 = vec_splat_u8(1);
646
perm1 = vec_add(perm, perm1);
648
/* permute 1st set of loaded blocks */
649
lB0 = vec_perm(l0, l1, perm);
650
lB1 = vec_perm(l0, l1, perm1);
652
/* start loading 3rd set */
657
/* permute 2nd set of loaded blocks */
658
lB2 = vec_perm(l2, l3, perm);
659
lB3 = vec_perm(l2, l3, perm1);
661
/* start loading lR */
664
/* (unsigned short[]) pB[0-7] */
665
b0H = vu16(vec_mergeh(zero, lB0));
667
/* (unsigned short[]) pB[8-15] */
668
b0L = vu16(vec_mergel(zero, lB0));
670
/* (unsigned short[]) pB[1-8] */
671
b1H = vu16(vec_mergeh(zero, lB1));
673
/* (unsigned short[]) pB[9-16] */
674
b1L = vu16(vec_mergel(zero, lB1));
676
/* (unsigned short[]) pB+s[0-7] */
677
b2H = vu16(vec_mergeh(zero, lB2));
679
/* (unsigned short[]) pB+s[8-15] */
680
b2L = vu16(vec_mergel(zero, lB2));
682
/* (unsigned short[]) pB+s[1-8] */
683
b3H = vu16(vec_mergeh(zero, lB3));
685
/* (unsigned short[]) pB+s[9-16] */
686
b3L = vu16(vec_mergel(zero, lB3));
688
#define ISUMSQ(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L) /* {{{ */ \
689
/* pB[i] + pB[i+1] */ \
690
bH = vec_add(b0H, b1H); \
691
bL = vec_add(b0L, b1L); \
693
/* (pB[i]+pB[i+1]) + pB[i+s] */ \
694
bH = vec_add(bH, b2H); \
695
bL = vec_add(bL, b2L); \
697
/* (pB[i]+pB[i+1]+pB[i+s]) + pB[i+s+1] */ \
698
bH = vec_add(bH, b3H); \
699
bL = vec_add(bL, b3L); \
701
/* (pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]) + 2 */ \
702
bH = vec_add(bH, two); \
703
bL = vec_add(bL, two); \
705
/* (pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2) >> 2 */ \
706
bH = vec_sra(bH, two); \
707
bL = vec_sra(bL, two); \
709
/* absolute value is used increase parallelism, x16 instead of x8 */ \
710
/* d = abs( ((int)(pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2)>>2) - pR[i] ) */ \
711
bH = vu16(vec_packsu(bH, bL)); \
712
min = vec_min(vu8(bH), lR); \
713
max = vec_max(vu8(bH), lR); \
714
dif = vec_sub(max, min); \
716
/* sum += d * d; */ \
717
sum = vec_msum(dif, dif, sum); \
723
ISUMSQ(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L);
726
/* start loading next lR */
730
/* perm loaded set */
731
lB0 = vec_perm(l0, l1, perm);
732
lB1 = vec_perm(l0, l1, perm1);
734
/* start loading next set */
740
/* (unsigned short[]) pB[0-7] */
741
b0H = vu16(vec_mergeh(zero, lB0));
743
/* (unsigned short[]) pB[8-15] */
744
b0L = vu16(vec_mergel(zero, lB0));
746
/* (unsigned short[]) pB[1-8] */
747
b1H = vu16(vec_mergeh(zero, lB1));
749
/* (unsigned short[]) pB[9-16] */
750
b1L = vu16(vec_mergel(zero, lB1));
752
ISUMSQ(b2H,b2L,b3H,b3L,b0H,b0L,b1H,b1L);
755
/* start loading next lR */
759
/* perm loaded set */
760
lB2 = vec_perm(l0, l1, perm);
761
lB3 = vec_perm(l0, l1, perm1);
763
/* start loading next set */
769
/* (unsigned short[]) pB+s[0-7] */
770
b2H = vu16(vec_mergeh(zero, lB2));
772
/* (unsigned short[]) pB+s[8-15] */
773
b2L = vu16(vec_mergel(zero, lB2));
775
/* (unsigned short[]) pB+s[1-8] */
776
b3H = vu16(vec_mergeh(zero, lB3));
778
/* (unsigned short[]) pB+s[9-16] */
779
b3L = vu16(vec_mergel(zero, lB3));
782
ISUMSQ(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L);
787
lB0 = vec_perm(l0, l1, perm);
788
lB1 = vec_perm(l0, l1, perm1);
790
/* (unsigned short[]) pB[0-7] */
791
b0H = vu16(vec_mergeh(zero, lB0));
793
/* (unsigned short[]) pB[8-15] */
794
b0L = vu16(vec_mergel(zero, lB0));
796
/* (unsigned short[]) pB[1-8] */
797
b1H = vu16(vec_mergeh(zero, lB1));
799
/* (unsigned short[]) pB[9-16] */
800
b1L = vu16(vec_mergel(zero, lB1));
802
ISUMSQ(b2H,b2L,b3H,b3L,b0H,b0L,b1H,b1L);
804
vo.v = vec_sums(vs32(sum), vs32(zero));
811
int sumsq_altivec(SUMSQ_PDECL)
815
#ifdef ALTIVEC_VERIFY
816
if (NOT_VECTOR_ALIGNED(blk2))
817
mjpeg_error_exit1("sumsq: blk2 %% 16 != 0, (%d)", blk2);
819
if (NOT_VECTOR_ALIGNED(rowstride))
820
mjpeg_error_exit1("sumsq: rowstride %% 16 != 0, (%d)", rowstride);
822
if (h != 8 && h != 16)
823
mjpeg_error_exit1("sumsq: h != [8|16], (%d)", h);
830
sumsq = sumsq_00(SUMSQ_ARGS);
832
sumsq = sumsq_01(SUMSQ_ARGS);
835
sumsq = sumsq_10(SUMSQ_ARGS);
837
sumsq = sumsq_11(SUMSQ_ARGS);
845
#if ALTIVEC_TEST_FUNCTION(sumsq)
846
ALTIVEC_TEST(sumsq, int, (SUMSQ_PDECL),
847
"blk1=0x%x, blk2=0x%x, rowstride=%d, hx=%d, hy=%d, h=%d",
850
/* vim:set foldmethod=marker foldlevel=0: */