1
/* build_sub22_mests.c, this file is part of the
2
* AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3
* Copyright (C) 2002 James Klicman <james@klicman.org>
5
* This library is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include "altivec_motion.h"
25
#include "vectorize.h"
27
#include "../mjpeg_logging.h"
29
/* #define AMBER_ENABLE */
33
/* include last to ensure AltiVec type semantics, especially for bool. */
39
extern int sub_mean_reduction_ppc(int len, me_result_set *set, int reduction);
44
* Get SAD for 2*2 subsampled macroblocks:
45
* (0,0) (+2,0) (0,+2) (+2,+2) pixel-space coordinates
46
* (0,0) (+1,0) (0,+1) (+1,+1) 2*2 subsampled coordinates
49
* blk(+2, 0) (blk += 1)
50
* blk( 0, +2) (blk += rowstride-1)
51
* blk(+2, +2) (blk += 1)
53
* Iterate through all rows 2 at a time, calculating all 4 sads as we go.
55
* Hints regarding input:
56
* a) blk may be vector aligned, mostly not aligned
57
* b) ref is about 50% vector aligned and 50% 8 byte aligned
58
* c) rowstride is always a multiple of 16
61
* NOTES: Since ref is always 8 byte aligned and we are only interested in
62
* the first 8 bytes, the data can always be retreived with one vec_ld.
63
* This "one vec_ld" optimization is also attempted for blk.
65
* The permutation vectors only need to be calculated once since
66
* rowstride is always a multiple of 16.
69
#define BUILD_SUB22_MESTS_PDECL /* {{{ */ \
70
me_result_set *sub44set, \
71
me_result_set *sub22set, \
72
int i0, int j0, int ihigh, int jhigh, \
74
uint8_t *s22org, uint8_t *s22blk, \
75
int rowstride, int h, \
79
#define BUILD_SUB22_MESTS_ARGS /* {{{ */ \
81
i0, j0, ihigh, jhigh, \
88
/* int build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL) {{{ */
89
#if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(build_sub22_mests)
90
#define VERIFY_BUILD_SUB22_MESTS
92
static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride, int h,
93
int *sads, int count);
95
static int _build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL, int verify);
96
int build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL)
98
return _build_sub22_mests_altivec(BUILD_SUB22_MESTS_ARGS, 0 /* no verify */);
101
static int _build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL, int verify)
103
int build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL)
110
me_result_s *sub44mests;
114
vector unsigned int zero;
115
vector unsigned char lvsl;
116
vector unsigned char perm2;
117
vector unsigned char align8x2;
118
vector unsigned int sads;
119
vector signed char xy22,
121
vector unsigned char xint,
123
vector unsigned int vthreshold;
124
unsigned int threshold;
125
int stride1, stride2, stride1_16, stride2_16;
127
vector unsigned char _align16;
130
unsigned int threshold;
133
me_result_s mests[4];
136
DataStreamControl dsc;
142
#ifdef ALTIVEC_VERIFY /* {{{ */
143
if (((unsigned long)s22blk & 0x7) != 0)
144
mjpeg_error_exit1("build_sub22_mests: s22blk %% 8 != 0, (0x%X)", s22blk);
146
if (NOT_VECTOR_ALIGNED(rowstride))
147
mjpeg_error_exit1("build_sub22_mests: rowstride %% 16 != 0, (%d)",
150
if (h != 4 && h != 8)
151
mjpeg_error_exit1("build_sub22_mests: h != [4|8], (%d)", h);
154
if (NOT_VECTOR_ALIGNED(cres))
155
mjpeg_warn("build_sub22_mests: cres %% 16 != 0, (0x%X)",cres);
163
if (len < 1) { /* sub44set->len is sometimes zero. we can */
164
sub22set->len = 0; /* save a lot of effort if we stop short. */
169
dsc.control = DATA_STREAM_CONTROL(1,0,0);
171
dsc.block.stride = rowstride;
173
vec_dst(s22blk, dsc.control, 0);
175
/* increase size to 2 and increment count */
176
dsc.control += DATA_STREAM_CONTROL(1,1,0);
179
sub44mests = sub44set->mests;
180
cres = sub22set->mests;
181
cres--; /* decrement cres so all stores can be done with stwu */
184
/* execute instructions that are not dependent on pack_bits */
185
zero = vec_splat_u32(0); /* initialize to zero */
186
/* lvsl = 0x(00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F) {{{ */
187
lvsl = vec_lvsl(0, (unsigned char*) 0);
190
/* 8*8 or 8*4 calculated in 8*2 chunks */
191
/* align8x2 = 0x(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) {{{ */
192
align8x2 = vec_sld(lvsl, lvsl, 8);
193
perm2 = vec_lvsr(0, (unsigned char*)0);
194
align8x2 = vec_sld(align8x2, perm2, 8);
197
mres.weight = 0; /* weight must be zero */
198
mres.x = ihigh - i0; /* x <= (ihigh - i0) */
199
mres.y = jhigh - j0; /* y <= (jhigh - j0) */
200
vio.init.xylim = mres;
201
threshold = 6 * null_ctl_sad / (reduction << 2);
202
vio.init.threshold = threshold;
203
xy22 = (vector signed char)VCONST(0,0,0,0, 0,0,2,0, 0,0,0,2, 0,0,2,2);
204
xint = vu8(vec_splat_u32(0xf));
205
xint = vec_add(xint, lvsl);
206
yint = vu8(vec_splat_u32(1));
207
yint = vec_add(yint, xint);
209
perm2 = vec_lvsl(0, s22blk);
210
perm2 = vec_splat(perm2, 0);
211
perm2 = vec_add(perm2, align8x2);
214
stride2 = rowstride + rowstride;
215
stride1_16 = stride1 + 16;
216
stride2_16 = stride2 + 16;
220
vthreshold = vec_ld(0, (unsigned int*) &vio.init);
221
xylim = vs8(vec_splat(vu32(vthreshold), 0)); /* vio.init.xylim */
222
vthreshold = vu32(vec_splat(vu32(vthreshold), 1)); /* vio.init.threshold */
224
do { /* while (--len) */
230
s22orgblk = s22org + ((y+j0)>>1)*rowstride + ((x+i0)>>1);
232
vec_dst(s22orgblk, dsc.control, 1);
234
mres.weight = 0; /* weight must be zero */
238
/* calculate SADs for 2*2 subsampled macroblocks: {{{ */
240
vector unsigned int sad20, sad02, sad22;
241
vector unsigned char max, min, dif;
242
vector unsigned char perm1;
243
vector unsigned char align8x2_0, align8x2_2;
244
vector unsigned char ld0, ld1, ld3;
245
vector unsigned char v8x1a, v8x1b;
246
vector unsigned char vblk8x2;
247
vector unsigned char vref8x2;
248
uint8_t *pblk, *pref;
258
perm1 = vec_lvsl(0, pblk); /* initialize permute vector */
260
if (((unsigned long)pblk & 0xf) < 8) {
262
v8x1a = vec_ld(0, pblk);
263
/* pblk += rowstride; */
264
v8x1b = vec_ld(stride1, pblk);
266
vref8x2 = vec_ld(0, pref);
267
/* pref += rowstride; */
268
ld3 = vec_ld(stride1, pref);
270
align8x2_0 = vec_splat(perm1, 0);
271
align8x2_0 = vec_add(align8x2_0, align8x2);
272
align8x2_2 = vec_splat(perm1, 1);
273
align8x2_2 = vec_add(align8x2_2, align8x2);
275
vref8x2 = vec_perm(vref8x2, ld3, perm2);
278
do { /* while (--i) */
280
/* pblk += rowstride; */
282
ld0 = vec_ld(0, pblk);
284
/* calculate (0,0) */
285
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);
286
max = vec_max(vblk8x2, vref8x2);
287
min = vec_min(vblk8x2, vref8x2);
288
dif = vec_sub(max, min);
289
sads = vec_sum4s(dif, sads);
291
/* calculate (2,0) */
292
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);
293
max = vec_max(vblk8x2, vref8x2);
294
min = vec_min(vblk8x2, vref8x2);
295
dif = vec_sub(max, min);
296
sad20 = vec_sum4s(dif, sad20);
298
/* load into v8x1a, then v8x1b will be the top row */
299
v8x1a = vec_sld(ld0, ld0, 0); /* v8x1a = ld0; */
301
/* pblk += rowstride; */
302
ld0 = vec_ld(stride1, pblk);
304
/* calculate (0,2) */
305
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0);
306
max = vec_max(vblk8x2, vref8x2);
307
min = vec_min(vblk8x2, vref8x2);
308
dif = vec_sub(max, min);
309
sad02 = vec_sum4s(dif, sad02);
311
/* calculate (2,2) */
312
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2);
313
max = vec_max(vblk8x2, vref8x2);
314
min = vec_min(vblk8x2, vref8x2);
316
/* pref += rowstride; */
318
vref8x2 = vec_ld(0, pref);
319
/* pref += rowstride; */
320
ld3 = vec_ld(stride1, pref);
322
dif = vec_sub(max, min);
323
sad22 = vec_sum4s(dif, sad22);
325
v8x1b = vec_sld(ld0, ld0, 0); /* v8x1b = ld0; */
327
vref8x2 = vec_perm(vref8x2, ld3, perm2);
331
/* pblk += rowstride; */
333
ld0 = vec_ld(0, pblk);
335
/* calculate (0,0) */
336
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);
337
max = vec_max(vblk8x2, vref8x2);
338
min = vec_min(vblk8x2, vref8x2);
339
dif = vec_sub(max, min);
340
sads = vec_sum4s(dif, sads);
342
/* calculate (2,0) */
343
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);
344
/* load into v8x1a, then v8x1b will be the top row */
345
v8x1a = vec_sld(ld0, ld0, 0); /* v8x1a = ld0; */
349
v8x1a = vec_ld(0, pblk);
350
ld0 = vec_ld(16, pblk);
352
/* pblk += rowstride; */
353
v8x1b = vec_ld(stride1, pblk);
354
ld1 = vec_ld(stride1_16, pblk);
356
/* align8x2_0 = align8x2 */
357
align8x2_0 = vec_sld(align8x2, align8x2, 0);
358
align8x2_2 = vec_splat_u8(1);
359
align8x2_2 = vec_add(align8x2, align8x2_2 /* (1) */ );
361
vref8x2 = vec_ld(0, pref);
362
/* pref += rowstride; */
363
ld3 = vec_ld(stride1, pref);
365
v8x1a = vec_perm(v8x1a, ld0, perm1);
366
v8x1b = vec_perm(v8x1b, ld1, perm1);
368
vref8x2 = vec_perm(vref8x2, ld3, perm2);
371
do { /* while (--i) */
373
/* pblk += rowstride; */
375
ld0 = vec_ld(0, pblk);
376
ld1 = vec_ld(16, pblk);
378
/* calculate (0,0) */
379
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);
380
max = vec_max(vblk8x2, vref8x2);
381
min = vec_min(vblk8x2, vref8x2);
382
dif = vec_sub(max, min);
383
sads = vec_sum4s(dif, sads);
385
/* calculate (2,0) */
386
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);
387
max = vec_max(vblk8x2, vref8x2);
388
min = vec_min(vblk8x2, vref8x2);
389
dif = vec_sub(max, min);
390
sad20 = vec_sum4s(dif, sad20);
392
/* load into v8x1a, then v8x1b will be the top row */
393
v8x1a = vec_perm(ld0, ld1, perm1);
395
/* pblk += rowstride; */
396
ld0 = vec_ld(stride1, pblk);
397
ld1 = vec_ld(stride1_16, pblk);
399
/* calculate (0,2) */
400
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0);
401
max = vec_max(vblk8x2, vref8x2);
402
min = vec_min(vblk8x2, vref8x2);
403
dif = vec_sub(max, min);
404
sad02 = vec_sum4s(dif, sad02);
406
/* calculate (2,2) */
407
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2);
408
max = vec_max(vblk8x2, vref8x2);
409
min = vec_min(vblk8x2, vref8x2);
411
/* pref += rowstride; */
413
vref8x2 = vec_ld(0, pref);
414
/* pref += rowstride; */
415
ld3 = vec_ld(stride1, pref);
417
dif = vec_sub(max, min);
418
sad22 = vec_sum4s(dif, sad22);
420
v8x1b = vec_perm(ld0, ld1, perm1);
422
vref8x2 = vec_perm(vref8x2, ld3, perm2);
426
/* pblk += rowstride; */
428
ld0 = vec_ld(0, pblk);
429
ld1 = vec_ld(16, pblk);
431
/* calculate (0,0) */
432
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);
433
max = vec_max(vblk8x2, vref8x2);
434
min = vec_min(vblk8x2, vref8x2);
435
dif = vec_sub(max, min);
436
sads = vec_sum4s(dif, sads);
438
/* calculate (2,0) */
439
vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);
440
/* load into v8x1a, then v8x1b will be the top row */
441
v8x1a = vec_perm(ld0, ld1, perm1);
445
/* calculate (2,0) */
446
max = vec_max(vblk8x2, vref8x2);
447
min = vec_min(vblk8x2, vref8x2);
448
dif = vec_sub(max, min);
449
sad20 = vec_sum4s(dif, sad20);
451
/* calculate (0,2) */
452
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0);
453
max = vec_max(vblk8x2, vref8x2);
454
min = vec_min(vblk8x2, vref8x2);
455
dif = vec_sub(max, min);
456
sad02 = vec_sum4s(dif, sad02);
458
/* calculate (2,2) */
459
vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2);
460
max = vec_max(vblk8x2, vref8x2);
461
min = vec_min(vblk8x2, vref8x2);
462
dif = vec_sub(max, min);
463
sad22 = vec_sum4s(dif, sad22);
465
/* calculate final sums {{{ */
466
sads = vu32(vec_sums(vs32(sads), vs32(zero)));
467
sad20 = vu32(vec_sums(vs32(sad20), vs32(zero)));
468
sad02 = vu32(vec_sums(vs32(sad02), vs32(zero)));
469
sad22 = vu32(vec_sums(vs32(sad22), vs32(zero)));
472
/* sads = {sads, sad20, sad02, sad22} {{{ */
473
sads = vu32(vec_mergel(vu32(sads), vu32(sad02)));
474
sad20 = vu32(vec_mergel(vu32(sad20), vu32(sad22)));
475
sads = vu32(vec_mergel(vu32(sads), vu32(sad20)));
479
#ifdef VERIFY_BUILD_SUB22_MESTS /* {{{ */
481
verify_sads(s22orgblk, s22blk, rowstride, h, (int*)&sads, 4);
484
/* add penalty, clip xy, arrange into me_result_s ... {{{ */
486
vector signed char xy;
488
xy = vec_ld(0, (signed char*) &vio.xy);
489
xy = vs8(vec_splat(vu32(xy), 0)); /* splat vio.xy */
490
xy = vs8(vec_add(xy, xy22)); /* adjust xy values for elements 1-3 */
492
/* add distance penalty {{{ */
493
/* penalty = (max(abs(x),abs(y))<<3) */
495
vector signed char xyabs;
496
vector unsigned int xxxx, yyyy;
497
vector unsigned int xymax, penalty;
499
/* (abs(x),abs(y)) */
500
xyabs = vec_subs(vs8(zero), xy);
501
xyabs = vec_max(xyabs, xy);
503
/* xxxx = (x, x, x, x), yyyy = (y, y, y, y)
504
* (0,0,x,y, 0,0,x,y, 0,0,x,y, 0,0,x,y) |/- permute vector -\|
505
* (0,0,0,x, 0,0,0,x, 0,0,0,x, 0,0,0,x) |lvsl+(0x0000000F,...)|
506
* (0,0,0,y, 0,0,0,y, 0,0,0,y, 0,0,0,y) |lvsl+(0x00000010,...)|
508
xxxx = vu32(vec_perm(vs8(zero), xyabs, xint));
509
yyyy = vu32(vec_perm(vs8(zero), xyabs, yint));
511
/* penalty = max(abs(x),abs(y)) << 3 */
512
xymax = vec_max(xxxx, yyyy);
513
penalty = vec_splat_u32(3);
514
penalty = vec_sl(xymax, penalty /* (3,...) */ );
516
sads = vec_add(sads, penalty);
519
/* mask sads x <= (ihigh - i0) && y <= (jhigh - j0) {{{ */
520
/* the first cmpgt (s8) will flag any x and/or y coordinates... {{{
521
* as out of bounds. the second cmpgt (u32) will complete the
522
* mask if the x or y flag for that result is set.
526
* [0 0 < <] [0 0 < <] [0 0 > <] [0 0 < >]
527
* vb8(xymask) = vec_cmpgt(vu8(xy), xylim)
528
* [0 0 0 0] [0 0 0 0] [0 0 1 0] [0 0 0 1]
529
* vb32(xymask) = vec_cmpgt(vu32(xymask), vu32(zero))
530
* [0 0 0 0] [0 0 0 0] [1 1 1 1] [1 1 1 1]
532
* Legend: 0=0x00 (<)=(xy[n] <= xymax[n])
533
* 1=0xff (>)=(xy[n] > xymax[n])
537
vector bool int xymask;
539
xymask = vb32(vec_cmpgt(xy, xylim));
540
xymask = vec_cmpgt(vu32(xymask), zero);
542
/* add (saturated) xymask to sads thereby forcing
543
* masked values above the threshold.
545
sads = vec_adds(sads, vu32(xymask));
548
/* arrange sad and xy into me_result_s form and store {{{ */
550
vector unsigned int mests;
551
/* mests = ( sad, xy, sad, xy, sad, xy, sad, xy ) {{{
553
* ( 0, sad, 0, sad, 0, sad, 0, sad )
554
* ( sad, sad, sad, sad, sad, sad, sad, sad )
556
* ( 0, xy, 0, xy, 0, xy, 0, xy )
557
* ( xy, xy, xy, xy, xy, xy, xy, xy )
559
* ( sad, xy, sad, xy, sad, xy, sad, xy )
561
xy = vs8(vec_pack(vu32(xy), vu32(xy)));
562
mests = vu32(vec_pack(vu32(sads), vu32(sads)));
563
mests = vu32(vec_mergeh(vu16(mests), vu16(xy)));
565
vec_st(mests, 0, (unsigned int*)&vio.mests);
569
if (vec_any_lt(sads, vthreshold)) {
570
me_result_s m0, m1, m2, m3;
571
unsigned int w0, w1, w2, w3;
594
cres++; /* increment to account for earlier decrement */
595
len = cres - sub22set->mests;
601
if ((len | reduction) > 0)
602
len = sub_mean_reduction_ppc(len, sub22set, reduction);
605
#if ALTIVEC_TEST_FUNCTION(sub_mean_reduction)
606
ALTIVEC_TEST_SUFFIX(sub_mean_reduction)(sub22set, reduction, &min_weight);
608
ALTIVEC_SUFFIX(sub_mean_reduction)(sub22set, reduction, &min_weight);
610
return sub22set->len;
614
#if ALTIVEC_TEST_FUNCTION(build_sub22_mests) /* {{{ */
616
#define BUILD_SUB22_MESTS_PFMT \
617
"sub44set=0x%X, sub22set=0x%X, i0=%d, j0=%d, ihigh=%d, jhigh=%d, " \
618
"null_ctl_sad=%d, s22org=0x%X, s22blk=0x%X, rowstride=%d, h=%d, " \
621
# ifdef ALTIVEC_VERIFY
622
int build_sub22_mests_altivec_verify(BUILD_SUB22_MESTS_PDECL)
625
unsigned long checksum1, checksum2;
627
len1 = _build_sub22_mests_altivec(BUILD_SUB22_MESTS_ARGS, 1 /*verify*/);
628
for (checksum1 = i = 0; i < len1; i++) {
629
checksum1 += sub22set->mests[i].weight;
630
checksum1 += abs(sub22set->mests[i].x);
631
checksum1 += abs(sub22set->mests[i].y);
634
len2 = ALTIVEC_TEST_WITH(build_sub22_mests)(BUILD_SUB22_MESTS_ARGS);
635
for (checksum2 = i = 0; i < len2; i++) {
636
checksum2 += sub22set->mests[i].weight;
637
checksum2 += abs(sub22set->mests[i].x);
638
checksum2 += abs(sub22set->mests[i].y);
641
if (len1 != len2 || checksum1 != checksum2) {
642
mjpeg_debug("build_sub22_mests(" BUILD_SUB22_MESTS_PFMT ")",
643
BUILD_SUB22_MESTS_ARGS);
644
mjpeg_debug("build_sub22_mests: sub44set->len=%d", sub44set->len);
645
mjpeg_debug("build_sub22_mests: checksums differ %d[%d] != %d[%d]",
646
checksum1, len1, checksum2, len2);
650
mjpeg_info("build_sub22_mests(" BUILD_SUB22_MESTS_PFMT ")",
651
BUILD_SUB22_MESTS_ARGS);
652
mjpeg_info("build_sub22_mests: sub44set->len=%d", sub44set->len);
653
mjpeg_info("build_sub22_mests: checksum %d[%d]",
661
static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride, int h,
662
int *sads, int count)
668
for (i = 0; i < count; i++) {
670
/* s = sad_sub22(pblk, blk2, stride, h); {{{ */
671
#if ALTIVEC_TEST_FUNCTION(sad_sub22)
672
s = ALTIVEC_TEST_WITH(sad_sub22)(pblk, blk2, stride, h);
674
s = sad_sub22(pblk, blk2, stride, h);
677
mjpeg_debug("build_sub22_mests: sads[%d]=%d != %d"
678
"=sad_sub22(blk1=0x%X(0x%X), blk2=0x%X, "
680
i, s2, s, pblk, blk1, blk2, stride, h);
692
#undef BENCHMARK_EPILOG
693
#define BENCHMARK_EPILOG \
694
mjpeg_info("build_sub22_mests: sub44set->len=%d", sub44set->len); \
695
mjpeg_info("build_sub22_mests: sub22set->len=%d", sub22set->len);
697
ALTIVEC_TEST(build_sub22_mests, int, (BUILD_SUB22_MESTS_PDECL),
698
BUILD_SUB22_MESTS_PFMT, BUILD_SUB22_MESTS_ARGS);
701
/* vim:set foldmethod=marker foldlevel=0: */