1
/* bsumsq_sub22.c, this file is part of the
2
* AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3
* Copyright (C) 2002 James Klicman <james@klicman.org>
5
* This library is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include "altivec_motion.h"
25
#include "vectorize.h"
26
#include "../mjpeg_logging.h"
28
/* #define AMBER_ENABLE */
32
/* include last to ensure AltiVec type semantics, especially for bool. */
37
* Total squared difference between bidirection prediction of (8*h)
38
* blocks of 2*2 subsampled pels.
40
* Iterate through all rows 2 at a time.
42
* Hints regarding input:
43
* b) ref is about 50% vector aligned and 50% 8 byte aligned
44
* c) rowstride is always a multiple of 16
48
* for (j = 0; j < h; j++) {
49
* for (i = 0; i < 8; i++) {
50
* d = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
59
#define BSUMSQ_SUB22_PDECL \
66
#define BSUMSQ_SUB22_ARGS blk1f, blk1b, blk2, rowstride, h
68
int bsumsq_sub22_altivec(BSUMSQ_SUB22_PDECL)
72
unsigned char *pB, *pF, *pR;
73
vector unsigned char align8x2;
74
vector unsigned char permB, permF, permR;
75
vector unsigned char lB0, lB1, lB2, lB3;
76
vector unsigned char lF0, lF1, lF2, lF3;
77
vector unsigned char lR0, lR1;
78
vector unsigned char B, F, R;
79
vector unsigned short bH, bL, fH, fL;
80
vector unsigned char min;
81
vector unsigned char max;
82
vector unsigned char dif;
83
vector unsigned int sum;
84
vector unsigned char zero;
85
vector unsigned short one;
95
if (((unsigned long)blk2 % 8) != 0)
96
mjpeg_error_exit1("bsumsq_sub22: blk2 %% 8 != 0, (0x%X)", blk2);
98
if (NOT_VECTOR_ALIGNED(rowstride))
99
mjpeg_error_exit1("bsumsq_sub22: rowstride %% 16 != 0, (%d)",
102
if (h != 4 && h != 8)
103
mjpeg_error_exit1("bsumsq_sub22: h != [4|8], (%d)", h);
106
/* 8*h blocks calculated in 8*2 chunks */
107
/* align8x2 = 0x( 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 ) {{{ */
108
align8x2 = vec_lvsl(0, (unsigned char*)0);
109
align8x2 = vec_sld(align8x2, align8x2, 8);
110
permB = vec_lvsr(0, (unsigned char*)0);
111
align8x2 = vec_sld(align8x2, permB, 8);
117
zero = vec_splat_u8(0);
118
one = vec_splat_u16(1);
120
sum = vec_splat_u32(0);
123
permR = vec_lvsl(0, pR);
124
permR = vec_splat(permR, 0);
125
permR = vec_add(permR, align8x2);
127
lt8B = (((unsigned long)blk1b & 0xf) <= 8);
128
lt8F = (((unsigned long)blk1f & 0xf) <= 8);
134
permB = vec_lvsl(0, pB);
135
permB = vec_splat(permB, 0);
136
permB = vec_add(permB, align8x2);
138
permF = vec_lvsl(0, pF);
139
permF = vec_splat(permF, 0);
140
permF = vec_add(permF, align8x2);
154
B = vec_perm(lB0, lB1, permB);
155
F = vec_perm(lF0, lF1, permF);
156
R = vec_perm(lR0, lR1, permR);
175
/* (unsigned short[]) pB[0-7] */
176
bH = vu16(vec_mergeh(zero, B));
178
/* (unsigned short[]) pF[0-7] */
179
fH = vu16(vec_mergeh(zero, F));
182
bH = vec_add(bH, fH);
184
/* (unsigned short[]) pB[8-15] */
185
bL = vu16(vec_mergel(zero, B));
187
/* (unsigned short[]) pF[8-15] */
188
fL = vu16(vec_mergel(zero, F));
191
bL = vec_add(bL, fL);
193
/* (pB[i]+pF[i]) + 1 */
194
bH = vec_add(bH, one);
195
bL = vec_add(bL, one);
197
/* (pB[i]+pF[i]+1) >> 1 */
198
bH = vec_sra(bH, one);
199
bL = vec_sra(bL, one);
201
/* d = abs( ((pB[i]+pF[i]+1)>>1) - pR[i] ) */
202
bH = vu16(vec_packsu(bH, bL));
203
min = vec_min(vu8(bH), R);
204
max = vec_max(vu8(bH), R);
205
dif = vec_sub(max, min);
208
sum = vec_msum(dif, dif, sum);
211
B = vec_perm(lB0, lB1, permB);
212
F = vec_perm(lF0, lF1, permF);
213
R = vec_perm(lR0, lR1, permR);
218
else if (lt8B || lt8F)
228
permB = vec_lvsl(0, pB);
230
permF = vec_lvsl(0, pF);
231
permF = vec_splat(permF, 0);
232
permF = vec_add(permF, align8x2);
235
lB1 = vec_ld(16, pB);
238
lB3 = vec_ld(16, pB);
249
lB0 = vec_perm(lB0, lB1, permB);
250
lB2 = vec_perm(lB2, lB3, permB);
251
B = vec_perm(lB0, lB2, align8x2);
253
F = vec_perm(lF0, lF1, permF);
254
R = vec_perm(lR0, lR1, permR);
259
lB1 = vec_ld(16, pB);
262
lB3 = vec_ld(16, pB);
275
/* (unsigned short[]) pB[0-7] */
276
bH = vu16(vec_mergeh(zero, B));
278
/* (unsigned short[]) pF[0-7] */
279
fH = vu16(vec_mergeh(zero, F));
282
bH = vec_add(bH, fH);
284
/* (unsigned short[]) pB[8-15] */
285
bL = vu16(vec_mergel(zero, B));
287
/* (unsigned short[]) pF[8-15] */
288
fL = vu16(vec_mergel(zero, F));
291
bL = vec_add(bL, fL);
293
/* (pB[i]+pF[i]) + 1 */
294
bH = vec_add(bH, one);
295
bL = vec_add(bL, one);
297
/* (pB[i]+pF[i]+1) >> 1 */
298
bH = vec_sra(bH, one);
299
bL = vec_sra(bL, one);
301
/* d = abs( ((pB[i]+pF[i]+1)>>1) - pR[i] ) */
302
bH = vu16(vec_packsu(bH, bL));
303
min = vec_min(vu8(bH), R);
304
max = vec_max(vu8(bH), R);
305
dif = vec_sub(max, min);
308
sum = vec_msum(dif, dif, sum);
310
lB0 = vec_perm(lB0, lB1, permB);
311
lB2 = vec_perm(lB2, lB3, permB);
312
B = vec_perm(lB0, lB2, align8x2);
314
F = vec_perm(lF0, lF1, permF);
315
R = vec_perm(lR0, lR1, permR);
323
permB = vec_lvsl(0, pB);
324
permF = vec_lvsl(0, pF);
327
lB1 = vec_ld(16, pB);
330
lB3 = vec_ld(16, pB);
333
lF1 = vec_ld(16, pF);
336
lF3 = vec_ld(16, pF);
342
lB0 = vec_perm(lB0, lB1, permB);
343
lB2 = vec_perm(lB2, lB3, permB);
344
B = vec_perm(lB0, lB2, align8x2);
346
lF0 = vec_perm(lF0, lF1, permF);
347
lF2 = vec_perm(lF2, lF3, permF);
348
F = vec_perm(lF0, lF2, align8x2);
350
R = vec_perm(lR0, lR1, permR);
355
lB1 = vec_ld(16, pB);
358
lB3 = vec_ld(16, pB);
362
lF1 = vec_ld(16, pF);
365
lF3 = vec_ld(16, pF);
372
/* (unsigned short[]) pB[0-7] */
373
bH = vu16(vec_mergeh(zero, B));
375
/* (unsigned short[]) pF[0-7] */
376
fH = vu16(vec_mergeh(zero, F));
379
bH = vec_add(bH, fH);
381
/* (unsigned short[]) pB[8-15] */
382
bL = vu16(vec_mergel(zero, B));
384
/* (unsigned short[]) pF[8-15] */
385
fL = vu16(vec_mergel(zero, F));
388
bL = vec_add(bL, fL);
390
/* (pB[i]+pF[i]) + 1 */
391
bH = vec_add(bH, one);
392
bL = vec_add(bL, one);
394
/* (pB[i]+pF[i]+1) >> 1 */
395
bH = vec_sra(bH, one);
396
bL = vec_sra(bL, one);
398
/* d = abs( ((pB[i]+pF[i]+1)>>1) - pR[i] ) */
399
bH = vu16(vec_packsu(bH, bL));
400
min = vec_min(vu8(bH), R);
401
max = vec_max(vu8(bH), R);
402
dif = vec_sub(max, min);
405
sum = vec_msum(dif, dif, sum);
407
lB0 = vec_perm(lB0, lB1, permB);
408
lB2 = vec_perm(lB2, lB3, permB);
409
B = vec_perm(lB0, lB2, align8x2);
411
lF0 = vec_perm(lF0, lF1, permF);
412
lF2 = vec_perm(lF2, lF3, permF);
413
F = vec_perm(lF0, lF2, align8x2);
415
R = vec_perm(lR0, lR1, permR);
419
/* (unsigned short[]) pB[0-7] */
420
bH = vu16(vec_mergeh(zero, B));
422
/* (unsigned short[]) pF[0-7] */
423
fH = vu16(vec_mergeh(zero, F));
426
bH = vec_add(bH, fH);
428
/* (unsigned short[]) pB[8-15] */
429
bL = vu16(vec_mergel(zero, B));
431
/* (unsigned short[]) pF[8-15] */
432
fL = vu16(vec_mergel(zero, F));
435
bL = vec_add(bL, fL);
437
/* (pB[i]+pF[i]) + 1 */
438
bH = vec_add(bH, one);
439
bL = vec_add(bL, one);
441
/* (pB[i]+pF[i]+1) >> 1 */
442
bH = vec_sra(bH, one);
443
bL = vec_sra(bL, one);
445
/* d = abs( ((pB[i]+pF[i]+1)>>1) - pR[i] ) */
446
bH = vu16(vec_packsu(bH, bL));
447
min = vec_min(vu8(bH), R);
448
max = vec_max(vu8(bH), R);
449
dif = vec_sub(max, min);
452
sum = vec_msum(dif, dif, sum);
454
vo.v = vec_sums(vs32(sum), vs32(zero));
461
#if ALTIVEC_TEST_FUNCTION(bsumsq_sub22)
462
ALTIVEC_TEST(bsumsq_sub22, int, (BSUMSQ_SUB22_PDECL),
463
"blk1f=0x%X, blk1b=0x%X, blk2=0x%X, rowstride=%d, h=%d",
466
/* vim:set foldmethod=marker foldlevel=0: */