1
/* sumsq_sub22.c, this file is part of the
2
* AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3
* Copyright (C) 2002 James Klicman <james@klicman.org>
5
* This library is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include "altivec_motion.h"
25
#include "vectorize.h"
26
#include "../mjpeg_logging.h"
28
/* #define AMBER_ENABLE */
32
/* include last to ensure AltiVec type semantics, especially for bool. */
38
* Total squared difference between bidriection prediction of (8*h)
39
* blocks of 2*2 subsampled pels.
41
* Iterate through all rows 2 at a time.
43
* Hints regarding input:
44
* b) blk2 is about 50% vector aligned and 50% 8 byte aligned
45
* c) rowstride is always a multiple of 16
48
* for (j = 0; j < h; j++) {
49
* for (i = 0; i < 8; i++) {
59
#define SUMSQ_SUB22_PDECL \
65
#define SUMSQ_SUB22_ARGS blk1, blk2, rowstride, h
67
int sumsq_sub22_altivec(SUMSQ_SUB22_PDECL)
70
unsigned char *pB, *pR;
71
vector unsigned char align8x2;
72
vector unsigned char lB0, lB1, lB2, lB3, lR0, lR1;
73
vector unsigned char B, R;
74
vector unsigned char min;
75
vector unsigned char max;
76
vector unsigned char dif;
77
vector unsigned int sum;
78
vector signed int zero;
79
vector unsigned char perm1, perm2;
89
if (((unsigned long)blk2 % 8) != 0)
90
mjpeg_error_exit1("sumsq_sub22: blk2 %% 8 != 0, (0x%X)", blk2);
92
if (NOT_VECTOR_ALIGNED(rowstride))
93
mjpeg_error_exit1("sumsq_sub22: rowstride %% 16 != 0, (%d)", rowstride);
96
mjpeg_error_exit1("sumsq_sub22: h != [4|8], (%d)", h);
99
/* 8*h blocks calculated in 8*2 chunks */
100
/* align8x2 = 0x( 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 ) {{{ */
101
align8x2 = vec_lvsl(0, (unsigned char*)0);
102
align8x2 = vec_sld(align8x2, align8x2, 8);
103
perm1 = vec_lvsr(0, (unsigned char*)0);
104
align8x2 = vec_sld(align8x2, perm1, 8);
112
zero = vec_splat_s32(0);
113
sum = vec_splat_u32(0);
115
perm1 = vec_lvsl(0, pB);
117
perm2 = vec_lvsl(0, pR);
118
perm2 = vec_splat(perm2, 0);
119
perm2 = vec_add(perm2, align8x2);
122
if (((unsigned long)pB & 0xf) <= 8) {
124
perm1 = vec_splat(perm1, 0);
125
perm1 = vec_add(perm1, align8x2);
135
B = vec_perm(lB0, lB1, perm1);
136
R = vec_perm(lR0, lR1, perm2);
152
dif = vec_sub(max, min);
153
sum = vec_msum(dif, dif, sum);
156
B = vec_perm(lB0, lB1, perm1);
158
R = vec_perm(lR0, lR1, perm2);
164
lB1 = vec_ld(16, pB);
167
lB3 = vec_ld(16, pB);
173
lB0 = vec_perm(lB0, lB1, perm1);
174
lB2 = vec_perm(lB2, lB3, perm1);
175
B = vec_perm(lB0, lB2, align8x2);
177
R = vec_perm(lR0, lR1, perm2);
183
lB1 = vec_ld(16, pB);
186
lB3 = vec_ld(16, pB);
196
dif = vec_sub(max, min);
197
sum = vec_msum(dif, dif, sum);
200
lB0 = vec_perm(lB0, lB1, perm1);
201
lB2 = vec_perm(lB2, lB3, perm1);
202
B = vec_perm(lB0, lB2, align8x2);
204
R = vec_perm(lR0, lR1, perm2);
210
dif = vec_sub(max, min);
211
sum = vec_msum(dif, dif, sum);
214
vo.v = vec_sums(vs32(sum), zero);
222
#if ALTIVEC_TEST_FUNCTION(sumsq_sub22)
223
ALTIVEC_TEST(sumsq_sub22, int, (SUMSQ_SUB22_PDECL),
224
"blk1=0x%X, blk2=0x%X, rowstride=%d, h=%d",
227
/* vim:set foldmethod=marker foldlevel=0: */