1
/* sad_11.c, this file is part of the
2
* AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3
* Copyright (C) 2002 James Klicman <james@klicman.org>
5
* This library is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include "altivec_motion.h"
25
#include "vectorize.h"
26
#include "../mjpeg_logging.h"
28
/* #define AMBER_ENABLE */
32
/* include last to ensure AltiVec type semantics, especially for bool. */
38
* SAD with horizontal and vertical half pel interpolation
41
* a) blk2 is always vector aligned
42
* b) rowstride is a multiple of 16
43
* c) h is either 8 or 16
46
* for (j = 0; j < h; j++) {
47
* for (i = 0; i < 16; i++)
48
* sum += abs( ((int)(pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2)>>2) - pR[i] );
54
#define SAD_11_PDECL /* {{{ */ \
60
#define SAD_11_ARGS blk1, blk2, rowstride, h
61
#define SAD_11_PFMT "blk1=0x%X, blk2=0x%X, rowstride=%d, h=%d"
63
int sad_11_altivec(SAD_11_PDECL)
66
unsigned char *pB, *pR;
67
vector unsigned char l0, l1, l2, l3, lR, lB0, lB1, lB2, lB3, perm, perm1;
68
vector unsigned short b0H, b0L, b1H, b1L, b2H, b2L, b3H, b3L;
69
vector unsigned short bH, bL;
70
vector unsigned char max, min, dif;
71
vector unsigned int sum;
72
vector unsigned char zero;
73
vector unsigned short two;
83
if (NOT_VECTOR_ALIGNED(blk2))
84
mjpeg_error_exit1("sad_11: blk2 %% 16 != 0, (%d)", blk2);
86
if (NOT_VECTOR_ALIGNED(rowstride))
87
mjpeg_error_exit1("sad_11: rowstride %% 16 != 0, (%d)", rowstride);
89
if (h != 8 && h != 16)
90
mjpeg_error_exit1("sad_11: h != [8|16], (%d)", h);
98
/* start loading first blocks */
105
/* initialize constants */
106
zero = vec_splat_u8(0);
107
two = vec_splat_u16(2);
109
sum = vec_splat_u32(0);
111
perm = vec_lvsl(0, blk1);
112
perm1 = vec_splat_u8(1);
113
perm1 = vec_add(perm, perm1);
115
/* permute 1st set of loaded blocks */
116
lB0 = vec_perm(l0, l1, perm);
117
lB1 = vec_perm(l0, l1, perm1);
119
/* start loading 3rd set */
124
/* permute 2nd set of loaded blocks */
125
lB2 = vec_perm(l2, l3, perm);
126
lB3 = vec_perm(l2, l3, perm1);
128
/* start loading lR */
131
/* (unsigned short[]) pB[0-7] */
132
b0H = vu16(vec_mergeh(zero, lB0));
134
/* (unsigned short[]) pB[8-15] */
135
b0L = vu16(vec_mergel(zero, lB0));
137
/* (unsigned short[]) pB[1-8] */
138
b1H = vu16(vec_mergeh(zero, lB1));
140
/* (unsigned short[]) pB[9-16] */
141
b1L = vu16(vec_mergel(zero, lB1));
143
/* (unsigned short[]) pB+s[0-7] */
144
b2H = vu16(vec_mergeh(zero, lB2));
146
/* (unsigned short[]) pB+s[8-15] */
147
b2L = vu16(vec_mergel(zero, lB2));
149
/* (unsigned short[]) pB+s[1-8] */
150
b3H = vu16(vec_mergeh(zero, lB3));
152
/* (unsigned short[]) pB+s[9-16] */
153
b3L = vu16(vec_mergel(zero, lB3));
156
* TODO: some of vec_add()'s might be consolidated since they
157
* calculate the same values multiple times.
159
#define ISAD(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L) /* {{{ */ \
160
/* pB[i] + pB[i+1] */ \
161
bH = vec_add(b0H, b1H); \
162
bL = vec_add(b0L, b1L); \
164
/* (pB[i]+pB[i+1]) + pB[i+s] */ \
165
bH = vec_add(bH, b2H); \
166
bL = vec_add(bL, b2L); \
168
/* (pB[i]+pB[i+1]+pB[i+s]) + pB[i+s+1] */ \
169
bH = vec_add(bH, b3H); \
170
bL = vec_add(bL, b3L); \
172
/* (pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]) + 2 */ \
173
bH = vec_add(bH, two); \
174
bL = vec_add(bL, two); \
176
/* (pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2) >> 2 */ \
177
bH = vec_sra(bH, two); \
178
bL = vec_sra(bL, two); \
180
/* abs( ((pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2)>>2) - pR[i] )*/ \
181
bH = vu16(vec_packsu(bH, bL)); \
182
min = vec_min(vu8(bH), lR); \
183
max = vec_max(vu8(bH), lR); \
184
dif = vec_sub(max, min); \
186
/* d += abs(((pB[i]+pB[i+1]+pB[i+s]+pB[i+s+1]+2)>>2)-pR[i]) */ \
187
sum = vec_sum4s(dif, sum); \
193
ISAD(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L);
196
/* start loading next lR */
200
/* perm loaded set */
201
lB0 = vec_perm(l0, l1, perm);
202
lB1 = vec_perm(l0, l1, perm1);
204
/* start loading next set */
210
/* (unsigned short[]) pB[0-7] */
211
b0H = vu16(vec_mergeh(zero, lB0));
213
/* (unsigned short[]) pB[8-15] */
214
b0L = vu16(vec_mergel(zero, lB0));
216
/* (unsigned short[]) pB[1-8] */
217
b1H = vu16(vec_mergeh(zero, lB1));
219
/* (unsigned short[]) pB[9-16] */
220
b1L = vu16(vec_mergel(zero, lB1));
222
ISAD(b2H,b2L,b3H,b3L,b0H,b0L,b1H,b1L);
224
/* start loading next lR */
228
/* perm loaded set */
229
lB2 = vec_perm(l0, l1, perm);
230
lB3 = vec_perm(l0, l1, perm1);
232
/* start loading next set */
237
/* (unsigned short[]) pB+s[0-7] */
238
b2H = vu16(vec_mergeh(zero, lB2));
240
/* (unsigned short[]) pB+s[8-15] */
241
b2L = vu16(vec_mergel(zero, lB2));
243
/* (unsigned short[]) pB+s[1-8] */
244
b3H = vu16(vec_mergeh(zero, lB3));
246
/* (unsigned short[]) pB+s[9-16] */
247
b3L = vu16(vec_mergel(zero, lB3));
250
ISAD(b0H,b0L,b1H,b1L,b2H,b2L,b3H,b3L);
255
lB0 = vec_perm(l0, l1, perm);
256
lB1 = vec_perm(l0, l1, perm1);
258
/* (unsigned short[]) pB[0-7] */
259
b0H = vu16(vec_mergeh(zero, lB0));
261
/* (unsigned short[]) pB[8-15] */
262
b0L = vu16(vec_mergel(zero, lB0));
264
/* (unsigned short[]) pB[1-8] */
265
b1H = vu16(vec_mergeh(zero, lB1));
267
/* (unsigned short[]) pB[9-16] */
268
b1L = vu16(vec_mergel(zero, lB1));
270
ISAD(b2H,b2L,b3H,b3L,b0H,b0L,b1H,b1L);
272
vo.v = vec_sums(vs32(sum), vs32(zero));
281
#if ALTIVEC_TEST_FUNCTION(sad_11) /* {{{ */
282
ALTIVEC_TEST(sad_11, int, (SAD_11_PDECL), SAD_11_PFMT, SAD_11_ARGS);
284
/* vim:set foldmethod=marker foldlevel=0: */