105
108
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
106
DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
109
DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
107
110
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
108
111
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
111
114
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
112
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
113
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
115
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
116
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
114
117
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
115
118
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
116
119
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
119
122
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
120
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
121
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
123
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
124
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
122
125
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
123
126
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
124
127
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
127
130
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
128
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
129
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
131
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
132
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
130
133
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
131
134
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
132
135
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
135
138
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
136
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
137
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
139
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
140
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
138
141
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
139
142
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
140
143
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
143
146
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
144
DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
147
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
145
148
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
148
151
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
149
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
150
DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
151
DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
152
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
153
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
154
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
152
155
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
153
156
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
154
157
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
157
160
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
158
DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
159
DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
160
DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
161
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
162
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
163
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
161
164
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
162
165
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
163
166
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
166
169
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
167
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
168
DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
169
DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
170
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
171
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
172
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
170
173
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
171
174
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
172
175
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
175
178
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
176
DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
177
DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
178
DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
179
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
180
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
181
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
179
182
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
180
183
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
181
184
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
184
/* this code assume that stride % 16 == 0 */
185
void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
186
DECLARE_ALIGNED_16(signed int, ABCD[4]) =
187
{((8 - x) * (8 - y)),
193
const vec_s32 vABCD = vec_ld(0, ABCD);
194
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
195
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
196
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
197
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
199
const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
200
const vec_u16 v6us = vec_splat_u16(6);
201
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
202
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
204
vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
205
vec_u8 vsrc0uc, vsrc1uc;
206
vec_s16 vsrc0ssH, vsrc1ssH;
207
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
208
vec_s16 vsrc2ssH, vsrc3ssH, psum;
209
vec_u8 vdst, ppsum, fsum;
211
if (((unsigned long)dst) % 16 == 0) {
212
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
213
0x14, 0x15, 0x16, 0x17,
214
0x08, 0x09, 0x0A, 0x0B,
215
0x0C, 0x0D, 0x0E, 0x0F};
217
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
218
0x04, 0x05, 0x06, 0x07,
219
0x18, 0x19, 0x1A, 0x1B,
220
0x1C, 0x1D, 0x1E, 0x1F};
223
vsrcAuc = vec_ld(0, src);
226
vsrcBuc = vec_ld(16, src);
227
vsrcperm0 = vec_lvsl(0, src);
228
vsrcperm1 = vec_lvsl(1, src);
230
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
234
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
236
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
237
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
239
if (!loadSecond) {// -> !reallyBadAlign
240
for (i = 0 ; i < h ; i++) {
243
vsrcCuc = vec_ld(stride + 0, src);
245
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
246
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
248
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
249
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
251
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
252
psum = vec_mladd(vB, vsrc1ssH, psum);
253
psum = vec_mladd(vC, vsrc2ssH, psum);
254
psum = vec_mladd(vD, vsrc3ssH, psum);
255
psum = vec_add(v28ss, psum);
256
psum = vec_sra(psum, v6us);
258
vdst = vec_ld(0, dst);
259
ppsum = (vec_u8)vec_packsu(psum, psum);
260
fsum = vec_perm(vdst, ppsum, fperm);
262
vec_st(fsum, 0, dst);
272
for (i = 0 ; i < h ; i++) {
273
vsrcCuc = vec_ld(stride + 0, src);
274
vsrcDuc = vec_ld(stride + 16, src);
276
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
280
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
283
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
285
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
286
psum = vec_mladd(vB, vsrc1ssH, psum);
287
psum = vec_mladd(vC, vsrc2ssH, psum);
288
psum = vec_mladd(vD, vsrc3ssH, psum);
289
psum = vec_add(v28ss, psum);
290
psum = vec_sr(psum, v6us);
292
vdst = vec_ld(0, dst);
293
ppsum = (vec_u8)vec_pack(psum, psum);
294
fsum = vec_perm(vdst, ppsum, fperm);
296
vec_st(fsum, 0, dst);
307
187
static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
308
188
const uint8_t * src2, int dst_stride,
309
189
int src_stride1, int h)