3
* Copyright (c) 2000, 2001 Fabrice Bellard.
5
* This library is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU Lesser General Public
7
* License as published by the Free Software Foundation; either
8
* version 2 of the License, or (at your option) any later version.
10
* This library is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
* Lesser General Public License for more details.
15
* You should have received a copy of the GNU Lesser General Public
16
* License along with this library; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
* gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
36
const uint8_t ff_zigzag_direct[64] = {
37
0, 1, 8, 16, 9, 2, 3, 10,
38
17, 24, 32, 25, 18, 11, 4, 5,
39
12, 19, 26, 33, 40, 48, 41, 34,
40
27, 20, 13, 6, 7, 14, 21, 28,
41
35, 42, 49, 56, 57, 50, 43, 36,
42
29, 22, 15, 23, 30, 37, 44, 51,
43
58, 59, 52, 45, 38, 31, 39, 46,
44
53, 60, 61, 54, 47, 55, 62, 63
47
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48
uint16_t __align8 inv_zigzag_direct16[64];
50
const uint8_t ff_alternate_horizontal_scan[64] = {
51
0, 1, 2, 3, 8, 9, 16, 17,
52
10, 11, 4, 5, 6, 7, 15, 14,
53
13, 12, 19, 18, 24, 25, 32, 33,
54
26, 27, 20, 21, 22, 23, 28, 29,
55
30, 31, 34, 35, 40, 41, 48, 49,
56
42, 43, 36, 37, 38, 39, 44, 45,
57
46, 47, 50, 51, 56, 57, 58, 59,
58
52, 53, 54, 55, 60, 61, 62, 63,
61
const uint8_t ff_alternate_vertical_scan[64] = {
62
0, 8, 16, 24, 1, 9, 2, 10,
63
17, 25, 32, 40, 48, 56, 57, 49,
64
41, 33, 26, 18, 3, 11, 4, 12,
65
19, 27, 34, 42, 50, 58, 35, 43,
66
51, 59, 20, 28, 5, 13, 6, 14,
67
21, 29, 36, 44, 52, 60, 37, 45,
68
53, 61, 22, 30, 7, 15, 23, 31,
69
38, 46, 54, 62, 39, 47, 55, 63,
72
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73
const uint32_t inverse[256]={
74
0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75
536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76
268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77
178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78
134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79
107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80
89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81
76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82
67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83
59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84
53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85
48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86
44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87
41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88
38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89
35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90
33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91
31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92
29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93
28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94
26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95
25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96
24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97
23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98
22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99
21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100
20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101
19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102
19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103
18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104
17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105
17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
108
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120
static int pix_sum_c(uint8_t * pix, int line_size)
125
for (i = 0; i < 16; i++) {
126
for (j = 0; j < 16; j += 8) {
137
pix += line_size - 16;
142
static int pix_norm1_c(uint8_t * pix, int line_size)
145
uint32_t *sq = squareTbl + 256;
148
for (i = 0; i < 16; i++) {
149
for (j = 0; j < 16; j += 8) {
160
#if LONG_MAX > 2147483647
161
register uint64_t x=*(uint64_t*)pix;
163
s += sq[(x>>8)&0xff];
164
s += sq[(x>>16)&0xff];
165
s += sq[(x>>24)&0xff];
166
s += sq[(x>>32)&0xff];
167
s += sq[(x>>40)&0xff];
168
s += sq[(x>>48)&0xff];
169
s += sq[(x>>56)&0xff];
171
register uint32_t x=*(uint32_t*)pix;
173
s += sq[(x>>8)&0xff];
174
s += sq[(x>>16)&0xff];
175
s += sq[(x>>24)&0xff];
176
x=*(uint32_t*)(pix+4);
178
s += sq[(x>>8)&0xff];
179
s += sq[(x>>16)&0xff];
180
s += sq[(x>>24)&0xff];
185
pix += line_size - 16;
190
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
193
for(i=0; i+8<=w; i+=8){
194
dst[i+0]= bswap_32(src[i+0]);
195
dst[i+1]= bswap_32(src[i+1]);
196
dst[i+2]= bswap_32(src[i+2]);
197
dst[i+3]= bswap_32(src[i+3]);
198
dst[i+4]= bswap_32(src[i+4]);
199
dst[i+5]= bswap_32(src[i+5]);
200
dst[i+6]= bswap_32(src[i+6]);
201
dst[i+7]= bswap_32(src[i+7]);
204
dst[i+0]= bswap_32(src[i+0]);
208
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
211
uint32_t *sq = squareTbl + 256;
214
for (i = 0; i < 8; i++) {
215
s += sq[pix1[0] - pix2[0]];
216
s += sq[pix1[1] - pix2[1]];
217
s += sq[pix1[2] - pix2[2]];
218
s += sq[pix1[3] - pix2[3]];
219
s += sq[pix1[4] - pix2[4]];
220
s += sq[pix1[5] - pix2[5]];
221
s += sq[pix1[6] - pix2[6]];
222
s += sq[pix1[7] - pix2[7]];
229
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
232
uint32_t *sq = squareTbl + 256;
235
for (i = 0; i < 16; i++) {
236
s += sq[pix1[ 0] - pix2[ 0]];
237
s += sq[pix1[ 1] - pix2[ 1]];
238
s += sq[pix1[ 2] - pix2[ 2]];
239
s += sq[pix1[ 3] - pix2[ 3]];
240
s += sq[pix1[ 4] - pix2[ 4]];
241
s += sq[pix1[ 5] - pix2[ 5]];
242
s += sq[pix1[ 6] - pix2[ 6]];
243
s += sq[pix1[ 7] - pix2[ 7]];
244
s += sq[pix1[ 8] - pix2[ 8]];
245
s += sq[pix1[ 9] - pix2[ 9]];
246
s += sq[pix1[10] - pix2[10]];
247
s += sq[pix1[11] - pix2[11]];
248
s += sq[pix1[12] - pix2[12]];
249
s += sq[pix1[13] - pix2[13]];
250
s += sq[pix1[14] - pix2[14]];
251
s += sq[pix1[15] - pix2[15]];
259
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
263
/* read the pixels */
265
block[0] = pixels[0];
266
block[1] = pixels[1];
267
block[2] = pixels[2];
268
block[3] = pixels[3];
269
block[4] = pixels[4];
270
block[5] = pixels[5];
271
block[6] = pixels[6];
272
block[7] = pixels[7];
278
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
279
const uint8_t *s2, int stride){
282
/* read the pixels */
284
block[0] = s1[0] - s2[0];
285
block[1] = s1[1] - s2[1];
286
block[2] = s1[2] - s2[2];
287
block[3] = s1[3] - s2[3];
288
block[4] = s1[4] - s2[4];
289
block[5] = s1[5] - s2[5];
290
block[6] = s1[6] - s2[6];
291
block[7] = s1[7] - s2[7];
299
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
303
uint8_t *cm = cropTbl + MAX_NEG_CROP;
305
/* read the pixels */
307
pixels[0] = cm[block[0]];
308
pixels[1] = cm[block[1]];
309
pixels[2] = cm[block[2]];
310
pixels[3] = cm[block[3]];
311
pixels[4] = cm[block[4]];
312
pixels[5] = cm[block[5]];
313
pixels[6] = cm[block[6]];
314
pixels[7] = cm[block[7]];
321
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
325
uint8_t *cm = cropTbl + MAX_NEG_CROP;
327
/* read the pixels */
329
pixels[0] = cm[pixels[0] + block[0]];
330
pixels[1] = cm[pixels[1] + block[1]];
331
pixels[2] = cm[pixels[2] + block[2]];
332
pixels[3] = cm[pixels[3] + block[3]];
333
pixels[4] = cm[pixels[4] + block[4]];
334
pixels[5] = cm[pixels[5] + block[5]];
335
pixels[6] = cm[pixels[6] + block[6]];
336
pixels[7] = cm[pixels[7] + block[7]];
343
#define PIXOP2(OPNAME, OP) \
344
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
348
OP(*((uint64_t*)block), LD64(pixels));\
354
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358
const uint64_t a= LD64(pixels );\
359
const uint64_t b= LD64(pixels+1);\
360
OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
366
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
370
const uint64_t a= LD64(pixels );\
371
const uint64_t b= LD64(pixels+1);\
372
OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
378
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
382
const uint64_t a= LD64(pixels );\
383
const uint64_t b= LD64(pixels+line_size);\
384
OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
390
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
394
const uint64_t a= LD64(pixels );\
395
const uint64_t b= LD64(pixels+line_size);\
396
OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
402
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405
const uint64_t a= LD64(pixels );\
406
const uint64_t b= LD64(pixels+1);\
407
uint64_t l0= (a&0x0303030303030303ULL)\
408
+ (b&0x0303030303030303ULL)\
409
+ 0x0202020202020202ULL;\
410
uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
411
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415
for(i=0; i<h; i+=2){\
416
uint64_t a= LD64(pixels );\
417
uint64_t b= LD64(pixels+1);\
418
l1= (a&0x0303030303030303ULL)\
419
+ (b&0x0303030303030303ULL);\
420
h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
421
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
422
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
427
l0= (a&0x0303030303030303ULL)\
428
+ (b&0x0303030303030303ULL)\
429
+ 0x0202020202020202ULL;\
430
h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
431
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
432
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
438
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
441
const uint64_t a= LD64(pixels );\
442
const uint64_t b= LD64(pixels+1);\
443
uint64_t l0= (a&0x0303030303030303ULL)\
444
+ (b&0x0303030303030303ULL)\
445
+ 0x0101010101010101ULL;\
446
uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
447
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451
for(i=0; i<h; i+=2){\
452
uint64_t a= LD64(pixels );\
453
uint64_t b= LD64(pixels+1);\
454
l1= (a&0x0303030303030303ULL)\
455
+ (b&0x0303030303030303ULL);\
456
h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
457
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
458
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
463
l0= (a&0x0303030303030303ULL)\
464
+ (b&0x0303030303030303ULL)\
465
+ 0x0101010101010101ULL;\
466
h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
467
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
468
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
474
CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
475
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
476
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
477
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
478
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
479
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
480
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
482
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
483
#else // 64 bit variant
485
#define PIXOP2(OPNAME, OP) \
486
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
489
OP(*((uint16_t*)(block )), LD16(pixels ));\
494
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
497
OP(*((uint32_t*)(block )), LD32(pixels ));\
502
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
505
OP(*((uint32_t*)(block )), LD32(pixels ));\
506
OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
511
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
512
OPNAME ## _pixels8_c(block, pixels, line_size, h);\
515
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
516
int src_stride1, int src_stride2, int h){\
520
a= LD32(&src1[i*src_stride1 ]);\
521
b= LD32(&src2[i*src_stride2 ]);\
522
OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
523
a= LD32(&src1[i*src_stride1+4]);\
524
b= LD32(&src2[i*src_stride2+4]);\
525
OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
529
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
int src_stride1, int src_stride2, int h){\
534
a= LD32(&src1[i*src_stride1 ]);\
535
b= LD32(&src2[i*src_stride2 ]);\
536
OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
537
a= LD32(&src1[i*src_stride1+4]);\
538
b= LD32(&src2[i*src_stride2+4]);\
539
OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
543
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
int src_stride1, int src_stride2, int h){\
548
a= LD32(&src1[i*src_stride1 ]);\
549
b= LD32(&src2[i*src_stride2 ]);\
550
OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
554
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555
int src_stride1, int src_stride2, int h){\
559
a= LD16(&src1[i*src_stride1 ]);\
560
b= LD16(&src2[i*src_stride2 ]);\
561
OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
565
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
566
int src_stride1, int src_stride2, int h){\
567
OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
568
OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
571
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
572
int src_stride1, int src_stride2, int h){\
573
OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
574
OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
577
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
578
OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
581
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
582
OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
585
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
586
OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
589
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
590
OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
593
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
594
int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
597
uint32_t a, b, c, d, l0, l1, h0, h1;\
598
a= LD32(&src1[i*src_stride1]);\
599
b= LD32(&src2[i*src_stride2]);\
600
c= LD32(&src3[i*src_stride3]);\
601
d= LD32(&src4[i*src_stride4]);\
602
l0= (a&0x03030303UL)\
605
h0= ((a&0xFCFCFCFCUL)>>2)\
606
+ ((b&0xFCFCFCFCUL)>>2);\
607
l1= (c&0x03030303UL)\
609
h1= ((c&0xFCFCFCFCUL)>>2)\
610
+ ((d&0xFCFCFCFCUL)>>2);\
611
OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
612
a= LD32(&src1[i*src_stride1+4]);\
613
b= LD32(&src2[i*src_stride2+4]);\
614
c= LD32(&src3[i*src_stride3+4]);\
615
d= LD32(&src4[i*src_stride4+4]);\
616
l0= (a&0x03030303UL)\
619
h0= ((a&0xFCFCFCFCUL)>>2)\
620
+ ((b&0xFCFCFCFCUL)>>2);\
621
l1= (c&0x03030303UL)\
623
h1= ((c&0xFCFCFCFCUL)>>2)\
624
+ ((d&0xFCFCFCFCUL)>>2);\
625
OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
629
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
630
OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
633
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
634
OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
637
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
638
OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
641
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
642
OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
645
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
646
int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
649
uint32_t a, b, c, d, l0, l1, h0, h1;\
650
a= LD32(&src1[i*src_stride1]);\
651
b= LD32(&src2[i*src_stride2]);\
652
c= LD32(&src3[i*src_stride3]);\
653
d= LD32(&src4[i*src_stride4]);\
654
l0= (a&0x03030303UL)\
657
h0= ((a&0xFCFCFCFCUL)>>2)\
658
+ ((b&0xFCFCFCFCUL)>>2);\
659
l1= (c&0x03030303UL)\
661
h1= ((c&0xFCFCFCFCUL)>>2)\
662
+ ((d&0xFCFCFCFCUL)>>2);\
663
OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664
a= LD32(&src1[i*src_stride1+4]);\
665
b= LD32(&src2[i*src_stride2+4]);\
666
c= LD32(&src3[i*src_stride3+4]);\
667
d= LD32(&src4[i*src_stride4+4]);\
668
l0= (a&0x03030303UL)\
671
h0= ((a&0xFCFCFCFCUL)>>2)\
672
+ ((b&0xFCFCFCFCUL)>>2);\
673
l1= (c&0x03030303UL)\
675
h1= ((c&0xFCFCFCFCUL)>>2)\
676
+ ((d&0xFCFCFCFCUL)>>2);\
677
OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
680
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681
int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682
OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
683
OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
685
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
686
int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
687
OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
688
OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
691
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693
int i, a0, b0, a1, b1;\
700
for(i=0; i<h; i+=2){\
706
block[0]= (a1+a0)>>2; /* FIXME non put */\
707
block[1]= (b1+b0)>>2;\
717
block[0]= (a1+a0)>>2;\
718
block[1]= (b1+b0)>>2;\
724
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727
const uint32_t a= LD32(pixels );\
728
const uint32_t b= LD32(pixels+1);\
729
uint32_t l0= (a&0x03030303UL)\
732
uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
733
+ ((b&0xFCFCFCFCUL)>>2);\
737
for(i=0; i<h; i+=2){\
738
uint32_t a= LD32(pixels );\
739
uint32_t b= LD32(pixels+1);\
740
l1= (a&0x03030303UL)\
742
h1= ((a&0xFCFCFCFCUL)>>2)\
743
+ ((b&0xFCFCFCFCUL)>>2);\
744
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
749
l0= (a&0x03030303UL)\
752
h0= ((a&0xFCFCFCFCUL)>>2)\
753
+ ((b&0xFCFCFCFCUL)>>2);\
754
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
760
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
765
const uint32_t a= LD32(pixels );\
766
const uint32_t b= LD32(pixels+1);\
767
uint32_t l0= (a&0x03030303UL)\
770
uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
771
+ ((b&0xFCFCFCFCUL)>>2);\
775
for(i=0; i<h; i+=2){\
776
uint32_t a= LD32(pixels );\
777
uint32_t b= LD32(pixels+1);\
778
l1= (a&0x03030303UL)\
780
h1= ((a&0xFCFCFCFCUL)>>2)\
781
+ ((b&0xFCFCFCFCUL)>>2);\
782
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
787
l0= (a&0x03030303UL)\
790
h0= ((a&0xFCFCFCFCUL)>>2)\
791
+ ((b&0xFCFCFCFCUL)>>2);\
792
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796
pixels+=4-line_size*(h+1);\
797
block +=4-line_size*h;\
801
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
806
const uint32_t a= LD32(pixels );\
807
const uint32_t b= LD32(pixels+1);\
808
uint32_t l0= (a&0x03030303UL)\
811
uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
812
+ ((b&0xFCFCFCFCUL)>>2);\
816
for(i=0; i<h; i+=2){\
817
uint32_t a= LD32(pixels );\
818
uint32_t b= LD32(pixels+1);\
819
l1= (a&0x03030303UL)\
821
h1= ((a&0xFCFCFCFCUL)>>2)\
822
+ ((b&0xFCFCFCFCUL)>>2);\
823
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828
l0= (a&0x03030303UL)\
831
h0= ((a&0xFCFCFCFCUL)>>2)\
832
+ ((b&0xFCFCFCFCUL)>>2);\
833
OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837
pixels+=4-line_size*(h+1);\
838
block +=4-line_size*h;\
842
CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
843
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
844
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
845
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
846
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
847
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
848
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
849
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
851
#define op_avg(a, b) a = rnd_avg32(a, b)
853
#define op_put(a, b) a = b
860
#define avg2(a,b) ((a+b+1)>>1)
861
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
864
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
866
const int A=(16-x16)*(16-y16);
867
const int B=( x16)*(16-y16);
868
const int C=(16-x16)*( y16);
869
const int D=( x16)*( y16);
874
dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
875
dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
876
dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
877
dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
878
dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
879
dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
880
dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
881
dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
887
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
888
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
891
const int s= 1<<shift;
901
for(x=0; x<8; x++){ //XXX FIXME optimize
902
int src_x, src_y, frac_x, frac_y, index;
911
if((unsigned)src_x < width){
912
if((unsigned)src_y < height){
913
index= src_x + src_y*stride;
914
dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
915
+ src[index +1]* frac_x )*(s-frac_y)
916
+ ( src[index+stride ]*(s-frac_x)
917
+ src[index+stride+1]* frac_x )* frac_y
920
index= src_x + clip(src_y, 0, height)*stride;
921
dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
922
+ src[index +1]* frac_x )*s
926
if((unsigned)src_y < height){
927
index= clip(src_x, 0, width) + src_y*stride;
928
dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
929
+ src[index+stride ]* frac_y )*s
932
index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
933
dst[y*stride + x]= src[index ];
945
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
947
case 2: put_pixels2_c (dst, src, stride, height); break;
948
case 4: put_pixels4_c (dst, src, stride, height); break;
949
case 8: put_pixels8_c (dst, src, stride, height); break;
950
case 16:put_pixels16_c(dst, src, stride, height); break;
954
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
956
for (i=0; i < height; i++) {
957
for (j=0; j < width; j++) {
958
dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
965
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
967
for (i=0; i < height; i++) {
968
for (j=0; j < width; j++) {
969
dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
976
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
978
for (i=0; i < height; i++) {
979
for (j=0; j < width; j++) {
980
dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
987
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
989
for (i=0; i < height; i++) {
990
for (j=0; j < width; j++) {
991
dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
998
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1000
for (i=0; i < height; i++) {
1001
for (j=0; j < width; j++) {
1002
dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1009
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1011
for (i=0; i < height; i++) {
1012
for (j=0; j < width; j++) {
1013
dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1020
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1022
for (i=0; i < height; i++) {
1023
for (j=0; j < width; j++) {
1024
dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1031
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1033
for (i=0; i < height; i++) {
1034
for (j=0; j < width; j++) {
1035
dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1042
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1044
case 2: avg_pixels2_c (dst, src, stride, height); break;
1045
case 4: avg_pixels4_c (dst, src, stride, height); break;
1046
case 8: avg_pixels8_c (dst, src, stride, height); break;
1047
case 16:avg_pixels16_c(dst, src, stride, height); break;
1051
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053
for (i=0; i < height; i++) {
1054
for (j=0; j < width; j++) {
1055
dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1062
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064
for (i=0; i < height; i++) {
1065
for (j=0; j < width; j++) {
1066
dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1073
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1075
for (i=0; i < height; i++) {
1076
for (j=0; j < width; j++) {
1077
dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1084
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1086
for (i=0; i < height; i++) {
1087
for (j=0; j < width; j++) {
1088
dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1095
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1097
for (i=0; i < height; i++) {
1098
for (j=0; j < width; j++) {
1099
dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1106
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1108
for (i=0; i < height; i++) {
1109
for (j=0; j < width; j++) {
1110
dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1117
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1119
for (i=0; i < height; i++) {
1120
for (j=0; j < width; j++) {
1121
dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1128
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1130
for (i=0; i < height; i++) {
1131
for (j=0; j < width; j++) {
1132
dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1139
#define TPEL_WIDTH(width)\
1140
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1141
void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1142
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1143
void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1144
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1145
void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1146
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1147
void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1148
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1149
void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1150
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1151
void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1152
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1153
void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1154
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155
void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1156
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157
void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1160
#define H264_CHROMA_MC(OPNAME, OP)\
1161
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1162
const int A=(8-x)*(8-y);\
1163
const int B=( x)*(8-y);\
1164
const int C=(8-x)*( y);\
1165
const int D=( x)*( y);\
1168
assert(x<8 && y<8 && x>=0 && y>=0);\
1172
OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1173
OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1179
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1180
const int A=(8-x)*(8-y);\
1181
const int B=( x)*(8-y);\
1182
const int C=(8-x)*( y);\
1183
const int D=( x)*( y);\
1186
assert(x<8 && y<8 && x>=0 && y>=0);\
1190
OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1191
OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192
OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1193
OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1199
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1200
const int A=(8-x)*(8-y);\
1201
const int B=( x)*(8-y);\
1202
const int C=(8-x)*( y);\
1203
const int D=( x)*( y);\
1206
assert(x<8 && y<8 && x>=0 && y>=0);\
1210
OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1211
OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1212
OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1213
OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1214
OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1215
OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1216
OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1217
OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1223
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1224
#define op_put(a, b) a = (((b) + 32)>>6)
1226
H264_CHROMA_MC(put_ , op_put)
1227
H264_CHROMA_MC(avg_ , op_avg)
1231
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1236
ST32(dst , LD32(src ));
1242
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1247
ST32(dst , LD32(src ));
1248
ST32(dst+4 , LD32(src+4 ));
1254
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1259
ST32(dst , LD32(src ));
1260
ST32(dst+4 , LD32(src+4 ));
1261
ST32(dst+8 , LD32(src+8 ));
1262
ST32(dst+12, LD32(src+12));
1268
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1273
ST32(dst , LD32(src ));
1274
ST32(dst+4 , LD32(src+4 ));
1275
ST32(dst+8 , LD32(src+8 ));
1276
ST32(dst+12, LD32(src+12));
1283
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1288
ST32(dst , LD32(src ));
1289
ST32(dst+4 , LD32(src+4 ));
1297
#define QPEL_MC(r, OPNAME, RND, OP) \
1298
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1299
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1303
OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1304
OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1305
OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1306
OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1307
OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1308
OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1309
OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1310
OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1316
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1318
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1322
const int src0= src[0*srcStride];\
1323
const int src1= src[1*srcStride];\
1324
const int src2= src[2*srcStride];\
1325
const int src3= src[3*srcStride];\
1326
const int src4= src[4*srcStride];\
1327
const int src5= src[5*srcStride];\
1328
const int src6= src[6*srcStride];\
1329
const int src7= src[7*srcStride];\
1330
const int src8= src[8*srcStride];\
1331
OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1332
OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1333
OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1334
OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1335
OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1336
OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1337
OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1338
OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1344
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1345
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1350
OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1351
OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1352
OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1353
OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1354
OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1355
OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1356
OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1357
OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1358
OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1359
OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1360
OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1361
OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1362
OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1363
OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1364
OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1365
OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1371
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1372
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1377
const int src0= src[0*srcStride];\
1378
const int src1= src[1*srcStride];\
1379
const int src2= src[2*srcStride];\
1380
const int src3= src[3*srcStride];\
1381
const int src4= src[4*srcStride];\
1382
const int src5= src[5*srcStride];\
1383
const int src6= src[6*srcStride];\
1384
const int src7= src[7*srcStride];\
1385
const int src8= src[8*srcStride];\
1386
const int src9= src[9*srcStride];\
1387
const int src10= src[10*srcStride];\
1388
const int src11= src[11*srcStride];\
1389
const int src12= src[12*srcStride];\
1390
const int src13= src[13*srcStride];\
1391
const int src14= src[14*srcStride];\
1392
const int src15= src[15*srcStride];\
1393
const int src16= src[16*srcStride];\
1394
OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1395
OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1396
OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1397
OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1398
OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1399
OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1400
OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1401
OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1402
OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1403
OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1404
OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1405
OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1406
OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1407
OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1408
OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1409
OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1415
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1416
OPNAME ## pixels8_c(dst, src, stride, 8);\
1419
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1421
put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1422
OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1425
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1426
OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1429
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1431
put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1432
OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1435
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1436
uint8_t full[16*9];\
1438
copy_block9(full, src, 16, stride, 9);\
1439
put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1440
OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1443
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1444
uint8_t full[16*9];\
1445
copy_block9(full, src, 16, stride, 9);\
1446
OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1449
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1450
uint8_t full[16*9];\
1452
copy_block9(full, src, 16, stride, 9);\
1453
put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454
OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1456
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1457
uint8_t full[16*9];\
1460
uint8_t halfHV[64];\
1461
copy_block9(full, src, 16, stride, 9);\
1462
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1463
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1464
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1465
OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1467
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1468
uint8_t full[16*9];\
1470
uint8_t halfHV[64];\
1471
copy_block9(full, src, 16, stride, 9);\
1472
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1473
put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1474
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1475
OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1477
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478
uint8_t full[16*9];\
1481
uint8_t halfHV[64];\
1482
copy_block9(full, src, 16, stride, 9);\
1483
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1485
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486
OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1488
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1489
uint8_t full[16*9];\
1491
uint8_t halfHV[64];\
1492
copy_block9(full, src, 16, stride, 9);\
1493
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494
put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1495
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496
OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1498
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
uint8_t full[16*9];\
1502
uint8_t halfHV[64];\
1503
copy_block9(full, src, 16, stride, 9);\
1504
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507
OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1509
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1510
uint8_t full[16*9];\
1512
uint8_t halfHV[64];\
1513
copy_block9(full, src, 16, stride, 9);\
1514
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1519
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
uint8_t full[16*9];\
1523
uint8_t halfHV[64];\
1524
copy_block9(full, src, 16, stride, 9);\
1525
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1526
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528
OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1530
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1531
uint8_t full[16*9];\
1533
uint8_t halfHV[64];\
1534
copy_block9(full, src, 16, stride, 9);\
1535
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1540
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1542
uint8_t halfHV[64];\
1543
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1544
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1545
OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1547
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1549
uint8_t halfHV[64];\
1550
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1551
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552
OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1554
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1555
uint8_t full[16*9];\
1558
uint8_t halfHV[64];\
1559
copy_block9(full, src, 16, stride, 9);\
1560
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1561
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1562
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1563
OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1565
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1566
uint8_t full[16*9];\
1568
copy_block9(full, src, 16, stride, 9);\
1569
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1570
put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1571
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1573
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1574
uint8_t full[16*9];\
1577
uint8_t halfHV[64];\
1578
copy_block9(full, src, 16, stride, 9);\
1579
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1580
put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1581
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1582
OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1584
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1585
uint8_t full[16*9];\
1587
copy_block9(full, src, 16, stride, 9);\
1588
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1589
put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1590
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1592
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1594
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1595
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1597
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1598
OPNAME ## pixels16_c(dst, src, stride, 16);\
1601
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1603
put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1604
OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1607
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1608
OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1611
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1613
put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1614
OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1617
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1618
uint8_t full[24*17];\
1620
copy_block17(full, src, 24, stride, 17);\
1621
put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1622
OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1625
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1626
uint8_t full[24*17];\
1627
copy_block17(full, src, 24, stride, 17);\
1628
OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1631
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1632
uint8_t full[24*17];\
1634
copy_block17(full, src, 24, stride, 17);\
1635
put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636
OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1638
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1639
uint8_t full[24*17];\
1640
uint8_t halfH[272];\
1641
uint8_t halfV[256];\
1642
uint8_t halfHV[256];\
1643
copy_block17(full, src, 24, stride, 17);\
1644
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1645
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1646
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1647
OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1649
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1650
uint8_t full[24*17];\
1651
uint8_t halfH[272];\
1652
uint8_t halfHV[256];\
1653
copy_block17(full, src, 24, stride, 17);\
1654
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1655
put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1656
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1657
OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1659
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660
uint8_t full[24*17];\
1661
uint8_t halfH[272];\
1662
uint8_t halfV[256];\
1663
uint8_t halfHV[256];\
1664
copy_block17(full, src, 24, stride, 17);\
1665
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1667
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668
OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1670
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1671
uint8_t full[24*17];\
1672
uint8_t halfH[272];\
1673
uint8_t halfHV[256];\
1674
copy_block17(full, src, 24, stride, 17);\
1675
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676
put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1677
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678
OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1680
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
uint8_t full[24*17];\
1682
uint8_t halfH[272];\
1683
uint8_t halfV[256];\
1684
uint8_t halfHV[256];\
1685
copy_block17(full, src, 24, stride, 17);\
1686
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689
OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1691
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1692
uint8_t full[24*17];\
1693
uint8_t halfH[272];\
1694
uint8_t halfHV[256];\
1695
copy_block17(full, src, 24, stride, 17);\
1696
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1701
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
uint8_t full[24*17];\
1703
uint8_t halfH[272];\
1704
uint8_t halfV[256];\
1705
uint8_t halfHV[256];\
1706
copy_block17(full, src, 24, stride, 17);\
1707
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1708
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710
OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1712
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1713
uint8_t full[24*17];\
1714
uint8_t halfH[272];\
1715
uint8_t halfHV[256];\
1716
copy_block17(full, src, 24, stride, 17);\
1717
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1722
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1723
uint8_t halfH[272];\
1724
uint8_t halfHV[256];\
1725
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1726
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1727
OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1729
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1730
uint8_t halfH[272];\
1731
uint8_t halfHV[256];\
1732
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1733
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734
OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1736
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737
uint8_t full[24*17];\
1738
uint8_t halfH[272];\
1739
uint8_t halfV[256];\
1740
uint8_t halfHV[256];\
1741
copy_block17(full, src, 24, stride, 17);\
1742
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1743
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1744
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1745
OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1747
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1748
uint8_t full[24*17];\
1749
uint8_t halfH[272];\
1750
copy_block17(full, src, 24, stride, 17);\
1751
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1752
put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1753
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1755
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1756
uint8_t full[24*17];\
1757
uint8_t halfH[272];\
1758
uint8_t halfV[256];\
1759
uint8_t halfHV[256];\
1760
copy_block17(full, src, 24, stride, 17);\
1761
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1762
put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1763
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1764
OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1766
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1767
uint8_t full[24*17];\
1768
uint8_t halfH[272];\
1769
copy_block17(full, src, 24, stride, 17);\
1770
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1771
put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1772
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1774
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1775
uint8_t halfH[272];\
1776
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1777
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1780
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1781
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1782
#define op_put(a, b) a = cm[((b) + 16)>>5]
1783
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1785
QPEL_MC(0, put_ , _ , op_put)
1786
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1787
QPEL_MC(0, avg_ , _ , op_avg)
1788
//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1790
#undef op_avg_no_rnd
1792
#undef op_put_no_rnd
1795
#define H264_LOWPASS(OPNAME, OP, OP2) \
1796
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1798
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1802
OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1803
OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1804
OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1805
OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1811
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1813
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1817
const int srcB= src[-2*srcStride];\
1818
const int srcA= src[-1*srcStride];\
1819
const int src0= src[0 *srcStride];\
1820
const int src1= src[1 *srcStride];\
1821
const int src2= src[2 *srcStride];\
1822
const int src3= src[3 *srcStride];\
1823
const int src4= src[4 *srcStride];\
1824
const int src5= src[5 *srcStride];\
1825
const int src6= src[6 *srcStride];\
1826
OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1827
OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1828
OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1829
OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1835
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1838
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1840
src -= 2*srcStride;\
1841
for(i=0; i<h+5; i++)\
1843
tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1844
tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1845
tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1846
tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1850
tmp -= tmpStride*(h+5-2);\
1853
const int tmpB= tmp[-2*tmpStride];\
1854
const int tmpA= tmp[-1*tmpStride];\
1855
const int tmp0= tmp[0 *tmpStride];\
1856
const int tmp1= tmp[1 *tmpStride];\
1857
const int tmp2= tmp[2 *tmpStride];\
1858
const int tmp3= tmp[3 *tmpStride];\
1859
const int tmp4= tmp[4 *tmpStride];\
1860
const int tmp5= tmp[5 *tmpStride];\
1861
const int tmp6= tmp[6 *tmpStride];\
1862
OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1863
OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1864
OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1865
OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1871
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1873
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1877
OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1878
OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1879
OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1880
OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1881
OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1882
OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1883
OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1884
OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1890
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1892
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1896
const int srcB= src[-2*srcStride];\
1897
const int srcA= src[-1*srcStride];\
1898
const int src0= src[0 *srcStride];\
1899
const int src1= src[1 *srcStride];\
1900
const int src2= src[2 *srcStride];\
1901
const int src3= src[3 *srcStride];\
1902
const int src4= src[4 *srcStride];\
1903
const int src5= src[5 *srcStride];\
1904
const int src6= src[6 *srcStride];\
1905
const int src7= src[7 *srcStride];\
1906
const int src8= src[8 *srcStride];\
1907
const int src9= src[9 *srcStride];\
1908
const int src10=src[10*srcStride];\
1909
OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1910
OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1911
OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1912
OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1913
OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1914
OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1915
OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1916
OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1922
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1925
uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1927
src -= 2*srcStride;\
1928
for(i=0; i<h+5; i++)\
1930
tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1931
tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1932
tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1933
tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1934
tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1935
tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1936
tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1937
tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1941
tmp -= tmpStride*(h+5-2);\
1944
const int tmpB= tmp[-2*tmpStride];\
1945
const int tmpA= tmp[-1*tmpStride];\
1946
const int tmp0= tmp[0 *tmpStride];\
1947
const int tmp1= tmp[1 *tmpStride];\
1948
const int tmp2= tmp[2 *tmpStride];\
1949
const int tmp3= tmp[3 *tmpStride];\
1950
const int tmp4= tmp[4 *tmpStride];\
1951
const int tmp5= tmp[5 *tmpStride];\
1952
const int tmp6= tmp[6 *tmpStride];\
1953
const int tmp7= tmp[7 *tmpStride];\
1954
const int tmp8= tmp[8 *tmpStride];\
1955
const int tmp9= tmp[9 *tmpStride];\
1956
const int tmp10=tmp[10*tmpStride];\
1957
OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1958
OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1959
OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1960
OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1961
OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1962
OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1963
OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1964
OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1970
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1971
OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1972
OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1973
src += 8*srcStride;\
1974
dst += 8*dstStride;\
1975
OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1976
OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1979
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1980
OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1981
OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1982
src += 8*srcStride;\
1983
dst += 8*dstStride;\
1984
OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1985
OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1988
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1989
OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1990
OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1991
src += 8*srcStride;\
1992
tmp += 8*tmpStride;\
1993
dst += 8*dstStride;\
1994
OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1995
OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1998
#define H264_MC(OPNAME, SIZE) \
1999
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000
OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2003
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004
uint8_t half[SIZE*SIZE];\
2005
put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2006
OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2009
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010
OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2013
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014
uint8_t half[SIZE*SIZE];\
2015
put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2016
OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2019
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020
uint8_t full[SIZE*(SIZE+5)];\
2021
uint8_t * const full_mid= full + SIZE*2;\
2022
uint8_t half[SIZE*SIZE];\
2023
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2024
put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2025
OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2028
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2029
uint8_t full[SIZE*(SIZE+5)];\
2030
uint8_t * const full_mid= full + SIZE*2;\
2031
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2032
OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2035
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2036
uint8_t full[SIZE*(SIZE+5)];\
2037
uint8_t * const full_mid= full + SIZE*2;\
2038
uint8_t half[SIZE*SIZE];\
2039
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2040
put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2041
OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2044
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2045
uint8_t full[SIZE*(SIZE+5)];\
2046
uint8_t * const full_mid= full + SIZE*2;\
2047
uint8_t halfH[SIZE*SIZE];\
2048
uint8_t halfV[SIZE*SIZE];\
2049
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2050
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2051
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2052
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2055
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2056
uint8_t full[SIZE*(SIZE+5)];\
2057
uint8_t * const full_mid= full + SIZE*2;\
2058
uint8_t halfH[SIZE*SIZE];\
2059
uint8_t halfV[SIZE*SIZE];\
2060
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2061
copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2062
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2063
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2067
uint8_t full[SIZE*(SIZE+5)];\
2068
uint8_t * const full_mid= full + SIZE*2;\
2069
uint8_t halfH[SIZE*SIZE];\
2070
uint8_t halfV[SIZE*SIZE];\
2071
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2072
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2073
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2074
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2078
uint8_t full[SIZE*(SIZE+5)];\
2079
uint8_t * const full_mid= full + SIZE*2;\
2080
uint8_t halfH[SIZE*SIZE];\
2081
uint8_t halfV[SIZE*SIZE];\
2082
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2083
copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2084
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2085
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2089
int16_t tmp[SIZE*(SIZE+5)];\
2090
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2093
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2094
int16_t tmp[SIZE*(SIZE+5)];\
2095
uint8_t halfH[SIZE*SIZE];\
2096
uint8_t halfHV[SIZE*SIZE];\
2097
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2098
put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2099
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2102
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2103
int16_t tmp[SIZE*(SIZE+5)];\
2104
uint8_t halfH[SIZE*SIZE];\
2105
uint8_t halfHV[SIZE*SIZE];\
2106
put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2107
put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2108
OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2111
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2112
uint8_t full[SIZE*(SIZE+5)];\
2113
uint8_t * const full_mid= full + SIZE*2;\
2114
int16_t tmp[SIZE*(SIZE+5)];\
2115
uint8_t halfV[SIZE*SIZE];\
2116
uint8_t halfHV[SIZE*SIZE];\
2117
copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2118
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2119
put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120
OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2123
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2124
uint8_t full[SIZE*(SIZE+5)];\
2125
uint8_t * const full_mid= full + SIZE*2;\
2126
int16_t tmp[SIZE*(SIZE+5)];\
2127
uint8_t halfV[SIZE*SIZE];\
2128
uint8_t halfHV[SIZE*SIZE];\
2129
copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2130
put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2131
put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2132
OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2135
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2136
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2137
#define op_put(a, b) a = cm[((b) + 16)>>5]
2138
#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2139
#define op2_put(a, b) a = cm[((b) + 512)>>10]
2141
H264_LOWPASS(put_ , op_put, op2_put)
2142
H264_LOWPASS(avg_ , op_avg, op2_avg)
2156
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2157
uint8_t *cm = cropTbl + MAX_NEG_CROP;
2161
dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2162
dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2163
dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2164
dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2165
dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2166
dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2167
dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2168
dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2174
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2175
uint8_t *cm = cropTbl + MAX_NEG_CROP;
2179
const int src_1= src[ -srcStride];
2180
const int src0 = src[0 ];
2181
const int src1 = src[ srcStride];
2182
const int src2 = src[2*srcStride];
2183
const int src3 = src[3*srcStride];
2184
const int src4 = src[4*srcStride];
2185
const int src5 = src[5*srcStride];
2186
const int src6 = src[6*srcStride];
2187
const int src7 = src[7*srcStride];
2188
const int src8 = src[8*srcStride];
2189
const int src9 = src[9*srcStride];
2190
dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2191
dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2192
dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2193
dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2194
dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2195
dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2196
dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2197
dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2203
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2204
put_pixels8_c(dst, src, stride, 8);
2207
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2209
wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2210
put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2213
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2214
wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2217
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2219
wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2220
put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2223
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2224
wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2227
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2231
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2232
wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2233
wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2234
put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2236
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2240
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2241
wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2242
wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2243
put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2245
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2247
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2248
wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2252
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2258
s += abs(pix1[0] - pix2[0]);
2259
s += abs(pix1[1] - pix2[1]);
2260
s += abs(pix1[2] - pix2[2]);
2261
s += abs(pix1[3] - pix2[3]);
2262
s += abs(pix1[4] - pix2[4]);
2263
s += abs(pix1[5] - pix2[5]);
2264
s += abs(pix1[6] - pix2[6]);
2265
s += abs(pix1[7] - pix2[7]);
2266
s += abs(pix1[8] - pix2[8]);
2267
s += abs(pix1[9] - pix2[9]);
2268
s += abs(pix1[10] - pix2[10]);
2269
s += abs(pix1[11] - pix2[11]);
2270
s += abs(pix1[12] - pix2[12]);
2271
s += abs(pix1[13] - pix2[13]);
2272
s += abs(pix1[14] - pix2[14]);
2273
s += abs(pix1[15] - pix2[15]);
2280
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2286
s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2287
s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2288
s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2289
s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2290
s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2291
s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2292
s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2293
s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2294
s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2295
s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2296
s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2297
s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2298
s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2299
s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2300
s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2301
s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2308
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2311
uint8_t *pix3 = pix2 + line_size;
2315
s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2316
s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2317
s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2318
s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2319
s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2320
s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2321
s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2322
s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2323
s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2324
s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2325
s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2326
s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2327
s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2328
s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2329
s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2330
s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2338
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2341
uint8_t *pix3 = pix2 + line_size;
2345
s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2346
s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2347
s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2348
s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2349
s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2350
s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2351
s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2352
s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2353
s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2354
s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2355
s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2356
s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2357
s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2358
s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2359
s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2360
s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2368
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2374
s += abs(pix1[0] - pix2[0]);
2375
s += abs(pix1[1] - pix2[1]);
2376
s += abs(pix1[2] - pix2[2]);
2377
s += abs(pix1[3] - pix2[3]);
2378
s += abs(pix1[4] - pix2[4]);
2379
s += abs(pix1[5] - pix2[5]);
2380
s += abs(pix1[6] - pix2[6]);
2381
s += abs(pix1[7] - pix2[7]);
2388
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2394
s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2395
s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2396
s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2397
s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2398
s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2399
s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2400
s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2401
s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2408
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2411
uint8_t *pix3 = pix2 + line_size;
2415
s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2416
s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2417
s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2418
s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2419
s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2420
s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2421
s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2422
s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2430
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2433
uint8_t *pix3 = pix2 + line_size;
2437
s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2438
s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2439
s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2440
s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2441
s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2442
s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2443
s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2444
s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2452
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2453
return pix_abs16x16_c(a,b,stride);
2456
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2457
return pix_abs8x8_c(a,b,stride);
2461
* permutes an 8x8 block.
2462
* @param block the block which will be permuted according to the given permutation vector
2463
* @param permutation the permutation vector
2464
* @param last the last non zero coefficient in scantable order, used to speed the permutation up
2465
* @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2466
* (inverse) permutated to scantable order!
2468
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2474
//if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2476
for(i=0; i<=last; i++){
2477
const int j= scantable[i];
2482
for(i=0; i<=last; i++){
2483
const int j= scantable[i];
2484
const int perm_j= permutation[j];
2485
block[perm_j]= temp[j];
2490
* memset(blocks, 0, sizeof(DCTELEM)*6*64)
2492
static void clear_blocks_c(DCTELEM *blocks)
2494
memset(blocks, 0, sizeof(DCTELEM)*6*64);
2497
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2499
for(i=0; i+7<w; i+=8){
2500
dst[i+0] += src[i+0];
2501
dst[i+1] += src[i+1];
2502
dst[i+2] += src[i+2];
2503
dst[i+3] += src[i+3];
2504
dst[i+4] += src[i+4];
2505
dst[i+5] += src[i+5];
2506
dst[i+6] += src[i+6];
2507
dst[i+7] += src[i+7];
2510
dst[i+0] += src[i+0];
2513
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2515
for(i=0; i+7<w; i+=8){
2516
dst[i+0] = src1[i+0]-src2[i+0];
2517
dst[i+1] = src1[i+1]-src2[i+1];
2518
dst[i+2] = src1[i+2]-src2[i+2];
2519
dst[i+3] = src1[i+3]-src2[i+3];
2520
dst[i+4] = src1[i+4]-src2[i+4];
2521
dst[i+5] = src1[i+5]-src2[i+5];
2522
dst[i+6] = src1[i+6]-src2[i+6];
2523
dst[i+7] = src1[i+7]-src2[i+7];
2526
dst[i+0] = src1[i+0]-src2[i+0];
2529
#define BUTTERFLY2(o1,o2,i1,i2) \
2533
#define BUTTERFLY1(x,y) \
2542
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2544
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2550
//FIXME try pointer walks
2551
BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2552
BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2553
BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2554
BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2556
BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2557
BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2558
BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2559
BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2561
BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2562
BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2563
BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2564
BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2568
BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2569
BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2570
BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2571
BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2573
BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2574
BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2575
BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2576
BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2579
BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2580
+BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2581
+BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2582
+BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2588
printf("MAX:%d\n", maxi);
2594
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2598
//FIXME OOOPS ignore 0 term instead of mean mess
2600
//FIXME try pointer walks
2601
BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2602
BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2603
BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2604
BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2606
BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2607
BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2608
BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2609
BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2611
BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2612
BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2613
BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2614
BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2618
BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2619
BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2620
BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2621
BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2623
BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2624
BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2625
BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2626
BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2629
BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2630
+BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2631
+BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2632
+BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2638
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2639
MpegEncContext * const s= (MpegEncContext *)c;
2640
uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2641
DCTELEM * const temp= (DCTELEM*)aligned_temp;
2644
s->dsp.diff_pixels(temp, src1, src2, stride);
2653
void simple_idct(DCTELEM *block); //FIXME
2655
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2656
MpegEncContext * const s= (MpegEncContext *)c;
2657
uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2658
DCTELEM * const temp= (DCTELEM*)aligned_temp;
2659
DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2664
s->dsp.diff_pixels(temp, src1, src2, stride);
2666
memcpy(bak, temp, 64*sizeof(DCTELEM));
2668
s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2669
s->dct_unquantize(s, temp, 0, s->qscale);
2670
simple_idct(temp); //FIXME
2673
sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2678
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2679
MpegEncContext * const s= (MpegEncContext *)c;
2680
const uint8_t *scantable= s->intra_scantable.permutated;
2681
uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2682
uint64_t __align8 aligned_bak[stride];
2683
DCTELEM * const temp= (DCTELEM*)aligned_temp;
2684
uint8_t * const bak= (uint8_t*)aligned_bak;
2685
int i, last, run, bits, level, distoration, start_i;
2686
const int esc_length= s->ac_esc_length;
2688
uint8_t * last_length;
2691
((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2692
((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2695
s->dsp.diff_pixels(temp, src1, src2, stride);
2697
s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2703
length = s->intra_ac_vlc_length;
2704
last_length= s->intra_ac_vlc_last_length;
2705
bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2708
length = s->inter_ac_vlc_length;
2709
last_length= s->inter_ac_vlc_last_length;
2714
for(i=start_i; i<last; i++){
2715
int j= scantable[i];
2720
if((level&(~127)) == 0){
2721
bits+= length[UNI_AC_ENC_INDEX(run, level)];
2730
level= temp[i] + 64;
2734
if((level&(~127)) == 0){
2735
bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2742
s->dct_unquantize(s, temp, 0, s->qscale);
2745
s->dsp.idct_add(bak, stride, temp);
2747
distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2749
return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2752
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2753
MpegEncContext * const s= (MpegEncContext *)c;
2754
const uint8_t *scantable= s->intra_scantable.permutated;
2755
uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2756
DCTELEM * const temp= (DCTELEM*)aligned_temp;
2757
int i, last, run, bits, level, start_i;
2758
const int esc_length= s->ac_esc_length;
2760
uint8_t * last_length;
2762
s->dsp.diff_pixels(temp, src1, src2, stride);
2764
s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2770
length = s->intra_ac_vlc_length;
2771
last_length= s->intra_ac_vlc_last_length;
2772
bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2775
length = s->inter_ac_vlc_length;
2776
last_length= s->inter_ac_vlc_last_length;
2781
for(i=start_i; i<last; i++){
2782
int j= scantable[i];
2787
if((level&(~127)) == 0){
2788
bits+= length[UNI_AC_ENC_INDEX(run, level)];
2797
level= temp[i] + 64;
2801
if((level&(~127)) == 0){
2802
bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2811
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2812
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2813
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2814
WARPER88_1616(rd8x8_c, rd16x16_c)
2815
WARPER88_1616(bit8x8_c, bit16x16_c)
2817
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2819
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2822
put_pixels_clamped_c(block, dest, line_size);
2824
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2827
add_pixels_clamped_c(block, dest, line_size);
2830
/* init static data */
2831
void dsputil_static_init(void)
2835
for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2836
for(i=0;i<MAX_NEG_CROP;i++) {
2838
cropTbl[i + MAX_NEG_CROP + 256] = 255;
2841
for(i=0;i<512;i++) {
2842
squareTbl[i] = (i - 256) * (i - 256);
2845
for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2849
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2853
#ifdef CONFIG_ENCODERS
2854
if(avctx->dct_algo==FF_DCT_FASTINT)
2855
c->fdct = fdct_ifast;
2857
c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2858
#endif //CONFIG_ENCODERS
2860
if(avctx->idct_algo==FF_IDCT_INT){
2861
c->idct_put= ff_jref_idct_put;
2862
c->idct_add= ff_jref_idct_add;
2863
c->idct = j_rev_dct;
2864
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2865
}else{ //accurate/default
2866
c->idct_put= simple_idct_put;
2867
c->idct_add= simple_idct_add;
2868
c->idct = simple_idct;
2869
c->idct_permutation_type= FF_NO_IDCT_PERM;
2872
c->get_pixels = get_pixels_c;
2873
c->diff_pixels = diff_pixels_c;
2874
c->put_pixels_clamped = put_pixels_clamped_c;
2875
c->add_pixels_clamped = add_pixels_clamped_c;
2878
c->clear_blocks = clear_blocks_c;
2879
c->pix_sum = pix_sum_c;
2880
c->pix_norm1 = pix_norm1_c;
2884
/* TODO [0] 16 [1] 8 */
2885
c->pix_abs16x16 = pix_abs16x16_c;
2886
c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2887
c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2888
c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2889
c->pix_abs8x8 = pix_abs8x8_c;
2890
c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2891
c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2892
c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2894
#define dspfunc(PFX, IDX, NUM) \
2895
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2896
c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2897
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2898
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2900
dspfunc(put, 0, 16);
2901
dspfunc(put_no_rnd, 0, 16);
2903
dspfunc(put_no_rnd, 1, 8);
2907
dspfunc(avg, 0, 16);
2908
dspfunc(avg_no_rnd, 0, 16);
2910
dspfunc(avg_no_rnd, 1, 8);
2915
c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2916
c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2917
c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2918
c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2919
c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2920
c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2921
c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2922
c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2923
c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2925
c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2926
c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2927
c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2928
c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2929
c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2930
c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2931
c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2932
c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2933
c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2935
#define dspfunc(PFX, IDX, NUM) \
2936
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2937
c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2938
c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2939
c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2940
c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2941
c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2942
c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2943
c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2944
c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2945
c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2946
c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2947
c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2948
c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2949
c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2950
c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2951
c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2953
dspfunc(put_qpel, 0, 16);
2954
dspfunc(put_no_rnd_qpel, 0, 16);
2956
dspfunc(avg_qpel, 0, 16);
2957
/* dspfunc(avg_no_rnd_qpel, 0, 16); */
2959
dspfunc(put_qpel, 1, 8);
2960
dspfunc(put_no_rnd_qpel, 1, 8);
2962
dspfunc(avg_qpel, 1, 8);
2963
/* dspfunc(avg_no_rnd_qpel, 1, 8); */
2965
dspfunc(put_h264_qpel, 0, 16);
2966
dspfunc(put_h264_qpel, 1, 8);
2967
dspfunc(put_h264_qpel, 2, 4);
2968
dspfunc(avg_h264_qpel, 0, 16);
2969
dspfunc(avg_h264_qpel, 1, 8);
2970
dspfunc(avg_h264_qpel, 2, 4);
2973
c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2974
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2975
c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2976
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2977
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2978
c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2980
c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2981
c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2982
c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2983
c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2984
c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2985
c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2986
c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2987
c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2989
c->hadamard8_diff[0]= hadamard8_diff16_c;
2990
c->hadamard8_diff[1]= hadamard8_diff_c;
2991
c->hadamard8_abs = hadamard8_abs_c;
2993
c->dct_sad[0]= dct_sad16x16_c;
2994
c->dct_sad[1]= dct_sad8x8_c;
2996
c->sad[0]= sad16x16_c;
2997
c->sad[1]= sad8x8_c;
2999
c->quant_psnr[0]= quant_psnr16x16_c;
3000
c->quant_psnr[1]= quant_psnr8x8_c;
3002
c->rd[0]= rd16x16_c;
3005
c->bit[0]= bit16x16_c;
3006
c->bit[1]= bit8x8_c;
3008
c->add_bytes= add_bytes_c;
3009
c->diff_bytes= diff_bytes_c;
3010
c->bswap_buf= bswap_buf;
3013
dsputil_init_mmx(c, avctx);
3016
dsputil_init_armv4l(c, avctx);
3019
dsputil_init_mlib(c, avctx);
3022
dsputil_init_alpha(c, avctx);
3025
dsputil_init_ppc(c, avctx);
3028
dsputil_init_mmi(c, avctx);
3031
dsputil_init_sh4(c,avctx);
3034
switch(c->idct_permutation_type){
3035
case FF_NO_IDCT_PERM:
3037
c->idct_permutation[i]= i;
3039
case FF_LIBMPEG2_IDCT_PERM:
3041
c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3043
case FF_SIMPLE_IDCT_PERM:
3045
c->idct_permutation[i]= simple_mmx_permutation[i];
3047
case FF_TRANSPOSE_IDCT_PERM:
3049
c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3052
fprintf(stderr, "Internal error, IDCT permutation not set\n");