85
85
static int cpu_flags;
87
struct algo algos[] = {
88
{"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
89
{"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
90
{"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
91
{"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
92
{"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
93
{"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
94
{"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
95
{"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
98
{"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
100
{"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
101
{"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
87
static const struct algo fdct_tab[] = {
88
{ "REF-DBL", ff_ref_fdct, NO_PERM },
89
{ "FAAN", ff_faandct, FAAN_SCALE },
90
{ "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91
{ "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94
{ "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95
{ "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96
{ "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100
{ "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104
{ "BFINfdct", ff_bfin_fdct, NO_PERM },
110
static const struct algo idct_tab[] = {
111
{ "FAANI", ff_faanidct, NO_PERM },
112
{ "REF-DBL", ff_ref_idct, NO_PERM },
113
{ "INT", j_rev_dct, MMX_PERM },
114
{ "SIMPLE-C", ff_simple_idct_8, NO_PERM },
105
{"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
106
{"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
108
{"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
109
{"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
110
{"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
111
{"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
115
{"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
118
{ "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119
{ "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121
{ "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122
{ "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123
{ "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124
{ "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
119
{"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
120
{"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
128
{ "BFINidct", ff_bfin_idct, NO_PERM },
124
{"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
125
{"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
132
{ "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133
{ "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
127
{"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
136
{ "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
130
{"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
139
{ "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
133
{"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
142
{ "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
135
#endif /* ARCH_ARM */
138
{"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
146
{ "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
144
152
#define AANSCALE_BITS 12
146
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
148
154
static int64_t gettime(void)
150
156
struct timeval tv;
151
gettimeofday(&tv,NULL);
157
gettimeofday(&tv, NULL);
152
158
return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
158
164
static short idct_mmx_perm[64];
160
static short idct_simple_mmx_perm[64]={
161
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
162
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
163
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
164
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
165
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
166
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
167
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
168
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
166
static short idct_simple_mmx_perm[64] = {
167
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
168
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
169
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
170
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
171
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
172
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
173
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
174
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
171
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
177
static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
173
179
static void idct_mmx_init(void)
196
static void dct_error(const char *name, int is_idct,
197
void (*fdct_func)(DCTELEM *block),
198
void (*fdct_ref)(DCTELEM *block), int form, int test)
200
static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
204
memset(block, 0, 64 * sizeof(*block));
208
for (i = 0; i < 64; i++)
209
block[i] = (av_lfg_get(prng) % 512) - 256;
212
for (i = 0; i < 64; i++)
217
j = av_lfg_get(prng) % 10 + 1;
218
for (i = 0; i < j; i++)
219
block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
222
block[ 0] = av_lfg_get(prng) % 4096 - 2048;
223
block[63] = (block[0] & 1) ^ 1;
228
static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
232
if (perm == MMX_PERM) {
233
for (i = 0; i < 64; i++)
234
dst[idct_mmx_perm[i]] = src[i];
235
} else if (perm == MMX_SIMPLE_PERM) {
236
for (i = 0; i < 64; i++)
237
dst[idct_simple_mmx_perm[i]] = src[i];
238
} else if (perm == SSE2_PERM) {
239
for (i = 0; i < 64; i++)
240
dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241
} else if (perm == PARTTRANS_PERM) {
242
for (i = 0; i < 64; i++)
243
dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
245
for (i = 0; i < 64; i++)
250
static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
252
void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
200
253
int it, i, scale;
202
int64_t err2, ti, ti1, it1;
203
int64_t sysErr[64], sysErrMax=0;
205
int blockSumErrMax=0, blockSumErr;
255
int64_t err2, ti, ti1, it1, err_sum = 0;
256
int64_t sysErr[64], sysErrMax = 0;
258
int blockSumErrMax = 0, blockSumErr;
208
263
av_lfg_init(&prng, 1);
212
for(i=0; i<64; i++) sysErr[i]=0;
213
for(it=0;it<NB_ITS;it++) {
219
block1[i] = (av_lfg_get(&prng) % 512) -256;
228
int num = av_lfg_get(&prng) % 10 + 1;
230
block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
233
block1[0] = av_lfg_get(&prng) % 4096 - 2048;
234
block1[63]= (block1[0]&1)^1;
238
#if 0 // simulate mismatch control
243
if((sum&1)==0) block1[63]^=1;
248
block_org[i]= block1[i];
250
if (form == MMX_PERM) {
252
block[idct_mmx_perm[i]] = block1[i];
253
} else if (form == MMX_SIMPLE_PERM) {
255
block[idct_simple_mmx_perm[i]] = block1[i];
257
} else if (form == SSE2_PERM) {
259
block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
260
} else if (form == PARTTRANS_PERM) {
262
block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
267
#if 0 // simulate mismatch control for tested IDCT but not the ref
272
if((sum&1)==0) block[63]^=1;
267
for (i = 0; i < 64; i++)
269
for (it = 0; it < NB_ITS; it++) {
270
init_block(block1, test, is_idct, &prng);
271
permute(block, block1, dct->format);
279
if (form == SCALE_PERM) {
280
for(i=0; i<64; i++) {
281
scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
282
block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
276
if (dct->format == SCALE_PERM) {
277
for (i = 0; i < 64; i++) {
278
scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
279
block[i] = (block[i] * scale) >> AANSCALE_BITS;
290
v = abs(block[i] - block1[i]);
286
for (i = 0; i < 64; i++) {
287
int err = block[i] - block1[i];
294
293
sysErr[i] += block[i] - block1[i];
295
294
blockSumErr += v;
296
if( abs(block[i])>maxout) maxout=abs(block[i]);
295
if (abs(block[i]) > maxout)
296
maxout = abs(block[i]);
298
if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
299
#if 0 // print different matrix pairs
298
if (blockSumErrMax < blockSumErr)
299
blockSumErrMax = blockSumErr;
301
for (i = 0; i < 64; i++)
302
sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
304
for (i = 0; i < 64; i++) {
303
if((i&7)==0) printf("\n");
304
printf("%4d ", block_org[i]);
307
if((i&7)==0) printf("\n");
308
printf("%4d ", block[i] - block1[i]);
313
for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
316
if(i%8==0) printf("\n");
317
printf("%7d ", (int)sysErr[i]);
307
printf("%7d ", (int) sysErr[i]);
321
printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
322
is_idct ? "IDCT" : "DCT",
323
name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
311
omse = (double) err2 / NB_ITS / 64;
312
ome = (double) err_sum / NB_ITS / 64;
314
spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
316
printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
317
is_idct ? "IDCT" : "DCT", dct->name, err_inf,
318
omse, ome, (double) sysErrMax / NB_ITS,
319
maxout, blockSumErrMax);
321
if (spec_err && !dct->nonspec)
331
block1[i] = av_lfg_get(&prng) % 512 -256;
341
block1[0] = av_lfg_get(&prng) % 512 -256;
342
block1[1] = av_lfg_get(&prng) % 512 -256;
343
block1[2] = av_lfg_get(&prng) % 512 -256;
344
block1[3] = av_lfg_get(&prng) % 512 -256;
348
if (form == MMX_PERM) {
350
block[idct_mmx_perm[i]] = block1[i];
351
} else if(form == MMX_SIMPLE_PERM) {
353
block[idct_simple_mmx_perm[i]] = block1[i];
328
init_block(block, test, is_idct, &prng);
329
permute(block1, block, dct->format);
362
for(it=0;it<NB_ITS_SPEED;it++) {
365
// memcpy(block, block1, sizeof(DCTELEM) * 64);
366
// do not memcpy especially not fastmemcpy because it does movntq !!!
334
for (it = 0; it < NB_ITS_SPEED; it++) {
335
memcpy(block, block1, sizeof(block));
369
338
it1 += NB_ITS_SPEED;
370
339
ti1 = gettime() - ti;
371
340
} while (ti1 < 1000000);
374
printf("%s %s: %0.1f kdct/s\n",
375
is_idct ? "IDCT" : "DCT",
376
name, (double)it1 * 1000.0 / (double)ti1);
343
printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
344
(double) it1 * 1000.0 / (double) ti1);
379
349
DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
414
384
s = 0.5 * sqrt(2.0);
417
block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
418
block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
385
for (i = 0; i < 4; i++) {
386
for (j = 0; j < 8; j++) {
387
block1[8 * (2 * i) + j] =
388
(block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
389
block1[8 * (2 * i + 1) + j] =
390
(block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
422
394
/* idct8 on lines */
395
for (i = 0; i < 8; i++) {
396
for (j = 0; j < 8; j++) {
427
sum += c8[k][j] * block1[8*i+k];
398
for (k = 0; k < 8; k++)
399
sum += c8[k][j] * block1[8 * i + k];
400
block2[8 * i + j] = sum;
405
for (i = 0; i < 8; i++) {
406
for (j = 0; j < 4; j++) {
438
sum += c4[k][j] * block2[8*(2*k)+i];
439
block3[8*(2*j)+i] = sum;
409
for (k = 0; k < 4; k++)
410
sum += c4[k][j] * block2[8 * (2 * k) + i];
411
block3[8 * (2 * j) + i] = sum;
444
sum += c4[k][j] * block2[8*(2*k+1)+i];
445
block3[8*(2*j+1)+i] = sum;
415
for (k = 0; k < 4; k++)
416
sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
417
block3[8 * (2 * j + 1) + i] = sum;
449
421
/* clamp and store the result */
457
dest[i * linesize + j] = (int)rint(v);
422
for (i = 0; i < 8; i++) {
423
for (j = 0; j < 8; j++) {
424
v = block3[8 * i + j];
426
else if (v > 255) v = 255;
427
dest[i * linesize + j] = (int) rint(v);
462
432
static void idct248_error(const char *name,
463
void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
433
void (*idct248_put)(uint8_t *dest, int line_size,
465
437
int it, i, it1, ti, ti1, err_max, v;
469
440
av_lfg_init(&prng, 1);
525
493
" 1 -> test with random sparse matrixes\n"
526
494
" 2 -> do 3. test from mpeg4 std\n"
527
495
"-i test IDCT implementations\n"
528
"-4 test IDCT248 implementations\n");
496
"-4 test IDCT248 implementations\n"
531
500
int main(int argc, char **argv)
533
502
int test_idct = 0, test_248_dct = 0;
536
508
cpu_flags = av_get_cpu_flags();
538
510
ff_ref_dct_init();
541
for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
542
for(i=0;i<MAX_NEG_CROP;i++) {
544
cropTbl[i + MAX_NEG_CROP + 256] = 255;
548
c = getopt(argc, argv, "ih4");
514
c = getopt(argc, argv, "ih4t");
556
522
test_248_dct = 1;
565
if(optind <argc) test= atoi(argv[optind]);
535
test = atoi(argv[optind]);
567
printf("ffmpeg DCT/IDCT test\n");
537
printf("Libav DCT/IDCT test\n");
569
539
if (test_248_dct) {
570
idct248_error("SIMPLE-C", ff_simple_idct248_put);
540
idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
572
for (i=0;algos[i].name;i++)
573
if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
574
dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
542
const struct algo *algos = test_idct ? idct_tab : fdct_tab;
543
for (i = 0; algos[i].name; i++)
544
if (!(~cpu_flags & algos[i].mm_support)) {
545
err |= dct_error(&algos[i], test, test_idct, speed);