~ubuntu-branches/ubuntu/wily/x264/wily-proposed

« back to all changes in this revision

Viewing changes to tools/checkasm.c

  • Committer: Package Import Robot
  • Author(s): Reinhard Tartler
  • Date: 2012-01-19 07:28:39 UTC
  • mfrom: (12.1.11 experimental)
  • Revision ID: package-import@ubuntu.com-20120119072839-0vj6g40ky09d9nru
Tags: 2:0.120.2127+gitf33c8cb-2ubuntu1
* Merge from Debian, remaining changes:
  - build against libgpac-dev to enable .mp4 output

Show diffs side-by-side

added added

removed removed

Lines of Context:
55
55
#define BENCH_RUNS 100  // tradeoff between accuracy and speed
56
56
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
57
57
#define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
58
 
#define MAX_CPUS 10     // number of different combinations of cpu flags
 
58
#define MAX_CPUS 30     // number of different combinations of cpu flags
59
59
 
60
60
typedef struct
61
61
{
164
164
            if( k < j )
165
165
                continue;
166
166
            printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
 
167
                    b->cpu&X264_CPU_FMA4 ? "fma4" :
 
168
                    b->cpu&X264_CPU_XOP ? "xop" :
167
169
                    b->cpu&X264_CPU_AVX ? "avx" :
168
170
                    b->cpu&X264_CPU_SSE4 ? "sse4" :
169
 
                    b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
170
171
                    b->cpu&X264_CPU_SSSE3 ? "ssse3" :
171
172
                    b->cpu&X264_CPU_SSE3 ? "sse3" :
172
173
                    /* print sse2slow only if there's also a sse2fast version of the same func */
173
 
                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
 
174
                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
174
175
                    b->cpu&X264_CPU_SSE2 ? "sse2" :
175
176
                    b->cpu&X264_CPU_MMX ? "mmx" :
176
177
                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
178
179
                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
179
180
                    b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
180
181
                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
 
182
                    b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
181
183
                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
182
184
                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
183
185
                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
469
471
        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
470
472
        for( int i=0; i<17; i++ ) \
471
473
            bitcosts[i] = 9*(i!=8); \
 
474
        memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
 
475
        memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
472
476
        for( int i=0; i<32; i++ ) \
473
477
        { \
474
478
            pixel *fenc = pbuf1+48+i*12; \
475
 
            pixel *fdec = pbuf3+48+i*12; \
 
479
            pixel *fdec1 = pbuf3+48+i*12; \
 
480
            pixel *fdec2 = pbuf4+48+i*12; \
476
481
            int pred_mode = i%9; \
477
482
            int res_c = INT_MAX; \
478
483
            for( int j=0; j<9; j++ ) \
479
484
            { \
480
 
                predict_4x4[j]( fdec ); \
481
 
                int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \
 
485
                predict_4x4[j]( fdec1 ); \
 
486
                int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
482
487
                if( cost < (uint16_t)res_c ) \
483
488
                    res_c = cost + (j<<16); \
484
489
            } \
485
 
            int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \
 
490
            predict_4x4[res_c>>16]( fdec1 ); \
 
491
            int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
486
492
            if( res_c != res_a ) \
487
493
            { \
488
494
                ok = 0; \
489
495
                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
490
496
                break; \
491
497
            } \
 
498
            if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
 
499
            { \
 
500
                ok = 0; \
 
501
                fprintf( stderr, #name" [FAILED]\n" ); \
 
502
                for( int j=0; j<16; j++ ) \
 
503
                    fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
 
504
                fprintf( stderr, "\n" ); \
 
505
                for( int j=0; j<16; j++ ) \
 
506
                    fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
 
507
                fprintf( stderr, "\n" ); \
 
508
                break; \
 
509
            } \
 
510
        } \
 
511
    }
 
512
 
 
513
#define TEST_INTRA8_X9( name, cmp ) \
 
514
    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 
515
    { \
 
516
        set_func_name( #name ); \
 
517
        used_asm = 1; \
 
518
        ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
 
519
        ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
 
520
        ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
 
521
        memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
 
522
        memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
 
523
        for( int i=0; i<17; i++ ) \
 
524
            bitcosts[i] = 9*(i!=8); \
 
525
        for( int i=0; i<32; i++ ) \
 
526
        { \
 
527
            pixel *fenc = pbuf1+48+i*12; \
 
528
            pixel *fdec1 = pbuf3+48+i*12; \
 
529
            pixel *fdec2 = pbuf4+48+i*12; \
 
530
            int pred_mode = i%9; \
 
531
            int res_c = INT_MAX; \
 
532
            predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
 
533
            for( int j=0; j<9; j++ ) \
 
534
            { \
 
535
                predict_8x8[j]( fdec1, edge ); \
 
536
                satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
 
537
                if( satds_c[j] < (uint16_t)res_c ) \
 
538
                    res_c = satds_c[j] + (j<<16); \
 
539
            } \
 
540
            predict_8x8[res_c>>16]( fdec1, edge ); \
 
541
            int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
 
542
            if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
 
543
            { \
 
544
                ok = 0; \
 
545
                fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
 
546
                for( int j = 0; j < 9; j++ ) \
 
547
                    fprintf( stderr, "%5d ", satds_c[j]); \
 
548
                fprintf( stderr, "\n" ); \
 
549
                for( int j = 0; j < 9; j++ ) \
 
550
                    fprintf( stderr, "%5d ", satds_a[j]); \
 
551
                fprintf( stderr, "\n" ); \
 
552
                break; \
 
553
            } \
 
554
            for( int j=0; j<8; j++ ) \
 
555
                if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
 
556
                    ok = 0; \
 
557
            if( !ok ) \
 
558
            { \
 
559
                fprintf( stderr, #name" [FAILED]\n" ); \
 
560
                for( int j=0; j<8; j++ ) \
 
561
                { \
 
562
                    for( int k=0; k<8; k++ ) \
 
563
                        fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
 
564
                    fprintf( stderr, "\n" ); \
 
565
                } \
 
566
                fprintf( stderr, "\n" ); \
 
567
                for( int j=0; j<8; j++ ) \
 
568
                { \
 
569
                    for( int k=0; k<8; k++ ) \
 
570
                        fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
 
571
                    fprintf( stderr, "\n" ); \
 
572
                } \
 
573
                fprintf( stderr, "\n" ); \
 
574
                break; \
 
575
            } \
492
576
        } \
493
577
    }
494
578
 
509
593
    report( "intra sad_x3 :" );
510
594
    ok = 1; used_asm = 0;
511
595
    TEST_INTRA_X9( intra_satd_x9_4x4, satd );
 
596
    TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
512
597
    report( "intra satd_x9 :" );
513
598
    ok = 1; used_asm = 0;
514
599
    TEST_INTRA_X9( intra_sad_x9_4x4, sad );
 
600
    TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
515
601
    report( "intra sad_x9 :" );
516
602
 
517
603
    ok = 1; used_asm = 0;
565
651
        {
566
652
            ALIGNED_16( uint16_t sums[72] );
567
653
            ALIGNED_16( int dc[4] );
568
 
            int16_t mvs_a[32], mvs_c[32];
 
654
            ALIGNED_16( int16_t mvs_a[32] );
 
655
            ALIGNED_16( int16_t mvs_c[32] );
569
656
            int mvn_a, mvn_c;
570
657
            int thresh = rand() & 0x3fff;
571
658
            set_func_name( "esa_ads" );
1365
1452
            call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
1366
1453
            // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
1367
1454
            x264_emms();
1368
 
            for( int j = 0; j < 100; j++ )
 
1455
            for( int j = 0; j < 100 && ok; j++ )
 
1456
            {
1369
1457
                ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
 
1458
                if( !ok )
 
1459
                    fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
 
1460
            }
1370
1461
        }
1371
1462
        report( "mbtree propagate :" );
1372
1463
    }
2205
2296
{
2206
2297
    *cpu_ref = *cpu_new;
2207
2298
    *cpu_new |= flags;
 
2299
#if BROKEN_STACK_ALIGNMENT
 
2300
    *cpu_new |= X264_CPU_STACK_MOD4;
 
2301
#endif
2208
2302
    if( *cpu_new & X264_CPU_SSE2_IS_FAST )
2209
2303
        *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
2210
2304
    if( !quiet )
2239
2333
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
2240
2334
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
2241
2335
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
 
2336
        cpu1 &= ~X264_CPU_CACHELINE_64;
2242
2337
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
2243
2338
        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
2244
2339
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
2248
2343
    }
2249
2344
    if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
2250
2345
    {
2251
 
        cpu1 &= ~X264_CPU_CACHELINE_64;
2252
2346
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
2253
2347
        cpu1 &= ~X264_CPU_SSE_MISALIGN;
2254
2348
    }
2255
2349
    if( x264_cpu_detect() & X264_CPU_LZCNT )
2256
2350
    {
2257
 
        cpu1 &= ~X264_CPU_CACHELINE_64;
2258
2351
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
2259
2352
        cpu1 &= ~X264_CPU_LZCNT;
2260
2353
    }
2261
2354
    if( x264_cpu_detect() & X264_CPU_SSE3 )
 
2355
    {
2262
2356
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
 
2357
        cpu1 &= ~X264_CPU_CACHELINE_64;
 
2358
    }
2263
2359
    if( x264_cpu_detect() & X264_CPU_SSSE3 )
2264
2360
    {
2265
 
        cpu1 &= ~X264_CPU_CACHELINE_64;
2266
2361
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
2267
2362
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
 
2363
        cpu1 &= ~X264_CPU_CACHELINE_64;
2268
2364
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
2269
2365
        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
2270
2366
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
2273
2369
        cpu1 &= ~X264_CPU_SLOW_ATOM;
2274
2370
    }
2275
2371
    if( x264_cpu_detect() & X264_CPU_SSE4 )
2276
 
    {
2277
 
        cpu1 &= ~X264_CPU_CACHELINE_64;
2278
 
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
2279
 
    }
 
2372
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
2280
2373
    if( x264_cpu_detect() & X264_CPU_AVX )
2281
2374
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
 
2375
    if( x264_cpu_detect() & X264_CPU_XOP )
 
2376
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
 
2377
    if( x264_cpu_detect() & X264_CPU_FMA4 )
 
2378
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
2282
2379
#elif ARCH_PPC
2283
2380
    if( x264_cpu_detect() & X264_CPU_ALTIVEC )
2284
2381
    {