57
57
if( !ok ) ret = -1; \
60
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
61
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
60
#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
62
61
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
63
62
#define MAX_CPUS 30 // number of different combinations of cpu flags
179
178
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
180
b->cpu&X264_CPU_AVX512 ? "avx512" :
181
181
b->cpu&X264_CPU_AVX2 ? "avx2" :
182
182
b->cpu&X264_CPU_BMI2 ? "bmi2" :
183
183
b->cpu&X264_CPU_BMI1 ? "bmi1" :
2602
2602
x264_quant_init( &h, cpu_new, &h.quantf );
2603
2603
h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
2605
/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
2606
#define GET_CB( i ) (\
2607
x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
2608
cb[i].f8_bits_encoded = 0, &cb[i] )
2605
2610
#define CABAC_RESIDUAL(name, start, end, rd)\
2607
2612
if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
2637
2642
x264_cabac_t cb[2];\
2638
2643
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
2639
2644
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
2640
x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
2641
x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
2642
cb[0].f8_bits_encoded = 0;\
2643
cb[1].f8_bits_encoded = 0;\
2644
2645
if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
2645
call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
2646
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
2646
call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
2647
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
2647
2648
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
2648
2649
if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
2657
2658
if( (j&15) == 0 )\
2659
call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
2660
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
2660
call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
2661
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
2794
2795
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
2795
2796
cpu1 &= ~X264_CPU_CACHELINE_32;
2797
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
2798
cpu1 &= ~X264_CPU_SLOW_CTZ;
2800
2799
if( cpu_detect & X264_CPU_SSE )
2801
2800
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
2807
2806
cpu1 &= ~X264_CPU_CACHELINE_64;
2808
2807
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
2809
2808
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
2810
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
2811
cpu1 &= ~X264_CPU_SLOW_CTZ;
2813
2810
if( cpu_detect & X264_CPU_LZCNT )
2827
2824
cpu1 &= ~X264_CPU_CACHELINE_64;
2828
2825
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
2829
2826
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
2830
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
2831
cpu1 &= ~X264_CPU_SLOW_CTZ;
2832
2827
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
2833
2828
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
2834
2829
cpu1 &= ~X264_CPU_CACHELINE_64;
2860
2855
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
2861
2856
if( cpu_detect & X264_CPU_AVX2 )
2862
2857
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
2858
if( cpu_detect & X264_CPU_AVX512 )
2859
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
2864
2861
if( cpu_detect & X264_CPU_ALTIVEC )
2916
2911
fprintf( stderr, "x264: using random seed %u\n", seed );
2919
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
2920
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
2914
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
2915
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
2921
2916
if( !buf1 || !pbuf1 )
2923
2918
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
2939
2934
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
2941
/* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
2943
for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
2945
INIT_POINTER_OFFSETS;
2946
ret |= x264_stack_pagealign( check_all_flags, i*32 );
2950
fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
2953
ret = x264_stack_pagealign( check_all_flags, 0 );
2936
if( x264_stack_pagealign( check_all_flags, 0 ) )
2957
2938
fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );