469
471
ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
470
472
for( int i=0; i<17; i++ ) \
471
473
bitcosts[i] = 9*(i!=8); \
474
memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
475
memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
472
476
for( int i=0; i<32; i++ ) \
474
478
pixel *fenc = pbuf1+48+i*12; \
475
pixel *fdec = pbuf3+48+i*12; \
479
pixel *fdec1 = pbuf3+48+i*12; \
480
pixel *fdec2 = pbuf4+48+i*12; \
476
481
int pred_mode = i%9; \
477
482
int res_c = INT_MAX; \
478
483
for( int j=0; j<9; j++ ) \
480
predict_4x4[j]( fdec ); \
481
int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \
485
predict_4x4[j]( fdec1 ); \
486
int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
482
487
if( cost < (uint16_t)res_c ) \
483
488
res_c = cost + (j<<16); \
485
int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \
490
predict_4x4[res_c>>16]( fdec1 ); \
491
int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
486
492
if( res_c != res_a ) \
489
495
fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
498
if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
501
fprintf( stderr, #name" [FAILED]\n" ); \
502
for( int j=0; j<16; j++ ) \
503
fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
504
fprintf( stderr, "\n" ); \
505
for( int j=0; j<16; j++ ) \
506
fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
507
fprintf( stderr, "\n" ); \
513
#define TEST_INTRA8_X9( name, cmp ) \
514
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
516
set_func_name( #name ); \
518
ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
519
ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
520
ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
521
memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
522
memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
523
for( int i=0; i<17; i++ ) \
524
bitcosts[i] = 9*(i!=8); \
525
for( int i=0; i<32; i++ ) \
527
pixel *fenc = pbuf1+48+i*12; \
528
pixel *fdec1 = pbuf3+48+i*12; \
529
pixel *fdec2 = pbuf4+48+i*12; \
530
int pred_mode = i%9; \
531
int res_c = INT_MAX; \
532
predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
533
for( int j=0; j<9; j++ ) \
535
predict_8x8[j]( fdec1, edge ); \
536
satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
537
if( satds_c[j] < (uint16_t)res_c ) \
538
res_c = satds_c[j] + (j<<16); \
540
predict_8x8[res_c>>16]( fdec1, edge ); \
541
int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
542
if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
545
fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
546
for( int j = 0; j < 9; j++ ) \
547
fprintf( stderr, "%5d ", satds_c[j]); \
548
fprintf( stderr, "\n" ); \
549
for( int j = 0; j < 9; j++ ) \
550
fprintf( stderr, "%5d ", satds_a[j]); \
551
fprintf( stderr, "\n" ); \
554
for( int j=0; j<8; j++ ) \
555
if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
559
fprintf( stderr, #name" [FAILED]\n" ); \
560
for( int j=0; j<8; j++ ) \
562
for( int k=0; k<8; k++ ) \
563
fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
564
fprintf( stderr, "\n" ); \
566
fprintf( stderr, "\n" ); \
567
for( int j=0; j<8; j++ ) \
569
for( int k=0; k<8; k++ ) \
570
fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
571
fprintf( stderr, "\n" ); \
573
fprintf( stderr, "\n" ); \
2249
2344
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
2251
cpu1 &= ~X264_CPU_CACHELINE_64;
2252
2346
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
2253
2347
cpu1 &= ~X264_CPU_SSE_MISALIGN;
2255
2349
if( x264_cpu_detect() & X264_CPU_LZCNT )
2257
cpu1 &= ~X264_CPU_CACHELINE_64;
2258
2351
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
2259
2352
cpu1 &= ~X264_CPU_LZCNT;
2261
2354
if( x264_cpu_detect() & X264_CPU_SSE3 )
2262
2356
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
2357
cpu1 &= ~X264_CPU_CACHELINE_64;
2263
2359
if( x264_cpu_detect() & X264_CPU_SSSE3 )
2265
cpu1 &= ~X264_CPU_CACHELINE_64;
2266
2361
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
2267
2362
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
2363
cpu1 &= ~X264_CPU_CACHELINE_64;
2268
2364
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
2269
2365
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
2270
2366
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
2273
2369
cpu1 &= ~X264_CPU_SLOW_ATOM;
2275
2371
if( x264_cpu_detect() & X264_CPU_SSE4 )
2277
cpu1 &= ~X264_CPU_CACHELINE_64;
2278
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
2372
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
2280
2373
if( x264_cpu_detect() & X264_CPU_AVX )
2281
2374
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
2375
if( x264_cpu_detect() & X264_CPU_XOP )
2376
ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
2377
if( x264_cpu_detect() & X264_CPU_FMA4 )
2378
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
2283
2380
if( x264_cpu_detect() & X264_CPU_ALTIVEC )