~ubuntu-branches/ubuntu/wily/x264/wily-proposed

« back to all changes in this revision

Viewing changes to tools/checkasm.c

Committer: Package Import Robot
Author(s): Reinhard Tartler
Date: 2012-01-19 07:28:39 UTC
mfrom: (12.1.11 experimental)
Revision ID: package-import@ubuntu.com-20120119072839-0vj6g40ky09d9nru

Tags: 2:0.120.2127+gitf33c8cb-2ubuntu1

* Merge from Debian, remaining changes:
- build against libgpac-dev to enable .mp4 output

files added:
debian/control.in

debian/libx264N.install.in

files removed:
.pc

.pc/.version

.pc/applied-patches

.pc/build-on-hurd.patch

.pc/build-on-hurd.patch/configure

debian/libx264-118.install.in

debian/patches

debian/patches/build-on-hurd.patch

debian/patches/series

files modified:
Makefile

common/arm/deblock-a.S

common/arm/mc-c.c

common/common.c

common/common.h

common/cpu.c

common/dct.c

common/deblock.c

common/frame.c

common/frame.h

common/macroblock.c

common/mc.c

common/mc.h

common/osdep.h

common/pixel.c

common/pixel.h

common/predict.c

common/quant.c

common/rectangle.c

common/set.c

common/x86/dct-a.asm

common/x86/dct.h

common/x86/deblock-a.asm

common/x86/mc-a.asm

common/x86/mc-a2.asm

common/x86/mc-c.c

common/x86/pixel-a.asm

common/x86/pixel.h

common/x86/predict-a.asm

common/x86/predict-c.c

common/x86/predict.h

common/x86/quant-a.asm

common/x86/quant.h

common/x86/sad-a.asm

common/x86/x86inc.asm

common/x86/x86util.asm

configure

debian/changelog

debian/confflags

debian/control

debian/rules

debian/x264-get-orig-source

debian/x264.1

encoder/analyse.c

encoder/cabac.c

encoder/cavlc.c

encoder/encoder.c

encoder/macroblock.c

encoder/macroblock.h

encoder/me.c

encoder/ratecontrol.c

encoder/rdo.c

encoder/set.c

encoder/slicetype.c

filters/video/resize.c

input/avs.c

input/ffms.c

input/input.h

input/lavf.c

input/timecode.c

tools/checkasm.c

version.sh

x264.c

x264.h

x264cli.h

Show diffs side-by-side

added added

removed removed

tools/checkasm.c

#define BENCH_RUNS 100 // tradeoff between accuracy and speed

#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)

#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions

#define MAX_CPUS 10 // number of different combinations of cpu flags

#define MAX_CPUS 30 // number of different combinations of cpu flags

typedef struct

{

164

if( k < j )

165

continue;

166

printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,

167

b->cpu&X264_CPU_FMA4 ? "fma4" :

168

b->cpu&X264_CPU_XOP ? "xop" :

167

169

b->cpu&X264_CPU_AVX ? "avx" :

168

170

b->cpu&X264_CPU_SSE4 ? "sse4" :

169

b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :

170

171

b->cpu&X264_CPU_SSSE3 ? "ssse3" :

171

172

b->cpu&X264_CPU_SSE3 ? "sse3" :

172

173

/* print sse2slow only if there's also a sse2fast version of the same func */

173

b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :

174

b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :

174

175

b->cpu&X264_CPU_SSE2 ? "sse2" :

175

176

b->cpu&X264_CPU_MMX ? "mmx" :

176

177

b->cpu&X264_CPU_ALTIVEC ? "altivec" :

178

179

b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",

179

180

b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :

180

181

b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :

182

b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :

181

183

b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :

182

184

b->cpu&X264_CPU_LZCNT ? "_lzcnt" :

183

185

b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :

469

471

ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \

470

472

for( int i=0; i<17; i++ ) \

471

473

bitcosts[i] = 9*(i!=8); \

474

memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \

475

memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \

472

476

for( int i=0; i<32; i++ ) \

473

477

{ \

474

478

pixel *fenc = pbuf1+48+i*12; \

475

pixel *fdec = pbuf3+48+i*12; \

479

pixel *fdec1 = pbuf3+48+i*12; \

480

pixel *fdec2 = pbuf4+48+i*12; \

476

481

int pred_mode = i%9; \

477

482

int res_c = INT_MAX; \

478

483

for( int j=0; j<9; j++ ) \

479

484

{ \

480

predict_4x4[j]( fdec ); \

481

int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \

485

predict_4x4[j]( fdec1 ); \

486

int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \

482

487

if( cost < (uint16_t)res_c ) \

483

488

res_c = cost + (j<<16); \

484

489

} \

485

int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \

490

predict_4x4[res_c>>16]( fdec1 ); \

491

int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \

486

492

if( res_c != res_a ) \

487

493

{ \

488

494

ok = 0; \

489

495

fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \

490

496

break; \

491

497

} \

498

if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \

499

{ \

500

ok = 0; \

501

fprintf( stderr, #name" [FAILED]\n" ); \

502

for( int j=0; j<16; j++ ) \

503

fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \

504

fprintf( stderr, "\n" ); \

505

for( int j=0; j<16; j++ ) \

506

fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \

507

fprintf( stderr, "\n" ); \

508

break; \

509

} \

510

} \

511

}

512

513

#define TEST_INTRA8_X9( name, cmp ) \

514

if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \

515

{ \

516

set_func_name( #name ); \

517

used_asm = 1; \

518

ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \

519

ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \

520

ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \

521

memset( satds_c, 0, 16 * sizeof(*satds_c) ); \

522

memset( satds_a, 0, 16 * sizeof(*satds_a) ); \

523

for( int i=0; i<17; i++ ) \

524

bitcosts[i] = 9*(i!=8); \

525

for( int i=0; i<32; i++ ) \

526

{ \

527

pixel *fenc = pbuf1+48+i*12; \

528

pixel *fdec1 = pbuf3+48+i*12; \

529

pixel *fdec2 = pbuf4+48+i*12; \

530

int pred_mode = i%9; \

531

int res_c = INT_MAX; \

532

predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \

533

for( int j=0; j<9; j++ ) \

534

{ \

535

predict_8x8[j]( fdec1, edge ); \

536

satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \

537

if( satds_c[j] < (uint16_t)res_c ) \

538

res_c = satds_c[j] + (j<<16); \

539

} \

540

predict_8x8[res_c>>16]( fdec1, edge ); \

541

int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \

542

if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \

543

{ \

544

ok = 0; \

545

fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \

546

for( int j = 0; j < 9; j++ ) \

547

fprintf( stderr, "%5d ", satds_c[j]); \

548

fprintf( stderr, "\n" ); \

549

for( int j = 0; j < 9; j++ ) \

550

fprintf( stderr, "%5d ", satds_a[j]); \

551

fprintf( stderr, "\n" ); \

552

break; \

553

} \

554

for( int j=0; j<8; j++ ) \

555

if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \

556

ok = 0; \

557

if( !ok ) \

558

{ \

559

fprintf( stderr, #name" [FAILED]\n" ); \

560

for( int j=0; j<8; j++ ) \

561

{ \

562

for( int k=0; k<8; k++ ) \

563

fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \

564

fprintf( stderr, "\n" ); \

565

} \

566

fprintf( stderr, "\n" ); \

567

for( int j=0; j<8; j++ ) \

568

{ \

569

for( int k=0; k<8; k++ ) \

570

fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \

571

fprintf( stderr, "\n" ); \

572

} \

573

fprintf( stderr, "\n" ); \

574

break; \

575

} \

492

576

} \

493

577

}

494

578

509

593

report( "intra sad_x3 :" );

510

594

ok = 1; used_asm = 0;

511

595

TEST_INTRA_X9( intra_satd_x9_4x4, satd );

596

TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );

512

597

report( "intra satd_x9 :" );

513

598

ok = 1; used_asm = 0;

514

599

TEST_INTRA_X9( intra_sad_x9_4x4, sad );

600

TEST_INTRA8_X9( intra_sad_x9_8x8, sad );

515

601

report( "intra sad_x9 :" );

516

602

517

603

ok = 1; used_asm = 0;

565

651

{

566

652

ALIGNED_16( uint16_t sums[72] );

567

653

ALIGNED_16( int dc[4] );

568

int16_t mvs_a[32], mvs_c[32];

654

ALIGNED_16( int16_t mvs_a[32] );

655

ALIGNED_16( int16_t mvs_c[32] );

569

656

int mvn_a, mvn_c;

570

657

int thresh = rand() & 0x3fff;

571

658

set_func_name( "esa_ads" );

1365

1452

call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );

1366

1453

// I don't care about exact rounding, this is just how close the floating-point implementation happens to be

1367

1454

x264_emms();

1368

for( int j = 0; j < 100; j++ )

1455

for( int j = 0; j < 100 && ok; j++ )

1456

{

1369

1457

ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;

1458

if( !ok )

1459

fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );

1460

}

1370

1461

}

1371

1462

report( "mbtree propagate :" );

1372

1463

}

2205

2296

{

2206

2297

*cpu_ref = *cpu_new;

2207

2298

*cpu_new |= flags;

2299

#if BROKEN_STACK_ALIGNMENT

2300

*cpu_new |= X264_CPU_STACK_MOD4;

2301

#endif

2208

2302

if( *cpu_new & X264_CPU_SSE2_IS_FAST )

2209

2303

*cpu_new &= ~X264_CPU_SSE2_IS_SLOW;

2210

2304

if( !quiet )

2239

2333

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );

2240

2334

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );

2241

2335

ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );

2336

cpu1 &= ~X264_CPU_CACHELINE_64;

2242

2337

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );

2243

2338

cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;

2244

2339

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );

2248

2343

}

2249

2344

if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )

2250

2345

{

2251

cpu1 &= ~X264_CPU_CACHELINE_64;

2252

2346

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );

2253

2347

cpu1 &= ~X264_CPU_SSE_MISALIGN;

2254

2348

}

2255

2349

if( x264_cpu_detect() & X264_CPU_LZCNT )

2256

2350

{

2257

cpu1 &= ~X264_CPU_CACHELINE_64;

2258

2351

ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );

2259

2352

cpu1 &= ~X264_CPU_LZCNT;

2260

2353

}

2261

2354

if( x264_cpu_detect() & X264_CPU_SSE3 )

2355

{

2262

2356

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );

2357

cpu1 &= ~X264_CPU_CACHELINE_64;

2358

}

2263

2359

if( x264_cpu_detect() & X264_CPU_SSSE3 )

2264

2360

{

2265

cpu1 &= ~X264_CPU_CACHELINE_64;

2266

2361

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );

2267

2362

ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );

2363

cpu1 &= ~X264_CPU_CACHELINE_64;

2268

2364

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );

2269

2365

cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;

2270

2366

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );

2273

2369

cpu1 &= ~X264_CPU_SLOW_ATOM;

2274

2370

}

2275

2371

if( x264_cpu_detect() & X264_CPU_SSE4 )

2276

{

2277

cpu1 &= ~X264_CPU_CACHELINE_64;

2278

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );

2279

}

2372

ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );

2280

2373

if( x264_cpu_detect() & X264_CPU_AVX )

2281

2374

ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );

2375

if( x264_cpu_detect() & X264_CPU_XOP )

2376

ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );

2377

if( x264_cpu_detect() & X264_CPU_FMA4 )

2378

ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );

2282

2379

#elif ARCH_PPC

2283

2380

if( x264_cpu_detect() & X264_CPU_ALTIVEC )

2284

2381

{

Older »