~ubuntu-branches/ubuntu/vivid/x264/vivid-proposed

« back to all changes in this revision

Viewing changes to common/x86/mc-c.c

Committer: Package Import Robot
Author(s): Reinhard Tartler
Date: 2014-02-13 23:27:06 UTC
mfrom: (1.4.5) (21.1.8 sid)
Revision ID: package-import@ubuntu.com-20140213232706-pt0zdfm6orvn8r8w

Tags: 2:0.142.2389+git956c8d8-1

Update to new upstream snapshot

files added:
common/opencl

common/opencl.c

common/opencl.h

common/opencl/bidir.cl

common/opencl/downscale.cl

common/opencl/intra.cl

common/opencl/motionsearch.cl

common/opencl/subpel.cl

common/opencl/weightp.cl

common/opencl/x264-cl.h

encoder/slicetype-cl.c

extras/cl.h

extras/cl_platform.h

output/mp4_lsmash.c

tools/cltostr.pl

files removed:
common/display-x11.c

common/display.h

common/visualize.c

common/visualize.h

tools/xyuv.c

files modified:
AUTHORS

Makefile

common/arm/asm.S

common/arm/cpu-a.S

common/arm/dct-a.S

common/arm/dct.h

common/arm/deblock-a.S

common/arm/mc-a.S

common/arm/mc-c.c

common/arm/mc.h

common/arm/pixel-a.S

common/arm/pixel.h

common/arm/predict-a.S

common/arm/predict-c.c

common/arm/predict.h

common/arm/quant-a.S

common/arm/quant.h

common/bitstream.c

common/bitstream.h

common/cabac.c

common/cabac.h

common/common.c

common/common.h

common/cpu.c

common/cpu.h

common/dct.c

common/dct.h

common/deblock.c

common/frame.c

common/frame.h

common/macroblock.c

common/macroblock.h

common/mc.c

common/mc.h

common/mvpred.c

common/osdep.c

common/osdep.h

common/pixel.c

common/pixel.h

common/ppc/dct.c

common/ppc/dct.h

common/ppc/deblock.c

common/ppc/mc.c

common/ppc/mc.h

common/ppc/pixel.c

common/ppc/pixel.h

common/ppc/ppccommon.h

common/ppc/predict.c

common/ppc/predict.h

common/ppc/quant.c

common/ppc/quant.h

common/predict.c

common/predict.h

common/quant.c

common/quant.h

common/rectangle.c

common/rectangle.h

common/set.c

common/set.h

common/sparc/pixel.asm

common/sparc/pixel.h

common/threadpool.c

common/threadpool.h

common/vlc.c

common/win32thread.c

common/win32thread.h

common/x86/bitstream-a.asm

common/x86/cabac-a.asm

common/x86/const-a.asm

common/x86/cpu-a.asm

common/x86/dct-32.asm

common/x86/dct-64.asm

common/x86/dct-a.asm

common/x86/dct.h

common/x86/deblock-a.asm

common/x86/mc-a.asm

common/x86/mc-a2.asm

common/x86/mc-c.c

common/x86/mc.h

common/x86/pixel-32.asm

common/x86/pixel-a.asm

common/x86/pixel.h

common/x86/predict-a.asm

common/x86/predict-c.c

common/x86/predict.h

common/x86/quant-a.asm

common/x86/quant.h

common/x86/sad-a.asm

common/x86/sad16-a.asm

common/x86/trellis-64.asm

common/x86/util.h

common/x86/x86inc.asm

common/x86/x86util.asm

config.guess

config.sub

configure

debian/changelog

debian/control

debian/control.in

debian/rules

debian/x264-get-orig-source

debian/x264.1

encoder/analyse.c

encoder/analyse.h

encoder/cabac.c

encoder/cavlc.c

encoder/encoder.c

encoder/lookahead.c

encoder/macroblock.c

encoder/macroblock.h

encoder/me.c

encoder/me.h

encoder/ratecontrol.c

encoder/ratecontrol.h

encoder/rdo.c

encoder/set.c

encoder/set.h

encoder/slicetype.c

filters/filters.c

filters/filters.h

filters/video/cache.c

filters/video/crop.c

filters/video/depth.c

filters/video/fix_vfr_pts.c

filters/video/internal.c

filters/video/internal.h

filters/video/resize.c

filters/video/select_every.c

filters/video/source.c

filters/video/video.c

filters/video/video.h

input/avs.c

input/ffms.c

input/input.c

input/input.h

input/lavf.c

input/raw.c

input/thread.c

input/timecode.c

input/y4m.c

output/flv.c

output/flv_bytestream.c

output/flv_bytestream.h

output/matroska.c

output/matroska_ebml.c

output/matroska_ebml.h

output/mp4.c

output/output.h

output/raw.c

tools/checkasm-a.asm

tools/checkasm.c

version.sh

x264.c

x264.h

x264cli.h

x264dll.c

x264res.rc

Show diffs side-by-side

added added

removed removed

common/x86/mc-c.c

/*****************************************************************************

* mc-c.c: x86 motion compensation

*****************************************************************************

* Authors: Laurent Aimar <fenrir@via.ecp.fr>

* Loren Merritt <lorenm@u.washington.edu>

#define DECL_SUF( func, args )\

void func##_mmx2 args;\

void func##_sse2 args;\

void func##_ssse3 args;

void func##_ssse3 args;\

void func##_avx2 args;

DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))

DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))

MC_WEIGHT( 12, ssse3 )

MC_WEIGHT( 16, ssse3 )

MC_WEIGHT( 20, ssse3 )

MC_WEIGHT( 8, avx2 )

MC_WEIGHT( 16, avx2 )

MC_WEIGHT( 20, avx2 )

#undef MC_OFFSET

#undef MC_WEIGHT

void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );

void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );

void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );

void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );

void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );

void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );

void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );

void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );

110

116

void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,

111

117

uint16_t *dstv, intptr_t i_dstv,

112

118

uint16_t *src, intptr_t i_src, int w, int h );

119

void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,

120

uint16_t *dstv, intptr_t i_dstv,

121

uint32_t *src, intptr_t i_src, int w, int h );

122

void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,

123

uint16_t *dstv, intptr_t i_dstv,

124

uint32_t *src, intptr_t i_src, int w, int h );

125

void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,

126

uint16_t *dstv, intptr_t i_dstv,

127

uint32_t *src, intptr_t i_src, int w, int h );

113

128

void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );

114

129

void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );

115

130

void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );

125

140

void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );

126

141

void x264_memzero_aligned_mmx( void *dst, size_t n );

127

142

void x264_memzero_aligned_sse( void *dst, size_t n );

143

void x264_memzero_aligned_avx( void *dst, size_t n );

128

144

void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );

145

void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );

129

146

void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );

130

147

void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );

148

void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );

131

149

void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );

132

150

void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );

133

151

void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );

152

void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );

134

153

void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );

135

154

void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );

155

void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );

136

156

void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,

137

157

uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );

138

158

void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,

147

167

int dx, int dy, int i_width, int i_height );

148

168

MC_CHROMA(mmx2)

149

169

MC_CHROMA(sse2)

150

MC_CHROMA(sse2_misalign)

151

170

MC_CHROMA(ssse3)

152

171

MC_CHROMA(ssse3_cache64)

153

172

MC_CHROMA(avx)

154

MC_CHROMA(avx_cache64)

173

MC_CHROMA(avx2)

155

174

156

175

#define LOWRES(cpu)\

157

176

void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\

162

181

LOWRES(ssse3)

163

182

LOWRES(avx)

164

183

LOWRES(xop)

184

LOWRES(avx2)

165

185

166

186

#define PIXEL_AVG_W(width,cpu)\

167

187

void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );

174

194

PIXEL_AVG_WALL(cache64_mmx2)

175

195

PIXEL_AVG_WALL(cache64_sse2)

176

196

PIXEL_AVG_WALL(sse2)

177

PIXEL_AVG_WALL(sse2_misalign)

178

197

PIXEL_AVG_WALL(cache64_ssse3)

198

PIXEL_AVG_WALL(avx2)

179

199

180

200

#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\

181

201

static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\

194

214

#define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2

195

215

#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2

196

216

#define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2

217

#define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2

218

#define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2

197

219

#else

198

220

/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */

199

221

#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3

205

227

PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)

206

228

#if HIGH_BIT_DEPTH

207

229

PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)

230

PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)

208

231

#else // !HIGH_BIT_DEPTH

209

232

#if ARCH_X86

210

233

PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)

211

234

PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)

212

235

#endif

213

236

PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)

214

PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)

215

237

PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)

216

238

PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)

217

239

PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)

240

PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)

218

241

#endif // HIGH_BIT_DEPTH

219

242

220

243

#define MC_COPY_WTAB(instr, name1, name2, name3)\

230

253

MC_COPY_WTAB(mmx,mmx,mmx,mmx)

231

254

#if HIGH_BIT_DEPTH

232

255

MC_COPY_WTAB(sse,mmx,sse,sse)

256

MC_COPY_WTAB(avx,mmx,sse,avx)

233

257

#else

234

258

MC_COPY_WTAB(sse,mmx,mmx,sse)

235

259

#endif

283

307

MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)

284

308

MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)

285

309

MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)

310

MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)

286

311

287

312

static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )

288

313

{

359

384

360

385

MC_LUMA(mmx2,mmx2,mmx)

361

386

MC_LUMA(sse2,sse2,sse)

362

#if !HIGH_BIT_DEPTH

387

#if HIGH_BIT_DEPTH

388

MC_LUMA(avx2,avx2,avx)

389

#else

363

390

#if ARCH_X86

364

391

MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)

365

392

MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)

402

429

403

430

GET_REF(mmx2)

404

431

GET_REF(sse2)

432

GET_REF(avx2)

405

433

#if !HIGH_BIT_DEPTH

406

434

#if ARCH_X86

407

435

GET_REF(cache32_mmx2)

408

436

GET_REF(cache64_mmx2)

409

437

#endif

410

GET_REF(sse2_misalign)

411

438

GET_REF(cache64_sse2)

412

439

GET_REF(cache64_ssse3)

413

440

GET_REF(cache64_ssse3_atom)

428

455

width += realign;\

429

456

while( height-- )\

430

457

431

x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\

432

x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\

458

x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\

459

x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\

433

460

x264_hpel_filter_h_##cpuh( dsth, src, width );\

434

461

dsth += stride;\

435

462

dstv += stride;\

448

475

void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );

449

476

void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );

450

477

void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );

478

void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );

451

479

#else

452

480

HPEL(16, sse2, sse2, sse2, sse2)

453

481

HPEL(16, ssse3, ssse3, ssse3, ssse3)

454

482

HPEL(16, avx, avx, avx, avx)

483

HPEL(32, avx2, avx2, avx2, avx2)

455

484

#endif

456

HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)

457

485

#endif // HIGH_BIT_DEPTH

458

486

459

487

static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )

608

636

return;

609

637

610

638

pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;

639

pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;

611

640

612

641

if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )

613

642

pf->integral_init4v = x264_integral_init4v_ssse3;

620

649

pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;

621

650

pf->plane_copy_interleave = x264_plane_copy_interleave_avx;

622

651

pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;

652

pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;

623

653

pf->store_interleave_chroma = x264_store_interleave_chroma_avx;

654

pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;

624

655

625

656

if( !(cpu&X264_CPU_STACK_MOD4) )

626

657

pf->mc_chroma = x264_mc_chroma_avx;

627

658

628

659

if( cpu&X264_CPU_XOP )

629

660

pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;

661

662

if( cpu&X264_CPU_AVX2 )

663

{

664

pf->mc_luma = mc_luma_avx2;

665

pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;

666

}

630

667

#else // !HIGH_BIT_DEPTH

631

668

632

669

#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead

668

705

pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;

669

706

pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;

670

707

pf->hpel_filter = x264_hpel_filter_sse2;

671

if( cpu&X264_CPU_SSE_MISALIGN )

672

pf->hpel_filter = x264_hpel_filter_sse2_misalign;

673

708

pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;

674

709

if( !(cpu&X264_CPU_STACK_MOD4) )

675

710

pf->mc_chroma = x264_mc_chroma_sse2;

688

723

pf->mc_luma = mc_luma_cache64_sse2;

689

724

pf->get_ref = get_ref_cache64_sse2;

690

725

}

691

if( cpu&X264_CPU_SSE_MISALIGN )

692

{

693

pf->get_ref = get_ref_sse2_misalign;

694

if( !(cpu&X264_CPU_STACK_MOD4) )

695

pf->mc_chroma = x264_mc_chroma_sse2_misalign;

696

}

697

726

}

698

727

}

699

728

765

794

766

795

if( cpu&X264_CPU_XOP )

767

796

pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;

797

798

if( cpu&X264_CPU_AVX2 )

799

{

800

pf->hpel_filter = x264_hpel_filter_avx2;

801

pf->mc_chroma = x264_mc_chroma_avx2;

802

pf->weight = x264_mc_weight_wtab_avx2;

803

pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;

804

pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2;

805

pf->integral_init8v = x264_integral_init8v_avx2;

806

pf->integral_init4v = x264_integral_init4v_avx2;

807

pf->integral_init8h = x264_integral_init8h_avx2;

808

pf->integral_init4h = x264_integral_init4h_avx2;

809

pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;

810

}

768

811

#endif // HIGH_BIT_DEPTH

769

812

770

813

if( !(cpu&X264_CPU_AVX) )

771

814

return;

815

pf->memzero_aligned = x264_memzero_aligned_avx;

772

816

pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;

773

817

774

818

if( cpu&X264_CPU_FMA4 )

776

820

777

821

if( !(cpu&X264_CPU_AVX2) )

778

822

return;

823

pf->get_ref = get_ref_avx2;

779

824

780

825

if( cpu&X264_CPU_FMA3 )

781

826

pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;

Older »