221
224
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
222
225
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
223
#define put_pixels16_mmx2 put_pixels16_mmx
224
#define put_pixels8_mmx2 put_pixels8_mmx
225
#define put_pixels4_mmx2 put_pixels4_mmx
226
#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227
#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228
#define put_pixels16_3dnow put_pixels16_mmx
229
#define put_pixels8_3dnow put_pixels8_mmx
230
#define put_pixels4_3dnow put_pixels4_mmx
231
#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232
#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
226
#define put_pixels16_mmxext put_pixels16_mmx
227
#define put_pixels8_mmxext put_pixels8_mmx
228
#define put_pixels4_mmxext put_pixels4_mmx
229
#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
230
#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
234
232
/***********************************/
235
233
/* standard MMX */
237
void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
235
void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
239
238
const DCTELEM *p;
242
241
/* read the pixels */
245
244
/* unrolled loop */
247
"movq %3, %%mm0 \n\t"
248
"movq 8%3, %%mm1 \n\t"
249
"movq 16%3, %%mm2 \n\t"
250
"movq 24%3, %%mm3 \n\t"
251
"movq 32%3, %%mm4 \n\t"
252
"movq 40%3, %%mm5 \n\t"
253
"movq 48%3, %%mm6 \n\t"
254
"movq 56%3, %%mm7 \n\t"
255
"packuswb %%mm1, %%mm0 \n\t"
256
"packuswb %%mm3, %%mm2 \n\t"
257
"packuswb %%mm5, %%mm4 \n\t"
258
"packuswb %%mm7, %%mm6 \n\t"
259
"movq %%mm0, (%0) \n\t"
260
"movq %%mm2, (%0, %1) \n\t"
261
"movq %%mm4, (%0, %1, 2) \n\t"
262
"movq %%mm6, (%0, %2) \n\t"
263
::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
246
"movq (%3), %%mm0 \n\t"
247
"movq 8(%3), %%mm1 \n\t"
248
"movq 16(%3), %%mm2 \n\t"
249
"movq 24(%3), %%mm3 \n\t"
250
"movq 32(%3), %%mm4 \n\t"
251
"movq 40(%3), %%mm5 \n\t"
252
"movq 48(%3), %%mm6 \n\t"
253
"movq 56(%3), %%mm7 \n\t"
254
"packuswb %%mm1, %%mm0 \n\t"
255
"packuswb %%mm3, %%mm2 \n\t"
256
"packuswb %%mm5, %%mm4 \n\t"
257
"packuswb %%mm7, %%mm6 \n\t"
258
"movq %%mm0, (%0) \n\t"
259
"movq %%mm2, (%0, %1) \n\t"
260
"movq %%mm4, (%0, %1, 2) \n\t"
261
"movq %%mm6, (%0, %2) \n\t"
262
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
265
pix += line_size * 4;
268
268
// if here would be an exact copy of the code above
269
269
// compiler would generate some very strange code
270
270
// thus using "r"
272
"movq (%3), %%mm0 \n\t"
273
"movq 8(%3), %%mm1 \n\t"
274
"movq 16(%3), %%mm2 \n\t"
275
"movq 24(%3), %%mm3 \n\t"
276
"movq 32(%3), %%mm4 \n\t"
277
"movq 40(%3), %%mm5 \n\t"
278
"movq 48(%3), %%mm6 \n\t"
279
"movq 56(%3), %%mm7 \n\t"
280
"packuswb %%mm1, %%mm0 \n\t"
281
"packuswb %%mm3, %%mm2 \n\t"
282
"packuswb %%mm5, %%mm4 \n\t"
283
"packuswb %%mm7, %%mm6 \n\t"
284
"movq %%mm0, (%0) \n\t"
285
"movq %%mm2, (%0, %1) \n\t"
286
"movq %%mm4, (%0, %1, 2) \n\t"
287
"movq %%mm6, (%0, %2) \n\t"
288
::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
272
"movq (%3), %%mm0 \n\t"
273
"movq 8(%3), %%mm1 \n\t"
274
"movq 16(%3), %%mm2 \n\t"
275
"movq 24(%3), %%mm3 \n\t"
276
"movq 32(%3), %%mm4 \n\t"
277
"movq 40(%3), %%mm5 \n\t"
278
"movq 48(%3), %%mm6 \n\t"
279
"movq 56(%3), %%mm7 \n\t"
280
"packuswb %%mm1, %%mm0 \n\t"
281
"packuswb %%mm3, %%mm2 \n\t"
282
"packuswb %%mm5, %%mm4 \n\t"
283
"packuswb %%mm7, %%mm6 \n\t"
284
"movq %%mm0, (%0) \n\t"
285
"movq %%mm2, (%0, %1) \n\t"
286
"movq %%mm4, (%0, %1, 2) \n\t"
287
"movq %%mm6, (%0, %2) \n\t"
288
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
292
#define put_signed_pixels_clamped_mmx_half(off) \
293
"movq "#off"(%2), %%mm1 \n\t"\
294
"movq 16+"#off"(%2), %%mm2 \n\t"\
295
"movq 32+"#off"(%2), %%mm3 \n\t"\
296
"movq 48+"#off"(%2), %%mm4 \n\t"\
297
"packsswb 8+"#off"(%2), %%mm1 \n\t"\
298
"packsswb 24+"#off"(%2), %%mm2 \n\t"\
299
"packsswb 40+"#off"(%2), %%mm3 \n\t"\
300
"packsswb 56+"#off"(%2), %%mm4 \n\t"\
301
"paddb %%mm0, %%mm1 \n\t"\
302
"paddb %%mm0, %%mm2 \n\t"\
303
"paddb %%mm0, %%mm3 \n\t"\
304
"paddb %%mm0, %%mm4 \n\t"\
305
"movq %%mm1, (%0) \n\t"\
306
"movq %%mm2, (%0, %3) \n\t"\
307
"movq %%mm3, (%0, %3, 2) \n\t"\
308
"movq %%mm4, (%0, %1) \n\t"
292
#define put_signed_pixels_clamped_mmx_half(off) \
293
"movq "#off"(%2), %%mm1 \n\t" \
294
"movq 16 + "#off"(%2), %%mm2 \n\t" \
295
"movq 32 + "#off"(%2), %%mm3 \n\t" \
296
"movq 48 + "#off"(%2), %%mm4 \n\t" \
297
"packsswb 8 + "#off"(%2), %%mm1 \n\t" \
298
"packsswb 24 + "#off"(%2), %%mm2 \n\t" \
299
"packsswb 40 + "#off"(%2), %%mm3 \n\t" \
300
"packsswb 56 + "#off"(%2), %%mm4 \n\t" \
301
"paddb %%mm0, %%mm1 \n\t" \
302
"paddb %%mm0, %%mm2 \n\t" \
303
"paddb %%mm0, %%mm3 \n\t" \
304
"paddb %%mm0, %%mm4 \n\t" \
305
"movq %%mm1, (%0) \n\t" \
306
"movq %%mm2, (%0, %3) \n\t" \
307
"movq %%mm3, (%0, %3, 2) \n\t" \
308
"movq %%mm4, (%0, %1) \n\t"
310
void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
310
void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
312
313
x86_reg line_skip = line_size;
313
314
x86_reg line_skip3;
315
316
__asm__ volatile (
316
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
317
"lea (%3, %3, 2), %1 \n\t"
318
put_signed_pixels_clamped_mmx_half(0)
319
"lea (%0, %3, 4), %0 \n\t"
320
put_signed_pixels_clamped_mmx_half(64)
321
:"+&r" (pixels), "=&r" (line_skip3)
322
:"r" (block), "r"(line_skip)
317
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
318
"lea (%3, %3, 2), %1 \n\t"
319
put_signed_pixels_clamped_mmx_half(0)
320
"lea (%0, %3, 4), %0 \n\t"
321
put_signed_pixels_clamped_mmx_half(64)
322
: "+&r"(pixels), "=&r"(line_skip3)
323
: "r"(block), "r"(line_skip)
326
void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
327
void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
328
330
const DCTELEM *p;
332
334
/* read the pixels */
339
"movq (%2), %%mm0 \n\t"
340
"movq 8(%2), %%mm1 \n\t"
341
"movq 16(%2), %%mm2 \n\t"
342
"movq 24(%2), %%mm3 \n\t"
343
"movq %0, %%mm4 \n\t"
344
"movq %1, %%mm6 \n\t"
345
"movq %%mm4, %%mm5 \n\t"
346
"punpcklbw %%mm7, %%mm4 \n\t"
347
"punpckhbw %%mm7, %%mm5 \n\t"
348
"paddsw %%mm4, %%mm0 \n\t"
349
"paddsw %%mm5, %%mm1 \n\t"
350
"movq %%mm6, %%mm5 \n\t"
351
"punpcklbw %%mm7, %%mm6 \n\t"
352
"punpckhbw %%mm7, %%mm5 \n\t"
353
"paddsw %%mm6, %%mm2 \n\t"
354
"paddsw %%mm5, %%mm3 \n\t"
355
"packuswb %%mm1, %%mm0 \n\t"
356
"packuswb %%mm3, %%mm2 \n\t"
357
"movq %%mm0, %0 \n\t"
358
"movq %%mm2, %1 \n\t"
359
:"+m"(*pix), "+m"(*(pix+line_size))
341
"movq (%2), %%mm0 \n\t"
342
"movq 8(%2), %%mm1 \n\t"
343
"movq 16(%2), %%mm2 \n\t"
344
"movq 24(%2), %%mm3 \n\t"
345
"movq %0, %%mm4 \n\t"
346
"movq %1, %%mm6 \n\t"
347
"movq %%mm4, %%mm5 \n\t"
348
"punpcklbw %%mm7, %%mm4 \n\t"
349
"punpckhbw %%mm7, %%mm5 \n\t"
350
"paddsw %%mm4, %%mm0 \n\t"
351
"paddsw %%mm5, %%mm1 \n\t"
352
"movq %%mm6, %%mm5 \n\t"
353
"punpcklbw %%mm7, %%mm6 \n\t"
354
"punpckhbw %%mm7, %%mm5 \n\t"
355
"paddsw %%mm6, %%mm2 \n\t"
356
"paddsw %%mm5, %%mm3 \n\t"
357
"packuswb %%mm1, %%mm0 \n\t"
358
"packuswb %%mm3, %%mm2 \n\t"
359
"movq %%mm0, %0 \n\t"
360
"movq %%mm2, %1 \n\t"
361
: "+m"(*pix), "+m"(*(pix + line_size))
364
pix += line_size * 2;
367
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
370
"lea (%3, %3), %%"REG_a" \n\t"
373
"movd (%1), %%mm0 \n\t"
374
"movd (%1, %3), %%mm1 \n\t"
375
"movd %%mm0, (%2) \n\t"
376
"movd %%mm1, (%2, %3) \n\t"
377
"add %%"REG_a", %1 \n\t"
378
"add %%"REG_a", %2 \n\t"
379
"movd (%1), %%mm0 \n\t"
380
"movd (%1, %3), %%mm1 \n\t"
381
"movd %%mm0, (%2) \n\t"
382
"movd %%mm1, (%2, %3) \n\t"
383
"add %%"REG_a", %1 \n\t"
384
"add %%"REG_a", %2 \n\t"
387
: "+g"(h), "+r" (pixels), "+r" (block)
388
: "r"((x86_reg)line_size)
393
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396
"lea (%3, %3), %%"REG_a" \n\t"
399
"movq (%1), %%mm0 \n\t"
400
"movq (%1, %3), %%mm1 \n\t"
401
"movq %%mm0, (%2) \n\t"
402
"movq %%mm1, (%2, %3) \n\t"
403
"add %%"REG_a", %1 \n\t"
404
"add %%"REG_a", %2 \n\t"
405
"movq (%1), %%mm0 \n\t"
406
"movq (%1, %3), %%mm1 \n\t"
407
"movq %%mm0, (%2) \n\t"
408
"movq %%mm1, (%2, %3) \n\t"
409
"add %%"REG_a", %1 \n\t"
410
"add %%"REG_a", %2 \n\t"
413
: "+g"(h), "+r" (pixels), "+r" (block)
414
: "r"((x86_reg)line_size)
419
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422
"lea (%3, %3), %%"REG_a" \n\t"
425
"movq (%1), %%mm0 \n\t"
426
"movq 8(%1), %%mm4 \n\t"
427
"movq (%1, %3), %%mm1 \n\t"
428
"movq 8(%1, %3), %%mm5 \n\t"
429
"movq %%mm0, (%2) \n\t"
430
"movq %%mm4, 8(%2) \n\t"
431
"movq %%mm1, (%2, %3) \n\t"
432
"movq %%mm5, 8(%2, %3) \n\t"
433
"add %%"REG_a", %1 \n\t"
434
"add %%"REG_a", %2 \n\t"
435
"movq (%1), %%mm0 \n\t"
436
"movq 8(%1), %%mm4 \n\t"
437
"movq (%1, %3), %%mm1 \n\t"
438
"movq 8(%1, %3), %%mm5 \n\t"
439
"movq %%mm0, (%2) \n\t"
440
"movq %%mm4, 8(%2) \n\t"
441
"movq %%mm1, (%2, %3) \n\t"
442
"movq %%mm5, 8(%2, %3) \n\t"
443
"add %%"REG_a", %1 \n\t"
444
"add %%"REG_a", %2 \n\t"
447
: "+g"(h), "+r" (pixels), "+r" (block)
448
: "r"((x86_reg)line_size)
453
static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
457
"movdqu (%1), %%xmm0 \n\t"
458
"movdqu (%1,%3), %%xmm1 \n\t"
459
"movdqu (%1,%3,2), %%xmm2 \n\t"
460
"movdqu (%1,%4), %%xmm3 \n\t"
461
"lea (%1,%3,4), %1 \n\t"
462
"movdqa %%xmm0, (%2) \n\t"
463
"movdqa %%xmm1, (%2,%3) \n\t"
464
"movdqa %%xmm2, (%2,%3,2) \n\t"
465
"movdqa %%xmm3, (%2,%4) \n\t"
467
"lea (%2,%3,4), %2 \n\t"
469
: "+g"(h), "+r" (pixels), "+r" (block)
470
: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
475
static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
479
"movdqu (%1), %%xmm0 \n\t"
480
"movdqu (%1,%3), %%xmm1 \n\t"
481
"movdqu (%1,%3,2), %%xmm2 \n\t"
482
"movdqu (%1,%4), %%xmm3 \n\t"
483
"lea (%1,%3,4), %1 \n\t"
484
"pavgb (%2), %%xmm0 \n\t"
485
"pavgb (%2,%3), %%xmm1 \n\t"
486
"pavgb (%2,%3,2), %%xmm2 \n\t"
487
"pavgb (%2,%4), %%xmm3 \n\t"
488
"movdqa %%xmm0, (%2) \n\t"
489
"movdqa %%xmm1, (%2,%3) \n\t"
490
"movdqa %%xmm2, (%2,%3,2) \n\t"
491
"movdqa %%xmm3, (%2,%4) \n\t"
493
"lea (%2,%3,4), %2 \n\t"
495
: "+g"(h), "+r" (pixels), "+r" (block)
496
: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
501
#define CLEAR_BLOCKS(name,n) \
502
static void name(DCTELEM *blocks)\
505
"pxor %%mm7, %%mm7 \n\t"\
506
"mov %1, %%"REG_a" \n\t"\
508
"movq %%mm7, (%0, %%"REG_a") \n\t"\
509
"movq %%mm7, 8(%0, %%"REG_a") \n\t"\
510
"movq %%mm7, 16(%0, %%"REG_a") \n\t"\
511
"movq %%mm7, 24(%0, %%"REG_a") \n\t"\
512
"add $32, %%"REG_a" \n\t"\
514
: : "r" (((uint8_t *)blocks)+128*n),\
369
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
370
int line_size, int h)
373
"lea (%3, %3), %%"REG_a" \n\t"
376
"movq (%1 ), %%mm0 \n\t"
377
"movq (%1, %3), %%mm1 \n\t"
378
"movq %%mm0, (%2) \n\t"
379
"movq %%mm1, (%2, %3) \n\t"
380
"add %%"REG_a", %1 \n\t"
381
"add %%"REG_a", %2 \n\t"
382
"movq (%1 ), %%mm0 \n\t"
383
"movq (%1, %3), %%mm1 \n\t"
384
"movq %%mm0, (%2) \n\t"
385
"movq %%mm1, (%2, %3) \n\t"
386
"add %%"REG_a", %1 \n\t"
387
"add %%"REG_a", %2 \n\t"
390
: "+g"(h), "+r"(pixels), "+r"(block)
391
: "r"((x86_reg)line_size)
396
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
397
int line_size, int h)
400
"lea (%3, %3), %%"REG_a" \n\t"
403
"movq (%1 ), %%mm0 \n\t"
404
"movq 8(%1 ), %%mm4 \n\t"
405
"movq (%1, %3), %%mm1 \n\t"
406
"movq 8(%1, %3), %%mm5 \n\t"
407
"movq %%mm0, (%2) \n\t"
408
"movq %%mm4, 8(%2) \n\t"
409
"movq %%mm1, (%2, %3) \n\t"
410
"movq %%mm5, 8(%2, %3) \n\t"
411
"add %%"REG_a", %1 \n\t"
412
"add %%"REG_a", %2 \n\t"
413
"movq (%1 ), %%mm0 \n\t"
414
"movq 8(%1 ), %%mm4 \n\t"
415
"movq (%1, %3), %%mm1 \n\t"
416
"movq 8(%1, %3), %%mm5 \n\t"
417
"movq %%mm0, (%2) \n\t"
418
"movq %%mm4, 8(%2) \n\t"
419
"movq %%mm1, (%2, %3) \n\t"
420
"movq %%mm5, 8(%2, %3) \n\t"
421
"add %%"REG_a", %1 \n\t"
422
"add %%"REG_a", %2 \n\t"
425
: "+g"(h), "+r"(pixels), "+r"(block)
426
: "r"((x86_reg)line_size)
431
#define CLEAR_BLOCKS(name, n) \
432
static void name(DCTELEM *blocks) \
435
"pxor %%mm7, %%mm7 \n\t" \
436
"mov %1, %%"REG_a" \n\t" \
438
"movq %%mm7, (%0, %%"REG_a") \n\t" \
439
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \
440
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \
441
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \
442
"add $32, %%"REG_a" \n\t" \
444
:: "r"(((uint8_t *)blocks) + 128 * n), \
519
449
CLEAR_BLOCKS(clear_blocks_mmx, 6)
520
450
CLEAR_BLOCKS(clear_block_mmx, 1)
522
452
static void clear_block_sse(DCTELEM *block)
525
"xorps %%xmm0, %%xmm0 \n"
526
"movaps %%xmm0, (%0) \n"
527
"movaps %%xmm0, 16(%0) \n"
528
"movaps %%xmm0, 32(%0) \n"
529
"movaps %%xmm0, 48(%0) \n"
530
"movaps %%xmm0, 64(%0) \n"
531
"movaps %%xmm0, 80(%0) \n"
532
"movaps %%xmm0, 96(%0) \n"
533
"movaps %%xmm0, 112(%0) \n"
455
"xorps %%xmm0, %%xmm0 \n"
456
"movaps %%xmm0, (%0) \n"
457
"movaps %%xmm0, 16(%0) \n"
458
"movaps %%xmm0, 32(%0) \n"
459
"movaps %%xmm0, 48(%0) \n"
460
"movaps %%xmm0, 64(%0) \n"
461
"movaps %%xmm0, 80(%0) \n"
462
"movaps %%xmm0, 96(%0) \n"
463
"movaps %%xmm0, 112(%0) \n"
539
469
static void clear_blocks_sse(DCTELEM *blocks)
542
"xorps %%xmm0, %%xmm0 \n"
543
"mov %1, %%"REG_a" \n"
545
"movaps %%xmm0, (%0, %%"REG_a") \n"
546
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
547
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
548
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
549
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
550
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
551
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
552
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
553
"add $128, %%"REG_a" \n"
555
: : "r" (((uint8_t *)blocks)+128*6),
472
"xorps %%xmm0, %%xmm0 \n"
473
"mov %1, %%"REG_a" \n"
475
"movaps %%xmm0, (%0, %%"REG_a") \n"
476
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
477
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
478
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
479
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
480
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
481
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
482
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
483
"add $128, %%"REG_a" \n"
485
:: "r"(((uint8_t *)blocks) + 128 * 6),
561
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
566
"movq (%1, %0), %%mm0 \n\t"
567
"movq (%2, %0), %%mm1 \n\t"
568
"paddb %%mm0, %%mm1 \n\t"
569
"movq %%mm1, (%2, %0) \n\t"
570
"movq 8(%1, %0), %%mm0 \n\t"
571
"movq 8(%2, %0), %%mm1 \n\t"
572
"paddb %%mm0, %%mm1 \n\t"
573
"movq %%mm1, 8(%2, %0) \n\t"
579
: "r"(src), "r"(dst), "r"((x86_reg)w-15)
582
dst[i+0] += src[i+0];
585
static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
590
"movq (%2, %0), %%mm0 \n\t"
491
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
497
"movq (%1, %0), %%mm0 \n\t"
498
"movq (%2, %0), %%mm1 \n\t"
499
"paddb %%mm0, %%mm1 \n\t"
500
"movq %%mm1, (%2, %0) \n\t"
501
"movq 8(%1, %0), %%mm0 \n\t"
591
502
"movq 8(%2, %0), %%mm1 \n\t"
592
"paddb (%3, %0), %%mm0 \n\t"
593
"paddb 8(%3, %0), %%mm1 \n\t"
594
"movq %%mm0, (%1, %0) \n\t"
595
"movq %%mm1, 8(%1, %0) \n\t"
503
"paddb %%mm0, %%mm1 \n\t"
504
"movq %%mm1, 8(%2, %0) \n\t"
601
: "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
510
: "r"(src), "r"(dst), "r"((x86_reg)w - 15)
604
dst[i] = src1[i] + src2[i];
513
dst[i + 0] += src[i + 0];
608
static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
517
static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
518
const uint8_t *diff, int w,
519
int *left, int *left_top)
611
int l = *left & 0xff;
523
int l = *left & 0xff;
612
524
int tl = *left_top & 0xff;
617
"movzbl (%3,%4), %2 \n"
630
"add (%6,%4), %b0 \n"
631
"mov %b0, (%5,%4) \n"
634
:"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
635
:"r"(dst+w), "r"(diff+w), "rm"(top+w)
529
"movzbl (%3, %4), %2 \n"
542
"add (%6, %4), %b0 \n"
543
"mov %b0, (%5, %4) \n"
546
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
547
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
642
#define H263_LOOP_FILTER \
643
"pxor %%mm7, %%mm7 \n\t"\
644
"movq %0, %%mm0 \n\t"\
645
"movq %0, %%mm1 \n\t"\
646
"movq %3, %%mm2 \n\t"\
647
"movq %3, %%mm3 \n\t"\
648
"punpcklbw %%mm7, %%mm0 \n\t"\
649
"punpckhbw %%mm7, %%mm1 \n\t"\
650
"punpcklbw %%mm7, %%mm2 \n\t"\
651
"punpckhbw %%mm7, %%mm3 \n\t"\
652
"psubw %%mm2, %%mm0 \n\t"\
653
"psubw %%mm3, %%mm1 \n\t"\
654
"movq %1, %%mm2 \n\t"\
655
"movq %1, %%mm3 \n\t"\
656
"movq %2, %%mm4 \n\t"\
657
"movq %2, %%mm5 \n\t"\
658
"punpcklbw %%mm7, %%mm2 \n\t"\
659
"punpckhbw %%mm7, %%mm3 \n\t"\
660
"punpcklbw %%mm7, %%mm4 \n\t"\
661
"punpckhbw %%mm7, %%mm5 \n\t"\
662
"psubw %%mm2, %%mm4 \n\t"\
663
"psubw %%mm3, %%mm5 \n\t"\
664
"psllw $2, %%mm4 \n\t"\
665
"psllw $2, %%mm5 \n\t"\
666
"paddw %%mm0, %%mm4 \n\t"\
667
"paddw %%mm1, %%mm5 \n\t"\
668
"pxor %%mm6, %%mm6 \n\t"\
669
"pcmpgtw %%mm4, %%mm6 \n\t"\
670
"pcmpgtw %%mm5, %%mm7 \n\t"\
671
"pxor %%mm6, %%mm4 \n\t"\
672
"pxor %%mm7, %%mm5 \n\t"\
673
"psubw %%mm6, %%mm4 \n\t"\
674
"psubw %%mm7, %%mm5 \n\t"\
675
"psrlw $3, %%mm4 \n\t"\
676
"psrlw $3, %%mm5 \n\t"\
677
"packuswb %%mm5, %%mm4 \n\t"\
678
"packsswb %%mm7, %%mm6 \n\t"\
679
"pxor %%mm7, %%mm7 \n\t"\
680
"movd %4, %%mm2 \n\t"\
681
"punpcklbw %%mm2, %%mm2 \n\t"\
682
"punpcklbw %%mm2, %%mm2 \n\t"\
683
"punpcklbw %%mm2, %%mm2 \n\t"\
684
"psubusb %%mm4, %%mm2 \n\t"\
685
"movq %%mm2, %%mm3 \n\t"\
686
"psubusb %%mm4, %%mm3 \n\t"\
687
"psubb %%mm3, %%mm2 \n\t"\
688
"movq %1, %%mm3 \n\t"\
689
"movq %2, %%mm4 \n\t"\
690
"pxor %%mm6, %%mm3 \n\t"\
691
"pxor %%mm6, %%mm4 \n\t"\
692
"paddusb %%mm2, %%mm3 \n\t"\
693
"psubusb %%mm2, %%mm4 \n\t"\
694
"pxor %%mm6, %%mm3 \n\t"\
695
"pxor %%mm6, %%mm4 \n\t"\
696
"paddusb %%mm2, %%mm2 \n\t"\
697
"packsswb %%mm1, %%mm0 \n\t"\
698
"pcmpgtb %%mm0, %%mm7 \n\t"\
699
"pxor %%mm7, %%mm0 \n\t"\
700
"psubb %%mm7, %%mm0 \n\t"\
701
"movq %%mm0, %%mm1 \n\t"\
702
"psubusb %%mm2, %%mm0 \n\t"\
703
"psubb %%mm0, %%mm1 \n\t"\
704
"pand %5, %%mm1 \n\t"\
705
"psrlw $2, %%mm1 \n\t"\
706
"pxor %%mm7, %%mm1 \n\t"\
707
"psubb %%mm7, %%mm1 \n\t"\
708
"movq %0, %%mm5 \n\t"\
709
"movq %3, %%mm6 \n\t"\
710
"psubb %%mm1, %%mm5 \n\t"\
711
"paddb %%mm1, %%mm6 \n\t"
713
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
714
if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
715
const int strength= ff_h263_loop_filter_strength[qscale];
721
"movq %%mm3, %1 \n\t"
722
"movq %%mm4, %2 \n\t"
723
"movq %%mm5, %0 \n\t"
724
"movq %%mm6, %3 \n\t"
725
: "+m" (*(uint64_t*)(src - 2*stride)),
726
"+m" (*(uint64_t*)(src - 1*stride)),
727
"+m" (*(uint64_t*)(src + 0*stride)),
728
"+m" (*(uint64_t*)(src + 1*stride))
729
: "g" (2*strength), "m"(ff_pb_FC)
734
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
735
if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
736
const int strength= ff_h263_loop_filter_strength[qscale];
737
DECLARE_ALIGNED(8, uint64_t, temp)[4];
738
uint8_t *btemp= (uint8_t*)temp;
742
transpose4x4(btemp , src , 8, stride);
743
transpose4x4(btemp+4, src + 4*stride, 8, stride);
745
H263_LOOP_FILTER // 5 3 4 6
751
: "g" (2*strength), "m"(ff_pb_FC)
755
"movq %%mm5, %%mm1 \n\t"
756
"movq %%mm4, %%mm0 \n\t"
757
"punpcklbw %%mm3, %%mm5 \n\t"
758
"punpcklbw %%mm6, %%mm4 \n\t"
759
"punpckhbw %%mm3, %%mm1 \n\t"
760
"punpckhbw %%mm6, %%mm0 \n\t"
761
"movq %%mm5, %%mm3 \n\t"
762
"movq %%mm1, %%mm6 \n\t"
763
"punpcklwd %%mm4, %%mm5 \n\t"
764
"punpcklwd %%mm0, %%mm1 \n\t"
765
"punpckhwd %%mm4, %%mm3 \n\t"
766
"punpckhwd %%mm0, %%mm6 \n\t"
767
"movd %%mm5, (%0) \n\t"
768
"punpckhdq %%mm5, %%mm5 \n\t"
769
"movd %%mm5, (%0,%2) \n\t"
770
"movd %%mm3, (%0,%2,2) \n\t"
771
"punpckhdq %%mm3, %%mm3 \n\t"
772
"movd %%mm3, (%0,%3) \n\t"
773
"movd %%mm1, (%1) \n\t"
554
static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
555
__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
556
"movd (%1), %%mm0 \n\t"
558
"movd (%1), %%mm1 \n\t"
559
"movd (%1,%3,1), %%mm2 \n\t"
560
"movd (%1,%3,2), %%mm3 \n\t"
561
"punpcklbw %%mm1, %%mm0 \n\t"
562
"punpcklbw %%mm3, %%mm2 \n\t"
563
"movq %%mm0, %%mm1 \n\t"
564
"punpcklwd %%mm2, %%mm0 \n\t"
565
"punpckhwd %%mm2, %%mm1 \n\t"
566
"movd %%mm0, (%0) \n\t"
568
"punpckhdq %%mm0, %%mm0 \n\t"
569
"movd %%mm0, (%0) \n\t"
570
"movd %%mm1, (%0,%2,1) \n\t"
774
571
"punpckhdq %%mm1, %%mm1 \n\t"
775
"movd %%mm1, (%1,%2) \n\t"
776
"movd %%mm6, (%1,%2,2) \n\t"
777
"punpckhdq %%mm6, %%mm6 \n\t"
778
"movd %%mm6, (%1,%3) \n\t"
780
"r" (src + 4*stride),
781
"r" ((x86_reg) stride ),
782
"r" ((x86_reg)(3*stride))
572
"movd %%mm1, (%0,%2,2) \n\t"
787
/* draw the edges of width 'w' of an image of size width, height
788
this mmx version can only handle w==8 || w==16 */
789
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
582
#define H263_LOOP_FILTER \
583
"pxor %%mm7, %%mm7 \n\t" \
584
"movq %0, %%mm0 \n\t" \
585
"movq %0, %%mm1 \n\t" \
586
"movq %3, %%mm2 \n\t" \
587
"movq %3, %%mm3 \n\t" \
588
"punpcklbw %%mm7, %%mm0 \n\t" \
589
"punpckhbw %%mm7, %%mm1 \n\t" \
590
"punpcklbw %%mm7, %%mm2 \n\t" \
591
"punpckhbw %%mm7, %%mm3 \n\t" \
592
"psubw %%mm2, %%mm0 \n\t" \
593
"psubw %%mm3, %%mm1 \n\t" \
594
"movq %1, %%mm2 \n\t" \
595
"movq %1, %%mm3 \n\t" \
596
"movq %2, %%mm4 \n\t" \
597
"movq %2, %%mm5 \n\t" \
598
"punpcklbw %%mm7, %%mm2 \n\t" \
599
"punpckhbw %%mm7, %%mm3 \n\t" \
600
"punpcklbw %%mm7, %%mm4 \n\t" \
601
"punpckhbw %%mm7, %%mm5 \n\t" \
602
"psubw %%mm2, %%mm4 \n\t" \
603
"psubw %%mm3, %%mm5 \n\t" \
604
"psllw $2, %%mm4 \n\t" \
605
"psllw $2, %%mm5 \n\t" \
606
"paddw %%mm0, %%mm4 \n\t" \
607
"paddw %%mm1, %%mm5 \n\t" \
608
"pxor %%mm6, %%mm6 \n\t" \
609
"pcmpgtw %%mm4, %%mm6 \n\t" \
610
"pcmpgtw %%mm5, %%mm7 \n\t" \
611
"pxor %%mm6, %%mm4 \n\t" \
612
"pxor %%mm7, %%mm5 \n\t" \
613
"psubw %%mm6, %%mm4 \n\t" \
614
"psubw %%mm7, %%mm5 \n\t" \
615
"psrlw $3, %%mm4 \n\t" \
616
"psrlw $3, %%mm5 \n\t" \
617
"packuswb %%mm5, %%mm4 \n\t" \
618
"packsswb %%mm7, %%mm6 \n\t" \
619
"pxor %%mm7, %%mm7 \n\t" \
620
"movd %4, %%mm2 \n\t" \
621
"punpcklbw %%mm2, %%mm2 \n\t" \
622
"punpcklbw %%mm2, %%mm2 \n\t" \
623
"punpcklbw %%mm2, %%mm2 \n\t" \
624
"psubusb %%mm4, %%mm2 \n\t" \
625
"movq %%mm2, %%mm3 \n\t" \
626
"psubusb %%mm4, %%mm3 \n\t" \
627
"psubb %%mm3, %%mm2 \n\t" \
628
"movq %1, %%mm3 \n\t" \
629
"movq %2, %%mm4 \n\t" \
630
"pxor %%mm6, %%mm3 \n\t" \
631
"pxor %%mm6, %%mm4 \n\t" \
632
"paddusb %%mm2, %%mm3 \n\t" \
633
"psubusb %%mm2, %%mm4 \n\t" \
634
"pxor %%mm6, %%mm3 \n\t" \
635
"pxor %%mm6, %%mm4 \n\t" \
636
"paddusb %%mm2, %%mm2 \n\t" \
637
"packsswb %%mm1, %%mm0 \n\t" \
638
"pcmpgtb %%mm0, %%mm7 \n\t" \
639
"pxor %%mm7, %%mm0 \n\t" \
640
"psubb %%mm7, %%mm0 \n\t" \
641
"movq %%mm0, %%mm1 \n\t" \
642
"psubusb %%mm2, %%mm0 \n\t" \
643
"psubb %%mm0, %%mm1 \n\t" \
644
"pand %5, %%mm1 \n\t" \
645
"psrlw $2, %%mm1 \n\t" \
646
"pxor %%mm7, %%mm1 \n\t" \
647
"psubb %%mm7, %%mm1 \n\t" \
648
"movq %0, %%mm5 \n\t" \
649
"movq %3, %%mm6 \n\t" \
650
"psubb %%mm1, %%mm5 \n\t" \
651
"paddb %%mm1, %%mm6 \n\t"
653
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
655
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
656
const int strength = ff_h263_loop_filter_strength[qscale];
661
"movq %%mm3, %1 \n\t"
662
"movq %%mm4, %2 \n\t"
663
"movq %%mm5, %0 \n\t"
664
"movq %%mm6, %3 \n\t"
665
: "+m"(*(uint64_t*)(src - 2 * stride)),
666
"+m"(*(uint64_t*)(src - 1 * stride)),
667
"+m"(*(uint64_t*)(src + 0 * stride)),
668
"+m"(*(uint64_t*)(src + 1 * stride))
669
: "g"(2 * strength), "m"(ff_pb_FC)
674
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
676
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
677
const int strength = ff_h263_loop_filter_strength[qscale];
678
DECLARE_ALIGNED(8, uint64_t, temp)[4];
679
uint8_t *btemp = (uint8_t*)temp;
683
transpose4x4(btemp, src, 8, stride);
684
transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
686
H263_LOOP_FILTER // 5 3 4 6
692
: "g"(2 * strength), "m"(ff_pb_FC)
696
"movq %%mm5, %%mm1 \n\t"
697
"movq %%mm4, %%mm0 \n\t"
698
"punpcklbw %%mm3, %%mm5 \n\t"
699
"punpcklbw %%mm6, %%mm4 \n\t"
700
"punpckhbw %%mm3, %%mm1 \n\t"
701
"punpckhbw %%mm6, %%mm0 \n\t"
702
"movq %%mm5, %%mm3 \n\t"
703
"movq %%mm1, %%mm6 \n\t"
704
"punpcklwd %%mm4, %%mm5 \n\t"
705
"punpcklwd %%mm0, %%mm1 \n\t"
706
"punpckhwd %%mm4, %%mm3 \n\t"
707
"punpckhwd %%mm0, %%mm6 \n\t"
708
"movd %%mm5, (%0) \n\t"
709
"punpckhdq %%mm5, %%mm5 \n\t"
710
"movd %%mm5, (%0, %2) \n\t"
711
"movd %%mm3, (%0, %2, 2) \n\t"
712
"punpckhdq %%mm3, %%mm3 \n\t"
713
"movd %%mm3, (%0, %3) \n\t"
714
"movd %%mm1, (%1) \n\t"
715
"punpckhdq %%mm1, %%mm1 \n\t"
716
"movd %%mm1, (%1, %2) \n\t"
717
"movd %%mm6, (%1, %2, 2) \n\t"
718
"punpckhdq %%mm6, %%mm6 \n\t"
719
"movd %%mm6, (%1, %3) \n\t"
721
"r"(src + 4 * stride),
722
"r"((x86_reg)stride),
723
"r"((x86_reg)(3 * stride))
728
/* Draw the edges of width 'w' of an image of size width, height
729
* this MMX version can only handle w == 8 || w == 16. */
730
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
731
int w, int h, int sides)
791
733
uint8_t *ptr, *last_line;
794
736
last_line = buf + (height - 1) * wrap;
795
737
/* left and right */
801
"movd (%0), %%mm0 \n\t"
802
"punpcklbw %%mm0, %%mm0 \n\t"
803
"punpcklwd %%mm0, %%mm0 \n\t"
804
"punpckldq %%mm0, %%mm0 \n\t"
805
"movq %%mm0, -8(%0) \n\t"
806
"movq -8(%0, %2), %%mm1 \n\t"
807
"punpckhbw %%mm1, %%mm1 \n\t"
808
"punpckhwd %%mm1, %%mm1 \n\t"
809
"punpckhdq %%mm1, %%mm1 \n\t"
810
"movq %%mm1, (%0, %2) \n\t"
815
: "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
822
"movd (%0), %%mm0 \n\t"
823
"punpcklbw %%mm0, %%mm0 \n\t"
824
"punpcklwd %%mm0, %%mm0 \n\t"
825
"punpckldq %%mm0, %%mm0 \n\t"
826
"movq %%mm0, -8(%0) \n\t"
827
"movq %%mm0, -16(%0) \n\t"
828
"movq -8(%0, %2), %%mm1 \n\t"
829
"punpckhbw %%mm1, %%mm1 \n\t"
830
"punpckhwd %%mm1, %%mm1 \n\t"
831
"punpckhdq %%mm1, %%mm1 \n\t"
832
"movq %%mm1, (%0, %2) \n\t"
833
"movq %%mm1, 8(%0, %2) \n\t"
838
: "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
742
"movd (%0), %%mm0 \n\t"
743
"punpcklbw %%mm0, %%mm0 \n\t"
744
"punpcklwd %%mm0, %%mm0 \n\t"
745
"punpckldq %%mm0, %%mm0 \n\t"
746
"movq %%mm0, -8(%0) \n\t"
747
"movq -8(%0, %2), %%mm1 \n\t"
748
"punpckhbw %%mm1, %%mm1 \n\t"
749
"punpckhwd %%mm1, %%mm1 \n\t"
750
"punpckhdq %%mm1, %%mm1 \n\t"
751
"movq %%mm1, (%0, %2) \n\t"
756
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
761
"movd (%0), %%mm0 \n\t"
762
"punpcklbw %%mm0, %%mm0 \n\t"
763
"punpcklwd %%mm0, %%mm0 \n\t"
764
"punpckldq %%mm0, %%mm0 \n\t"
765
"movq %%mm0, -8(%0) \n\t"
766
"movq %%mm0, -16(%0) \n\t"
767
"movq -8(%0, %2), %%mm1 \n\t"
768
"punpckhbw %%mm1, %%mm1 \n\t"
769
"punpckhwd %%mm1, %%mm1 \n\t"
770
"punpckhdq %%mm1, %%mm1 \n\t"
771
"movq %%mm1, (%0, %2) \n\t"
772
"movq %%mm1, 8(%0, %2) \n\t"
777
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
842
781
/* top and bottom (and hopefully also the corners) */
843
if (sides&EDGE_TOP) {
844
for(i = 0; i < h; i += 4) {
845
ptr= buf - (i + 1) * wrap - w;
848
"movq (%1, %0), %%mm0 \n\t"
849
"movq %%mm0, (%0) \n\t"
850
"movq %%mm0, (%0, %2) \n\t"
851
"movq %%mm0, (%0, %2, 2) \n\t"
852
"movq %%mm0, (%0, %3) \n\t"
857
: "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
862
if (sides&EDGE_BOTTOM) {
863
for(i = 0; i < w; i += 4) {
864
ptr= last_line + (i + 1) * wrap - w;
867
"movq (%1, %0), %%mm0 \n\t"
868
"movq %%mm0, (%0) \n\t"
869
"movq %%mm0, (%0, %2) \n\t"
870
"movq %%mm0, (%0, %2, 2) \n\t"
871
"movq %%mm0, (%0, %3) \n\t"
876
: "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
882
#define PAETH(cpu, abs3)\
883
static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
888
"pxor %%mm7, %%mm7 \n"\
889
"movd (%1,%0), %%mm0 \n"\
890
"movd (%2,%0), %%mm1 \n"\
891
"punpcklbw %%mm7, %%mm0 \n"\
892
"punpcklbw %%mm7, %%mm1 \n"\
895
"movq %%mm1, %%mm2 \n"\
896
"movd (%2,%0), %%mm1 \n"\
897
"movq %%mm2, %%mm3 \n"\
898
"punpcklbw %%mm7, %%mm1 \n"\
899
"movq %%mm2, %%mm4 \n"\
900
"psubw %%mm1, %%mm3 \n"\
901
"psubw %%mm0, %%mm4 \n"\
902
"movq %%mm3, %%mm5 \n"\
903
"paddw %%mm4, %%mm5 \n"\
905
"movq %%mm4, %%mm6 \n"\
906
"pminsw %%mm5, %%mm6 \n"\
907
"pcmpgtw %%mm6, %%mm3 \n"\
908
"pcmpgtw %%mm5, %%mm4 \n"\
909
"movq %%mm4, %%mm6 \n"\
910
"pand %%mm3, %%mm4 \n"\
911
"pandn %%mm3, %%mm6 \n"\
912
"pandn %%mm0, %%mm3 \n"\
913
"movd (%3,%0), %%mm0 \n"\
914
"pand %%mm1, %%mm6 \n"\
915
"pand %%mm4, %%mm2 \n"\
916
"punpcklbw %%mm7, %%mm0 \n"\
918
"paddw %%mm6, %%mm0 \n"\
919
"paddw %%mm2, %%mm3 \n"\
920
"paddw %%mm3, %%mm0 \n"\
921
"pand %%mm5, %%mm0 \n"\
922
"movq %%mm0, %%mm3 \n"\
923
"packuswb %%mm3, %%mm3 \n"\
924
"movd %%mm3, (%1,%0) \n"\
929
:"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
936
"psubw %%mm5, %%mm7 \n"\
937
"pmaxsw %%mm7, %%mm5 \n"\
938
"pxor %%mm6, %%mm6 \n"\
939
"pxor %%mm7, %%mm7 \n"\
940
"psubw %%mm3, %%mm6 \n"\
941
"psubw %%mm4, %%mm7 \n"\
942
"pmaxsw %%mm6, %%mm3 \n"\
943
"pmaxsw %%mm7, %%mm4 \n"\
944
"pxor %%mm7, %%mm7 \n"
947
"pabsw %%mm3, %%mm3 \n"\
948
"pabsw %%mm4, %%mm4 \n"\
949
"pabsw %%mm5, %%mm5 \n"
951
PAETH(mmx2, ABS3_MMX2)
953
PAETH(ssse3, ABS3_SSSE3)
956
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
957
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
958
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
959
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
960
"movq "#in7", " #m3 " \n\t" /* d */\
961
"movq "#in0", %%mm5 \n\t" /* D */\
962
"paddw " #m3 ", %%mm5 \n\t" /* x4 */\
963
"psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
964
"movq "#in1", %%mm5 \n\t" /* C */\
965
"movq "#in2", %%mm6 \n\t" /* B */\
966
"paddw " #m6 ", %%mm5 \n\t" /* x3 */\
967
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\
968
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
969
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
970
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
971
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\
972
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
973
"psraw $5, %%mm5 \n\t"\
974
"packuswb %%mm5, %%mm5 \n\t"\
975
OP(%%mm5, out, %%mm7, d)
977
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
978
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
982
"pxor %%mm7, %%mm7 \n\t"\
984
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
985
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
986
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
987
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
988
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
989
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
990
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
991
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
992
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
993
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
994
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
995
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
996
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
997
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
998
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
999
"paddw %%mm3, %%mm5 \n\t" /* b */\
1000
"paddw %%mm2, %%mm6 \n\t" /* c */\
1001
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1002
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1003
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1004
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1005
"paddw %%mm4, %%mm0 \n\t" /* a */\
1006
"paddw %%mm1, %%mm5 \n\t" /* d */\
1007
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1008
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1009
"paddw %6, %%mm6 \n\t"\
1010
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1011
"psraw $5, %%mm0 \n\t"\
1012
"movq %%mm0, %5 \n\t"\
1013
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1015
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1016
"movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1017
"movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1018
"psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1019
"psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1020
"punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1021
"punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1022
"paddw %%mm0, %%mm2 \n\t" /* b */\
1023
"paddw %%mm5, %%mm3 \n\t" /* c */\
1024
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1025
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1026
"movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1027
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1028
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1029
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1030
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1031
"paddw %%mm2, %%mm1 \n\t" /* a */\
1032
"paddw %%mm6, %%mm4 \n\t" /* d */\
1033
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1034
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1035
"paddw %6, %%mm1 \n\t"\
1036
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1037
"psraw $5, %%mm3 \n\t"\
1038
"movq %5, %%mm1 \n\t"\
1039
"packuswb %%mm3, %%mm1 \n\t"\
1040
OP_MMX2(%%mm1, (%1),%%mm4, q)\
1041
/* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1043
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1044
"movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1045
"movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1046
"psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1047
"psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1048
"punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1049
"punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1050
"paddw %%mm1, %%mm5 \n\t" /* b */\
1051
"paddw %%mm4, %%mm0 \n\t" /* c */\
1052
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1053
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1054
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1055
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1056
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1057
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1058
"paddw %%mm3, %%mm2 \n\t" /* d */\
1059
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1060
"movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1061
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1062
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1063
"paddw %%mm2, %%mm6 \n\t" /* a */\
1064
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1065
"paddw %6, %%mm0 \n\t"\
1066
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1067
"psraw $5, %%mm0 \n\t"\
1068
/* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1070
"paddw %%mm5, %%mm3 \n\t" /* a */\
1071
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1072
"paddw %%mm4, %%mm6 \n\t" /* b */\
1073
"pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1074
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1075
"paddw %%mm1, %%mm4 \n\t" /* c */\
1076
"paddw %%mm2, %%mm5 \n\t" /* d */\
1077
"paddw %%mm6, %%mm6 \n\t" /* 2b */\
1078
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1079
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1080
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1081
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1082
"paddw %6, %%mm4 \n\t"\
1083
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1084
"psraw $5, %%mm4 \n\t"\
1085
"packuswb %%mm4, %%mm0 \n\t"\
1086
OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1092
: "+a"(src), "+c"(dst), "+D"(h)\
1093
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1098
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1101
/* quick HACK, XXX FIXME MUST be optimized */\
1104
temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1105
temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1106
temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1107
temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1108
temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1109
temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1110
temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1111
temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1112
temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1113
temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1114
temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1115
temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1116
temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1117
temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1118
temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1119
temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1121
"movq (%0), %%mm0 \n\t"\
1122
"movq 8(%0), %%mm1 \n\t"\
1123
"paddw %2, %%mm0 \n\t"\
1124
"paddw %2, %%mm1 \n\t"\
1125
"psraw $5, %%mm0 \n\t"\
1126
"psraw $5, %%mm1 \n\t"\
1127
"packuswb %%mm1, %%mm0 \n\t"\
1128
OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1129
"movq 16(%0), %%mm0 \n\t"\
1130
"movq 24(%0), %%mm1 \n\t"\
1131
"paddw %2, %%mm0 \n\t"\
1132
"paddw %2, %%mm1 \n\t"\
1133
"psraw $5, %%mm0 \n\t"\
1134
"psraw $5, %%mm1 \n\t"\
1135
"packuswb %%mm1, %%mm0 \n\t"\
1136
OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1137
:: "r"(temp), "r"(dst), "m"(ROUNDER)\
1145
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1147
"pxor %%mm7, %%mm7 \n\t"\
1149
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1150
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1151
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1152
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1153
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1154
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1155
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1156
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1157
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1158
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1159
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1160
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1161
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1162
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1163
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1164
"paddw %%mm3, %%mm5 \n\t" /* b */\
1165
"paddw %%mm2, %%mm6 \n\t" /* c */\
1166
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1167
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1168
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1169
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1170
"paddw %%mm4, %%mm0 \n\t" /* a */\
1171
"paddw %%mm1, %%mm5 \n\t" /* d */\
1172
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1173
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1174
"paddw %5, %%mm6 \n\t"\
1175
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1176
"psraw $5, %%mm0 \n\t"\
1177
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1179
"movd 5(%0), %%mm5 \n\t" /* FGHI */\
1180
"punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1181
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1182
"paddw %%mm5, %%mm1 \n\t" /* a */\
1183
"paddw %%mm6, %%mm2 \n\t" /* b */\
1184
"pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1185
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1186
"paddw %%mm6, %%mm3 \n\t" /* c */\
1187
"paddw %%mm5, %%mm4 \n\t" /* d */\
1188
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1189
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1190
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1191
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1192
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1193
"paddw %5, %%mm1 \n\t"\
1194
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1195
"psraw $5, %%mm3 \n\t"\
1196
"packuswb %%mm3, %%mm0 \n\t"\
1197
OP_MMX2(%%mm0, (%1), %%mm4, q)\
1203
: "+a"(src), "+c"(dst), "+d"(h)\
1204
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1209
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1212
/* quick HACK, XXX FIXME MUST be optimized */\
1215
temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1216
temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1217
temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1218
temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1219
temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1220
temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1221
temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1222
temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1224
"movq (%0), %%mm0 \n\t"\
1225
"movq 8(%0), %%mm1 \n\t"\
1226
"paddw %2, %%mm0 \n\t"\
1227
"paddw %2, %%mm1 \n\t"\
1228
"psraw $5, %%mm0 \n\t"\
1229
"psraw $5, %%mm1 \n\t"\
1230
"packuswb %%mm1, %%mm0 \n\t"\
1231
OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1232
:: "r"(temp), "r"(dst), "m"(ROUNDER)\
1240
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1242
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1243
uint64_t temp[17*4];\
1244
uint64_t *temp_ptr= temp;\
1249
"pxor %%mm7, %%mm7 \n\t"\
1251
"movq (%0), %%mm0 \n\t"\
1252
"movq (%0), %%mm1 \n\t"\
1253
"movq 8(%0), %%mm2 \n\t"\
1254
"movq 8(%0), %%mm3 \n\t"\
1255
"punpcklbw %%mm7, %%mm0 \n\t"\
1256
"punpckhbw %%mm7, %%mm1 \n\t"\
1257
"punpcklbw %%mm7, %%mm2 \n\t"\
1258
"punpckhbw %%mm7, %%mm3 \n\t"\
1259
"movq %%mm0, (%1) \n\t"\
1260
"movq %%mm1, 17*8(%1) \n\t"\
1261
"movq %%mm2, 2*17*8(%1) \n\t"\
1262
"movq %%mm3, 3*17*8(%1) \n\t"\
1267
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
1268
: "r" ((x86_reg)srcStride)\
1275
/*FIXME reorder for speed */\
1277
/*"pxor %%mm7, %%mm7 \n\t"*/\
1279
"movq (%0), %%mm0 \n\t"\
1280
"movq 8(%0), %%mm1 \n\t"\
1281
"movq 16(%0), %%mm2 \n\t"\
1282
"movq 24(%0), %%mm3 \n\t"\
1283
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1284
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1286
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1288
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1290
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1291
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1293
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1294
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1296
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1297
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1299
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1300
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1302
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1304
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1306
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1307
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1309
"add $136, %0 \n\t"\
1314
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1315
: "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1320
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1321
uint64_t temp[9*2];\
1322
uint64_t *temp_ptr= temp;\
1327
"pxor %%mm7, %%mm7 \n\t"\
1329
"movq (%0), %%mm0 \n\t"\
1330
"movq (%0), %%mm1 \n\t"\
1331
"punpcklbw %%mm7, %%mm0 \n\t"\
1332
"punpckhbw %%mm7, %%mm1 \n\t"\
1333
"movq %%mm0, (%1) \n\t"\
1334
"movq %%mm1, 9*8(%1) \n\t"\
1339
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
1340
: "r" ((x86_reg)srcStride)\
1347
/*FIXME reorder for speed */\
1349
/*"pxor %%mm7, %%mm7 \n\t"*/\
1351
"movq (%0), %%mm0 \n\t"\
1352
"movq 8(%0), %%mm1 \n\t"\
1353
"movq 16(%0), %%mm2 \n\t"\
1354
"movq 24(%0), %%mm3 \n\t"\
1355
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1356
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1358
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1360
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1362
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1364
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1366
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1367
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1374
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1375
: "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1380
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1381
OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1384
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1386
uint8_t * const half= (uint8_t*)temp;\
1387
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1388
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1391
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1392
OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1395
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1397
uint8_t * const half= (uint8_t*)temp;\
1398
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1399
OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1402
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1404
uint8_t * const half= (uint8_t*)temp;\
1405
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1406
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1409
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1410
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1413
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1415
uint8_t * const half= (uint8_t*)temp;\
1416
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1417
OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1419
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1420
uint64_t half[8 + 9];\
1421
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1422
uint8_t * const halfHV= ((uint8_t*)half);\
1423
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1424
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1425
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1426
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1428
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1429
uint64_t half[8 + 9];\
1430
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1431
uint8_t * const halfHV= ((uint8_t*)half);\
1432
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1433
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1434
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1435
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1437
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1438
uint64_t half[8 + 9];\
1439
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1440
uint8_t * const halfHV= ((uint8_t*)half);\
1441
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1442
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1443
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1444
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1446
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1447
uint64_t half[8 + 9];\
1448
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1449
uint8_t * const halfHV= ((uint8_t*)half);\
1450
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1451
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1452
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1453
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1455
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1456
uint64_t half[8 + 9];\
1457
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1458
uint8_t * const halfHV= ((uint8_t*)half);\
1459
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1460
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1461
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1463
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1464
uint64_t half[8 + 9];\
1465
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1466
uint8_t * const halfHV= ((uint8_t*)half);\
1467
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1468
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1469
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1471
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472
uint64_t half[8 + 9];\
1473
uint8_t * const halfH= ((uint8_t*)half);\
1474
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1475
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1476
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1478
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1479
uint64_t half[8 + 9];\
1480
uint8_t * const halfH= ((uint8_t*)half);\
1481
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1482
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1483
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1485
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1487
uint8_t * const halfH= ((uint8_t*)half);\
1488
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1489
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1491
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1492
OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1495
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1497
uint8_t * const half= (uint8_t*)temp;\
1498
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1499
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1502
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1503
OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1506
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1508
uint8_t * const half= (uint8_t*)temp;\
1509
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1510
OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1513
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1515
uint8_t * const half= (uint8_t*)temp;\
1516
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1517
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1520
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1521
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1524
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1526
uint8_t * const half= (uint8_t*)temp;\
1527
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1528
OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1530
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1531
uint64_t half[16*2 + 17*2];\
1532
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1533
uint8_t * const halfHV= ((uint8_t*)half);\
1534
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1535
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1536
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1537
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1539
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1540
uint64_t half[16*2 + 17*2];\
1541
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1542
uint8_t * const halfHV= ((uint8_t*)half);\
1543
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1544
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1545
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1546
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1548
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1549
uint64_t half[16*2 + 17*2];\
1550
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1551
uint8_t * const halfHV= ((uint8_t*)half);\
1552
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1553
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1554
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1555
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1557
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1558
uint64_t half[16*2 + 17*2];\
1559
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1560
uint8_t * const halfHV= ((uint8_t*)half);\
1561
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1562
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1563
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1564
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1566
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1567
uint64_t half[16*2 + 17*2];\
1568
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1569
uint8_t * const halfHV= ((uint8_t*)half);\
1570
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1571
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1572
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1574
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1575
uint64_t half[16*2 + 17*2];\
1576
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1577
uint8_t * const halfHV= ((uint8_t*)half);\
1578
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1579
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1580
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1582
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1583
uint64_t half[17*2];\
1584
uint8_t * const halfH= ((uint8_t*)half);\
1585
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1586
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1587
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1589
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1590
uint64_t half[17*2];\
1591
uint8_t * const halfH= ((uint8_t*)half);\
1592
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1593
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1594
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1596
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1597
uint64_t half[17*2];\
1598
uint8_t * const halfH= ((uint8_t*)half);\
1599
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1600
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1603
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1604
#define AVG_3DNOW_OP(a,b,temp, size) \
1605
"mov" #size " " #b ", " #temp " \n\t"\
1606
"pavgusb " #temp ", " #a " \n\t"\
1607
"mov" #size " " #a ", " #b " \n\t"
1608
#define AVG_MMX2_OP(a,b,temp, size) \
1609
"mov" #size " " #b ", " #temp " \n\t"\
1610
"pavgb " #temp ", " #a " \n\t"\
1611
"mov" #size " " #a ", " #b " \n\t"
1613
QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1614
QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1615
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1616
QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1617
QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1618
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1619
QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1620
QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1621
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
782
if (sides & EDGE_TOP) {
783
for (i = 0; i < h; i += 4) {
784
ptr = buf - (i + 1) * wrap - w;
787
"movq (%1, %0), %%mm0 \n\t"
788
"movq %%mm0, (%0) \n\t"
789
"movq %%mm0, (%0, %2) \n\t"
790
"movq %%mm0, (%0, %2, 2) \n\t"
791
"movq %%mm0, (%0, %3) \n\t"
796
: "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
797
"r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
802
if (sides & EDGE_BOTTOM) {
803
for (i = 0; i < h; i += 4) {
804
ptr = last_line + (i + 1) * wrap - w;
807
"movq (%1, %0), %%mm0 \n\t"
808
"movq %%mm0, (%0) \n\t"
809
"movq %%mm0, (%0, %2) \n\t"
810
"movq %%mm0, (%0, %2, 2) \n\t"
811
"movq %%mm0, (%0, %3) \n\t"
816
: "r"((x86_reg)last_line - (x86_reg)ptr - w),
817
"r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
818
"r"(ptr + width + 2 * w)
824
#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
825
in0, in1, in2, in7, out, OP) \
826
"paddw "#m4", "#m3" \n\t" /* x1 */ \
827
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
828
"pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
829
"movq "#in7", "#m3" \n\t" /* d */ \
830
"movq "#in0", %%mm5 \n\t" /* D */ \
831
"paddw "#m3", %%mm5 \n\t" /* x4 */ \
832
"psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
833
"movq "#in1", %%mm5 \n\t" /* C */ \
834
"movq "#in2", %%mm6 \n\t" /* B */ \
835
"paddw "#m6", %%mm5 \n\t" /* x3 */ \
836
"paddw "#m5", %%mm6 \n\t" /* x2 */ \
837
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
838
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
839
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
840
"paddw "#rnd", %%mm4 \n\t" /* x2 */ \
841
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
842
"psraw $5, %%mm5 \n\t" \
843
"packuswb %%mm5, %%mm5 \n\t" \
844
OP(%%mm5, out, %%mm7, d)
846
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
847
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
856
"pxor %%mm7, %%mm7 \n\t" \
858
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
859
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
860
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
861
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
862
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
863
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
864
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
865
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
866
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
867
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
868
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
869
"psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
870
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
871
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
872
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
873
"paddw %%mm3, %%mm5 \n\t" /* b */ \
874
"paddw %%mm2, %%mm6 \n\t" /* c */ \
875
"paddw %%mm5, %%mm5 \n\t" /* 2b */ \
876
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
877
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
878
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
879
"paddw %%mm4, %%mm0 \n\t" /* a */ \
880
"paddw %%mm1, %%mm5 \n\t" /* d */ \
881
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
882
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
883
"paddw %6, %%mm6 \n\t" \
884
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
885
"psraw $5, %%mm0 \n\t" \
886
"movq %%mm0, %5 \n\t" \
887
/* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
889
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
890
"movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
891
"movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
892
"psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
893
"psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
894
"punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
895
"punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
896
"paddw %%mm0, %%mm2 \n\t" /* b */ \
897
"paddw %%mm5, %%mm3 \n\t" /* c */ \
898
"paddw %%mm2, %%mm2 \n\t" /* 2b */ \
899
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
900
"movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
901
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
902
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
903
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
904
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
905
"paddw %%mm2, %%mm1 \n\t" /* a */ \
906
"paddw %%mm6, %%mm4 \n\t" /* d */ \
907
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
908
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
909
"paddw %6, %%mm1 \n\t" \
910
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
911
"psraw $5, %%mm3 \n\t" \
912
"movq %5, %%mm1 \n\t" \
913
"packuswb %%mm3, %%mm1 \n\t" \
914
OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
915
/* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
917
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
918
"movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
919
"movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
920
"psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
921
"psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
922
"punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
923
"punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
924
"paddw %%mm1, %%mm5 \n\t" /* b */ \
925
"paddw %%mm4, %%mm0 \n\t" /* c */ \
926
"paddw %%mm5, %%mm5 \n\t" /* 2b */ \
927
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
928
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
929
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
930
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
931
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
932
"paddw %%mm3, %%mm2 \n\t" /* d */ \
933
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
934
"movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
935
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
936
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
937
"paddw %%mm2, %%mm6 \n\t" /* a */ \
938
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
939
"paddw %6, %%mm0 \n\t" \
940
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
941
"psraw $5, %%mm0 \n\t" \
942
/* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
943
/* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
945
"paddw %%mm5, %%mm3 \n\t" /* a */ \
946
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
947
"paddw %%mm4, %%mm6 \n\t" /* b */ \
948
"pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
949
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
950
"paddw %%mm1, %%mm4 \n\t" /* c */ \
951
"paddw %%mm2, %%mm5 \n\t" /* d */ \
952
"paddw %%mm6, %%mm6 \n\t" /* 2b */ \
953
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
954
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
955
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
956
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
957
"paddw %6, %%mm4 \n\t" \
958
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
959
"psraw $5, %%mm4 \n\t" \
960
"packuswb %%mm4, %%mm0 \n\t" \
961
OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
967
: "+a"(src), "+c"(dst), "+D"(h) \
968
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
969
/* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
974
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
981
"pxor %%mm7, %%mm7 \n\t" \
983
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
984
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
985
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
986
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
987
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
988
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
989
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
990
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
991
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
992
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
993
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
994
"psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
995
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
996
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
997
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
998
"paddw %%mm3, %%mm5 \n\t" /* b */ \
999
"paddw %%mm2, %%mm6 \n\t" /* c */ \
1000
"paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1001
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1002
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1003
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1004
"paddw %%mm4, %%mm0 \n\t" /* a */ \
1005
"paddw %%mm1, %%mm5 \n\t" /* d */ \
1006
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1007
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1008
"paddw %5, %%mm6 \n\t" \
1009
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1010
"psraw $5, %%mm0 \n\t" \
1011
/* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1013
"movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1014
"punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1015
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1016
"paddw %%mm5, %%mm1 \n\t" /* a */ \
1017
"paddw %%mm6, %%mm2 \n\t" /* b */ \
1018
"pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1019
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1020
"paddw %%mm6, %%mm3 \n\t" /* c */ \
1021
"paddw %%mm5, %%mm4 \n\t" /* d */ \
1022
"paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1023
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1024
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1025
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1026
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1027
"paddw %5, %%mm1 \n\t" \
1028
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1029
"psraw $5, %%mm3 \n\t" \
1030
"packuswb %%mm3, %%mm0 \n\t" \
1031
OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1037
: "+a"(src), "+c"(dst), "+d"(h) \
1038
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1039
/* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1044
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1045
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1050
uint64_t temp[17 * 4]; \
1051
uint64_t *temp_ptr = temp; \
1054
/* FIXME unroll */ \
1055
__asm__ volatile ( \
1056
"pxor %%mm7, %%mm7 \n\t" \
1058
"movq (%0), %%mm0 \n\t" \
1059
"movq (%0), %%mm1 \n\t" \
1060
"movq 8(%0), %%mm2 \n\t" \
1061
"movq 8(%0), %%mm3 \n\t" \
1062
"punpcklbw %%mm7, %%mm0 \n\t" \
1063
"punpckhbw %%mm7, %%mm1 \n\t" \
1064
"punpcklbw %%mm7, %%mm2 \n\t" \
1065
"punpckhbw %%mm7, %%mm3 \n\t" \
1066
"movq %%mm0, (%1) \n\t" \
1067
"movq %%mm1, 17 * 8(%1) \n\t" \
1068
"movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1069
"movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1074
: "+r"(src), "+r"(temp_ptr), "+r"(count) \
1075
: "r"((x86_reg)srcStride) \
1082
/* FIXME reorder for speed */ \
1083
__asm__ volatile ( \
1084
/* "pxor %%mm7, %%mm7 \n\t" */ \
1086
"movq (%0), %%mm0 \n\t" \
1087
"movq 8(%0), %%mm1 \n\t" \
1088
"movq 16(%0), %%mm2 \n\t" \
1089
"movq 24(%0), %%mm3 \n\t" \
1090
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1091
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1093
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1095
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1097
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1098
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1100
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1101
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1103
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1104
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1106
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1107
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1109
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1111
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1113
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1114
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1116
"add $136, %0 \n\t" \
1121
: "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1122
: "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1123
/* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1124
"g"(4 - 14 * (x86_reg)dstStride) \
1129
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1134
uint64_t temp[9 * 2]; \
1135
uint64_t *temp_ptr = temp; \
1138
/* FIXME unroll */ \
1139
__asm__ volatile ( \
1140
"pxor %%mm7, %%mm7 \n\t" \
1142
"movq (%0), %%mm0 \n\t" \
1143
"movq (%0), %%mm1 \n\t" \
1144
"punpcklbw %%mm7, %%mm0 \n\t" \
1145
"punpckhbw %%mm7, %%mm1 \n\t" \
1146
"movq %%mm0, (%1) \n\t" \
1147
"movq %%mm1, 9*8(%1) \n\t" \
1152
: "+r"(src), "+r"(temp_ptr), "+r"(count) \
1153
: "r"((x86_reg)srcStride) \
1160
/* FIXME reorder for speed */ \
1161
__asm__ volatile ( \
1162
/* "pxor %%mm7, %%mm7 \n\t" */ \
1164
"movq (%0), %%mm0 \n\t" \
1165
"movq 8(%0), %%mm1 \n\t" \
1166
"movq 16(%0), %%mm2 \n\t" \
1167
"movq 24(%0), %%mm3 \n\t" \
1168
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1169
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1171
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1173
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1175
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1177
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1179
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1180
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1182
"add $72, %0 \n\t" \
1187
: "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1188
: "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1189
/* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1190
"g"(4 - 6 * (x86_reg)dstStride) \
1195
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1198
OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1201
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1205
uint8_t * const half = (uint8_t*)temp; \
1206
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1208
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1211
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1214
OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1218
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1222
uint8_t * const half = (uint8_t*)temp; \
1223
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1225
OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1229
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1233
uint8_t * const half = (uint8_t*)temp; \
1234
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1235
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1238
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1241
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1244
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1248
uint8_t * const half = (uint8_t*)temp; \
1249
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1250
OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1254
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1257
uint64_t half[8 + 9]; \
1258
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1259
uint8_t * const halfHV = ((uint8_t*)half); \
1260
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1262
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1263
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1264
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1267
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1270
uint64_t half[8 + 9]; \
1271
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1272
uint8_t * const halfHV = ((uint8_t*)half); \
1273
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1275
put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1277
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1278
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1281
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1284
uint64_t half[8 + 9]; \
1285
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1286
uint8_t * const halfHV = ((uint8_t*)half); \
1287
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1289
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1290
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1291
OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1294
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1297
uint64_t half[8 + 9]; \
1298
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1299
uint8_t * const halfHV = ((uint8_t*)half); \
1300
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1302
put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1304
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1305
OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1308
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1311
uint64_t half[8 + 9]; \
1312
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1313
uint8_t * const halfHV = ((uint8_t*)half); \
1314
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1316
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1317
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1320
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1323
uint64_t half[8 + 9]; \
1324
uint8_t * const halfH = ((uint8_t*)half) + 64; \
1325
uint8_t * const halfHV = ((uint8_t*)half); \
1326
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1328
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1329
OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1332
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1335
uint64_t half[8 + 9]; \
1336
uint8_t * const halfH = ((uint8_t*)half); \
1337
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1339
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1340
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1343
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1346
uint64_t half[8 + 9]; \
1347
uint8_t * const halfH = ((uint8_t*)half); \
1348
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1350
put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1352
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1355
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1359
uint8_t * const halfH = ((uint8_t*)half); \
1360
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1362
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1365
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1368
OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1371
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1374
uint64_t temp[32]; \
1375
uint8_t * const half = (uint8_t*)temp; \
1376
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1378
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1381
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1384
OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1385
stride, stride, 16); \
1388
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1391
uint64_t temp[32]; \
1392
uint8_t * const half = (uint8_t*)temp; \
1393
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1395
OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1396
stride, stride, 16); \
1399
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1402
uint64_t temp[32]; \
1403
uint8_t * const half = (uint8_t*)temp; \
1404
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1406
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1409
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1412
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1415
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1418
uint64_t temp[32]; \
1419
uint8_t * const half = (uint8_t*)temp; \
1420
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1422
OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1423
stride, stride, 16); \
1426
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1429
uint64_t half[16 * 2 + 17 * 2]; \
1430
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1431
uint8_t * const halfHV = ((uint8_t*)half); \
1432
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1434
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1436
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1438
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1441
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1444
uint64_t half[16 * 2 + 17 * 2]; \
1445
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1446
uint8_t * const halfHV = ((uint8_t*)half); \
1447
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1449
put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1451
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1453
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1456
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1459
uint64_t half[16 * 2 + 17 * 2]; \
1460
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1461
uint8_t * const halfHV = ((uint8_t*)half); \
1462
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1464
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1466
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1468
OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1472
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1475
uint64_t half[16 * 2 + 17 * 2]; \
1476
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1477
uint8_t * const halfHV = ((uint8_t*)half); \
1478
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1480
put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1482
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1484
OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1488
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1491
uint64_t half[16 * 2 + 17 * 2]; \
1492
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1493
uint8_t * const halfHV = ((uint8_t*)half); \
1494
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1496
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1498
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1501
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1504
uint64_t half[16 * 2 + 17 * 2]; \
1505
uint8_t * const halfH = ((uint8_t*)half) + 256; \
1506
uint8_t * const halfHV = ((uint8_t*)half); \
1507
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1509
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1511
OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1515
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1518
uint64_t half[17 * 2]; \
1519
uint8_t * const halfH = ((uint8_t*)half); \
1520
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1522
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1524
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1527
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1530
uint64_t half[17 * 2]; \
1531
uint8_t * const halfH = ((uint8_t*)half); \
1532
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1534
put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1536
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1539
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1542
uint64_t half[17 * 2]; \
1543
uint8_t * const halfH = ((uint8_t*)half); \
1544
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1546
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1549
#define PUT_OP(a, b, temp, size) \
1550
"mov"#size" "#a", "#b" \n\t"
1552
#define AVG_MMXEXT_OP(a, b, temp, size) \
1553
"mov"#size" "#b", "#temp" \n\t" \
1554
"pavgb "#temp", "#a" \n\t" \
1555
"mov"#size" "#a", "#b" \n\t"
1557
QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1558
QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1559
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1560
QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1561
QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1562
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1623
1564
/***********************************/
1624
1565
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1626
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1627
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1628
OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1630
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1631
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1632
OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1635
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
1636
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1637
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1638
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1639
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1640
OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1641
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1642
OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1643
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1644
OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1645
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1646
OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1648
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1649
OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1651
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1652
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1653
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1654
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1655
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1656
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1657
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1658
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1660
QPEL_2TAP(put_, 16, mmx2)
1661
QPEL_2TAP(avg_, 16, mmx2)
1662
QPEL_2TAP(put_, 8, mmx2)
1663
QPEL_2TAP(avg_, 8, mmx2)
1664
QPEL_2TAP(put_, 16, 3dnow)
1665
QPEL_2TAP(avg_, 16, 3dnow)
1666
QPEL_2TAP(put_, 8, 3dnow)
1667
QPEL_2TAP(avg_, 8, 3dnow)
1671
typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1672
x86_reg linesize, x86_reg start_y,
1673
x86_reg end_y, x86_reg block_h,
1674
x86_reg start_x, x86_reg end_x,
1676
extern emu_edge_core_func ff_emu_edge_core_mmx;
1677
extern emu_edge_core_func ff_emu_edge_core_sse;
1679
static av_always_inline
1680
void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1681
int block_w, int block_h,
1682
int src_x, int src_y, int w, int h,
1683
emu_edge_core_func *core_fn)
1685
int start_y, start_x, end_y, end_x, src_y_add=0;
1688
src_y_add = h-1-src_y;
1690
}else if(src_y<=-block_h){
1691
src_y_add = 1-block_h-src_y;
1697
}else if(src_x<=-block_w){
1698
src+= (1-block_w-src_x);
1702
start_y= FFMAX(0, -src_y);
1703
start_x= FFMAX(0, -src_x);
1704
end_y= FFMIN(block_h, h-src_y);
1705
end_x= FFMIN(block_w, w-src_x);
1706
assert(start_x < end_x && block_w > 0);
1707
assert(start_y < end_y && block_h > 0);
1709
// fill in the to-be-copied part plus all above/below
1710
src += (src_y_add+start_y)*linesize + start_x;
1712
core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1717
void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1718
int block_w, int block_h,
1719
int src_x, int src_y, int w, int h)
1721
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1722
w, h, &ff_emu_edge_core_mmx);
1726
void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1727
int block_w, int block_h,
1728
int src_x, int src_y, int w, int h)
1730
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1731
w, h, &ff_emu_edge_core_sse);
1733
#endif /* HAVE_YASM */
1735
typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1736
int linesize, int block_w, int block_h,
1737
int src_x, int src_y, int w, int h);
1739
static av_always_inline
1740
void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1741
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1742
emulated_edge_mc_func *emu_edge_fn)
1745
const int ix = ox>>(16+shift);
1746
const int iy = oy>>(16+shift);
1747
const int oxs = ox>>4;
1748
const int oys = oy>>4;
1749
const int dxxs = dxx>>4;
1750
const int dxys = dxy>>4;
1751
const int dyxs = dyx>>4;
1752
const int dyys = dyy>>4;
1753
const uint16_t r4[4] = {r,r,r,r};
1754
const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1755
const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1756
const uint64_t shift2 = 2*shift;
1757
uint8_t edge_buf[(h+1)*stride];
1567
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1568
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1572
OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1575
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1576
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1580
OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1584
#define QPEL_2TAP(OPNAME, SIZE, MMX) \
1585
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1586
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1587
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1588
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1589
OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1590
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1591
OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1592
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1593
OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1594
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1598
OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1600
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1604
OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1607
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1608
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1609
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1610
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1611
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1612
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1613
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1614
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1616
QPEL_2TAP(put_, 16, mmxext)
1617
QPEL_2TAP(avg_, 16, mmxext)
1618
QPEL_2TAP(put_, 8, mmxext)
1619
QPEL_2TAP(avg_, 8, mmxext)
1621
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1623
put_pixels8_xy2_mmx(dst, src, stride, 8);
1625
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1627
put_pixels16_xy2_mmx(dst, src, stride, 16);
1629
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1631
avg_pixels8_xy2_mmx(dst, src, stride, 8);
1633
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1635
avg_pixels16_xy2_mmx(dst, src, stride, 16);
1638
static void gmc_mmx(uint8_t *dst, uint8_t *src,
1639
int stride, int h, int ox, int oy,
1640
int dxx, int dxy, int dyx, int dyy,
1641
int shift, int r, int width, int height)
1644
const int ix = ox >> (16 + shift);
1645
const int iy = oy >> (16 + shift);
1646
const int oxs = ox >> 4;
1647
const int oys = oy >> 4;
1648
const int dxxs = dxx >> 4;
1649
const int dxys = dxy >> 4;
1650
const int dyxs = dyx >> 4;
1651
const int dyys = dyy >> 4;
1652
const uint16_t r4[4] = { r, r, r, r };
1653
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1654
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1655
const uint64_t shift2 = 2 * shift;
1760
const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1761
const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1762
const int dxh = dxy*(h-1);
1763
const int dyw = dyx*(w-1);
1764
if( // non-constant fullpel offset (3% of blocks)
1765
((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1766
(oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1658
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1659
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1660
const int dxh = dxy * (h - 1);
1661
const int dyw = dyx * (w - 1);
1662
if ( // non-constant fullpel offset (3% of blocks)
1663
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1664
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1767
1665
// uses more than 16 bits of subpel mv (only at huge resolution)
1768
|| (dxx|dxy|dyx|dyy)&15 )
1770
//FIXME could still use mmx for some of the rows
1771
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1666
|| (dxx | dxy | dyx | dyy) & 15 ||
1667
(unsigned)ix >= width - w ||
1668
(unsigned)iy >= height - h) {
1669
// FIXME could still use mmx for some of the rows
1670
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1671
shift, r, width, height);
1775
src += ix + iy*stride;
1776
if( (unsigned)ix >= width-w ||
1777
(unsigned)iy >= height-h )
1779
emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1675
src += ix + iy * stride;
1784
"movd %0, %%mm6 \n\t"
1785
"pxor %%mm7, %%mm7 \n\t"
1786
"punpcklwd %%mm6, %%mm6 \n\t"
1787
"punpcklwd %%mm6, %%mm6 \n\t"
1678
"movd %0, %%mm6 \n\t"
1679
"pxor %%mm7, %%mm7 \n\t"
1680
"punpcklwd %%mm6, %%mm6 \n\t"
1681
"punpcklwd %%mm6, %%mm6 \n\t"
1788
1682
:: "r"(1<<shift)
1791
for(x=0; x<w; x+=4){
1792
uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1793
oxs - dxys + dxxs*(x+1),
1794
oxs - dxys + dxxs*(x+2),
1795
oxs - dxys + dxxs*(x+3) };
1796
uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1797
oys - dyys + dyxs*(x+1),
1798
oys - dyys + dyxs*(x+2),
1799
oys - dyys + dyxs*(x+3) };
1685
for (x = 0; x < w; x += 4) {
1686
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1687
oxs - dxys + dxxs * (x + 1),
1688
oxs - dxys + dxxs * (x + 2),
1689
oxs - dxys + dxxs * (x + 3) };
1690
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1691
oys - dyys + dyxs * (x + 1),
1692
oys - dyys + dyxs * (x + 2),
1693
oys - dyys + dyxs * (x + 3) };
1803
"movq %0, %%mm4 \n\t"
1804
"movq %1, %%mm5 \n\t"
1805
"paddw %2, %%mm4 \n\t"
1806
"paddw %3, %%mm5 \n\t"
1807
"movq %%mm4, %0 \n\t"
1808
"movq %%mm5, %1 \n\t"
1809
"psrlw $12, %%mm4 \n\t"
1810
"psrlw $12, %%mm5 \n\t"
1695
for (y = 0; y < h; y++) {
1697
"movq %0, %%mm4 \n\t"
1698
"movq %1, %%mm5 \n\t"
1699
"paddw %2, %%mm4 \n\t"
1700
"paddw %3, %%mm5 \n\t"
1701
"movq %%mm4, %0 \n\t"
1702
"movq %%mm5, %1 \n\t"
1703
"psrlw $12, %%mm4 \n\t"
1704
"psrlw $12, %%mm5 \n\t"
1811
1705
: "+m"(*dx4), "+m"(*dy4)
1812
1706
: "m"(*dxy4), "m"(*dyy4)
1816
"movq %%mm6, %%mm2 \n\t"
1817
"movq %%mm6, %%mm1 \n\t"
1818
"psubw %%mm4, %%mm2 \n\t"
1819
"psubw %%mm5, %%mm1 \n\t"
1820
"movq %%mm2, %%mm0 \n\t"
1821
"movq %%mm4, %%mm3 \n\t"
1822
"pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1823
"pmullw %%mm5, %%mm3 \n\t" // dx*dy
1824
"pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1825
"pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1827
"movd %4, %%mm5 \n\t"
1828
"movd %3, %%mm4 \n\t"
1829
"punpcklbw %%mm7, %%mm5 \n\t"
1830
"punpcklbw %%mm7, %%mm4 \n\t"
1831
"pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1832
"pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1834
"movd %2, %%mm5 \n\t"
1835
"movd %1, %%mm4 \n\t"
1836
"punpcklbw %%mm7, %%mm5 \n\t"
1837
"punpcklbw %%mm7, %%mm4 \n\t"
1838
"pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1839
"pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1840
"paddw %5, %%mm1 \n\t"
1841
"paddw %%mm3, %%mm2 \n\t"
1842
"paddw %%mm1, %%mm0 \n\t"
1843
"paddw %%mm2, %%mm0 \n\t"
1845
"psrlw %6, %%mm0 \n\t"
1846
"packuswb %%mm0, %%mm0 \n\t"
1847
"movd %%mm0, %0 \n\t"
1849
: "=m"(dst[x+y*stride])
1710
"movq %%mm6, %%mm2 \n\t"
1711
"movq %%mm6, %%mm1 \n\t"
1712
"psubw %%mm4, %%mm2 \n\t"
1713
"psubw %%mm5, %%mm1 \n\t"
1714
"movq %%mm2, %%mm0 \n\t"
1715
"movq %%mm4, %%mm3 \n\t"
1716
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1717
"pmullw %%mm5, %%mm3 \n\t" // dx * dy
1718
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1719
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1721
"movd %4, %%mm5 \n\t"
1722
"movd %3, %%mm4 \n\t"
1723
"punpcklbw %%mm7, %%mm5 \n\t"
1724
"punpcklbw %%mm7, %%mm4 \n\t"
1725
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1726
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1728
"movd %2, %%mm5 \n\t"
1729
"movd %1, %%mm4 \n\t"
1730
"punpcklbw %%mm7, %%mm5 \n\t"
1731
"punpcklbw %%mm7, %%mm4 \n\t"
1732
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1733
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1734
"paddw %5, %%mm1 \n\t"
1735
"paddw %%mm3, %%mm2 \n\t"
1736
"paddw %%mm1, %%mm0 \n\t"
1737
"paddw %%mm2, %%mm0 \n\t"
1739
"psrlw %6, %%mm0 \n\t"
1740
"packuswb %%mm0, %%mm0 \n\t"
1741
"movd %%mm0, %0 \n\t"
1743
: "=m"(dst[x + y * stride])
1850
1744
: "m"(src[0]), "m"(src[1]),
1851
"m"(src[stride]), "m"(src[stride+1]),
1745
"m"(src[stride]), "m"(src[stride + 1]),
1852
1746
"m"(*r4), "m"(shift2)
1750
src += 4 - h * stride;
1862
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1863
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1865
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1866
width, height, &emulated_edge_mc_mmx);
1869
static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1870
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1872
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1873
width, height, &emulated_edge_mc_sse);
1876
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1877
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1879
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1880
width, height, &ff_emulated_edge_mc_8);
1884
#define PREFETCH(name, op) \
1885
static void name(void *mem, int stride, int h){\
1886
const uint8_t *p= mem;\
1888
__asm__ volatile(#op" %0" :: "m"(*p));\
1892
PREFETCH(prefetch_mmx2, prefetcht0)
1893
PREFETCH(prefetch_3dnow, prefetch)
1896
#include "h264_qpel_mmx.c"
1898
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1899
int stride, int h, int x, int y);
1900
void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1901
int stride, int h, int x, int y);
1902
void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1903
int stride, int h, int x, int y);
1905
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1906
int stride, int h, int x, int y);
1907
void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1908
int stride, int h, int x, int y);
1909
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1910
int stride, int h, int x, int y);
1912
void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1913
int stride, int h, int x, int y);
1914
void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1915
int stride, int h, int x, int y);
1917
void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1918
int stride, int h, int x, int y);
1919
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1920
int stride, int h, int x, int y);
1922
void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1923
int stride, int h, int x, int y);
1924
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1925
int stride, int h, int x, int y);
1927
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1928
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1929
(uint8_t *dst, uint8_t *src,\
1753
#endif /* HAVE_INLINE_ASM */
1755
#include "h264_qpel.c"
1757
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1758
int stride, int h, int x, int y);
1759
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1760
int stride, int h, int x, int y);
1761
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1762
int stride, int h, int x, int y);
1764
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1765
int stride, int h, int x, int y);
1766
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1767
int stride, int h, int x, int y);
1768
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1769
int stride, int h, int x, int y);
1771
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1772
int stride, int h, int x, int y);
1773
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1774
int stride, int h, int x, int y);
1776
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1777
int stride, int h, int x, int y);
1778
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1779
int stride, int h, int x, int y);
1781
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1782
int stride, int h, int x, int y);
1783
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1784
int stride, int h, int x, int y);
1786
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1787
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1788
(uint8_t *dst, uint8_t *src, \
1930
1789
int stride, int h, int x, int y);
1932
1791
CHROMA_MC(put, 2, 10, mmxext)
2053
1882
"movaps %%xmm3, %%xmm4 \n\t"
2054
1883
"andps %%xmm1, %%xmm3 \n\t"
2055
1884
"andnps %%xmm1, %%xmm4 \n\t"
2056
"addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2057
"subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1885
"addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
1886
"subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2058
1887
"movaps %%xmm3, %1 \n\t"
2059
1888
"movaps %%xmm0, %0 \n\t"
2060
:"+m"(mag[i]), "+m"(ang[i])
2069
#define MIX5(mono,stereo)\
2071
"movss 0(%2), %%xmm5 \n"\
2072
"movss 8(%2), %%xmm6 \n"\
2073
"movss 24(%2), %%xmm7 \n"\
2074
"shufps $0, %%xmm5, %%xmm5 \n"\
2075
"shufps $0, %%xmm6, %%xmm6 \n"\
2076
"shufps $0, %%xmm7, %%xmm7 \n"\
2078
"movaps (%0,%1), %%xmm0 \n"\
2079
"movaps 0x400(%0,%1), %%xmm1 \n"\
2080
"movaps 0x800(%0,%1), %%xmm2 \n"\
2081
"movaps 0xc00(%0,%1), %%xmm3 \n"\
2082
"movaps 0x1000(%0,%1), %%xmm4 \n"\
2083
"mulps %%xmm5, %%xmm0 \n"\
2084
"mulps %%xmm6, %%xmm1 \n"\
2085
"mulps %%xmm5, %%xmm2 \n"\
2086
"mulps %%xmm7, %%xmm3 \n"\
2087
"mulps %%xmm7, %%xmm4 \n"\
2088
stereo("addps %%xmm1, %%xmm0 \n")\
2089
"addps %%xmm1, %%xmm2 \n"\
2090
"addps %%xmm3, %%xmm0 \n"\
2091
"addps %%xmm4, %%xmm2 \n"\
2092
mono("addps %%xmm2, %%xmm0 \n")\
2093
"movaps %%xmm0, (%0,%1) \n"\
2094
stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2098
:"r"(samples[0]+len), "r"(matrix)\
2099
:XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2100
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2104
#define MIX_MISC(stereo)\
2107
"movaps (%3,%0), %%xmm0 \n"\
2108
stereo("movaps %%xmm0, %%xmm1 \n")\
2109
"mulps %%xmm4, %%xmm0 \n"\
2110
stereo("mulps %%xmm5, %%xmm1 \n")\
2111
"lea 1024(%3,%0), %1 \n"\
2114
"movaps (%1), %%xmm2 \n"\
2115
stereo("movaps %%xmm2, %%xmm3 \n")\
2116
"mulps (%4,%2), %%xmm2 \n"\
2117
stereo("mulps 16(%4,%2), %%xmm3 \n")\
2118
"addps %%xmm2, %%xmm0 \n"\
2119
stereo("addps %%xmm3, %%xmm1 \n")\
2123
"movaps %%xmm0, (%3,%0) \n"\
2124
stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2127
:"+&r"(i), "=&r"(j), "=&r"(k)\
2128
:"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2132
static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2134
int (*matrix_cmp)[2] = (int(*)[2])matrix;
2137
i = -len*sizeof(float);
2138
if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2140
} else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2143
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2144
j = 2*in_ch*sizeof(float);
2148
"movss (%2,%0), %%xmm4 \n"
2149
"movss 4(%2,%0), %%xmm5 \n"
2150
"shufps $0, %%xmm4, %%xmm4 \n"
2151
"shufps $0, %%xmm5, %%xmm5 \n"
2152
"movaps %%xmm4, (%1,%0,4) \n"
2153
"movaps %%xmm5, 16(%1,%0,4) \n"
2156
:"r"(matrix_simd), "r"(matrix)
2167
static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2168
x86_reg i = (len-4)*4;
2171
"movq (%2,%0), %%mm0 \n\t"
2172
"movq 8(%2,%0), %%mm1 \n\t"
2173
"pfmul (%3,%0), %%mm0 \n\t"
2174
"pfmul 8(%3,%0), %%mm1 \n\t"
2175
"movq %%mm0, (%1,%0) \n\t"
2176
"movq %%mm1, 8(%1,%0) \n\t"
2181
:"r"(dst), "r"(src0), "r"(src1)
2185
static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2186
x86_reg i = (len-8)*4;
2189
"movaps (%2,%0), %%xmm0 \n\t"
2190
"movaps 16(%2,%0), %%xmm1 \n\t"
2191
"mulps (%3,%0), %%xmm0 \n\t"
2192
"mulps 16(%3,%0), %%xmm1 \n\t"
2193
"movaps %%xmm0, (%1,%0) \n\t"
2194
"movaps %%xmm1, 16(%1,%0) \n\t"
2198
:"r"(dst), "r"(src0), "r"(src1)
2203
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2204
x86_reg i = len*4-16;
2207
"pswapd 8(%1), %%mm0 \n\t"
2208
"pswapd (%1), %%mm1 \n\t"
2209
"pfmul (%3,%0), %%mm0 \n\t"
2210
"pfmul 8(%3,%0), %%mm1 \n\t"
2211
"movq %%mm0, (%2,%0) \n\t"
2212
"movq %%mm1, 8(%2,%0) \n\t"
2216
:"+r"(i), "+r"(src1)
2217
:"r"(dst), "r"(src0)
2219
__asm__ volatile("femms");
2221
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2222
x86_reg i = len*4-32;
2225
"movaps 16(%1), %%xmm0 \n\t"
2226
"movaps (%1), %%xmm1 \n\t"
2227
"shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2228
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2229
"mulps (%3,%0), %%xmm0 \n\t"
2230
"mulps 16(%3,%0), %%xmm1 \n\t"
2231
"movaps %%xmm0, (%2,%0) \n\t"
2232
"movaps %%xmm1, 16(%2,%0) \n\t"
2236
:"+r"(i), "+r"(src1)
2237
:"r"(dst), "r"(src0)
2241
static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2242
const float *src2, int len){
2243
x86_reg i = (len-4)*4;
2246
"movq (%2,%0), %%mm0 \n\t"
2247
"movq 8(%2,%0), %%mm1 \n\t"
2248
"pfmul (%3,%0), %%mm0 \n\t"
2249
"pfmul 8(%3,%0), %%mm1 \n\t"
2250
"pfadd (%4,%0), %%mm0 \n\t"
2251
"pfadd 8(%4,%0), %%mm1 \n\t"
2252
"movq %%mm0, (%1,%0) \n\t"
2253
"movq %%mm1, 8(%1,%0) \n\t"
2257
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2260
__asm__ volatile("femms");
2262
static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2263
const float *src2, int len){
2264
x86_reg i = (len-8)*4;
2267
"movaps (%2,%0), %%xmm0 \n\t"
2268
"movaps 16(%2,%0), %%xmm1 \n\t"
2269
"mulps (%3,%0), %%xmm0 \n\t"
2270
"mulps 16(%3,%0), %%xmm1 \n\t"
2271
"addps (%4,%0), %%xmm0 \n\t"
2272
"addps 16(%4,%0), %%xmm1 \n\t"
2273
"movaps %%xmm0, (%1,%0) \n\t"
2274
"movaps %%xmm1, 16(%1,%0) \n\t"
2278
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
1889
: "+m"(mag[i]), "+m"(ang[i])
2284
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2285
const float *win, int len){
2287
x86_reg j = len*4-8;
2290
"pswapd (%5,%1), %%mm1 \n"
2291
"movq (%5,%0), %%mm0 \n"
2292
"pswapd (%4,%1), %%mm5 \n"
2293
"movq (%3,%0), %%mm4 \n"
2294
"movq %%mm0, %%mm2 \n"
2295
"movq %%mm1, %%mm3 \n"
2296
"pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2297
"pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2298
"pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2299
"pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2300
"pfadd %%mm3, %%mm2 \n"
2301
"pfsub %%mm0, %%mm1 \n"
2302
"pswapd %%mm2, %%mm2 \n"
2303
"movq %%mm1, (%2,%0) \n"
2304
"movq %%mm2, (%2,%1) \n"
2310
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
1896
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
1897
const float *src1, const float *win,
1900
x86_reg i = -len * 4;
1901
x86_reg j = len * 4 - 8;
1904
"pswapd (%5, %1), %%mm1 \n"
1905
"movq (%5, %0), %%mm0 \n"
1906
"pswapd (%4, %1), %%mm5 \n"
1907
"movq (%3, %0), %%mm4 \n"
1908
"movq %%mm0, %%mm2 \n"
1909
"movq %%mm1, %%mm3 \n"
1910
"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
1911
"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
1912
"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
1913
"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
1914
"pfadd %%mm3, %%mm2 \n"
1915
"pfsub %%mm0, %%mm1 \n"
1916
"pswapd %%mm2, %%mm2 \n"
1917
"movq %%mm1, (%2, %0) \n"
1918
"movq %%mm2, (%2, %1) \n"
1924
: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2314
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2315
const float *win, int len){
2317
x86_reg j = len*4-16;
2320
"movaps (%5,%1), %%xmm1 \n"
2321
"movaps (%5,%0), %%xmm0 \n"
2322
"movaps (%4,%1), %%xmm5 \n"
2323
"movaps (%3,%0), %%xmm4 \n"
2324
"shufps $0x1b, %%xmm1, %%xmm1 \n"
2325
"shufps $0x1b, %%xmm5, %%xmm5 \n"
2326
"movaps %%xmm0, %%xmm2 \n"
2327
"movaps %%xmm1, %%xmm3 \n"
2328
"mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2329
"mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2330
"mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2331
"mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2332
"addps %%xmm3, %%xmm2 \n"
2333
"subps %%xmm0, %%xmm1 \n"
2334
"shufps $0x1b, %%xmm2, %%xmm2 \n"
2335
"movaps %%xmm1, (%2,%0) \n"
2336
"movaps %%xmm2, (%2,%1) \n"
2341
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
1928
static void vector_fmul_window_sse(float *dst, const float *src0,
1929
const float *src1, const float *win, int len)
1931
x86_reg i = -len * 4;
1932
x86_reg j = len * 4 - 16;
1935
"movaps (%5, %1), %%xmm1 \n"
1936
"movaps (%5, %0), %%xmm0 \n"
1937
"movaps (%4, %1), %%xmm5 \n"
1938
"movaps (%3, %0), %%xmm4 \n"
1939
"shufps $0x1b, %%xmm1, %%xmm1 \n"
1940
"shufps $0x1b, %%xmm5, %%xmm5 \n"
1941
"movaps %%xmm0, %%xmm2 \n"
1942
"movaps %%xmm1, %%xmm3 \n"
1943
"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
1944
"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
1945
"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
1946
"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
1947
"addps %%xmm3, %%xmm2 \n"
1948
"subps %%xmm0, %%xmm1 \n"
1949
"shufps $0x1b, %%xmm2, %%xmm2 \n"
1950
"movaps %%xmm1, (%2, %0) \n"
1951
"movaps %%xmm2, (%2, %1) \n"
1956
: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2344
1959
#endif /* HAVE_6REGS */
2346
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
1961
static void vector_clipf_sse(float *dst, const float *src,
1962
float min, float max, int len)
2349
x86_reg i = (len-16)*4;
2351
"movss %3, %%xmm4 \n"
2352
"movss %4, %%xmm5 \n"
2353
"shufps $0, %%xmm4, %%xmm4 \n"
2354
"shufps $0, %%xmm5, %%xmm5 \n"
2356
"movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2357
"movaps 16(%2,%0), %%xmm1 \n\t"
2358
"movaps 32(%2,%0), %%xmm2 \n\t"
2359
"movaps 48(%2,%0), %%xmm3 \n\t"
2360
"maxps %%xmm4, %%xmm0 \n\t"
2361
"maxps %%xmm4, %%xmm1 \n\t"
2362
"maxps %%xmm4, %%xmm2 \n\t"
2363
"maxps %%xmm4, %%xmm3 \n\t"
2364
"minps %%xmm5, %%xmm0 \n\t"
2365
"minps %%xmm5, %%xmm1 \n\t"
2366
"minps %%xmm5, %%xmm2 \n\t"
2367
"minps %%xmm5, %%xmm3 \n\t"
2368
"movaps %%xmm0, (%1,%0) \n\t"
2369
"movaps %%xmm1, 16(%1,%0) \n\t"
2370
"movaps %%xmm2, 32(%1,%0) \n\t"
2371
"movaps %%xmm3, 48(%1,%0) \n\t"
2375
:"r"(dst), "r"(src), "m"(min), "m"(max)
1964
x86_reg i = (len - 16) * 4;
1966
"movss %3, %%xmm4 \n\t"
1967
"movss %4, %%xmm5 \n\t"
1968
"shufps $0, %%xmm4, %%xmm4 \n\t"
1969
"shufps $0, %%xmm5, %%xmm5 \n\t"
1971
"movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1972
"movaps 16(%2, %0), %%xmm1 \n\t"
1973
"movaps 32(%2, %0), %%xmm2 \n\t"
1974
"movaps 48(%2, %0), %%xmm3 \n\t"
1975
"maxps %%xmm4, %%xmm0 \n\t"
1976
"maxps %%xmm4, %%xmm1 \n\t"
1977
"maxps %%xmm4, %%xmm2 \n\t"
1978
"maxps %%xmm4, %%xmm3 \n\t"
1979
"minps %%xmm5, %%xmm0 \n\t"
1980
"minps %%xmm5, %%xmm1 \n\t"
1981
"minps %%xmm5, %%xmm2 \n\t"
1982
"minps %%xmm5, %%xmm3 \n\t"
1983
"movaps %%xmm0, (%1, %0) \n\t"
1984
"movaps %%xmm1, 16(%1, %0) \n\t"
1985
"movaps %%xmm2, 32(%1, %0) \n\t"
1986
"movaps %%xmm3, 48(%1, %0) \n\t"
1990
: "r"(dst), "r"(src), "m"(min), "m"(max)
2380
void ff_vp3_idct_mmx(int16_t *input_data);
2381
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2382
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2384
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2386
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2387
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2389
void ff_vp3_idct_sse2(int16_t *input_data);
2390
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2391
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2393
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2394
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2395
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2396
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2397
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2399
void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2400
const int16_t *window, unsigned int len);
2401
void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2402
const int16_t *window, unsigned int len);
2403
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2404
const int16_t *window, unsigned int len);
2405
void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2406
const int16_t *window, unsigned int len);
2407
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2408
const int16_t *window, unsigned int len);
1995
#endif /* HAVE_INLINE_ASM */
1997
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1999
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2001
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
2003
int order, int mul);
2004
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2006
int order, int mul);
2007
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2009
int order, int mul);
2011
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
2012
const int16_t *window, unsigned int len);
2013
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
2014
const int16_t *window, unsigned int len);
2015
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
2016
const int16_t *window, unsigned int len);
2017
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
2018
const int16_t *window, unsigned int len);
2019
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
2020
const int16_t *window, unsigned int len);
2409
2021
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2410
2022
const int16_t *window, unsigned int len);
2412
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2413
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2414
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2024
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2025
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2027
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
2028
const uint8_t *diff, int w,
2029
int *left, int *left_top);
2030
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2032
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2416
2035
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2418
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2419
int32_t max, unsigned int len);
2420
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2421
int32_t max, unsigned int len);
2422
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2423
int32_t max, unsigned int len);
2424
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
2425
int32_t max, unsigned int len);
2037
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2038
const float *src1, int len);
2039
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2040
const float *src1, int len);
2042
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2043
const float *src2, int len);
2044
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2045
const float *src2, int len);
2047
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2048
int32_t min, int32_t max, unsigned int len);
2049
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2050
int32_t min, int32_t max, unsigned int len);
2051
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2052
int32_t min, int32_t max, unsigned int len);
2053
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2054
int32_t min, int32_t max, unsigned int len);
2427
2056
extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2428
2057
const float *src1, int len);
2429
2058
extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2430
2059
const float *src1, int len);
2432
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2061
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2063
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2064
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2065
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2066
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2067
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2068
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2069
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2070
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2071
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2072
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2073
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2074
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2075
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2076
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2077
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2078
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2081
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2083
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2084
c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2085
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2086
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2089
#define H264_QPEL_FUNCS(x, y, CPU) \
2091
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2092
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2093
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2094
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2097
#define H264_QPEL_FUNCS_10(x, y, CPU) \
2099
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2100
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2101
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2102
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2105
static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2107
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2110
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2111
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2112
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2114
if (!high_bit_depth) {
2115
c->clear_block = clear_block_mmx;
2116
c->clear_blocks = clear_blocks_mmx;
2117
c->draw_edges = draw_edges_mmx;
2119
SET_HPEL_FUNCS(put, 0, 16, mmx);
2120
SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2121
SET_HPEL_FUNCS(avg, 0, 16, mmx);
2122
SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2123
SET_HPEL_FUNCS(put, 1, 8, mmx);
2124
SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2125
SET_HPEL_FUNCS(avg, 1, 8, mmx);
2126
SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2128
switch (avctx->idct_algo) {
2130
case FF_IDCT_SIMPLEMMX:
2131
c->idct_put = ff_simple_idct_put_mmx;
2132
c->idct_add = ff_simple_idct_add_mmx;
2133
c->idct = ff_simple_idct_mmx;
2134
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2136
case FF_IDCT_XVIDMMX:
2137
c->idct_put = ff_idct_xvid_mmx_put;
2138
c->idct_add = ff_idct_xvid_mmx_add;
2139
c->idct = ff_idct_xvid_mmx;
2146
c->add_bytes = add_bytes_mmx;
2148
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2149
c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2150
c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2152
#endif /* HAVE_INLINE_ASM */
2155
if (!high_bit_depth && CONFIG_H264CHROMA) {
2156
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2157
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2160
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2165
static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2168
const int bit_depth = avctx->bits_per_raw_sample;
2169
const int high_bit_depth = bit_depth > 8;
2172
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2173
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2174
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
2175
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
2177
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2178
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2179
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
2180
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
2181
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2182
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2184
if (!high_bit_depth) {
2185
c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2186
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2188
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2189
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2190
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2192
c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2193
c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2195
c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2196
c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2197
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2200
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2201
if (!high_bit_depth) {
2202
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2203
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2204
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2205
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2207
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2208
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2212
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2213
c->idct_put = ff_idct_xvid_mmxext_put;
2214
c->idct_add = ff_idct_xvid_mmxext_add;
2215
c->idct = ff_idct_xvid_mmxext;
2218
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2219
avctx->codec_id == AV_CODEC_ID_THEORA)) {
2220
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2221
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2223
#endif /* HAVE_INLINE_ASM */
2225
#if HAVE_MMXEXT_EXTERNAL
2226
if (CONFIG_H264QPEL) {
2227
if (!high_bit_depth) {
2228
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2229
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2230
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2231
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2232
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2233
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2234
} else if (bit_depth == 10) {
2236
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2237
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2238
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2239
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2241
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2242
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2246
if (!high_bit_depth && CONFIG_H264CHROMA) {
2247
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2248
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2249
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2250
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2252
if (bit_depth == 10 && CONFIG_H264CHROMA) {
2253
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2254
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2255
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2256
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2259
/* slower than cmov version on AMD */
2260
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2261
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2263
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2264
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2266
if (avctx->flags & CODEC_FLAG_BITEXACT) {
2267
c->apply_window_int16 = ff_apply_window_int16_mmxext;
2269
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2271
#endif /* HAVE_MMXEXT_EXTERNAL */
2274
static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2277
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2280
if (!high_bit_depth) {
2281
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2282
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2284
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2285
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2286
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2288
c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2289
c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2291
c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2292
c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2293
c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2295
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2296
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2297
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2298
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2299
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2301
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2302
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2306
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2307
avctx->codec_id == AV_CODEC_ID_THEORA)) {
2308
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2309
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2312
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2313
#endif /* HAVE_INLINE_ASM */
2316
if (!high_bit_depth && CONFIG_H264CHROMA) {
2317
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2318
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2320
#endif /* HAVE_YASM */
2323
static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2326
#if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2327
c->vector_fmul_window = vector_fmul_window_3dnowext;
2331
static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2333
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2336
if (!high_bit_depth) {
2337
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2338
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2339
c->clear_block = clear_block_sse;
2340
c->clear_blocks = clear_blocks_sse;
2344
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2347
c->vector_fmul_window = vector_fmul_window_sse;
2350
c->vector_clipf = vector_clipf_sse;
2351
#endif /* HAVE_INLINE_ASM */
2354
c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2355
c->vector_fmul_add = ff_vector_fmul_add_sse;
2357
c->scalarproduct_float = ff_scalarproduct_float_sse;
2358
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2359
#endif /* HAVE_YASM */
2362
static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2365
const int bit_depth = avctx->bits_per_raw_sample;
2366
const int high_bit_depth = bit_depth > 8;
2368
#if HAVE_SSE2_INLINE
2369
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2370
c->idct_put = ff_idct_xvid_sse2_put;
2371
c->idct_add = ff_idct_xvid_sse2_add;
2372
c->idct = ff_idct_xvid_sse2;
2373
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2375
#endif /* HAVE_SSE2_INLINE */
2377
#if HAVE_SSE2_EXTERNAL
2378
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2379
// these functions are slower than mmx on AMD, but faster on Intel
2380
if (!high_bit_depth) {
2381
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2382
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2383
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2384
if (CONFIG_H264QPEL)
2385
H264_QPEL_FUNCS(0, 0, sse2);
2389
if (!high_bit_depth && CONFIG_H264QPEL) {
2390
H264_QPEL_FUNCS(0, 1, sse2);
2391
H264_QPEL_FUNCS(0, 2, sse2);
2392
H264_QPEL_FUNCS(0, 3, sse2);
2393
H264_QPEL_FUNCS(1, 1, sse2);
2394
H264_QPEL_FUNCS(1, 2, sse2);
2395
H264_QPEL_FUNCS(1, 3, sse2);
2396
H264_QPEL_FUNCS(2, 1, sse2);
2397
H264_QPEL_FUNCS(2, 2, sse2);
2398
H264_QPEL_FUNCS(2, 3, sse2);
2399
H264_QPEL_FUNCS(3, 1, sse2);
2400
H264_QPEL_FUNCS(3, 2, sse2);
2401
H264_QPEL_FUNCS(3, 3, sse2);
2404
if (bit_depth == 10) {
2405
if (CONFIG_H264QPEL) {
2406
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2407
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2408
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2409
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2410
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2411
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2412
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2414
if (CONFIG_H264CHROMA) {
2415
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2416
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2420
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2421
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2422
if (mm_flags & AV_CPU_FLAG_ATOM) {
2423
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2425
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2427
if (avctx->flags & CODEC_FLAG_BITEXACT) {
2428
c->apply_window_int16 = ff_apply_window_int16_sse2;
2429
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2430
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2432
c->bswap_buf = ff_bswap32_buf_sse2;
2433
#endif /* HAVE_SSE2_EXTERNAL */
2436
static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2439
#if HAVE_SSSE3_EXTERNAL
2440
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2441
const int bit_depth = avctx->bits_per_raw_sample;
2443
if (!high_bit_depth && CONFIG_H264QPEL) {
2444
H264_QPEL_FUNCS(1, 0, ssse3);
2445
H264_QPEL_FUNCS(1, 1, ssse3);
2446
H264_QPEL_FUNCS(1, 2, ssse3);
2447
H264_QPEL_FUNCS(1, 3, ssse3);
2448
H264_QPEL_FUNCS(2, 0, ssse3);
2449
H264_QPEL_FUNCS(2, 1, ssse3);
2450
H264_QPEL_FUNCS(2, 2, ssse3);
2451
H264_QPEL_FUNCS(2, 3, ssse3);
2452
H264_QPEL_FUNCS(3, 0, ssse3);
2453
H264_QPEL_FUNCS(3, 1, ssse3);
2454
H264_QPEL_FUNCS(3, 2, ssse3);
2455
H264_QPEL_FUNCS(3, 3, ssse3);
2457
if (bit_depth == 10 && CONFIG_H264QPEL) {
2458
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2459
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2460
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2462
if (!high_bit_depth && CONFIG_H264CHROMA) {
2463
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2464
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2465
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2466
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2468
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2469
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2470
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2472
if (mm_flags & AV_CPU_FLAG_ATOM)
2473
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2475
c->apply_window_int16 = ff_apply_window_int16_ssse3;
2476
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2477
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2478
c->bswap_buf = ff_bswap32_buf_ssse3;
2479
#endif /* HAVE_SSSE3_EXTERNAL */
2482
static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2485
#if HAVE_SSE4_EXTERNAL
2486
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2487
#endif /* HAVE_SSE4_EXTERNAL */
2490
static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2492
#if HAVE_AVX_EXTERNAL
2493
const int bit_depth = avctx->bits_per_raw_sample;
2495
if (bit_depth == 10) {
2496
// AVX implies !cache64.
2497
// TODO: Port cache(32|64) detection from x264.
2498
if (CONFIG_H264QPEL) {
2499
H264_QPEL_FUNCS_10(1, 0, sse2);
2500
H264_QPEL_FUNCS_10(2, 0, sse2);
2501
H264_QPEL_FUNCS_10(3, 0, sse2);
2504
if (CONFIG_H264CHROMA) {
2505
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2506
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2509
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2510
c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2511
c->vector_fmul_add = ff_vector_fmul_add_avx;
2512
#endif /* HAVE_AVX_EXTERNAL */
2515
void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2434
2517
int mm_flags = av_get_cpu_flags();
2435
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2436
const int bit_depth = avctx->bits_per_raw_sample;
2438
if (avctx->dsp_mask) {
2439
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2440
mm_flags |= (avctx->dsp_mask & 0xffff);
2442
mm_flags &= ~(avctx->dsp_mask & 0xffff);
2446
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2519
#if HAVE_7REGS && HAVE_INLINE_ASM
2520
if (mm_flags & AV_CPU_FLAG_CMOV)
2521
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2447
2524
if (mm_flags & AV_CPU_FLAG_MMX)
2448
av_log(avctx, AV_LOG_INFO, " mmx");
2449
if (mm_flags & AV_CPU_FLAG_MMX2)
2450
av_log(avctx, AV_LOG_INFO, " mmx2");
2525
dsputil_init_mmx(c, avctx, mm_flags);
2527
if (mm_flags & AV_CPU_FLAG_MMXEXT)
2528
dsputil_init_mmxext(c, avctx, mm_flags);
2451
2530
if (mm_flags & AV_CPU_FLAG_3DNOW)
2452
av_log(avctx, AV_LOG_INFO, " 3dnow");
2531
dsputil_init_3dnow(c, avctx, mm_flags);
2533
if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
2534
dsputil_init_3dnowext(c, avctx, mm_flags);
2453
2536
if (mm_flags & AV_CPU_FLAG_SSE)
2454
av_log(avctx, AV_LOG_INFO, " sse");
2537
dsputil_init_sse(c, avctx, mm_flags);
2455
2539
if (mm_flags & AV_CPU_FLAG_SSE2)
2456
av_log(avctx, AV_LOG_INFO, " sse2");
2457
av_log(avctx, AV_LOG_INFO, "\n");
2460
if (mm_flags & AV_CPU_FLAG_MMX) {
2461
const int idct_algo= avctx->idct_algo;
2463
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2464
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2465
c->idct_put= ff_simple_idct_put_mmx;
2466
c->idct_add= ff_simple_idct_add_mmx;
2467
c->idct = ff_simple_idct_mmx;
2468
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2470
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2471
if(mm_flags & AV_CPU_FLAG_MMX2){
2472
c->idct_put= ff_libmpeg2mmx2_idct_put;
2473
c->idct_add= ff_libmpeg2mmx2_idct_add;
2474
c->idct = ff_mmxext_idct;
2476
c->idct_put= ff_libmpeg2mmx_idct_put;
2477
c->idct_add= ff_libmpeg2mmx_idct_add;
2478
c->idct = ff_mmx_idct;
2480
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2482
}else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2483
idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2484
if(mm_flags & AV_CPU_FLAG_SSE2){
2485
c->idct_put= ff_vp3_idct_put_sse2;
2486
c->idct_add= ff_vp3_idct_add_sse2;
2487
c->idct = ff_vp3_idct_sse2;
2488
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2490
c->idct_put= ff_vp3_idct_put_mmx;
2491
c->idct_add= ff_vp3_idct_add_mmx;
2492
c->idct = ff_vp3_idct_mmx;
2493
c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2495
}else if(idct_algo==FF_IDCT_CAVS){
2496
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2497
}else if(idct_algo==FF_IDCT_XVIDMMX){
2498
if(mm_flags & AV_CPU_FLAG_SSE2){
2499
c->idct_put= ff_idct_xvid_sse2_put;
2500
c->idct_add= ff_idct_xvid_sse2_add;
2501
c->idct = ff_idct_xvid_sse2;
2502
c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2503
}else if(mm_flags & AV_CPU_FLAG_MMX2){
2504
c->idct_put= ff_idct_xvid_mmx2_put;
2505
c->idct_add= ff_idct_xvid_mmx2_add;
2506
c->idct = ff_idct_xvid_mmx2;
2508
c->idct_put= ff_idct_xvid_mmx_put;
2509
c->idct_add= ff_idct_xvid_mmx_add;
2510
c->idct = ff_idct_xvid_mmx;
2515
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2516
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2517
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2518
if (!high_bit_depth) {
2519
c->clear_block = clear_block_mmx;
2520
c->clear_blocks = clear_blocks_mmx;
2521
if ((mm_flags & AV_CPU_FLAG_SSE) &&
2522
!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2523
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2524
c->clear_block = clear_block_sse;
2525
c->clear_blocks = clear_blocks_sse;
2529
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2530
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2531
c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2532
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2533
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2535
if (!high_bit_depth) {
2536
SET_HPEL_FUNCS(put, 0, 16, mmx);
2537
SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2538
SET_HPEL_FUNCS(avg, 0, 16, mmx);
2539
SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2540
SET_HPEL_FUNCS(put, 1, 8, mmx);
2541
SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2542
SET_HPEL_FUNCS(avg, 1, 8, mmx);
2543
SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2546
#if ARCH_X86_32 || !HAVE_YASM
2549
#if ARCH_X86_32 && HAVE_YASM
2550
if (!high_bit_depth)
2551
c->emulated_edge_mc = emulated_edge_mc_mmx;
2554
c->add_bytes= add_bytes_mmx;
2555
c->add_bytes_l2= add_bytes_l2_mmx;
2557
if (!high_bit_depth)
2558
c->draw_edges = draw_edges_mmx;
2560
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2561
c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2562
c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2566
if (!high_bit_depth && CONFIG_H264CHROMA) {
2567
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2568
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2571
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2574
if (mm_flags & AV_CPU_FLAG_MMX2) {
2575
c->prefetch = prefetch_mmx2;
2577
if (!high_bit_depth) {
2578
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2579
c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2581
c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2582
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2583
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2585
c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2586
c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2588
c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2589
c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2590
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2593
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2594
if (!high_bit_depth) {
2595
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2596
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2597
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2598
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2599
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2600
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2603
if (CONFIG_VP3_DECODER && HAVE_YASM) {
2604
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2605
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2608
if (CONFIG_VP3_DECODER && HAVE_YASM) {
2609
c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2612
if (CONFIG_VP3_DECODER
2613
&& (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2614
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2615
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2618
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2619
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2620
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2621
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2622
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2623
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2624
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2625
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2626
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2627
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2628
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2629
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2630
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2631
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2632
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2633
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2634
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2636
SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2637
SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2638
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2639
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2640
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2641
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2643
if (!high_bit_depth) {
2644
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2645
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2646
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2647
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2648
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2649
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2651
else if (bit_depth == 10) {
2654
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2655
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2656
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2657
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2659
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2660
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2664
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2665
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2666
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2667
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2670
if (!high_bit_depth && CONFIG_H264CHROMA) {
2671
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2672
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2673
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2674
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2676
if (bit_depth == 10 && CONFIG_H264CHROMA) {
2677
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2678
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2679
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2680
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
2683
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2686
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2687
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2690
c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2691
} else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2692
c->prefetch = prefetch_3dnow;
2694
if (!high_bit_depth) {
2695
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2696
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2698
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2699
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2700
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2702
c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2703
c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2705
c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2706
c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2707
c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2709
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2710
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2711
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2712
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2713
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2714
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2715
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2719
if (CONFIG_VP3_DECODER
2720
&& (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2721
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2722
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2725
SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2726
SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2727
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2728
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2729
SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2730
SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2732
if (!high_bit_depth) {
2733
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2734
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2735
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2736
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2737
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2738
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2741
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2742
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2743
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2744
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2747
if (!high_bit_depth && CONFIG_H264CHROMA) {
2748
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2749
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2756
#define H264_QPEL_FUNCS(x, y, CPU)\
2757
c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2758
c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2759
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2760
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2761
if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2762
// these functions are slower than mmx on AMD, but faster on Intel
2763
if (!high_bit_depth) {
2764
c->put_pixels_tab[0][0] = put_pixels16_sse2;
2765
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2766
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2767
H264_QPEL_FUNCS(0, 0, sse2);
2770
if(mm_flags & AV_CPU_FLAG_SSE2){
2771
if (!high_bit_depth) {
2772
H264_QPEL_FUNCS(0, 1, sse2);
2773
H264_QPEL_FUNCS(0, 2, sse2);
2774
H264_QPEL_FUNCS(0, 3, sse2);
2775
H264_QPEL_FUNCS(1, 1, sse2);
2776
H264_QPEL_FUNCS(1, 2, sse2);
2777
H264_QPEL_FUNCS(1, 3, sse2);
2778
H264_QPEL_FUNCS(2, 1, sse2);
2779
H264_QPEL_FUNCS(2, 2, sse2);
2780
H264_QPEL_FUNCS(2, 3, sse2);
2781
H264_QPEL_FUNCS(3, 1, sse2);
2782
H264_QPEL_FUNCS(3, 2, sse2);
2783
H264_QPEL_FUNCS(3, 3, sse2);
2786
#define H264_QPEL_FUNCS_10(x, y, CPU)\
2787
c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2788
c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2789
c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2790
c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2791
if (bit_depth == 10) {
2792
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2793
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2794
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2795
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2796
H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2797
H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2798
H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2800
if (CONFIG_H264CHROMA) {
2801
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2802
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2808
if(mm_flags & AV_CPU_FLAG_SSSE3){
2809
if (!high_bit_depth) {
2810
H264_QPEL_FUNCS(1, 0, ssse3);
2811
H264_QPEL_FUNCS(1, 1, ssse3);
2812
H264_QPEL_FUNCS(1, 2, ssse3);
2813
H264_QPEL_FUNCS(1, 3, ssse3);
2814
H264_QPEL_FUNCS(2, 0, ssse3);
2815
H264_QPEL_FUNCS(2, 1, ssse3);
2816
H264_QPEL_FUNCS(2, 2, ssse3);
2817
H264_QPEL_FUNCS(2, 3, ssse3);
2818
H264_QPEL_FUNCS(3, 0, ssse3);
2819
H264_QPEL_FUNCS(3, 1, ssse3);
2820
H264_QPEL_FUNCS(3, 2, ssse3);
2821
H264_QPEL_FUNCS(3, 3, ssse3);
2824
else if (bit_depth == 10) {
2825
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2826
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2827
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2830
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2832
if (!high_bit_depth && CONFIG_H264CHROMA) {
2833
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2834
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2835
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2836
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2838
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2839
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2840
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2845
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2846
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2847
c->vector_fmul = vector_fmul_3dnow;
2849
if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
2850
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2852
c->vector_fmul_window = vector_fmul_window_3dnow2;
2855
if(mm_flags & AV_CPU_FLAG_MMX2){
2857
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2858
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2859
if (avctx->flags & CODEC_FLAG_BITEXACT) {
2860
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2862
c->apply_window_int16 = ff_apply_window_int16_mmxext;
2866
if(mm_flags & AV_CPU_FLAG_SSE){
2867
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2868
c->ac3_downmix = ac3_downmix_sse;
2869
c->vector_fmul = vector_fmul_sse;
2870
c->vector_fmul_reverse = vector_fmul_reverse_sse;
2871
c->vector_fmul_add = vector_fmul_add_sse;
2873
c->vector_fmul_window = vector_fmul_window_sse;
2875
c->vector_clipf = vector_clipf_sse;
2877
c->scalarproduct_float = ff_scalarproduct_float_sse;
2878
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2880
if (!high_bit_depth)
2881
c->emulated_edge_mc = emulated_edge_mc_sse;
2885
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2886
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2887
if(mm_flags & AV_CPU_FLAG_SSE2){
2889
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2890
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2891
if (mm_flags & AV_CPU_FLAG_ATOM) {
2892
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2894
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2896
if (avctx->flags & CODEC_FLAG_BITEXACT) {
2897
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2899
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2900
c->apply_window_int16 = ff_apply_window_int16_sse2;
2905
if (mm_flags & AV_CPU_FLAG_SSSE3) {
2907
if (mm_flags & AV_CPU_FLAG_ATOM) {
2908
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2910
c->apply_window_int16 = ff_apply_window_int16_ssse3;
2912
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2913
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2918
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2920
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2924
#if HAVE_AVX && HAVE_YASM
2925
if (mm_flags & AV_CPU_FLAG_AVX) {
2926
if (bit_depth == 10) {
2927
//AVX implies !cache64.
2928
//TODO: Port cache(32|64) detection from x264.
2929
H264_QPEL_FUNCS_10(1, 0, sse2)
2930
H264_QPEL_FUNCS_10(2, 0, sse2)
2931
H264_QPEL_FUNCS_10(3, 0, sse2)
2933
if (CONFIG_H264CHROMA) {
2934
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2935
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2938
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2540
dsputil_init_sse2(c, avctx, mm_flags);
2542
if (mm_flags & AV_CPU_FLAG_SSSE3)
2543
dsputil_init_ssse3(c, avctx, mm_flags);
2545
if (mm_flags & AV_CPU_FLAG_SSE4)
2546
dsputil_init_sse4(c, avctx, mm_flags);
2548
if (mm_flags & AV_CPU_FLAG_AVX)
2549
dsputil_init_avx(c, avctx, mm_flags);
2943
2551
if (CONFIG_ENCODERS)
2944
dsputilenc_init_mmx(c, avctx);
2552
ff_dsputilenc_init_mmx(c, avctx);