288
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
289
290
#define FDCT_COL(cpu, mm, mov)\
290
291
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
292
mov##_m2r(*(in + offset + 1 * 8), mm##0);\
293
mov##_m2r(*(in + offset + 6 * 8), mm##1);\
294
mov##_r2r(mm##0, mm##2);\
295
mov##_m2r(*(in + offset + 2 * 8), mm##3);\
296
paddsw_r2r(mm##1, mm##0);\
297
mov##_m2r(*(in + offset + 5 * 8), mm##4);\
298
psllw_i2r(SHIFT_FRW_COL, mm##0);\
299
mov##_m2r(*(in + offset + 0 * 8), mm##5);\
300
paddsw_r2r(mm##3, mm##4);\
301
paddsw_m2r(*(in + offset + 7 * 8), mm##5);\
302
psllw_i2r(SHIFT_FRW_COL, mm##4);\
303
mov##_r2r(mm##0, mm##6);\
304
psubsw_r2r(mm##1, mm##2);\
305
mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\
306
psubsw_r2r(mm##4, mm##0);\
307
mov##_m2r(*(in + offset + 3 * 8), mm##7);\
308
pmulhw_r2r(mm##0, mm##1);\
309
paddsw_m2r(*(in + offset + 4 * 8), mm##7);\
310
psllw_i2r(SHIFT_FRW_COL, mm##5);\
311
paddsw_r2r(mm##4, mm##6);\
312
psllw_i2r(SHIFT_FRW_COL, mm##7);\
313
mov##_r2r(mm##5, mm##4);\
314
psubsw_r2r(mm##7, mm##5);\
315
paddsw_r2r(mm##5, mm##1);\
316
paddsw_r2r(mm##7, mm##4);\
317
por_m2r(*fdct_one_corr, mm##1);\
318
psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\
319
pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\
320
mov##_r2r(mm##4, mm##7);\
321
psubsw_m2r(*(in + offset + 5 * 8), mm##3);\
322
psubsw_r2r(mm##6, mm##4);\
323
mov##_r2m(mm##1, *(out + offset + 2 * 8));\
324
paddsw_r2r(mm##6, mm##7);\
325
mov##_m2r(*(in + offset + 3 * 8), mm##1);\
326
psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\
327
psubsw_m2r(*(in + offset + 4 * 8), mm##1);\
328
mov##_r2r(mm##2, mm##6);\
329
mov##_r2m(mm##4, *(out + offset + 4 * 8));\
330
paddsw_r2r(mm##3, mm##2);\
331
pmulhw_m2r(*ocos_4_16, mm##2);\
332
psubsw_r2r(mm##3, mm##6);\
333
pmulhw_m2r(*ocos_4_16, mm##6);\
334
psubsw_r2r(mm##0, mm##5);\
335
por_m2r(*fdct_one_corr, mm##5);\
336
psllw_i2r(SHIFT_FRW_COL, mm##1);\
337
por_m2r(*fdct_one_corr, mm##2);\
338
mov##_r2r(mm##1, mm##4);\
339
mov##_m2r(*(in + offset + 0 * 8), mm##3);\
340
paddsw_r2r(mm##6, mm##1);\
341
psubsw_m2r(*(in + offset + 7 * 8), mm##3);\
342
psubsw_r2r(mm##6, mm##4);\
343
mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\
344
psllw_i2r(SHIFT_FRW_COL, mm##3);\
345
mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\
346
pmulhw_r2r(mm##1, mm##0);\
347
mov##_r2m(mm##7, *(out + offset + 0 * 8));\
348
pmulhw_r2r(mm##4, mm##6);\
349
mov##_r2m(mm##5, *(out + offset + 6 * 8));\
350
mov##_r2r(mm##3, mm##7);\
351
mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\
352
psubsw_r2r(mm##2, mm##7);\
353
paddsw_r2r(mm##2, mm##3);\
354
pmulhw_r2r(mm##7, mm##5);\
355
paddsw_r2r(mm##3, mm##0);\
356
paddsw_r2r(mm##4, mm##6);\
357
pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\
358
por_m2r(*fdct_one_corr, mm##0);\
359
paddsw_r2r(mm##7, mm##5);\
360
psubsw_r2r(mm##6, mm##7);\
361
mov##_r2m(mm##0, *(out + offset + 1 * 8));\
362
paddsw_r2r(mm##4, mm##5);\
363
mov##_r2m(mm##7, *(out + offset + 3 * 8));\
364
psubsw_r2r(mm##1, mm##3);\
365
mov##_r2m(mm##5, *(out + offset + 5 * 8));\
366
mov##_r2m(mm##3, *(out + offset + 7 * 8));\
294
#mov" 16(%0), %%"#mm"0 \n\t" \
295
#mov" 96(%0), %%"#mm"1 \n\t" \
296
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
297
#mov" 32(%0), %%"#mm"3 \n\t" \
298
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
299
#mov" 80(%0), %%"#mm"4 \n\t" \
300
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
301
#mov" (%0), %%"#mm"5 \n\t" \
302
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
303
"paddsw 112(%0), %%"#mm"5 \n\t" \
304
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
305
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
306
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
307
#mov" 16(%1), %%"#mm"1 \n\t" \
308
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
309
#mov" 48(%0), %%"#mm"7 \n\t" \
310
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
311
"paddsw 64(%0), %%"#mm"7 \n\t" \
312
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
313
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
314
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
315
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
316
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
317
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
318
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
319
"por (%2), %%"#mm"1 \n\t" \
320
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
321
"pmulhw 16(%1), %%"#mm"5 \n\t" \
322
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
323
"psubsw 80(%0), %%"#mm"3 \n\t" \
324
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
325
#mov" %%"#mm"1, 32(%3) \n\t" \
326
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
327
#mov" 48(%0), %%"#mm"1 \n\t" \
328
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
329
"psubsw 64(%0), %%"#mm"1 \n\t" \
330
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
331
#mov" %%"#mm"4, 64(%3) \n\t" \
332
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
333
"pmulhw (%4), %%"#mm"2 \n\t" \
334
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
335
"pmulhw (%4), %%"#mm"6 \n\t" \
336
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
337
"por (%2), %%"#mm"5 \n\t" \
338
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
339
"por (%2), %%"#mm"2 \n\t" \
340
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
341
#mov" (%0), %%"#mm"3 \n\t" \
342
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
343
"psubsw 112(%0), %%"#mm"3 \n\t" \
344
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
345
#mov" (%1), %%"#mm"0 \n\t" \
346
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
347
#mov" 32(%1), %%"#mm"6 \n\t" \
348
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
349
#mov" %%"#mm"7, (%3) \n\t" \
350
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
351
#mov" %%"#mm"5, 96(%3) \n\t" \
352
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
353
#mov" 32(%1), %%"#mm"5 \n\t" \
354
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
355
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
356
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
357
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
358
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
359
"pmulhw (%1), %%"#mm"3 \n\t" \
360
"por (%2), %%"#mm"0 \n\t" \
361
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
362
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
363
#mov" %%"#mm"0, 16(%3) \n\t" \
364
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
365
#mov" %%"#mm"7, 48(%3) \n\t" \
366
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
367
#mov" %%"#mm"5, 80(%3) \n\t" \
368
#mov" %%"#mm"3, 112(%3) \n\t" \
370
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
371
"r" (out + offset), "r" (ocos_4_16)); \
369
374
FDCT_COL(mmx, mm, movq)
434
439
static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
436
pshufw_m2r(*(in + 4), mm5, 0x1B);
437
movq_m2r(*(in + 0), mm0);
439
paddsw_r2r(mm5, mm0);
440
psubsw_r2r(mm5, mm1);
442
punpckldq_r2r(mm1, mm0);
443
punpckhdq_r2r(mm1, mm2);
444
movq_m2r(*(table + 0), mm1);
445
movq_m2r(*(table + 4), mm3);
446
movq_m2r(*(table + 8), mm4);
447
movq_m2r(*(table + 12), mm5);
448
movq_m2r(*(table + 16), mm6);
449
movq_m2r(*(table + 20), mm7);
450
pmaddwd_r2r(mm0, mm1);
451
pmaddwd_r2r(mm2, mm3);
452
pmaddwd_r2r(mm0, mm4);
453
pmaddwd_r2r(mm2, mm5);
454
pmaddwd_r2r(mm0, mm6);
455
pmaddwd_r2r(mm2, mm7);
456
pmaddwd_m2r(*(table + 24), mm0);
457
pmaddwd_m2r(*(table + 28), mm2);
462
movq_m2r(*fdct_r_row, mm0);
467
psrad_i2r(SHIFT_FRW_ROW, mm3);
468
psrad_i2r(SHIFT_FRW_ROW, mm5);
469
psrad_i2r(SHIFT_FRW_ROW, mm7);
470
psrad_i2r(SHIFT_FRW_ROW, mm2);
471
packssdw_r2r(mm5, mm3);
472
packssdw_r2r(mm2, mm7);
473
movq_r2m(mm3, *(out + 0));
474
movq_r2m(mm7, *(out + 4));
442
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
443
"movq (%0), %%mm0 \n\t"
444
"movq %%mm0, %%mm1 \n\t"
445
"paddsw %%mm5, %%mm0 \n\t"
446
"psubsw %%mm5, %%mm1 \n\t"
447
"movq %%mm0, %%mm2 \n\t"
448
"punpckldq %%mm1, %%mm0 \n\t"
449
"punpckhdq %%mm1, %%mm2 \n\t"
450
"movq (%1), %%mm1 \n\t"
451
"movq 8(%1), %%mm3 \n\t"
452
"movq 16(%1), %%mm4 \n\t"
453
"movq 24(%1), %%mm5 \n\t"
454
"movq 32(%1), %%mm6 \n\t"
455
"movq 40(%1), %%mm7 \n\t"
456
"pmaddwd %%mm0, %%mm1 \n\t"
457
"pmaddwd %%mm2, %%mm3 \n\t"
458
"pmaddwd %%mm0, %%mm4 \n\t"
459
"pmaddwd %%mm2, %%mm5 \n\t"
460
"pmaddwd %%mm0, %%mm6 \n\t"
461
"pmaddwd %%mm2, %%mm7 \n\t"
462
"pmaddwd 48(%1), %%mm0 \n\t"
463
"pmaddwd 56(%1), %%mm2 \n\t"
464
"paddd %%mm1, %%mm3 \n\t"
465
"paddd %%mm4, %%mm5 \n\t"
466
"paddd %%mm6, %%mm7 \n\t"
467
"paddd %%mm0, %%mm2 \n\t"
468
"movq (%2), %%mm0 \n\t"
469
"paddd %%mm0, %%mm3 \n\t"
470
"paddd %%mm0, %%mm5 \n\t"
471
"paddd %%mm0, %%mm7 \n\t"
472
"paddd %%mm0, %%mm2 \n\t"
473
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
474
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
475
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
476
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
477
"packssdw %%mm5, %%mm3 \n\t"
478
"packssdw %%mm2, %%mm7 \n\t"
479
"movq %%mm3, (%3) \n\t"
480
"movq %%mm7, 8(%3) \n\t"
482
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
477
485
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
479
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
480
movd_m2r(*(in + 6), mm1);
481
punpcklwd_m2r(*(in + 4), mm1);
483
psrlq_i2r(0x20, mm1);
484
movq_m2r(*(in + 0), mm0);
485
punpcklwd_r2r(mm2, mm1);
487
paddsw_r2r(mm1, mm0);
488
psubsw_r2r(mm1, mm5);
490
punpckldq_r2r(mm5, mm0);
491
punpckhdq_r2r(mm5, mm2);
492
movq_m2r(*(table + 0), mm1);
493
movq_m2r(*(table + 4), mm3);
494
movq_m2r(*(table + 8), mm4);
495
movq_m2r(*(table + 12), mm5);
496
movq_m2r(*(table + 16), mm6);
497
movq_m2r(*(table + 20), mm7);
498
pmaddwd_r2r(mm0, mm1);
499
pmaddwd_r2r(mm2, mm3);
500
pmaddwd_r2r(mm0, mm4);
501
pmaddwd_r2r(mm2, mm5);
502
pmaddwd_r2r(mm0, mm6);
503
pmaddwd_r2r(mm2, mm7);
504
pmaddwd_m2r(*(table + 24), mm0);
505
pmaddwd_m2r(*(table + 28), mm2);
510
movq_m2r(*fdct_r_row, mm0);
515
psrad_i2r(SHIFT_FRW_ROW, mm3);
516
psrad_i2r(SHIFT_FRW_ROW, mm5);
517
psrad_i2r(SHIFT_FRW_ROW, mm7);
518
psrad_i2r(SHIFT_FRW_ROW, mm2);
519
packssdw_r2r(mm5, mm3);
520
packssdw_r2r(mm2, mm7);
521
movq_r2m(mm3, *(out + 0));
522
movq_r2m(mm7, *(out + 4));
487
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
489
"movd 12(%0), %%mm1 \n\t"
490
"punpcklwd 8(%0), %%mm1 \n\t"
491
"movq %%mm1, %%mm2 \n\t"
492
"psrlq $0x20, %%mm1 \n\t"
493
"movq 0(%0), %%mm0 \n\t"
494
"punpcklwd %%mm2, %%mm1 \n\t"
495
"movq %%mm0, %%mm5 \n\t"
496
"paddsw %%mm1, %%mm0 \n\t"
497
"psubsw %%mm1, %%mm5 \n\t"
498
"movq %%mm0, %%mm2 \n\t"
499
"punpckldq %%mm5, %%mm0 \n\t"
500
"punpckhdq %%mm5, %%mm2 \n\t"
501
"movq 0(%1), %%mm1 \n\t"
502
"movq 8(%1), %%mm3 \n\t"
503
"movq 16(%1), %%mm4 \n\t"
504
"movq 24(%1), %%mm5 \n\t"
505
"movq 32(%1), %%mm6 \n\t"
506
"movq 40(%1), %%mm7 \n\t"
507
"pmaddwd %%mm0, %%mm1 \n\t"
508
"pmaddwd %%mm2, %%mm3 \n\t"
509
"pmaddwd %%mm0, %%mm4 \n\t"
510
"pmaddwd %%mm2, %%mm5 \n\t"
511
"pmaddwd %%mm0, %%mm6 \n\t"
512
"pmaddwd %%mm2, %%mm7 \n\t"
513
"pmaddwd 48(%1), %%mm0 \n\t"
514
"pmaddwd 56(%1), %%mm2 \n\t"
515
"paddd %%mm1, %%mm3 \n\t"
516
"paddd %%mm4, %%mm5 \n\t"
517
"paddd %%mm6, %%mm7 \n\t"
518
"paddd %%mm0, %%mm2 \n\t"
519
"movq (%2), %%mm0 \n\t"
520
"paddd %%mm0, %%mm3 \n\t"
521
"paddd %%mm0, %%mm5 \n\t"
522
"paddd %%mm0, %%mm7 \n\t"
523
"paddd %%mm0, %%mm2 \n\t"
524
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
525
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
526
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
527
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
528
"packssdw %%mm5, %%mm3 \n\t"
529
"packssdw %%mm2, %%mm7 \n\t"
530
"movq %%mm3, 0(%3) \n\t"
531
"movq %%mm7, 8(%3) \n\t"
533
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
525
536
void ff_fdct_mmx(int16_t *block)