71
99
// first argument is unmodifed and second is trashed
72
100
// regfe is supposed to contain 0xfefefefefefefefe
73
101
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
74
"movq " #rega ", " #regr " \n\t"\
75
"pand " #regb ", " #regr " \n\t"\
76
"pxor " #rega ", " #regb " \n\t"\
77
"pand " #regfe "," #regb " \n\t"\
78
"psrlq $1, " #regb " \n\t"\
79
"paddb " #regb ", " #regr " \n\t"
102
"movq " #rega ", " #regr " \n\t"\
103
"pand " #regb ", " #regr " \n\t"\
104
"pxor " #rega ", " #regb " \n\t"\
105
"pand " #regfe "," #regb " \n\t"\
106
"psrlq $1, " #regb " \n\t"\
107
"paddb " #regb ", " #regr " \n\t"
81
109
#define PAVGB_MMX(rega, regb, regr, regfe) \
82
"movq " #rega ", " #regr " \n\t"\
83
"por " #regb ", " #regr " \n\t"\
84
"pxor " #rega ", " #regb " \n\t"\
85
"pand " #regfe "," #regb " \n\t"\
86
"psrlq $1, " #regb " \n\t"\
87
"psubb " #regb ", " #regr " \n\t"
110
"movq " #rega ", " #regr " \n\t"\
111
"por " #regb ", " #regr " \n\t"\
112
"pxor " #rega ", " #regb " \n\t"\
113
"pand " #regfe "," #regb " \n\t"\
114
"psrlq $1, " #regb " \n\t"\
115
"psubb " #regb ", " #regr " \n\t"
89
117
// mm6 is supposed to contain 0xfefefefefefefefe
90
118
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
91
"movq " #rega ", " #regr " \n\t"\
92
"movq " #regc ", " #regp " \n\t"\
93
"pand " #regb ", " #regr " \n\t"\
94
"pand " #regd ", " #regp " \n\t"\
95
"pxor " #rega ", " #regb " \n\t"\
96
"pxor " #regc ", " #regd " \n\t"\
97
"pand %%mm6, " #regb " \n\t"\
98
"pand %%mm6, " #regd " \n\t"\
99
"psrlq $1, " #regb " \n\t"\
100
"psrlq $1, " #regd " \n\t"\
101
"paddb " #regb ", " #regr " \n\t"\
102
"paddb " #regd ", " #regp " \n\t"
119
"movq " #rega ", " #regr " \n\t"\
120
"movq " #regc ", " #regp " \n\t"\
121
"pand " #regb ", " #regr " \n\t"\
122
"pand " #regd ", " #regp " \n\t"\
123
"pxor " #rega ", " #regb " \n\t"\
124
"pxor " #regc ", " #regd " \n\t"\
125
"pand %%mm6, " #regb " \n\t"\
126
"pand %%mm6, " #regd " \n\t"\
127
"psrlq $1, " #regb " \n\t"\
128
"psrlq $1, " #regd " \n\t"\
129
"paddb " #regb ", " #regr " \n\t"\
130
"paddb " #regd ", " #regp " \n\t"
104
132
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
105
"movq " #rega ", " #regr " \n\t"\
106
"movq " #regc ", " #regp " \n\t"\
107
"por " #regb ", " #regr " \n\t"\
108
"por " #regd ", " #regp " \n\t"\
109
"pxor " #rega ", " #regb " \n\t"\
110
"pxor " #regc ", " #regd " \n\t"\
111
"pand %%mm6, " #regb " \n\t"\
112
"pand %%mm6, " #regd " \n\t"\
113
"psrlq $1, " #regd " \n\t"\
114
"psrlq $1, " #regb " \n\t"\
115
"psubb " #regb ", " #regr " \n\t"\
116
"psubb " #regd ", " #regp " \n\t"
133
"movq " #rega ", " #regr " \n\t"\
134
"movq " #regc ", " #regp " \n\t"\
135
"por " #regb ", " #regr " \n\t"\
136
"por " #regd ", " #regp " \n\t"\
137
"pxor " #rega ", " #regb " \n\t"\
138
"pxor " #regc ", " #regd " \n\t"\
139
"pand %%mm6, " #regb " \n\t"\
140
"pand %%mm6, " #regd " \n\t"\
141
"psrlq $1, " #regd " \n\t"\
142
"psrlq $1, " #regb " \n\t"\
143
"psubb " #regb ", " #regr " \n\t"\
144
"psubb " #regd ", " #regp " \n\t"
118
146
/***********************************/
119
147
/* MMX no rounding */
120
148
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
121
149
#define SET_RND MOVQ_WONE
122
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
123
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
150
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
151
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
125
153
#include "dsputil_mmx_rnd.h"
299
"movq (%2), %%mm0\n\t"
300
"movq 8(%2), %%mm1\n\t"
301
"movq 16(%2), %%mm2\n\t"
302
"movq 24(%2), %%mm3\n\t"
305
"movq %%mm4, %%mm5\n\t"
306
"punpcklbw %%mm7, %%mm4\n\t"
307
"punpckhbw %%mm7, %%mm5\n\t"
308
"paddsw %%mm4, %%mm0\n\t"
309
"paddsw %%mm5, %%mm1\n\t"
310
"movq %%mm6, %%mm5\n\t"
311
"punpcklbw %%mm7, %%mm6\n\t"
312
"punpckhbw %%mm7, %%mm5\n\t"
313
"paddsw %%mm6, %%mm2\n\t"
314
"paddsw %%mm5, %%mm3\n\t"
315
"packuswb %%mm1, %%mm0\n\t"
316
"packuswb %%mm3, %%mm2\n\t"
319
:"+m"(*pix), "+m"(*(pix+line_size))
352
"movq (%2), %%mm0 \n\t"
353
"movq 8(%2), %%mm1 \n\t"
354
"movq 16(%2), %%mm2 \n\t"
355
"movq 24(%2), %%mm3 \n\t"
356
"movq %0, %%mm4 \n\t"
357
"movq %1, %%mm6 \n\t"
358
"movq %%mm4, %%mm5 \n\t"
359
"punpcklbw %%mm7, %%mm4 \n\t"
360
"punpckhbw %%mm7, %%mm5 \n\t"
361
"paddsw %%mm4, %%mm0 \n\t"
362
"paddsw %%mm5, %%mm1 \n\t"
363
"movq %%mm6, %%mm5 \n\t"
364
"punpcklbw %%mm7, %%mm6 \n\t"
365
"punpckhbw %%mm7, %%mm5 \n\t"
366
"paddsw %%mm6, %%mm2 \n\t"
367
"paddsw %%mm5, %%mm3 \n\t"
368
"packuswb %%mm1, %%mm0 \n\t"
369
"packuswb %%mm3, %%mm2 \n\t"
370
"movq %%mm0, %0 \n\t"
371
"movq %%mm2, %1 \n\t"
372
:"+m"(*pix), "+m"(*(pix+line_size))
322
375
pix += line_size*2;
380
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
383
"lea (%3, %3), %%"REG_a" \n\t"
386
"movd (%1), %%mm0 \n\t"
387
"movd (%1, %3), %%mm1 \n\t"
388
"movd %%mm0, (%2) \n\t"
389
"movd %%mm1, (%2, %3) \n\t"
390
"add %%"REG_a", %1 \n\t"
391
"add %%"REG_a", %2 \n\t"
392
"movd (%1), %%mm0 \n\t"
393
"movd (%1, %3), %%mm1 \n\t"
394
"movd %%mm0, (%2) \n\t"
395
"movd %%mm1, (%2, %3) \n\t"
396
"add %%"REG_a", %1 \n\t"
397
"add %%"REG_a", %2 \n\t"
400
: "+g"(h), "+r" (pixels), "+r" (block)
401
: "r"((long)line_size)
327
406
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
329
408
__asm __volatile(
330
"lea (%3, %3), %%eax \n\t"
333
"movq (%1), %%mm0 \n\t"
334
"movq (%1, %3), %%mm1 \n\t"
335
"movq %%mm0, (%2) \n\t"
336
"movq %%mm1, (%2, %3) \n\t"
337
"addl %%eax, %1 \n\t"
338
"addl %%eax, %2 \n\t"
339
"movq (%1), %%mm0 \n\t"
340
"movq (%1, %3), %%mm1 \n\t"
341
"movq %%mm0, (%2) \n\t"
342
"movq %%mm1, (%2, %3) \n\t"
343
"addl %%eax, %1 \n\t"
344
"addl %%eax, %2 \n\t"
347
: "+g"(h), "+r" (pixels), "+r" (block)
409
"lea (%3, %3), %%"REG_a" \n\t"
412
"movq (%1), %%mm0 \n\t"
413
"movq (%1, %3), %%mm1 \n\t"
414
"movq %%mm0, (%2) \n\t"
415
"movq %%mm1, (%2, %3) \n\t"
416
"add %%"REG_a", %1 \n\t"
417
"add %%"REG_a", %2 \n\t"
418
"movq (%1), %%mm0 \n\t"
419
"movq (%1, %3), %%mm1 \n\t"
420
"movq %%mm0, (%2) \n\t"
421
"movq %%mm1, (%2, %3) \n\t"
422
"add %%"REG_a", %1 \n\t"
423
"add %%"REG_a", %2 \n\t"
426
: "+g"(h), "+r" (pixels), "+r" (block)
427
: "r"((long)line_size)
353
432
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
355
434
__asm __volatile(
356
"lea (%3, %3), %%eax \n\t"
359
"movq (%1), %%mm0 \n\t"
360
"movq 8(%1), %%mm4 \n\t"
361
"movq (%1, %3), %%mm1 \n\t"
362
"movq 8(%1, %3), %%mm5 \n\t"
363
"movq %%mm0, (%2) \n\t"
364
"movq %%mm4, 8(%2) \n\t"
365
"movq %%mm1, (%2, %3) \n\t"
366
"movq %%mm5, 8(%2, %3) \n\t"
367
"addl %%eax, %1 \n\t"
368
"addl %%eax, %2 \n\t"
369
"movq (%1), %%mm0 \n\t"
370
"movq 8(%1), %%mm4 \n\t"
371
"movq (%1, %3), %%mm1 \n\t"
372
"movq 8(%1, %3), %%mm5 \n\t"
373
"movq %%mm0, (%2) \n\t"
374
"movq %%mm4, 8(%2) \n\t"
375
"movq %%mm1, (%2, %3) \n\t"
376
"movq %%mm5, 8(%2, %3) \n\t"
377
"addl %%eax, %1 \n\t"
378
"addl %%eax, %2 \n\t"
381
: "+g"(h), "+r" (pixels), "+r" (block)
435
"lea (%3, %3), %%"REG_a" \n\t"
438
"movq (%1), %%mm0 \n\t"
439
"movq 8(%1), %%mm4 \n\t"
440
"movq (%1, %3), %%mm1 \n\t"
441
"movq 8(%1, %3), %%mm5 \n\t"
442
"movq %%mm0, (%2) \n\t"
443
"movq %%mm4, 8(%2) \n\t"
444
"movq %%mm1, (%2, %3) \n\t"
445
"movq %%mm5, 8(%2, %3) \n\t"
446
"add %%"REG_a", %1 \n\t"
447
"add %%"REG_a", %2 \n\t"
448
"movq (%1), %%mm0 \n\t"
449
"movq 8(%1), %%mm4 \n\t"
450
"movq (%1, %3), %%mm1 \n\t"
451
"movq 8(%1, %3), %%mm5 \n\t"
452
"movq %%mm0, (%2) \n\t"
453
"movq %%mm4, 8(%2) \n\t"
454
"movq %%mm1, (%2, %3) \n\t"
455
"movq %%mm5, 8(%2, %3) \n\t"
456
"add %%"REG_a", %1 \n\t"
457
"add %%"REG_a", %2 \n\t"
460
: "+g"(h), "+r" (pixels), "+r" (block)
461
: "r"((long)line_size)
387
466
static void clear_blocks_mmx(DCTELEM *blocks)
389
468
__asm __volatile(
390
"pxor %%mm7, %%mm7 \n\t"
391
"movl $-128*6, %%eax \n\t"
393
"movq %%mm7, (%0, %%eax) \n\t"
394
"movq %%mm7, 8(%0, %%eax) \n\t"
395
"movq %%mm7, 16(%0, %%eax) \n\t"
396
"movq %%mm7, 24(%0, %%eax) \n\t"
397
"addl $32, %%eax \n\t"
399
: : "r" (((int)blocks)+128*6)
469
"pxor %%mm7, %%mm7 \n\t"
470
"mov $-128*6, %%"REG_a" \n\t"
472
"movq %%mm7, (%0, %%"REG_a") \n\t"
473
"movq %%mm7, 8(%0, %%"REG_a") \n\t"
474
"movq %%mm7, 16(%0, %%"REG_a") \n\t"
475
"movq %%mm7, 24(%0, %%"REG_a") \n\t"
476
"add $32, %%"REG_a" \n\t"
478
: : "r" (((uint8_t *)blocks)+128*6)
483
#ifdef CONFIG_ENCODERS
404
484
static int pix_sum16_mmx(uint8_t * pix, int line_size){
407
int index= -line_size*h;
487
long index= -line_size*h;
409
489
__asm __volatile(
410
"pxor %%mm7, %%mm7 \n\t"
411
"pxor %%mm6, %%mm6 \n\t"
413
"movq (%2, %1), %%mm0 \n\t"
414
"movq (%2, %1), %%mm1 \n\t"
415
"movq 8(%2, %1), %%mm2 \n\t"
416
"movq 8(%2, %1), %%mm3 \n\t"
417
"punpcklbw %%mm7, %%mm0 \n\t"
418
"punpckhbw %%mm7, %%mm1 \n\t"
419
"punpcklbw %%mm7, %%mm2 \n\t"
420
"punpckhbw %%mm7, %%mm3 \n\t"
421
"paddw %%mm0, %%mm1 \n\t"
422
"paddw %%mm2, %%mm3 \n\t"
423
"paddw %%mm1, %%mm3 \n\t"
424
"paddw %%mm3, %%mm6 \n\t"
427
"movq %%mm6, %%mm5 \n\t"
428
"psrlq $32, %%mm6 \n\t"
429
"paddw %%mm5, %%mm6 \n\t"
430
"movq %%mm6, %%mm5 \n\t"
431
"psrlq $16, %%mm6 \n\t"
432
"paddw %%mm5, %%mm6 \n\t"
433
"movd %%mm6, %0 \n\t"
434
"andl $0xFFFF, %0 \n\t"
490
"pxor %%mm7, %%mm7 \n\t"
491
"pxor %%mm6, %%mm6 \n\t"
493
"movq (%2, %1), %%mm0 \n\t"
494
"movq (%2, %1), %%mm1 \n\t"
495
"movq 8(%2, %1), %%mm2 \n\t"
496
"movq 8(%2, %1), %%mm3 \n\t"
497
"punpcklbw %%mm7, %%mm0 \n\t"
498
"punpckhbw %%mm7, %%mm1 \n\t"
499
"punpcklbw %%mm7, %%mm2 \n\t"
500
"punpckhbw %%mm7, %%mm3 \n\t"
501
"paddw %%mm0, %%mm1 \n\t"
502
"paddw %%mm2, %%mm3 \n\t"
503
"paddw %%mm1, %%mm3 \n\t"
504
"paddw %%mm3, %%mm6 \n\t"
507
"movq %%mm6, %%mm5 \n\t"
508
"psrlq $32, %%mm6 \n\t"
509
"paddw %%mm5, %%mm6 \n\t"
510
"movq %%mm6, %%mm5 \n\t"
511
"psrlq $16, %%mm6 \n\t"
512
"paddw %%mm5, %%mm6 \n\t"
513
"movd %%mm6, %0 \n\t"
514
"andl $0xFFFF, %0 \n\t"
435
515
: "=&r" (sum), "+r" (index)
436
: "r" (pix - index), "r" (line_size)
516
: "r" (pix - index), "r" ((long)line_size)
521
#endif //CONFIG_ENCODERS
442
523
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
446
"movq (%1, %0), %%mm0 \n\t"
447
"movq (%2, %0), %%mm1 \n\t"
448
"paddb %%mm0, %%mm1 \n\t"
449
"movq %%mm1, (%2, %0) \n\t"
450
"movq 8(%1, %0), %%mm0 \n\t"
451
"movq 8(%2, %0), %%mm1 \n\t"
452
"paddb %%mm0, %%mm1 \n\t"
453
"movq %%mm1, 8(%2, %0) \n\t"
527
"movq (%1, %0), %%mm0 \n\t"
528
"movq (%2, %0), %%mm1 \n\t"
529
"paddb %%mm0, %%mm1 \n\t"
530
"movq %%mm1, (%2, %0) \n\t"
531
"movq 8(%1, %0), %%mm0 \n\t"
532
"movq 8(%2, %0), %%mm1 \n\t"
533
"paddb %%mm0, %%mm1 \n\t"
534
"movq %%mm1, 8(%2, %0) \n\t"
458
: "r"(src), "r"(dst), "r"(w-15)
539
: "r"(src), "r"(dst), "r"((long)w-15)
461
542
dst[i+0] += src[i+0];
545
#define H263_LOOP_FILTER \
546
"pxor %%mm7, %%mm7 \n\t"\
547
"movq %0, %%mm0 \n\t"\
548
"movq %0, %%mm1 \n\t"\
549
"movq %3, %%mm2 \n\t"\
550
"movq %3, %%mm3 \n\t"\
551
"punpcklbw %%mm7, %%mm0 \n\t"\
552
"punpckhbw %%mm7, %%mm1 \n\t"\
553
"punpcklbw %%mm7, %%mm2 \n\t"\
554
"punpckhbw %%mm7, %%mm3 \n\t"\
555
"psubw %%mm2, %%mm0 \n\t"\
556
"psubw %%mm3, %%mm1 \n\t"\
557
"movq %1, %%mm2 \n\t"\
558
"movq %1, %%mm3 \n\t"\
559
"movq %2, %%mm4 \n\t"\
560
"movq %2, %%mm5 \n\t"\
561
"punpcklbw %%mm7, %%mm2 \n\t"\
562
"punpckhbw %%mm7, %%mm3 \n\t"\
563
"punpcklbw %%mm7, %%mm4 \n\t"\
564
"punpckhbw %%mm7, %%mm5 \n\t"\
565
"psubw %%mm2, %%mm4 \n\t"\
566
"psubw %%mm3, %%mm5 \n\t"\
567
"psllw $2, %%mm4 \n\t"\
568
"psllw $2, %%mm5 \n\t"\
569
"paddw %%mm0, %%mm4 \n\t"\
570
"paddw %%mm1, %%mm5 \n\t"\
571
"pxor %%mm6, %%mm6 \n\t"\
572
"pcmpgtw %%mm4, %%mm6 \n\t"\
573
"pcmpgtw %%mm5, %%mm7 \n\t"\
574
"pxor %%mm6, %%mm4 \n\t"\
575
"pxor %%mm7, %%mm5 \n\t"\
576
"psubw %%mm6, %%mm4 \n\t"\
577
"psubw %%mm7, %%mm5 \n\t"\
578
"psrlw $3, %%mm4 \n\t"\
579
"psrlw $3, %%mm5 \n\t"\
580
"packuswb %%mm5, %%mm4 \n\t"\
581
"packsswb %%mm7, %%mm6 \n\t"\
582
"pxor %%mm7, %%mm7 \n\t"\
583
"movd %4, %%mm2 \n\t"\
584
"punpcklbw %%mm2, %%mm2 \n\t"\
585
"punpcklbw %%mm2, %%mm2 \n\t"\
586
"punpcklbw %%mm2, %%mm2 \n\t"\
587
"psubusb %%mm4, %%mm2 \n\t"\
588
"movq %%mm2, %%mm3 \n\t"\
589
"psubusb %%mm4, %%mm3 \n\t"\
590
"psubb %%mm3, %%mm2 \n\t"\
591
"movq %1, %%mm3 \n\t"\
592
"movq %2, %%mm4 \n\t"\
593
"pxor %%mm6, %%mm3 \n\t"\
594
"pxor %%mm6, %%mm4 \n\t"\
595
"paddusb %%mm2, %%mm3 \n\t"\
596
"psubusb %%mm2, %%mm4 \n\t"\
597
"pxor %%mm6, %%mm3 \n\t"\
598
"pxor %%mm6, %%mm4 \n\t"\
599
"paddusb %%mm2, %%mm2 \n\t"\
600
"packsswb %%mm1, %%mm0 \n\t"\
601
"pcmpgtb %%mm0, %%mm7 \n\t"\
602
"pxor %%mm7, %%mm0 \n\t"\
603
"psubb %%mm7, %%mm0 \n\t"\
604
"movq %%mm0, %%mm1 \n\t"\
605
"psubusb %%mm2, %%mm0 \n\t"\
606
"psubb %%mm0, %%mm1 \n\t"\
607
"pand %5, %%mm1 \n\t"\
608
"psrlw $2, %%mm1 \n\t"\
609
"pxor %%mm7, %%mm1 \n\t"\
610
"psubb %%mm7, %%mm1 \n\t"\
611
"movq %0, %%mm5 \n\t"\
612
"movq %3, %%mm6 \n\t"\
613
"psubb %%mm1, %%mm5 \n\t"\
614
"paddb %%mm1, %%mm6 \n\t"
616
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
617
const int strength= ff_h263_loop_filter_strength[qscale];
623
"movq %%mm3, %1 \n\t"
624
"movq %%mm4, %2 \n\t"
625
"movq %%mm5, %0 \n\t"
626
"movq %%mm6, %3 \n\t"
627
: "+m" (*(uint64_t*)(src - 2*stride)),
628
"+m" (*(uint64_t*)(src - 1*stride)),
629
"+m" (*(uint64_t*)(src + 0*stride)),
630
"+m" (*(uint64_t*)(src + 1*stride))
631
: "g" (2*strength), "m"(ff_pb_FC)
635
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
636
asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
637
"movd %4, %%mm0 \n\t"
638
"movd %5, %%mm1 \n\t"
639
"movd %6, %%mm2 \n\t"
640
"movd %7, %%mm3 \n\t"
641
"punpcklbw %%mm1, %%mm0 \n\t"
642
"punpcklbw %%mm3, %%mm2 \n\t"
643
"movq %%mm0, %%mm1 \n\t"
644
"punpcklwd %%mm2, %%mm0 \n\t"
645
"punpckhwd %%mm2, %%mm1 \n\t"
646
"movd %%mm0, %0 \n\t"
647
"punpckhdq %%mm0, %%mm0 \n\t"
648
"movd %%mm0, %1 \n\t"
649
"movd %%mm1, %2 \n\t"
650
"punpckhdq %%mm1, %%mm1 \n\t"
651
"movd %%mm1, %3 \n\t"
653
: "=m" (*(uint32_t*)(dst + 0*dst_stride)),
654
"=m" (*(uint32_t*)(dst + 1*dst_stride)),
655
"=m" (*(uint32_t*)(dst + 2*dst_stride)),
656
"=m" (*(uint32_t*)(dst + 3*dst_stride))
657
: "m" (*(uint32_t*)(src + 0*src_stride)),
658
"m" (*(uint32_t*)(src + 1*src_stride)),
659
"m" (*(uint32_t*)(src + 2*src_stride)),
660
"m" (*(uint32_t*)(src + 3*src_stride))
664
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
665
const int strength= ff_h263_loop_filter_strength[qscale];
666
uint64_t temp[4] __attribute__ ((aligned(8)));
667
uint8_t *btemp= (uint8_t*)temp;
671
transpose4x4(btemp , src , 8, stride);
672
transpose4x4(btemp+4, src + 4*stride, 8, stride);
674
H263_LOOP_FILTER // 5 3 4 6
680
: "g" (2*strength), "m"(ff_pb_FC)
684
"movq %%mm5, %%mm1 \n\t"
685
"movq %%mm4, %%mm0 \n\t"
686
"punpcklbw %%mm3, %%mm5 \n\t"
687
"punpcklbw %%mm6, %%mm4 \n\t"
688
"punpckhbw %%mm3, %%mm1 \n\t"
689
"punpckhbw %%mm6, %%mm0 \n\t"
690
"movq %%mm5, %%mm3 \n\t"
691
"movq %%mm1, %%mm6 \n\t"
692
"punpcklwd %%mm4, %%mm5 \n\t"
693
"punpcklwd %%mm0, %%mm1 \n\t"
694
"punpckhwd %%mm4, %%mm3 \n\t"
695
"punpckhwd %%mm0, %%mm6 \n\t"
696
"movd %%mm5, (%0) \n\t"
697
"punpckhdq %%mm5, %%mm5 \n\t"
698
"movd %%mm5, (%0,%2) \n\t"
699
"movd %%mm3, (%0,%2,2) \n\t"
700
"punpckhdq %%mm3, %%mm3 \n\t"
701
"movd %%mm3, (%0,%3) \n\t"
702
"movd %%mm1, (%1) \n\t"
703
"punpckhdq %%mm1, %%mm1 \n\t"
704
"movd %%mm1, (%1,%2) \n\t"
705
"movd %%mm6, (%1,%2,2) \n\t"
706
"punpckhdq %%mm6, %%mm6 \n\t"
707
"movd %%mm6, (%1,%3) \n\t"
709
"r" (src + 4*stride),
710
"r" ((long) stride ),
711
"r" ((long)(3*stride))
715
#ifdef CONFIG_ENCODERS
464
716
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
468
720
"pxor %%mm0,%%mm0\n"
469
721
"pxor %%mm7,%%mm7\n"
471
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
472
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
474
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
476
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
477
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
479
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
480
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
481
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
483
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
484
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
723
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
724
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
726
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
728
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
729
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
731
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
732
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
733
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
735
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
736
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
486
738
"pmaddwd %%mm3,%%mm3\n"
487
739
"pmaddwd %%mm4,%%mm4\n"
489
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
490
pix2^2+pix3^2+pix6^2+pix7^2) */
741
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
742
pix2^2+pix3^2+pix6^2+pix7^2) */
491
743
"paddd %%mm3,%%mm4\n"
492
744
"paddd %%mm2,%%mm7\n"
495
747
"paddd %%mm4,%%mm7\n"
499
751
"movq %%mm7,%%mm1\n"
500
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
752
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
501
753
"paddd %%mm7,%%mm1\n"
502
754
"movd %%mm1,%1\n"
503
: "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
507
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
511
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
512
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
514
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
515
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
516
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
517
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
519
/* todo: mm1-mm2, mm3-mm4 */
520
/* algo: substract mm1 from mm2 with saturation and vice versa */
521
/* OR the results to get absolute difference */
524
"psubusb %%mm2,%%mm1\n"
525
"psubusb %%mm4,%%mm3\n"
526
"psubusb %%mm5,%%mm2\n"
527
"psubusb %%mm6,%%mm4\n"
532
/* now convert to 16-bit vectors so we can square them */
536
"punpckhbw %%mm0,%%mm2\n"
537
"punpckhbw %%mm0,%%mm4\n"
538
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
539
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
541
"pmaddwd %%mm2,%%mm2\n"
542
"pmaddwd %%mm4,%%mm4\n"
543
"pmaddwd %%mm1,%%mm1\n"
544
"pmaddwd %%mm3,%%mm3\n"
549
"paddd %%mm2,%%mm1\n"
550
"paddd %%mm4,%%mm3\n"
551
"paddd %%mm1,%%mm7\n"
552
"paddd %%mm3,%%mm7\n"
558
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
559
"paddd %%mm7,%%mm1\n"
561
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
755
: "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
759
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
764
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
765
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
767
"movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
768
"movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
769
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
770
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
772
/* todo: mm1-mm2, mm3-mm4 */
773
/* algo: substract mm1 from mm2 with saturation and vice versa */
774
/* OR the results to get absolute difference */
777
"psubusb %%mm2,%%mm1\n"
778
"psubusb %%mm4,%%mm3\n"
779
"psubusb %%mm5,%%mm2\n"
780
"psubusb %%mm6,%%mm4\n"
785
/* now convert to 16-bit vectors so we can square them */
789
"punpckhbw %%mm0,%%mm2\n"
790
"punpckhbw %%mm0,%%mm4\n"
791
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
792
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
794
"pmaddwd %%mm2,%%mm2\n"
795
"pmaddwd %%mm4,%%mm4\n"
796
"pmaddwd %%mm1,%%mm1\n"
797
"pmaddwd %%mm3,%%mm3\n"
799
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
800
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
802
"paddd %%mm2,%%mm1\n"
803
"paddd %%mm4,%%mm3\n"
804
"paddd %%mm1,%%mm7\n"
805
"paddd %%mm3,%%mm7\n"
811
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
812
"paddd %%mm7,%%mm1\n"
814
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
815
: "r" ((long)line_size) , "m" (h)
820
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
824
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
825
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
827
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
828
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
829
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
830
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
832
/* todo: mm1-mm2, mm3-mm4 */
833
/* algo: substract mm1 from mm2 with saturation and vice versa */
834
/* OR the results to get absolute difference */
837
"psubusb %%mm2,%%mm1\n"
838
"psubusb %%mm4,%%mm3\n"
839
"psubusb %%mm5,%%mm2\n"
840
"psubusb %%mm6,%%mm4\n"
845
/* now convert to 16-bit vectors so we can square them */
849
"punpckhbw %%mm0,%%mm2\n"
850
"punpckhbw %%mm0,%%mm4\n"
851
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
852
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
854
"pmaddwd %%mm2,%%mm2\n"
855
"pmaddwd %%mm4,%%mm4\n"
856
"pmaddwd %%mm1,%%mm1\n"
857
"pmaddwd %%mm3,%%mm3\n"
862
"paddd %%mm2,%%mm1\n"
863
"paddd %%mm4,%%mm3\n"
864
"paddd %%mm1,%%mm7\n"
865
"paddd %%mm3,%%mm7\n"
871
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
872
"paddd %%mm7,%%mm1\n"
874
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
875
: "r" ((long)line_size) , "m" (h)
880
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
884
"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
885
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
887
"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
888
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
889
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
890
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
892
/* todo: mm1-mm2, mm3-mm4 */
893
/* algo: substract mm1 from mm2 with saturation and vice versa */
894
/* OR the results to get absolute difference */
895
"movdqa %%xmm1,%%xmm5\n"
896
"movdqa %%xmm3,%%xmm6\n"
897
"psubusb %%xmm2,%%xmm1\n"
898
"psubusb %%xmm4,%%xmm3\n"
899
"psubusb %%xmm5,%%xmm2\n"
900
"psubusb %%xmm6,%%xmm4\n"
902
"por %%xmm1,%%xmm2\n"
903
"por %%xmm3,%%xmm4\n"
905
/* now convert to 16-bit vectors so we can square them */
906
"movdqa %%xmm2,%%xmm1\n"
907
"movdqa %%xmm4,%%xmm3\n"
909
"punpckhbw %%xmm0,%%xmm2\n"
910
"punpckhbw %%xmm0,%%xmm4\n"
911
"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
912
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
914
"pmaddwd %%xmm2,%%xmm2\n"
915
"pmaddwd %%xmm4,%%xmm4\n"
916
"pmaddwd %%xmm1,%%xmm1\n"
917
"pmaddwd %%xmm3,%%xmm3\n"
919
"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
920
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
922
"paddd %%xmm2,%%xmm1\n"
923
"paddd %%xmm4,%%xmm3\n"
924
"paddd %%xmm1,%%xmm7\n"
925
"paddd %%xmm3,%%xmm7\n"
930
"movdqa %%xmm7,%%xmm1\n"
931
"psrldq $8, %%xmm7\n" /* shift hi qword to lo */
932
"paddd %%xmm1,%%xmm7\n"
933
"movdqa %%xmm7,%%xmm1\n"
934
"psrldq $4, %%xmm7\n" /* shift hi dword to lo */
935
"paddd %%xmm1,%%xmm7\n"
937
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
938
: "r" ((long)line_size));
942
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
950
"movq %%mm0, %%mm1\n"
954
"movq %%mm0, %%mm2\n"
955
"movq %%mm1, %%mm3\n"
956
"punpcklbw %%mm7,%%mm0\n"
957
"punpcklbw %%mm7,%%mm1\n"
958
"punpckhbw %%mm7,%%mm2\n"
959
"punpckhbw %%mm7,%%mm3\n"
960
"psubw %%mm1, %%mm0\n"
961
"psubw %%mm3, %%mm2\n"
966
"movq %%mm4, %%mm1\n"
970
"movq %%mm4, %%mm5\n"
971
"movq %%mm1, %%mm3\n"
972
"punpcklbw %%mm7,%%mm4\n"
973
"punpcklbw %%mm7,%%mm1\n"
974
"punpckhbw %%mm7,%%mm5\n"
975
"punpckhbw %%mm7,%%mm3\n"
976
"psubw %%mm1, %%mm4\n"
977
"psubw %%mm3, %%mm5\n"
978
"psubw %%mm4, %%mm0\n"
979
"psubw %%mm5, %%mm2\n"
980
"pxor %%mm3, %%mm3\n"
981
"pxor %%mm1, %%mm1\n"
982
"pcmpgtw %%mm0, %%mm3\n\t"
983
"pcmpgtw %%mm2, %%mm1\n\t"
984
"pxor %%mm3, %%mm0\n"
985
"pxor %%mm1, %%mm2\n"
986
"psubw %%mm3, %%mm0\n"
987
"psubw %%mm1, %%mm2\n"
988
"paddw %%mm0, %%mm2\n"
989
"paddw %%mm2, %%mm6\n"
995
"movq %%mm0, %%mm1\n"
999
"movq %%mm0, %%mm2\n"
1000
"movq %%mm1, %%mm3\n"
1001
"punpcklbw %%mm7,%%mm0\n"
1002
"punpcklbw %%mm7,%%mm1\n"
1003
"punpckhbw %%mm7,%%mm2\n"
1004
"punpckhbw %%mm7,%%mm3\n"
1005
"psubw %%mm1, %%mm0\n"
1006
"psubw %%mm3, %%mm2\n"
1007
"psubw %%mm0, %%mm4\n"
1008
"psubw %%mm2, %%mm5\n"
1009
"pxor %%mm3, %%mm3\n"
1010
"pxor %%mm1, %%mm1\n"
1011
"pcmpgtw %%mm4, %%mm3\n\t"
1012
"pcmpgtw %%mm5, %%mm1\n\t"
1013
"pxor %%mm3, %%mm4\n"
1014
"pxor %%mm1, %%mm5\n"
1015
"psubw %%mm3, %%mm4\n"
1016
"psubw %%mm1, %%mm5\n"
1017
"paddw %%mm4, %%mm5\n"
1018
"paddw %%mm5, %%mm6\n"
1023
"movq %%mm4, %%mm1\n"
1027
"movq %%mm4, %%mm5\n"
1028
"movq %%mm1, %%mm3\n"
1029
"punpcklbw %%mm7,%%mm4\n"
1030
"punpcklbw %%mm7,%%mm1\n"
1031
"punpckhbw %%mm7,%%mm5\n"
1032
"punpckhbw %%mm7,%%mm3\n"
1033
"psubw %%mm1, %%mm4\n"
1034
"psubw %%mm3, %%mm5\n"
1035
"psubw %%mm4, %%mm0\n"
1036
"psubw %%mm5, %%mm2\n"
1037
"pxor %%mm3, %%mm3\n"
1038
"pxor %%mm1, %%mm1\n"
1039
"pcmpgtw %%mm0, %%mm3\n\t"
1040
"pcmpgtw %%mm2, %%mm1\n\t"
1041
"pxor %%mm3, %%mm0\n"
1042
"pxor %%mm1, %%mm2\n"
1043
"psubw %%mm3, %%mm0\n"
1044
"psubw %%mm1, %%mm2\n"
1045
"paddw %%mm0, %%mm2\n"
1046
"paddw %%mm2, %%mm6\n"
1052
"movq %%mm6, %%mm0\n"
1053
"punpcklwd %%mm7,%%mm0\n"
1054
"punpckhwd %%mm7,%%mm6\n"
1055
"paddd %%mm0, %%mm6\n"
1057
"movq %%mm6,%%mm0\n"
1058
"psrlq $32, %%mm6\n"
1059
"paddd %%mm6,%%mm0\n"
1061
: "+r" (pix1), "=r"(tmp)
1062
: "r" ((long)line_size) , "g" (h-2)
1067
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1069
uint8_t * pix= pix1;
1072
"pxor %%mm7,%%mm7\n"
1073
"pxor %%mm6,%%mm6\n"
1076
"movq 1(%0),%%mm1\n"
1077
"movq %%mm0, %%mm2\n"
1078
"movq %%mm1, %%mm3\n"
1079
"punpcklbw %%mm7,%%mm0\n"
1080
"punpcklbw %%mm7,%%mm1\n"
1081
"punpckhbw %%mm7,%%mm2\n"
1082
"punpckhbw %%mm7,%%mm3\n"
1083
"psubw %%mm1, %%mm0\n"
1084
"psubw %%mm3, %%mm2\n"
1089
"movq 1(%0),%%mm1\n"
1090
"movq %%mm4, %%mm5\n"
1091
"movq %%mm1, %%mm3\n"
1092
"punpcklbw %%mm7,%%mm4\n"
1093
"punpcklbw %%mm7,%%mm1\n"
1094
"punpckhbw %%mm7,%%mm5\n"
1095
"punpckhbw %%mm7,%%mm3\n"
1096
"psubw %%mm1, %%mm4\n"
1097
"psubw %%mm3, %%mm5\n"
1098
"psubw %%mm4, %%mm0\n"
1099
"psubw %%mm5, %%mm2\n"
1100
"pxor %%mm3, %%mm3\n"
1101
"pxor %%mm1, %%mm1\n"
1102
"pcmpgtw %%mm0, %%mm3\n\t"
1103
"pcmpgtw %%mm2, %%mm1\n\t"
1104
"pxor %%mm3, %%mm0\n"
1105
"pxor %%mm1, %%mm2\n"
1106
"psubw %%mm3, %%mm0\n"
1107
"psubw %%mm1, %%mm2\n"
1108
"paddw %%mm0, %%mm2\n"
1109
"paddw %%mm2, %%mm6\n"
1115
"movq 1(%0),%%mm1\n"
1116
"movq %%mm0, %%mm2\n"
1117
"movq %%mm1, %%mm3\n"
1118
"punpcklbw %%mm7,%%mm0\n"
1119
"punpcklbw %%mm7,%%mm1\n"
1120
"punpckhbw %%mm7,%%mm2\n"
1121
"punpckhbw %%mm7,%%mm3\n"
1122
"psubw %%mm1, %%mm0\n"
1123
"psubw %%mm3, %%mm2\n"
1124
"psubw %%mm0, %%mm4\n"
1125
"psubw %%mm2, %%mm5\n"
1126
"pxor %%mm3, %%mm3\n"
1127
"pxor %%mm1, %%mm1\n"
1128
"pcmpgtw %%mm4, %%mm3\n\t"
1129
"pcmpgtw %%mm5, %%mm1\n\t"
1130
"pxor %%mm3, %%mm4\n"
1131
"pxor %%mm1, %%mm5\n"
1132
"psubw %%mm3, %%mm4\n"
1133
"psubw %%mm1, %%mm5\n"
1134
"paddw %%mm4, %%mm5\n"
1135
"paddw %%mm5, %%mm6\n"
1140
"movq 1(%0),%%mm1\n"
1141
"movq %%mm4, %%mm5\n"
1142
"movq %%mm1, %%mm3\n"
1143
"punpcklbw %%mm7,%%mm4\n"
1144
"punpcklbw %%mm7,%%mm1\n"
1145
"punpckhbw %%mm7,%%mm5\n"
1146
"punpckhbw %%mm7,%%mm3\n"
1147
"psubw %%mm1, %%mm4\n"
1148
"psubw %%mm3, %%mm5\n"
1149
"psubw %%mm4, %%mm0\n"
1150
"psubw %%mm5, %%mm2\n"
1151
"pxor %%mm3, %%mm3\n"
1152
"pxor %%mm1, %%mm1\n"
1153
"pcmpgtw %%mm0, %%mm3\n\t"
1154
"pcmpgtw %%mm2, %%mm1\n\t"
1155
"pxor %%mm3, %%mm0\n"
1156
"pxor %%mm1, %%mm2\n"
1157
"psubw %%mm3, %%mm0\n"
1158
"psubw %%mm1, %%mm2\n"
1159
"paddw %%mm0, %%mm2\n"
1160
"paddw %%mm2, %%mm6\n"
1166
"movq %%mm6, %%mm0\n"
1167
"punpcklwd %%mm7,%%mm0\n"
1168
"punpckhwd %%mm7,%%mm6\n"
1169
"paddd %%mm0, %%mm6\n"
1171
"movq %%mm6,%%mm0\n"
1172
"psrlq $32, %%mm6\n"
1173
"paddd %%mm6,%%mm0\n"
1175
: "+r" (pix1), "=r"(tmp)
1176
: "r" ((long)line_size) , "g" (h-2)
1178
return tmp + hf_noise8_mmx(pix+8, line_size, h);
1181
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1182
MpegEncContext *c = p;
1185
if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1186
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1187
score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1189
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1190
else return score1 + FFABS(score2)*8;
1193
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1194
MpegEncContext *c = p;
1195
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1196
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1198
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1199
else return score1 + FFABS(score2)*8;
1202
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1205
assert( (((int)pix) & 7) == 0);
1206
assert((line_size &7) ==0);
1208
#define SUM(in0, in1, out0, out1) \
1209
"movq (%0), %%mm2\n"\
1210
"movq 8(%0), %%mm3\n"\
1212
"movq %%mm2, " #out0 "\n"\
1213
"movq %%mm3, " #out1 "\n"\
1214
"psubusb " #in0 ", %%mm2\n"\
1215
"psubusb " #in1 ", %%mm3\n"\
1216
"psubusb " #out0 ", " #in0 "\n"\
1217
"psubusb " #out1 ", " #in1 "\n"\
1218
"por %%mm2, " #in0 "\n"\
1219
"por %%mm3, " #in1 "\n"\
1220
"movq " #in0 ", %%mm2\n"\
1221
"movq " #in1 ", %%mm3\n"\
1222
"punpcklbw %%mm7, " #in0 "\n"\
1223
"punpcklbw %%mm7, " #in1 "\n"\
1224
"punpckhbw %%mm7, %%mm2\n"\
1225
"punpckhbw %%mm7, %%mm3\n"\
1226
"paddw " #in1 ", " #in0 "\n"\
1227
"paddw %%mm3, %%mm2\n"\
1228
"paddw %%mm2, " #in0 "\n"\
1229
"paddw " #in0 ", %%mm6\n"
1234
"pxor %%mm6,%%mm6\n"
1235
"pxor %%mm7,%%mm7\n"
1237
"movq 8(%0),%%mm1\n"
1240
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1243
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1245
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1250
"movq %%mm6,%%mm0\n"
1251
"psrlq $32, %%mm6\n"
1252
"paddw %%mm6,%%mm0\n"
1253
"movq %%mm0,%%mm6\n"
1254
"psrlq $16, %%mm0\n"
1255
"paddw %%mm6,%%mm0\n"
1257
: "+r" (pix), "=r"(tmp)
1258
: "r" ((long)line_size) , "m" (h)
1260
return tmp & 0xFFFF;
1264
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1267
assert( (((int)pix) & 7) == 0);
1268
assert((line_size &7) ==0);
1270
#define SUM(in0, in1, out0, out1) \
1271
"movq (%0), " #out0 "\n"\
1272
"movq 8(%0), " #out1 "\n"\
1274
"psadbw " #out0 ", " #in0 "\n"\
1275
"psadbw " #out1 ", " #in1 "\n"\
1276
"paddw " #in1 ", " #in0 "\n"\
1277
"paddw " #in0 ", %%mm6\n"
1281
"pxor %%mm6,%%mm6\n"
1282
"pxor %%mm7,%%mm7\n"
1284
"movq 8(%0),%%mm1\n"
1287
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1290
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1292
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1298
: "+r" (pix), "=r"(tmp)
1299
: "r" ((long)line_size) , "m" (h)
1305
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1308
assert( (((int)pix1) & 7) == 0);
1309
assert( (((int)pix2) & 7) == 0);
1310
assert((line_size &7) ==0);
1312
#define SUM(in0, in1, out0, out1) \
1313
"movq (%0),%%mm2\n"\
1314
"movq (%1)," #out0 "\n"\
1315
"movq 8(%0),%%mm3\n"\
1316
"movq 8(%1)," #out1 "\n"\
1319
"psubb " #out0 ", %%mm2\n"\
1320
"psubb " #out1 ", %%mm3\n"\
1321
"pxor %%mm7, %%mm2\n"\
1322
"pxor %%mm7, %%mm3\n"\
1323
"movq %%mm2, " #out0 "\n"\
1324
"movq %%mm3, " #out1 "\n"\
1325
"psubusb " #in0 ", %%mm2\n"\
1326
"psubusb " #in1 ", %%mm3\n"\
1327
"psubusb " #out0 ", " #in0 "\n"\
1328
"psubusb " #out1 ", " #in1 "\n"\
1329
"por %%mm2, " #in0 "\n"\
1330
"por %%mm3, " #in1 "\n"\
1331
"movq " #in0 ", %%mm2\n"\
1332
"movq " #in1 ", %%mm3\n"\
1333
"punpcklbw %%mm7, " #in0 "\n"\
1334
"punpcklbw %%mm7, " #in1 "\n"\
1335
"punpckhbw %%mm7, %%mm2\n"\
1336
"punpckhbw %%mm7, %%mm3\n"\
1337
"paddw " #in1 ", " #in0 "\n"\
1338
"paddw %%mm3, %%mm2\n"\
1339
"paddw %%mm2, " #in0 "\n"\
1340
"paddw " #in0 ", %%mm6\n"
1345
"pxor %%mm6,%%mm6\n"
1346
"pcmpeqw %%mm7,%%mm7\n"
1347
"psllw $15, %%mm7\n"
1348
"packsswb %%mm7, %%mm7\n"
1351
"movq 8(%0),%%mm1\n"
1352
"movq 8(%1),%%mm3\n"
1356
"psubb %%mm2, %%mm0\n"
1357
"psubb %%mm3, %%mm1\n"
1358
"pxor %%mm7, %%mm0\n"
1359
"pxor %%mm7, %%mm1\n"
1360
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1363
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1365
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1370
"movq %%mm6,%%mm0\n"
1371
"psrlq $32, %%mm6\n"
1372
"paddw %%mm6,%%mm0\n"
1373
"movq %%mm0,%%mm6\n"
1374
"psrlq $16, %%mm0\n"
1375
"paddw %%mm6,%%mm0\n"
1377
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
1378
: "r" ((long)line_size) , "m" (h)
1380
return tmp & 0x7FFF;
1384
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1387
assert( (((int)pix1) & 7) == 0);
1388
assert( (((int)pix2) & 7) == 0);
1389
assert((line_size &7) ==0);
1391
#define SUM(in0, in1, out0, out1) \
1392
"movq (%0)," #out0 "\n"\
1393
"movq (%1),%%mm2\n"\
1394
"movq 8(%0)," #out1 "\n"\
1395
"movq 8(%1),%%mm3\n"\
1398
"psubb %%mm2, " #out0 "\n"\
1399
"psubb %%mm3, " #out1 "\n"\
1400
"pxor %%mm7, " #out0 "\n"\
1401
"pxor %%mm7, " #out1 "\n"\
1402
"psadbw " #out0 ", " #in0 "\n"\
1403
"psadbw " #out1 ", " #in1 "\n"\
1404
"paddw " #in1 ", " #in0 "\n"\
1405
"paddw " #in0 ", %%mm6\n"
1409
"pxor %%mm6,%%mm6\n"
1410
"pcmpeqw %%mm7,%%mm7\n"
1411
"psllw $15, %%mm7\n"
1412
"packsswb %%mm7, %%mm7\n"
1415
"movq 8(%0),%%mm1\n"
1416
"movq 8(%1),%%mm3\n"
1420
"psubb %%mm2, %%mm0\n"
1421
"psubb %%mm3, %%mm1\n"
1422
"pxor %%mm7, %%mm0\n"
1423
"pxor %%mm7, %%mm1\n"
1424
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1427
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1429
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1435
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
1436
: "r" ((long)line_size) , "m" (h)
565
1442
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
569
"movq (%2, %0), %%mm0 \n\t"
570
"movq (%1, %0), %%mm1 \n\t"
571
"psubb %%mm0, %%mm1 \n\t"
572
"movq %%mm1, (%3, %0) \n\t"
573
"movq 8(%2, %0), %%mm0 \n\t"
574
"movq 8(%1, %0), %%mm1 \n\t"
575
"psubb %%mm0, %%mm1 \n\t"
576
"movq %%mm1, 8(%3, %0) \n\t"
1446
"movq (%2, %0), %%mm0 \n\t"
1447
"movq (%1, %0), %%mm1 \n\t"
1448
"psubb %%mm0, %%mm1 \n\t"
1449
"movq %%mm1, (%3, %0) \n\t"
1450
"movq 8(%2, %0), %%mm0 \n\t"
1451
"movq 8(%1, %0), %%mm1 \n\t"
1452
"psubb %%mm0, %%mm1 \n\t"
1453
"movq %%mm1, 8(%3, %0) \n\t"
581
: "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
1458
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
584
1461
dst[i+0] = src1[i+0]-src2[i+0];
1464
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1470
"movq -1(%1, %0), %%mm0 \n\t" // LT
1471
"movq (%1, %0), %%mm1 \n\t" // T
1472
"movq -1(%2, %0), %%mm2 \n\t" // L
1473
"movq (%2, %0), %%mm3 \n\t" // X
1474
"movq %%mm2, %%mm4 \n\t" // L
1475
"psubb %%mm0, %%mm2 \n\t"
1476
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
1477
"movq %%mm4, %%mm5 \n\t" // L
1478
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1479
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
1480
"pminub %%mm2, %%mm4 \n\t"
1481
"pmaxub %%mm1, %%mm4 \n\t"
1482
"psubb %%mm4, %%mm3 \n\t" // dst - pred
1483
"movq %%mm3, (%3, %0) \n\t"
1488
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1494
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1496
*left_top= src1[w-1];
586
1500
#define LBUTTERFLY2(a1,b1,a2,b2)\
587
"paddw " #b1 ", " #a1 " \n\t"\
588
"paddw " #b2 ", " #a2 " \n\t"\
589
"paddw " #b1 ", " #b1 " \n\t"\
590
"paddw " #b2 ", " #b2 " \n\t"\
591
"psubw " #a1 ", " #b1 " \n\t"\
592
"psubw " #a2 ", " #b2 " \n\t"
1501
"paddw " #b1 ", " #a1 " \n\t"\
1502
"paddw " #b2 ", " #a2 " \n\t"\
1503
"paddw " #b1 ", " #b1 " \n\t"\
1504
"paddw " #b2 ", " #b2 " \n\t"\
1505
"psubw " #a1 ", " #b1 " \n\t"\
1506
"psubw " #a2 ", " #b2 " \n\t"
594
1508
#define HADAMARD48\
595
1509
LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
635
1544
SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
637
1546
#define LOAD4(o, a, b, c, d)\
638
"movq "#o"(%1), " #a " \n\t"\
639
"movq "#o"+16(%1), " #b " \n\t"\
640
"movq "#o"+32(%1), " #c " \n\t"\
641
"movq "#o"+48(%1), " #d " \n\t"
1547
"movq "#o"(%1), " #a " \n\t"\
1548
"movq "#o"+16(%1), " #b " \n\t"\
1549
"movq "#o"+32(%1), " #c " \n\t"\
1550
"movq "#o"+48(%1), " #d " \n\t"
643
1552
#define STORE4(o, a, b, c, d)\
644
"movq "#a", "#o"(%1) \n\t"\
645
"movq "#b", "#o"+16(%1) \n\t"\
646
"movq "#c", "#o"+32(%1) \n\t"\
647
"movq "#d", "#o"+48(%1) \n\t"\
649
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
650
uint64_t temp[16] __align8;
653
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
656
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
657
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
661
"movq %%mm7, 112(%1) \n\t"
663
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
664
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
666
"movq 112(%1), %%mm7 \n\t"
667
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
668
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
670
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
671
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
675
"movq %%mm7, 120(%1) \n\t"
677
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
678
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
680
"movq 120(%1), %%mm7 \n\t"
681
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
682
"movq %%mm7, %%mm5 \n\t"//FIXME remove
683
"movq %%mm6, %%mm7 \n\t"
684
"movq %%mm0, %%mm6 \n\t"
685
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
687
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
688
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
691
"movq %%mm7, 64(%1) \n\t"
693
MMABS_SUM(%%mm1, %%mm7, %%mm0)
694
MMABS_SUM(%%mm2, %%mm7, %%mm0)
695
MMABS_SUM(%%mm3, %%mm7, %%mm0)
696
MMABS_SUM(%%mm4, %%mm7, %%mm0)
697
MMABS_SUM(%%mm5, %%mm7, %%mm0)
698
MMABS_SUM(%%mm6, %%mm7, %%mm0)
699
"movq 64(%1), %%mm1 \n\t"
700
MMABS_SUM(%%mm1, %%mm7, %%mm0)
701
"movq %%mm0, 64(%1) \n\t"
703
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
704
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
707
"movq %%mm7, (%1) \n\t"
709
MMABS_SUM(%%mm1, %%mm7, %%mm0)
710
MMABS_SUM(%%mm2, %%mm7, %%mm0)
711
MMABS_SUM(%%mm3, %%mm7, %%mm0)
712
MMABS_SUM(%%mm4, %%mm7, %%mm0)
713
MMABS_SUM(%%mm5, %%mm7, %%mm0)
714
MMABS_SUM(%%mm6, %%mm7, %%mm0)
715
"movq (%1), %%mm1 \n\t"
716
MMABS_SUM(%%mm1, %%mm7, %%mm0)
717
"movq 64(%1), %%mm1 \n\t"
718
MMABS_SUM(%%mm1, %%mm7, %%mm0)
720
"movq %%mm0, %%mm1 \n\t"
721
"psrlq $32, %%mm0 \n\t"
722
"paddusw %%mm1, %%mm0 \n\t"
723
"movq %%mm0, %%mm1 \n\t"
724
"psrlq $16, %%mm0 \n\t"
725
"paddusw %%mm1, %%mm0 \n\t"
726
"movd %%mm0, %0 \n\t"
734
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
735
uint64_t temp[16] __align8;
738
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
741
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
742
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
746
"movq %%mm7, 112(%1) \n\t"
748
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
749
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
751
"movq 112(%1), %%mm7 \n\t"
752
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
753
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
755
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
756
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
760
"movq %%mm7, 120(%1) \n\t"
762
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
763
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
765
"movq 120(%1), %%mm7 \n\t"
766
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
767
"movq %%mm7, %%mm5 \n\t"//FIXME remove
768
"movq %%mm6, %%mm7 \n\t"
769
"movq %%mm0, %%mm6 \n\t"
770
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
772
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
773
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
776
"movq %%mm7, 64(%1) \n\t"
777
MMABS_MMX2(%%mm0, %%mm7)
778
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
779
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
780
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
781
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
782
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
783
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
784
"movq 64(%1), %%mm1 \n\t"
785
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
786
"movq %%mm0, 64(%1) \n\t"
788
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
789
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
792
"movq %%mm7, (%1) \n\t"
793
MMABS_MMX2(%%mm0, %%mm7)
794
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
795
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
796
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
797
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
798
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
799
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
800
"movq (%1), %%mm1 \n\t"
801
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
802
"movq 64(%1), %%mm1 \n\t"
803
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
805
"movq %%mm0, %%mm1 \n\t"
806
"psrlq $32, %%mm0 \n\t"
807
"paddusw %%mm1, %%mm0 \n\t"
808
"movq %%mm0, %%mm1 \n\t"
809
"psrlq $16, %%mm0 \n\t"
810
"paddusw %%mm1, %%mm0 \n\t"
811
"movd %%mm0, %0 \n\t"
820
WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
821
WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1553
"movq "#a", "#o"(%1) \n\t"\
1554
"movq "#b", "#o"+16(%1) \n\t"\
1555
"movq "#c", "#o"+32(%1) \n\t"\
1556
"movq "#d", "#o"+48(%1) \n\t"\
1558
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1559
DECLARE_ALIGNED_8(uint64_t, temp[16]);
1564
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1567
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1568
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1572
"movq %%mm7, 112(%1) \n\t"
1574
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1575
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1577
"movq 112(%1), %%mm7 \n\t"
1578
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1579
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1581
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1582
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1586
"movq %%mm7, 120(%1) \n\t"
1588
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1589
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1591
"movq 120(%1), %%mm7 \n\t"
1592
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1593
"movq %%mm7, %%mm5 \n\t"//FIXME remove
1594
"movq %%mm6, %%mm7 \n\t"
1595
"movq %%mm0, %%mm6 \n\t"
1596
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1598
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1599
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1602
"movq %%mm7, 64(%1) \n\t"
1604
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1605
MMABS_SUM(%%mm2, %%mm7, %%mm0)
1606
MMABS_SUM(%%mm3, %%mm7, %%mm0)
1607
MMABS_SUM(%%mm4, %%mm7, %%mm0)
1608
MMABS_SUM(%%mm5, %%mm7, %%mm0)
1609
MMABS_SUM(%%mm6, %%mm7, %%mm0)
1610
"movq 64(%1), %%mm1 \n\t"
1611
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1612
"movq %%mm0, 64(%1) \n\t"
1614
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1615
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1618
"movq %%mm7, (%1) \n\t"
1620
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1621
MMABS_SUM(%%mm2, %%mm7, %%mm0)
1622
MMABS_SUM(%%mm3, %%mm7, %%mm0)
1623
MMABS_SUM(%%mm4, %%mm7, %%mm0)
1624
MMABS_SUM(%%mm5, %%mm7, %%mm0)
1625
MMABS_SUM(%%mm6, %%mm7, %%mm0)
1626
"movq (%1), %%mm1 \n\t"
1627
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1628
"movq 64(%1), %%mm1 \n\t"
1629
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1631
"movq %%mm0, %%mm1 \n\t"
1632
"psrlq $32, %%mm0 \n\t"
1633
"paddusw %%mm1, %%mm0 \n\t"
1634
"movq %%mm0, %%mm1 \n\t"
1635
"psrlq $16, %%mm0 \n\t"
1636
"paddusw %%mm1, %%mm0 \n\t"
1637
"movd %%mm0, %0 \n\t"
1645
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1646
DECLARE_ALIGNED_8(uint64_t, temp[16]);
1651
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1654
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1655
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1659
"movq %%mm7, 112(%1) \n\t"
1661
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1662
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1664
"movq 112(%1), %%mm7 \n\t"
1665
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1666
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1668
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1669
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1673
"movq %%mm7, 120(%1) \n\t"
1675
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1676
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1678
"movq 120(%1), %%mm7 \n\t"
1679
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1680
"movq %%mm7, %%mm5 \n\t"//FIXME remove
1681
"movq %%mm6, %%mm7 \n\t"
1682
"movq %%mm0, %%mm6 \n\t"
1683
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1685
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1686
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1689
"movq %%mm7, 64(%1) \n\t"
1690
MMABS_MMX2(%%mm0, %%mm7)
1691
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1692
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1693
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1694
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1695
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1696
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1697
"movq 64(%1), %%mm1 \n\t"
1698
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1699
"movq %%mm0, 64(%1) \n\t"
1701
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1702
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1705
"movq %%mm7, (%1) \n\t"
1706
MMABS_MMX2(%%mm0, %%mm7)
1707
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1708
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1709
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1710
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1711
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1712
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1713
"movq (%1), %%mm1 \n\t"
1714
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1715
"movq 64(%1), %%mm1 \n\t"
1716
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1718
"pshufw $0x0E, %%mm0, %%mm1 \n\t"
1719
"paddusw %%mm1, %%mm0 \n\t"
1720
"pshufw $0x01, %%mm0, %%mm1 \n\t"
1721
"paddusw %%mm1, %%mm0 \n\t"
1722
"movd %%mm0, %0 \n\t"
1731
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
1732
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1733
#endif //CONFIG_ENCODERS
823
1735
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
824
1736
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
826
1738
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
827
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
828
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
829
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
830
"movq "#in7", " #m3 " \n\t" /* d */\
831
"movq "#in0", %%mm5 \n\t" /* D */\
832
"paddw " #m3 ", %%mm5 \n\t" /* x4 */\
833
"psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
834
"movq "#in1", %%mm5 \n\t" /* C */\
835
"movq "#in2", %%mm6 \n\t" /* B */\
836
"paddw " #m6 ", %%mm5 \n\t" /* x3 */\
837
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\
838
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
839
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
840
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
841
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\
842
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
843
"psraw $5, %%mm5 \n\t"\
844
"packuswb %%mm5, %%mm5 \n\t"\
1739
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1740
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1741
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1742
"movq "#in7", " #m3 " \n\t" /* d */\
1743
"movq "#in0", %%mm5 \n\t" /* D */\
1744
"paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1745
"psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1746
"movq "#in1", %%mm5 \n\t" /* C */\
1747
"movq "#in2", %%mm6 \n\t" /* B */\
1748
"paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1749
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1750
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1751
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1752
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1753
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1754
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1755
"psraw $5, %%mm5 \n\t"\
1756
"packuswb %%mm5, %%mm5 \n\t"\
845
1757
OP(%%mm5, out, %%mm7, d)
847
1759
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
852
"pxor %%mm7, %%mm7 \n\t"\
854
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
855
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
856
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
857
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
858
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
859
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
860
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
861
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
862
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
863
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
864
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
865
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
866
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
867
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
868
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
869
"paddw %%mm3, %%mm5 \n\t" /* b */\
870
"paddw %%mm2, %%mm6 \n\t" /* c */\
871
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
872
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
873
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
874
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
875
"paddw %%mm4, %%mm0 \n\t" /* a */\
876
"paddw %%mm1, %%mm5 \n\t" /* d */\
877
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
878
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
879
"paddw %6, %%mm6 \n\t"\
880
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
881
"psraw $5, %%mm0 \n\t"\
882
"movq %%mm0, %5 \n\t"\
1764
"pxor %%mm7, %%mm7 \n\t"\
1766
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1767
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1768
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1769
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1770
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1771
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1772
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1773
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1774
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1775
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1776
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1777
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1778
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1779
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1780
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1781
"paddw %%mm3, %%mm5 \n\t" /* b */\
1782
"paddw %%mm2, %%mm6 \n\t" /* c */\
1783
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1784
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1785
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1786
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1787
"paddw %%mm4, %%mm0 \n\t" /* a */\
1788
"paddw %%mm1, %%mm5 \n\t" /* d */\
1789
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1790
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1791
"paddw %6, %%mm6 \n\t"\
1792
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1793
"psraw $5, %%mm0 \n\t"\
1794
"movq %%mm0, %5 \n\t"\
883
1795
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
885
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
886
"movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
887
"movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
888
"psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
889
"psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
890
"punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
891
"punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
892
"paddw %%mm0, %%mm2 \n\t" /* b */\
893
"paddw %%mm5, %%mm3 \n\t" /* c */\
894
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
895
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
896
"movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
897
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
898
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
899
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
900
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
901
"paddw %%mm2, %%mm1 \n\t" /* a */\
902
"paddw %%mm6, %%mm4 \n\t" /* d */\
903
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
904
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
905
"paddw %6, %%mm1 \n\t"\
906
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
907
"psraw $5, %%mm3 \n\t"\
908
"movq %5, %%mm1 \n\t"\
909
"packuswb %%mm3, %%mm1 \n\t"\
1797
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1798
"movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1799
"movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1800
"psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1801
"psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1802
"punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1803
"punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1804
"paddw %%mm0, %%mm2 \n\t" /* b */\
1805
"paddw %%mm5, %%mm3 \n\t" /* c */\
1806
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1807
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1808
"movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1809
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1810
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1811
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1812
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1813
"paddw %%mm2, %%mm1 \n\t" /* a */\
1814
"paddw %%mm6, %%mm4 \n\t" /* d */\
1815
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1816
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1817
"paddw %6, %%mm1 \n\t"\
1818
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1819
"psraw $5, %%mm3 \n\t"\
1820
"movq %5, %%mm1 \n\t"\
1821
"packuswb %%mm3, %%mm1 \n\t"\
910
1822
OP_MMX2(%%mm1, (%1),%%mm4, q)\
911
1823
/* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
913
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
914
"movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
915
"movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
916
"psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
917
"psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
918
"punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
919
"punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
920
"paddw %%mm1, %%mm5 \n\t" /* b */\
921
"paddw %%mm4, %%mm0 \n\t" /* c */\
922
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
923
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
924
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
925
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
926
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
927
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
928
"paddw %%mm3, %%mm2 \n\t" /* d */\
929
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
930
"movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
931
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
932
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
933
"paddw %%mm2, %%mm6 \n\t" /* a */\
934
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
935
"paddw %6, %%mm0 \n\t"\
936
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
937
"psraw $5, %%mm0 \n\t"\
1825
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1826
"movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1827
"movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1828
"psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1829
"psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1830
"punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1831
"punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1832
"paddw %%mm1, %%mm5 \n\t" /* b */\
1833
"paddw %%mm4, %%mm0 \n\t" /* c */\
1834
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1835
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1836
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1837
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1838
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1839
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1840
"paddw %%mm3, %%mm2 \n\t" /* d */\
1841
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1842
"movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1843
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1844
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1845
"paddw %%mm2, %%mm6 \n\t" /* a */\
1846
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1847
"paddw %6, %%mm0 \n\t"\
1848
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1849
"psraw $5, %%mm0 \n\t"\
938
1850
/* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
940
"paddw %%mm5, %%mm3 \n\t" /* a */\
941
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
942
"paddw %%mm4, %%mm6 \n\t" /* b */\
943
"pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
944
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
945
"paddw %%mm1, %%mm4 \n\t" /* c */\
946
"paddw %%mm2, %%mm5 \n\t" /* d */\
947
"paddw %%mm6, %%mm6 \n\t" /* 2b */\
948
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
949
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
950
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
951
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
952
"paddw %6, %%mm4 \n\t"\
953
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
954
"psraw $5, %%mm4 \n\t"\
955
"packuswb %%mm4, %%mm0 \n\t"\
1852
"paddw %%mm5, %%mm3 \n\t" /* a */\
1853
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1854
"paddw %%mm4, %%mm6 \n\t" /* b */\
1855
"pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1856
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1857
"paddw %%mm1, %%mm4 \n\t" /* c */\
1858
"paddw %%mm2, %%mm5 \n\t" /* d */\
1859
"paddw %%mm6, %%mm6 \n\t" /* 2b */\
1860
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1861
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1862
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1863
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1864
"paddw %6, %%mm4 \n\t"\
1865
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1866
"psraw $5, %%mm4 \n\t"\
1867
"packuswb %%mm4, %%mm0 \n\t"\
956
1868
OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
962
1874
: "+a"(src), "+c"(dst), "+m"(h)\
963
: "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1875
: "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1016
1928
uint64_t temp;\
1019
"pxor %%mm7, %%mm7 \n\t"\
1021
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1022
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1023
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1024
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1025
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1026
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1027
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1028
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1029
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1030
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1031
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1032
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1033
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1034
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1035
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1036
"paddw %%mm3, %%mm5 \n\t" /* b */\
1037
"paddw %%mm2, %%mm6 \n\t" /* c */\
1038
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1039
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1040
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1041
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1042
"paddw %%mm4, %%mm0 \n\t" /* a */\
1043
"paddw %%mm1, %%mm5 \n\t" /* d */\
1044
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1045
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1046
"paddw %6, %%mm6 \n\t"\
1047
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1048
"psraw $5, %%mm0 \n\t"\
1931
"pxor %%mm7, %%mm7 \n\t"\
1933
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1934
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1935
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1936
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1937
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1938
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1939
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1940
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1941
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1942
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1943
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1944
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1945
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1946
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1947
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1948
"paddw %%mm3, %%mm5 \n\t" /* b */\
1949
"paddw %%mm2, %%mm6 \n\t" /* c */\
1950
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1951
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1952
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1953
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1954
"paddw %%mm4, %%mm0 \n\t" /* a */\
1955
"paddw %%mm1, %%mm5 \n\t" /* d */\
1956
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1957
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1958
"paddw %6, %%mm6 \n\t"\
1959
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1960
"psraw $5, %%mm0 \n\t"\
1049
1961
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1051
"movd 5(%0), %%mm5 \n\t" /* FGHI */\
1052
"punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1053
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1054
"paddw %%mm5, %%mm1 \n\t" /* a */\
1055
"paddw %%mm6, %%mm2 \n\t" /* b */\
1056
"pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1057
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1058
"paddw %%mm6, %%mm3 \n\t" /* c */\
1059
"paddw %%mm5, %%mm4 \n\t" /* d */\
1060
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1061
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1062
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1063
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1064
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1065
"paddw %6, %%mm1 \n\t"\
1066
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1067
"psraw $5, %%mm3 \n\t"\
1068
"packuswb %%mm3, %%mm0 \n\t"\
1963
"movd 5(%0), %%mm5 \n\t" /* FGHI */\
1964
"punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1965
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1966
"paddw %%mm5, %%mm1 \n\t" /* a */\
1967
"paddw %%mm6, %%mm2 \n\t" /* b */\
1968
"pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1969
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1970
"paddw %%mm6, %%mm3 \n\t" /* c */\
1971
"paddw %%mm5, %%mm4 \n\t" /* d */\
1972
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1973
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1974
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1975
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1976
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1977
"paddw %6, %%mm1 \n\t"\
1978
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1979
"psraw $5, %%mm3 \n\t"\
1980
"packuswb %%mm3, %%mm0 \n\t"\
1069
1981
OP_MMX2(%%mm0, (%1), %%mm4, q)\
1075
1987
: "+a"(src), "+c"(dst), "+m"(h)\
1076
: "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1988
: "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1147
2059
/*FIXME reorder for speed */\
1149
/*"pxor %%mm7, %%mm7 \n\t"*/\
1151
"movq (%0), %%mm0 \n\t"\
1152
"movq 8(%0), %%mm1 \n\t"\
1153
"movq 16(%0), %%mm2 \n\t"\
1154
"movq 24(%0), %%mm3 \n\t"\
2061
/*"pxor %%mm7, %%mm7 \n\t"*/\
2063
"movq (%0), %%mm0 \n\t"\
2064
"movq 8(%0), %%mm1 \n\t"\
2065
"movq 16(%0), %%mm2 \n\t"\
2066
"movq 24(%0), %%mm3 \n\t"\
1155
2067
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1156
2068
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1158
2070
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1160
2072
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1162
2074
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1163
2075
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1165
2077
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1166
2078
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1168
2080
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1169
2081
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1171
2083
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1172
2084
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1174
2086
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1176
2088
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1177
"addl %4, %1 \n\t" \
1178
2090
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1179
2091
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1181
"addl $136, %0 \n\t"\
2093
"add $136, %0 \n\t"\
1186
2098
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1187
: "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
2099
: "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
1192
2104
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1193
uint64_t temp[9*4];\
2105
uint64_t temp[9*2];\
1194
2106
uint64_t *temp_ptr= temp;\
1197
2109
/*FIXME unroll */\
1199
"pxor %%mm7, %%mm7 \n\t"\
1201
"movq (%0), %%mm0 \n\t"\
1202
"movq (%0), %%mm1 \n\t"\
1203
"punpcklbw %%mm7, %%mm0 \n\t"\
1204
"punpckhbw %%mm7, %%mm1 \n\t"\
1205
"movq %%mm0, (%1) \n\t"\
1206
"movq %%mm1, 9*8(%1) \n\t"\
2111
"pxor %%mm7, %%mm7 \n\t"\
2113
"movq (%0), %%mm0 \n\t"\
2114
"movq (%0), %%mm1 \n\t"\
2115
"punpcklbw %%mm7, %%mm0 \n\t"\
2116
"punpckhbw %%mm7, %%mm1 \n\t"\
2117
"movq %%mm0, (%1) \n\t"\
2118
"movq %%mm1, 9*8(%1) \n\t"\
1211
2123
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
2124
: "r" ((long)srcStride)\
1286
2198
uint64_t temp[8];\
1287
2199
uint8_t * const half= (uint8_t*)temp;\
1288
2200
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1289
OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
2201
OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1291
2203
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1292
2204
uint64_t half[8 + 9];\
1293
2205
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1294
2206
uint8_t * const halfHV= ((uint8_t*)half);\
1295
2207
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1296
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
2208
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1297
2209
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1298
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
2210
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1300
2212
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1301
2213
uint64_t half[8 + 9];\
1302
2214
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1303
2215
uint8_t * const halfHV= ((uint8_t*)half);\
1304
2216
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1305
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
2217
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1306
2218
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1307
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
2219
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1309
2221
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1310
2222
uint64_t half[8 + 9];\
1311
2223
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1312
2224
uint8_t * const halfHV= ((uint8_t*)half);\
1313
2225
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1314
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
2226
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1315
2227
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1316
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
2228
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1318
2230
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1319
2231
uint64_t half[8 + 9];\
1320
2232
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1321
2233
uint8_t * const halfHV= ((uint8_t*)half);\
1322
2234
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1323
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
2235
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1324
2236
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1325
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
2237
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1327
2239
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1328
2240
uint64_t half[8 + 9];\
1397
2309
uint64_t temp[32];\
1398
2310
uint8_t * const half= (uint8_t*)temp;\
1399
2311
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1400
OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
2312
OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1402
2314
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1403
2315
uint64_t half[16*2 + 17*2];\
1404
2316
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1405
2317
uint8_t * const halfHV= ((uint8_t*)half);\
1406
2318
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1407
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
2319
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1408
2320
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1409
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
2321
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1411
2323
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1412
2324
uint64_t half[16*2 + 17*2];\
1413
2325
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1414
2326
uint8_t * const halfHV= ((uint8_t*)half);\
1415
2327
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1416
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
2328
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1417
2329
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1418
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
2330
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1420
2332
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1421
2333
uint64_t half[16*2 + 17*2];\
1422
2334
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1423
2335
uint8_t * const halfHV= ((uint8_t*)half);\
1424
2336
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1425
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
2337
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1426
2338
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1427
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
2339
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1429
2341
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1430
2342
uint64_t half[16*2 + 17*2];\
1431
2343
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1432
2344
uint8_t * const halfHV= ((uint8_t*)half);\
1433
2345
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1434
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
2346
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1435
2347
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1436
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
2348
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1438
2350
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1439
2351
uint64_t half[16*2 + 17*2];\
1502
2460
c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1503
2461
c->avg_ ## postfix1 = avg_ ## postfix2;
2463
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2464
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2466
const int ix = ox>>(16+shift);
2467
const int iy = oy>>(16+shift);
2468
const int oxs = ox>>4;
2469
const int oys = oy>>4;
2470
const int dxxs = dxx>>4;
2471
const int dxys = dxy>>4;
2472
const int dyxs = dyx>>4;
2473
const int dyys = dyy>>4;
2474
const uint16_t r4[4] = {r,r,r,r};
2475
const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2476
const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2477
const uint64_t shift2 = 2*shift;
2478
uint8_t edge_buf[(h+1)*stride];
2481
const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2482
const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2483
const int dxh = dxy*(h-1);
2484
const int dyw = dyx*(w-1);
2485
if( // non-constant fullpel offset (3% of blocks)
2486
(ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2487
oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2488
// uses more than 16 bits of subpel mv (only at huge resolution)
2489
|| (dxx|dxy|dyx|dyy)&15 )
2491
//FIXME could still use mmx for some of the rows
2492
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2496
src += ix + iy*stride;
2497
if( (unsigned)ix >= width-w ||
2498
(unsigned)iy >= height-h )
2500
ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2505
"movd %0, %%mm6 \n\t"
2506
"pxor %%mm7, %%mm7 \n\t"
2507
"punpcklwd %%mm6, %%mm6 \n\t"
2508
"punpcklwd %%mm6, %%mm6 \n\t"
2512
for(x=0; x<w; x+=4){
2513
uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2514
oxs - dxys + dxxs*(x+1),
2515
oxs - dxys + dxxs*(x+2),
2516
oxs - dxys + dxxs*(x+3) };
2517
uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2518
oys - dyys + dyxs*(x+1),
2519
oys - dyys + dyxs*(x+2),
2520
oys - dyys + dyxs*(x+3) };
2524
"movq %0, %%mm4 \n\t"
2525
"movq %1, %%mm5 \n\t"
2526
"paddw %2, %%mm4 \n\t"
2527
"paddw %3, %%mm5 \n\t"
2528
"movq %%mm4, %0 \n\t"
2529
"movq %%mm5, %1 \n\t"
2530
"psrlw $12, %%mm4 \n\t"
2531
"psrlw $12, %%mm5 \n\t"
2532
: "+m"(*dx4), "+m"(*dy4)
2533
: "m"(*dxy4), "m"(*dyy4)
2537
"movq %%mm6, %%mm2 \n\t"
2538
"movq %%mm6, %%mm1 \n\t"
2539
"psubw %%mm4, %%mm2 \n\t"
2540
"psubw %%mm5, %%mm1 \n\t"
2541
"movq %%mm2, %%mm0 \n\t"
2542
"movq %%mm4, %%mm3 \n\t"
2543
"pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2544
"pmullw %%mm5, %%mm3 \n\t" // dx*dy
2545
"pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2546
"pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2548
"movd %4, %%mm5 \n\t"
2549
"movd %3, %%mm4 \n\t"
2550
"punpcklbw %%mm7, %%mm5 \n\t"
2551
"punpcklbw %%mm7, %%mm4 \n\t"
2552
"pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2553
"pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2555
"movd %2, %%mm5 \n\t"
2556
"movd %1, %%mm4 \n\t"
2557
"punpcklbw %%mm7, %%mm5 \n\t"
2558
"punpcklbw %%mm7, %%mm4 \n\t"
2559
"pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2560
"pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2561
"paddw %5, %%mm1 \n\t"
2562
"paddw %%mm3, %%mm2 \n\t"
2563
"paddw %%mm1, %%mm0 \n\t"
2564
"paddw %%mm2, %%mm0 \n\t"
2566
"psrlw %6, %%mm0 \n\t"
2567
"packuswb %%mm0, %%mm0 \n\t"
2568
"movd %%mm0, %0 \n\t"
2570
: "=m"(dst[x+y*stride])
2571
: "m"(src[0]), "m"(src[1]),
2572
"m"(src[stride]), "m"(src[stride+1]),
2573
"m"(*r4), "m"(shift2)
2581
#ifdef CONFIG_ENCODERS
2582
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2585
assert(FFABS(scale) < 256);
2586
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2589
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2590
"psrlw $15, %%mm6 \n\t" // 1w
2591
"pxor %%mm7, %%mm7 \n\t"
2592
"movd %4, %%mm5 \n\t"
2593
"punpcklwd %%mm5, %%mm5 \n\t"
2594
"punpcklwd %%mm5, %%mm5 \n\t"
2596
"movq (%1, %0), %%mm0 \n\t"
2597
"movq 8(%1, %0), %%mm1 \n\t"
2598
"pmulhw %%mm5, %%mm0 \n\t"
2599
"pmulhw %%mm5, %%mm1 \n\t"
2600
"paddw %%mm6, %%mm0 \n\t"
2601
"paddw %%mm6, %%mm1 \n\t"
2602
"psraw $1, %%mm0 \n\t"
2603
"psraw $1, %%mm1 \n\t"
2604
"paddw (%2, %0), %%mm0 \n\t"
2605
"paddw 8(%2, %0), %%mm1 \n\t"
2606
"psraw $6, %%mm0 \n\t"
2607
"psraw $6, %%mm1 \n\t"
2608
"pmullw (%3, %0), %%mm0 \n\t"
2609
"pmullw 8(%3, %0), %%mm1 \n\t"
2610
"pmaddwd %%mm0, %%mm0 \n\t"
2611
"pmaddwd %%mm1, %%mm1 \n\t"
2612
"paddd %%mm1, %%mm0 \n\t"
2613
"psrld $4, %%mm0 \n\t"
2614
"paddd %%mm0, %%mm7 \n\t"
2616
"cmp $128, %0 \n\t" //FIXME optimize & bench
2618
"movq %%mm7, %%mm6 \n\t"
2619
"psrlq $32, %%mm7 \n\t"
2620
"paddd %%mm6, %%mm7 \n\t"
2621
"psrld $2, %%mm7 \n\t"
2622
"movd %%mm7, %0 \n\t"
2625
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2630
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2633
if(FFABS(scale) < 256){
2634
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2636
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2637
"psrlw $15, %%mm6 \n\t" // 1w
2638
"movd %3, %%mm5 \n\t"
2639
"punpcklwd %%mm5, %%mm5 \n\t"
2640
"punpcklwd %%mm5, %%mm5 \n\t"
2642
"movq (%1, %0), %%mm0 \n\t"
2643
"movq 8(%1, %0), %%mm1 \n\t"
2644
"pmulhw %%mm5, %%mm0 \n\t"
2645
"pmulhw %%mm5, %%mm1 \n\t"
2646
"paddw %%mm6, %%mm0 \n\t"
2647
"paddw %%mm6, %%mm1 \n\t"
2648
"psraw $1, %%mm0 \n\t"
2649
"psraw $1, %%mm1 \n\t"
2650
"paddw (%2, %0), %%mm0 \n\t"
2651
"paddw 8(%2, %0), %%mm1 \n\t"
2652
"movq %%mm0, (%2, %0) \n\t"
2653
"movq %%mm1, 8(%2, %0) \n\t"
2655
"cmp $128, %0 \n\t" //FIXME optimize & bench
2659
: "r"(basis), "r"(rem), "g"(scale)
2662
for(i=0; i<8*8; i++){
2663
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2667
#endif /* CONFIG_ENCODERS */
2669
#define PREFETCH(name, op) \
2670
static void name(void *mem, int stride, int h){\
2671
const uint8_t *p= mem;\
2673
asm volatile(#op" %0" :: "m"(*p));\
2677
PREFETCH(prefetch_mmx2, prefetcht0)
2678
PREFETCH(prefetch_3dnow, prefetch)
2681
#include "h264dsp_mmx.c"
2684
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2686
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2687
put_pixels8_mmx(dst, src, stride, 8);
2689
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2690
avg_pixels8_mmx(dst, src, stride, 8);
2692
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2693
put_pixels16_mmx(dst, src, stride, 16);
2695
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2696
avg_pixels16_mmx(dst, src, stride, 16);
1505
2699
/* external functions, from idct_mmx.c */
1506
2700
void ff_mmx_idct(DCTELEM *block);
1507
2701
void ff_mmxext_idct(DCTELEM *block);
2703
void ff_vp3_idct_sse2(int16_t *input_data);
2704
void ff_vp3_idct_mmx(int16_t *data);
2705
void ff_vp3_dsp_init_mmx(void);
1509
2707
/* XXX: those functions should be suppressed ASAP when all IDCTs are
1511
2710
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1513
2712
ff_mmx_idct (block);
1528
2727
ff_mmxext_idct (block);
1529
2728
add_pixels_clamped_mmx(block, dest, line_size);
2731
static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2733
ff_vp3_idct_sse2(block);
2734
put_signed_pixels_clamped_mmx(block, dest, line_size);
2736
static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2738
ff_vp3_idct_sse2(block);
2739
add_pixels_clamped_mmx(block, dest, line_size);
2741
static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2743
ff_vp3_idct_mmx(block);
2744
put_signed_pixels_clamped_mmx(block, dest, line_size);
2746
static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2748
ff_vp3_idct_mmx(block);
2749
add_pixels_clamped_mmx(block, dest, line_size);
2751
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2753
ff_idct_xvid_mmx (block);
2754
put_pixels_clamped_mmx(block, dest, line_size);
2756
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2758
ff_idct_xvid_mmx (block);
2759
add_pixels_clamped_mmx(block, dest, line_size);
2761
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2763
ff_idct_xvid_mmx2 (block);
2764
put_pixels_clamped_mmx(block, dest, line_size);
2766
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2768
ff_idct_xvid_mmx2 (block);
2769
add_pixels_clamped_mmx(block, dest, line_size);
2772
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2775
asm volatile("pxor %%mm7, %%mm7":);
2776
for(i=0; i<blocksize; i+=2) {
2778
"movq %0, %%mm0 \n\t"
2779
"movq %1, %%mm1 \n\t"
2780
"movq %%mm0, %%mm2 \n\t"
2781
"movq %%mm1, %%mm3 \n\t"
2782
"pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2783
"pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2784
"pslld $31, %%mm2 \n\t" // keep only the sign bit
2785
"pxor %%mm2, %%mm1 \n\t"
2786
"movq %%mm3, %%mm4 \n\t"
2787
"pand %%mm1, %%mm3 \n\t"
2788
"pandn %%mm1, %%mm4 \n\t"
2789
"pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2790
"pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2791
"movq %%mm3, %1 \n\t"
2792
"movq %%mm0, %0 \n\t"
2793
:"+m"(mag[i]), "+m"(ang[i])
2797
asm volatile("femms");
2799
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2804
"movaps %0, %%xmm5 \n\t"
2805
::"m"(ff_pdw_80000000[0])
2807
for(i=0; i<blocksize; i+=4) {
2809
"movaps %0, %%xmm0 \n\t"
2810
"movaps %1, %%xmm1 \n\t"
2811
"xorps %%xmm2, %%xmm2 \n\t"
2812
"xorps %%xmm3, %%xmm3 \n\t"
2813
"cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2814
"cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2815
"andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2816
"xorps %%xmm2, %%xmm1 \n\t"
2817
"movaps %%xmm3, %%xmm4 \n\t"
2818
"andps %%xmm1, %%xmm3 \n\t"
2819
"andnps %%xmm1, %%xmm4 \n\t"
2820
"addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2821
"subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2822
"movaps %%xmm3, %1 \n\t"
2823
"movaps %%xmm0, %0 \n\t"
2824
:"+m"(mag[i]), "+m"(ang[i])
2830
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2834
"movq (%1,%0), %%mm0 \n\t"
2835
"movq 8(%1,%0), %%mm1 \n\t"
2836
"pfmul (%2,%0), %%mm0 \n\t"
2837
"pfmul 8(%2,%0), %%mm1 \n\t"
2838
"movq %%mm0, (%1,%0) \n\t"
2839
"movq %%mm1, 8(%1,%0) \n\t"
2848
static void vector_fmul_sse(float *dst, const float *src, int len){
2852
"movaps (%1,%0), %%xmm0 \n\t"
2853
"movaps 16(%1,%0), %%xmm1 \n\t"
2854
"mulps (%2,%0), %%xmm0 \n\t"
2855
"mulps 16(%2,%0), %%xmm1 \n\t"
2856
"movaps %%xmm0, (%1,%0) \n\t"
2857
"movaps %%xmm1, 16(%1,%0) \n\t"
2866
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2870
"pswapd 8(%1), %%mm0 \n\t"
2871
"pswapd (%1), %%mm1 \n\t"
2872
"pfmul (%3,%0), %%mm0 \n\t"
2873
"pfmul 8(%3,%0), %%mm1 \n\t"
2874
"movq %%mm0, (%2,%0) \n\t"
2875
"movq %%mm1, 8(%2,%0) \n\t"
2879
:"+r"(i), "+r"(src1)
2880
:"r"(dst), "r"(src0)
2882
asm volatile("femms");
2884
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2888
"movaps 16(%1), %%xmm0 \n\t"
2889
"movaps (%1), %%xmm1 \n\t"
2890
"shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2891
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2892
"mulps (%3,%0), %%xmm0 \n\t"
2893
"mulps 16(%3,%0), %%xmm1 \n\t"
2894
"movaps %%xmm0, (%2,%0) \n\t"
2895
"movaps %%xmm1, 16(%2,%0) \n\t"
2899
:"+r"(i), "+r"(src1)
2900
:"r"(dst), "r"(src0)
2904
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
2905
const float *src2, int src3, int len, int step){
2907
if(step == 2 && src3 == 0){
2911
"movq (%2,%0), %%mm0 \n\t"
2912
"movq 8(%2,%0), %%mm1 \n\t"
2913
"pfmul (%3,%0), %%mm0 \n\t"
2914
"pfmul 8(%3,%0), %%mm1 \n\t"
2915
"pfadd (%4,%0), %%mm0 \n\t"
2916
"pfadd 8(%4,%0), %%mm1 \n\t"
2917
"movd %%mm0, (%1) \n\t"
2918
"movd %%mm1, 16(%1) \n\t"
2919
"psrlq $32, %%mm0 \n\t"
2920
"psrlq $32, %%mm1 \n\t"
2921
"movd %%mm0, 8(%1) \n\t"
2922
"movd %%mm1, 24(%1) \n\t"
2927
:"r"(src0), "r"(src1), "r"(src2)
2931
else if(step == 1 && src3 == 0){
2934
"movq (%2,%0), %%mm0 \n\t"
2935
"movq 8(%2,%0), %%mm1 \n\t"
2936
"pfmul (%3,%0), %%mm0 \n\t"
2937
"pfmul 8(%3,%0), %%mm1 \n\t"
2938
"pfadd (%4,%0), %%mm0 \n\t"
2939
"pfadd 8(%4,%0), %%mm1 \n\t"
2940
"movq %%mm0, (%1,%0) \n\t"
2941
"movq %%mm1, 8(%1,%0) \n\t"
2945
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2950
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2951
asm volatile("femms");
2953
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
2954
const float *src2, int src3, int len, int step){
2956
if(step == 2 && src3 == 0){
2960
"movaps (%2,%0), %%xmm0 \n\t"
2961
"movaps 16(%2,%0), %%xmm1 \n\t"
2962
"mulps (%3,%0), %%xmm0 \n\t"
2963
"mulps 16(%3,%0), %%xmm1 \n\t"
2964
"addps (%4,%0), %%xmm0 \n\t"
2965
"addps 16(%4,%0), %%xmm1 \n\t"
2966
"movss %%xmm0, (%1) \n\t"
2967
"movss %%xmm1, 32(%1) \n\t"
2968
"movhlps %%xmm0, %%xmm2 \n\t"
2969
"movhlps %%xmm1, %%xmm3 \n\t"
2970
"movss %%xmm2, 16(%1) \n\t"
2971
"movss %%xmm3, 48(%1) \n\t"
2972
"shufps $0xb1, %%xmm0, %%xmm0 \n\t"
2973
"shufps $0xb1, %%xmm1, %%xmm1 \n\t"
2974
"movss %%xmm0, 8(%1) \n\t"
2975
"movss %%xmm1, 40(%1) \n\t"
2976
"movhlps %%xmm0, %%xmm2 \n\t"
2977
"movhlps %%xmm1, %%xmm3 \n\t"
2978
"movss %%xmm2, 24(%1) \n\t"
2979
"movss %%xmm3, 56(%1) \n\t"
2984
:"r"(src0), "r"(src1), "r"(src2)
2988
else if(step == 1 && src3 == 0){
2991
"movaps (%2,%0), %%xmm0 \n\t"
2992
"movaps 16(%2,%0), %%xmm1 \n\t"
2993
"mulps (%3,%0), %%xmm0 \n\t"
2994
"mulps 16(%3,%0), %%xmm1 \n\t"
2995
"addps (%4,%0), %%xmm0 \n\t"
2996
"addps 16(%4,%0), %%xmm1 \n\t"
2997
"movaps %%xmm0, (%1,%0) \n\t"
2998
"movaps %%xmm1, 16(%1,%0) \n\t"
3002
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3007
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3010
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3011
// not bit-exact: pf2id uses different rounding than C and SSE
3013
for(i=0; i<len; i+=4) {
3015
"pf2id %1, %%mm0 \n\t"
3016
"pf2id %2, %%mm1 \n\t"
3017
"packssdw %%mm1, %%mm0 \n\t"
3018
"movq %%mm0, %0 \n\t"
3020
:"m"(src[i]), "m"(src[i+2])
3023
asm volatile("femms");
3025
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3027
for(i=0; i<len; i+=4) {
3029
"cvtps2pi %1, %%mm0 \n\t"
3030
"cvtps2pi %2, %%mm1 \n\t"
3031
"packssdw %%mm1, %%mm0 \n\t"
3032
"movq %%mm0, %0 \n\t"
3034
:"m"(src[i]), "m"(src[i+2])
3037
asm volatile("emms");
3040
#ifdef CONFIG_SNOW_DECODER
3041
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3042
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3043
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3044
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3045
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3046
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3047
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3048
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
1532
3051
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
1534
3053
mm_flags = mm_support();
1536
3055
if (avctx->dsp_mask) {
1537
if (avctx->dsp_mask & FF_MM_FORCE)
1538
mm_flags |= (avctx->dsp_mask & 0xffff);
1540
mm_flags &= ~(avctx->dsp_mask & 0xffff);
3056
if (avctx->dsp_mask & FF_MM_FORCE)
3057
mm_flags |= (avctx->dsp_mask & 0xffff);
3059
mm_flags &= ~(avctx->dsp_mask & 0xffff);
1544
fprintf(stderr, "libavcodec: CPU flags:");
3063
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
1545
3064
if (mm_flags & MM_MMX)
1546
fprintf(stderr, " mmx");
3065
av_log(avctx, AV_LOG_INFO, " mmx");
1547
3066
if (mm_flags & MM_MMXEXT)
1548
fprintf(stderr, " mmxext");
3067
av_log(avctx, AV_LOG_INFO, " mmxext");
1549
3068
if (mm_flags & MM_3DNOW)
1550
fprintf(stderr, " 3dnow");
3069
av_log(avctx, AV_LOG_INFO, " 3dnow");
1551
3070
if (mm_flags & MM_SSE)
1552
fprintf(stderr, " sse");
3071
av_log(avctx, AV_LOG_INFO, " sse");
1553
3072
if (mm_flags & MM_SSE2)
1554
fprintf(stderr, " sse2");
1555
fprintf(stderr, "\n");
3073
av_log(avctx, AV_LOG_INFO, " sse2");
3074
av_log(avctx, AV_LOG_INFO, "\n");
1558
3077
if (mm_flags & MM_MMX) {
3078
const int idct_algo= avctx->idct_algo;
3080
#ifdef CONFIG_ENCODERS
1559
3081
const int dct_algo = avctx->dct_algo;
1560
const int idct_algo= avctx->idct_algo;
1562
#ifdef CONFIG_ENCODERS
1563
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
1564
c->fdct = ff_fdct_mmx;
1565
#endif //CONFIG_ENCODERS
1567
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
1568
c->idct_put= ff_simple_idct_put_mmx;
1569
c->idct_add= ff_simple_idct_add_mmx;
1570
c->idct = ff_simple_idct_mmx;
1571
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
1572
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
1573
if(mm_flags & MM_MMXEXT){
1574
c->idct_put= ff_libmpeg2mmx2_idct_put;
1575
c->idct_add= ff_libmpeg2mmx2_idct_add;
1576
c->idct = ff_mmxext_idct;
3082
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3083
if(mm_flags & MM_SSE2){
3084
c->fdct = ff_fdct_sse2;
3085
}else if(mm_flags & MM_MMXEXT){
3086
c->fdct = ff_fdct_mmx2;
1578
c->idct_put= ff_libmpeg2mmx_idct_put;
1579
c->idct_add= ff_libmpeg2mmx_idct_add;
1580
c->idct = ff_mmx_idct;
1582
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3088
c->fdct = ff_fdct_mmx;
3091
#endif //CONFIG_ENCODERS
3092
if(avctx->lowres==0){
3093
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3094
c->idct_put= ff_simple_idct_put_mmx;
3095
c->idct_add= ff_simple_idct_add_mmx;
3096
c->idct = ff_simple_idct_mmx;
3097
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3099
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3100
if(mm_flags & MM_MMXEXT){
3101
c->idct_put= ff_libmpeg2mmx2_idct_put;
3102
c->idct_add= ff_libmpeg2mmx2_idct_add;
3103
c->idct = ff_mmxext_idct;
3105
c->idct_put= ff_libmpeg2mmx_idct_put;
3106
c->idct_add= ff_libmpeg2mmx_idct_add;
3107
c->idct = ff_mmx_idct;
3109
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3111
}else if(idct_algo==FF_IDCT_VP3 &&
3112
avctx->codec->id!=CODEC_ID_THEORA &&
3113
!(avctx->flags & CODEC_FLAG_BITEXACT)){
3114
if(mm_flags & MM_SSE2){
3115
c->idct_put= ff_vp3_idct_put_sse2;
3116
c->idct_add= ff_vp3_idct_add_sse2;
3117
c->idct = ff_vp3_idct_sse2;
3118
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3120
ff_vp3_dsp_init_mmx();
3121
c->idct_put= ff_vp3_idct_put_mmx;
3122
c->idct_add= ff_vp3_idct_add_mmx;
3123
c->idct = ff_vp3_idct_mmx;
3124
c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3126
}else if(idct_algo==FF_IDCT_CAVS){
3127
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3128
}else if(idct_algo==FF_IDCT_XVIDMMX){
3129
if(mm_flags & MM_MMXEXT){
3130
c->idct_put= ff_idct_xvid_mmx2_put;
3131
c->idct_add= ff_idct_xvid_mmx2_add;
3132
c->idct = ff_idct_xvid_mmx2;
3134
c->idct_put= ff_idct_xvid_mmx_put;
3135
c->idct_add= ff_idct_xvid_mmx_add;
3136
c->idct = ff_idct_xvid_mmx;
3141
#ifdef CONFIG_ENCODERS
1585
3142
c->get_pixels = get_pixels_mmx;
1586
3143
c->diff_pixels = diff_pixels_mmx;
3144
#endif //CONFIG_ENCODERS
1587
3145
c->put_pixels_clamped = put_pixels_clamped_mmx;
3146
c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
1588
3147
c->add_pixels_clamped = add_pixels_clamped_mmx;
1589
3148
c->clear_blocks = clear_blocks_mmx;
3149
#ifdef CONFIG_ENCODERS
1590
3150
c->pix_sum = pix_sum16_mmx;
3151
#endif //CONFIG_ENCODERS
1592
3153
c->put_pixels_tab[0][0] = put_pixels16_mmx;
1593
3154
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;