3
* rgb2rgb.c, Software RGB to RGB convertor
4
* pluralize by Software PAL8 to RGB convertor
5
* Software YUV to YUV convertor
6
* Software YUV to RGB convertor
7
* Written by Nick Kurshev.
8
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12
#include <inttypes.h> /* for __WORDSIZE */
15
// #warning You have misconfigured system and probably will lose performance!
16
#define __WORDSIZE MP_WORDSIZE
34
#define PREFETCH "prefetch"
35
#define PREFETCHW "prefetchw"
36
#define PAVGB "pavgusb"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
42
#define PREFETCH "/nop"
43
#define PREFETCHW "/nop"
47
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54
#define MOVNTQ "movntq"
55
#define SFENCE "sfence"
61
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64
const uint8_t *s = src;
67
const uint8_t *mm_end;
71
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79
"punpckldq 3%1, %%mm0\n\t"
81
"punpckldq 9%1, %%mm1\n\t"
82
"movd 12%1, %%mm2\n\t"
83
"punpckldq 15%1, %%mm2\n\t"
84
"movd 18%1, %%mm3\n\t"
85
"punpckldq 21%1, %%mm3\n\t"
86
"pand %%mm7, %%mm0\n\t"
87
"pand %%mm7, %%mm1\n\t"
88
"pand %%mm7, %%mm2\n\t"
89
"pand %%mm7, %%mm3\n\t"
90
MOVNTQ" %%mm0, %0\n\t"
91
MOVNTQ" %%mm1, 8%0\n\t"
92
MOVNTQ" %%mm2, 16%0\n\t"
100
__asm __volatile(SFENCE:::"memory");
101
__asm __volatile(EMMS:::"memory");
112
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115
const uint8_t *s = src;
118
const uint8_t *mm_end;
122
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129
"movq 8%1, %%mm1\n\t"
130
"movq 16%1, %%mm4\n\t"
131
"movq 24%1, %%mm5\n\t"
132
"movq %%mm0, %%mm2\n\t"
133
"movq %%mm1, %%mm3\n\t"
134
"movq %%mm4, %%mm6\n\t"
135
"movq %%mm5, %%mm7\n\t"
136
"psrlq $8, %%mm2\n\t"
137
"psrlq $8, %%mm3\n\t"
138
"psrlq $8, %%mm6\n\t"
139
"psrlq $8, %%mm7\n\t"
148
"por %%mm2, %%mm0\n\t"
149
"por %%mm3, %%mm1\n\t"
150
"por %%mm6, %%mm4\n\t"
151
"por %%mm7, %%mm5\n\t"
153
"movq %%mm1, %%mm2\n\t"
154
"movq %%mm4, %%mm3\n\t"
155
"psllq $48, %%mm2\n\t"
156
"psllq $32, %%mm3\n\t"
159
"por %%mm2, %%mm0\n\t"
160
"psrlq $16, %%mm1\n\t"
161
"psrlq $32, %%mm4\n\t"
162
"psllq $16, %%mm5\n\t"
163
"por %%mm3, %%mm1\n\t"
165
"por %%mm5, %%mm4\n\t"
167
MOVNTQ" %%mm0, %0\n\t"
168
MOVNTQ" %%mm1, 8%0\n\t"
171
:"m"(*s),"m"(mask24l),
172
"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177
__asm __volatile(SFENCE:::"memory");
178
__asm __volatile(EMMS:::"memory");
190
Original by Strepto/Astral
191
ported to gcc & bugfixed : A'rpi
192
MMX2, 3DNOW optimization by Nick Kurshev
193
32bit c version, and and&add trick by Michael Niedermayer
195
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197
register const uint8_t* s=src;
198
register uint8_t* d=dst;
199
register const uint8_t *end;
200
const uint8_t *mm_end;
203
__asm __volatile(PREFETCH" %0"::"m"(*s));
204
__asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211
"movq 8%1, %%mm2\n\t"
212
"movq %%mm0, %%mm1\n\t"
213
"movq %%mm2, %%mm3\n\t"
214
"pand %%mm4, %%mm0\n\t"
215
"pand %%mm4, %%mm2\n\t"
216
"paddw %%mm1, %%mm0\n\t"
217
"paddw %%mm3, %%mm2\n\t"
218
MOVNTQ" %%mm0, %0\n\t"
226
__asm __volatile(SFENCE:::"memory");
227
__asm __volatile(EMMS:::"memory");
232
register unsigned x= *((uint32_t *)s);
233
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239
register unsigned short x= *((uint16_t *)s);
240
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246
register const uint8_t* s=src;
247
register uint8_t* d=dst;
248
register const uint8_t *end;
249
const uint8_t *mm_end;
252
__asm __volatile(PREFETCH" %0"::"m"(*s));
253
__asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254
__asm __volatile("movq %0, %%mm6"::"m"(mask15b));
261
"movq 8%1, %%mm2\n\t"
262
"movq %%mm0, %%mm1\n\t"
263
"movq %%mm2, %%mm3\n\t"
264
"psrlq $1, %%mm0\n\t"
265
"psrlq $1, %%mm2\n\t"
266
"pand %%mm7, %%mm0\n\t"
267
"pand %%mm7, %%mm2\n\t"
268
"pand %%mm6, %%mm1\n\t"
269
"pand %%mm6, %%mm3\n\t"
270
"por %%mm1, %%mm0\n\t"
271
"por %%mm3, %%mm2\n\t"
272
MOVNTQ" %%mm0, %0\n\t"
280
__asm __volatile(SFENCE:::"memory");
281
__asm __volatile(EMMS:::"memory");
286
register uint32_t x= *((uint32_t *)s);
287
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
293
register uint16_t x= *((uint16_t *)s);
294
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302
const uint8_t *s = src;
305
const uint8_t *mm_end;
307
uint16_t *d = (uint16_t *)dst;
311
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
313
"movq %3, %%mm5 \n\t"
314
"movq %4, %%mm6 \n\t"
315
"movq %5, %%mm7 \n\t"
318
PREFETCH" 32(%1) \n\t"
319
"movd (%1), %%mm0 \n\t"
320
"movd 4(%1), %%mm3 \n\t"
321
"punpckldq 8(%1), %%mm0 \n\t"
322
"punpckldq 12(%1), %%mm3 \n\t"
323
"movq %%mm0, %%mm1 \n\t"
324
"movq %%mm3, %%mm4 \n\t"
325
"pand %%mm6, %%mm0 \n\t"
326
"pand %%mm6, %%mm3 \n\t"
327
"pmaddwd %%mm7, %%mm0 \n\t"
328
"pmaddwd %%mm7, %%mm3 \n\t"
329
"pand %%mm5, %%mm1 \n\t"
330
"pand %%mm5, %%mm4 \n\t"
331
"por %%mm1, %%mm0 \n\t"
332
"por %%mm4, %%mm3 \n\t"
333
"psrld $5, %%mm0 \n\t"
334
"pslld $11, %%mm3 \n\t"
335
"por %%mm3, %%mm0 \n\t"
336
MOVNTQ" %%mm0, (%0) \n\t"
342
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
345
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
349
::"m"(red_16mask),"m"(green_16mask));
355
"movd 4%1, %%mm3\n\t"
356
"punpckldq 8%1, %%mm0\n\t"
357
"punpckldq 12%1, %%mm3\n\t"
358
"movq %%mm0, %%mm1\n\t"
359
"movq %%mm0, %%mm2\n\t"
360
"movq %%mm3, %%mm4\n\t"
361
"movq %%mm3, %%mm5\n\t"
362
"psrlq $3, %%mm0\n\t"
363
"psrlq $3, %%mm3\n\t"
366
"psrlq $5, %%mm1\n\t"
367
"psrlq $5, %%mm4\n\t"
368
"pand %%mm6, %%mm1\n\t"
369
"pand %%mm6, %%mm4\n\t"
370
"psrlq $8, %%mm2\n\t"
371
"psrlq $8, %%mm5\n\t"
372
"pand %%mm7, %%mm2\n\t"
373
"pand %%mm7, %%mm5\n\t"
374
"por %%mm1, %%mm0\n\t"
375
"por %%mm4, %%mm3\n\t"
376
"por %%mm2, %%mm0\n\t"
377
"por %%mm5, %%mm3\n\t"
378
"psllq $16, %%mm3\n\t"
379
"por %%mm3, %%mm0\n\t"
380
MOVNTQ" %%mm0, %0\n\t"
381
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
386
__asm __volatile(SFENCE:::"memory");
387
__asm __volatile(EMMS:::"memory");
391
const int src= *s; s += 4;
392
*d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393
// *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399
const uint8_t *s = src;
402
const uint8_t *mm_end;
404
uint16_t *d = (uint16_t *)dst;
407
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
411
::"m"(red_16mask),"m"(green_16mask));
418
"movd 4%1, %%mm3\n\t"
419
"punpckldq 8%1, %%mm0\n\t"
420
"punpckldq 12%1, %%mm3\n\t"
421
"movq %%mm0, %%mm1\n\t"
422
"movq %%mm0, %%mm2\n\t"
423
"movq %%mm3, %%mm4\n\t"
424
"movq %%mm3, %%mm5\n\t"
425
"psllq $8, %%mm0\n\t"
426
"psllq $8, %%mm3\n\t"
427
"pand %%mm7, %%mm0\n\t"
428
"pand %%mm7, %%mm3\n\t"
429
"psrlq $5, %%mm1\n\t"
430
"psrlq $5, %%mm4\n\t"
431
"pand %%mm6, %%mm1\n\t"
432
"pand %%mm6, %%mm4\n\t"
433
"psrlq $19, %%mm2\n\t"
434
"psrlq $19, %%mm5\n\t"
437
"por %%mm1, %%mm0\n\t"
438
"por %%mm4, %%mm3\n\t"
439
"por %%mm2, %%mm0\n\t"
440
"por %%mm5, %%mm3\n\t"
441
"psllq $16, %%mm3\n\t"
442
"por %%mm3, %%mm0\n\t"
443
MOVNTQ" %%mm0, %0\n\t"
444
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
448
__asm __volatile(SFENCE:::"memory");
449
__asm __volatile(EMMS:::"memory");
453
const int src= *s; s += 4;
454
*d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460
const uint8_t *s = src;
463
const uint8_t *mm_end;
465
uint16_t *d = (uint16_t *)dst;
469
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
471
"movq %3, %%mm5 \n\t"
472
"movq %4, %%mm6 \n\t"
473
"movq %5, %%mm7 \n\t"
476
PREFETCH" 32(%1) \n\t"
477
"movd (%1), %%mm0 \n\t"
478
"movd 4(%1), %%mm3 \n\t"
479
"punpckldq 8(%1), %%mm0 \n\t"
480
"punpckldq 12(%1), %%mm3 \n\t"
481
"movq %%mm0, %%mm1 \n\t"
482
"movq %%mm3, %%mm4 \n\t"
483
"pand %%mm6, %%mm0 \n\t"
484
"pand %%mm6, %%mm3 \n\t"
485
"pmaddwd %%mm7, %%mm0 \n\t"
486
"pmaddwd %%mm7, %%mm3 \n\t"
487
"pand %%mm5, %%mm1 \n\t"
488
"pand %%mm5, %%mm4 \n\t"
489
"por %%mm1, %%mm0 \n\t"
490
"por %%mm4, %%mm3 \n\t"
491
"psrld $6, %%mm0 \n\t"
492
"pslld $10, %%mm3 \n\t"
493
"por %%mm3, %%mm0 \n\t"
494
MOVNTQ" %%mm0, (%0) \n\t"
500
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
503
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
507
::"m"(red_15mask),"m"(green_15mask));
513
"movd 4%1, %%mm3\n\t"
514
"punpckldq 8%1, %%mm0\n\t"
515
"punpckldq 12%1, %%mm3\n\t"
516
"movq %%mm0, %%mm1\n\t"
517
"movq %%mm0, %%mm2\n\t"
518
"movq %%mm3, %%mm4\n\t"
519
"movq %%mm3, %%mm5\n\t"
520
"psrlq $3, %%mm0\n\t"
521
"psrlq $3, %%mm3\n\t"
524
"psrlq $6, %%mm1\n\t"
525
"psrlq $6, %%mm4\n\t"
526
"pand %%mm6, %%mm1\n\t"
527
"pand %%mm6, %%mm4\n\t"
528
"psrlq $9, %%mm2\n\t"
529
"psrlq $9, %%mm5\n\t"
530
"pand %%mm7, %%mm2\n\t"
531
"pand %%mm7, %%mm5\n\t"
532
"por %%mm1, %%mm0\n\t"
533
"por %%mm4, %%mm3\n\t"
534
"por %%mm2, %%mm0\n\t"
535
"por %%mm5, %%mm3\n\t"
536
"psllq $16, %%mm3\n\t"
537
"por %%mm3, %%mm0\n\t"
538
MOVNTQ" %%mm0, %0\n\t"
539
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
544
__asm __volatile(SFENCE:::"memory");
545
__asm __volatile(EMMS:::"memory");
549
const int src= *s; s += 4;
550
*d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556
const uint8_t *s = src;
559
const uint8_t *mm_end;
561
uint16_t *d = (uint16_t *)dst;
564
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
568
::"m"(red_15mask),"m"(green_15mask));
575
"movd 4%1, %%mm3\n\t"
576
"punpckldq 8%1, %%mm0\n\t"
577
"punpckldq 12%1, %%mm3\n\t"
578
"movq %%mm0, %%mm1\n\t"
579
"movq %%mm0, %%mm2\n\t"
580
"movq %%mm3, %%mm4\n\t"
581
"movq %%mm3, %%mm5\n\t"
582
"psllq $7, %%mm0\n\t"
583
"psllq $7, %%mm3\n\t"
584
"pand %%mm7, %%mm0\n\t"
585
"pand %%mm7, %%mm3\n\t"
586
"psrlq $6, %%mm1\n\t"
587
"psrlq $6, %%mm4\n\t"
588
"pand %%mm6, %%mm1\n\t"
589
"pand %%mm6, %%mm4\n\t"
590
"psrlq $19, %%mm2\n\t"
591
"psrlq $19, %%mm5\n\t"
594
"por %%mm1, %%mm0\n\t"
595
"por %%mm4, %%mm3\n\t"
596
"por %%mm2, %%mm0\n\t"
597
"por %%mm5, %%mm3\n\t"
598
"psllq $16, %%mm3\n\t"
599
"por %%mm3, %%mm0\n\t"
600
MOVNTQ" %%mm0, %0\n\t"
601
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
605
__asm __volatile(SFENCE:::"memory");
606
__asm __volatile(EMMS:::"memory");
610
const int src= *s; s += 4;
611
*d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617
const uint8_t *s = src;
620
const uint8_t *mm_end;
622
uint16_t *d = (uint16_t *)dst;
625
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
629
::"m"(red_16mask),"m"(green_16mask));
636
"movd 3%1, %%mm3\n\t"
637
"punpckldq 6%1, %%mm0\n\t"
638
"punpckldq 9%1, %%mm3\n\t"
639
"movq %%mm0, %%mm1\n\t"
640
"movq %%mm0, %%mm2\n\t"
641
"movq %%mm3, %%mm4\n\t"
642
"movq %%mm3, %%mm5\n\t"
643
"psrlq $3, %%mm0\n\t"
644
"psrlq $3, %%mm3\n\t"
647
"psrlq $5, %%mm1\n\t"
648
"psrlq $5, %%mm4\n\t"
649
"pand %%mm6, %%mm1\n\t"
650
"pand %%mm6, %%mm4\n\t"
651
"psrlq $8, %%mm2\n\t"
652
"psrlq $8, %%mm5\n\t"
653
"pand %%mm7, %%mm2\n\t"
654
"pand %%mm7, %%mm5\n\t"
655
"por %%mm1, %%mm0\n\t"
656
"por %%mm4, %%mm3\n\t"
657
"por %%mm2, %%mm0\n\t"
658
"por %%mm5, %%mm3\n\t"
659
"psllq $16, %%mm3\n\t"
660
"por %%mm3, %%mm0\n\t"
661
MOVNTQ" %%mm0, %0\n\t"
662
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
666
__asm __volatile(SFENCE:::"memory");
667
__asm __volatile(EMMS:::"memory");
674
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680
const uint8_t *s = src;
683
const uint8_t *mm_end;
685
uint16_t *d = (uint16_t *)dst;
688
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
692
::"m"(red_16mask),"m"(green_16mask));
699
"movd 3%1, %%mm3\n\t"
700
"punpckldq 6%1, %%mm0\n\t"
701
"punpckldq 9%1, %%mm3\n\t"
702
"movq %%mm0, %%mm1\n\t"
703
"movq %%mm0, %%mm2\n\t"
704
"movq %%mm3, %%mm4\n\t"
705
"movq %%mm3, %%mm5\n\t"
706
"psllq $8, %%mm0\n\t"
707
"psllq $8, %%mm3\n\t"
708
"pand %%mm7, %%mm0\n\t"
709
"pand %%mm7, %%mm3\n\t"
710
"psrlq $5, %%mm1\n\t"
711
"psrlq $5, %%mm4\n\t"
712
"pand %%mm6, %%mm1\n\t"
713
"pand %%mm6, %%mm4\n\t"
714
"psrlq $19, %%mm2\n\t"
715
"psrlq $19, %%mm5\n\t"
718
"por %%mm1, %%mm0\n\t"
719
"por %%mm4, %%mm3\n\t"
720
"por %%mm2, %%mm0\n\t"
721
"por %%mm5, %%mm3\n\t"
722
"psllq $16, %%mm3\n\t"
723
"por %%mm3, %%mm0\n\t"
724
MOVNTQ" %%mm0, %0\n\t"
725
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
729
__asm __volatile(SFENCE:::"memory");
730
__asm __volatile(EMMS:::"memory");
737
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743
const uint8_t *s = src;
746
const uint8_t *mm_end;
748
uint16_t *d = (uint16_t *)dst;
751
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
755
::"m"(red_15mask),"m"(green_15mask));
762
"movd 3%1, %%mm3\n\t"
763
"punpckldq 6%1, %%mm0\n\t"
764
"punpckldq 9%1, %%mm3\n\t"
765
"movq %%mm0, %%mm1\n\t"
766
"movq %%mm0, %%mm2\n\t"
767
"movq %%mm3, %%mm4\n\t"
768
"movq %%mm3, %%mm5\n\t"
769
"psrlq $3, %%mm0\n\t"
770
"psrlq $3, %%mm3\n\t"
773
"psrlq $6, %%mm1\n\t"
774
"psrlq $6, %%mm4\n\t"
775
"pand %%mm6, %%mm1\n\t"
776
"pand %%mm6, %%mm4\n\t"
777
"psrlq $9, %%mm2\n\t"
778
"psrlq $9, %%mm5\n\t"
779
"pand %%mm7, %%mm2\n\t"
780
"pand %%mm7, %%mm5\n\t"
781
"por %%mm1, %%mm0\n\t"
782
"por %%mm4, %%mm3\n\t"
783
"por %%mm2, %%mm0\n\t"
784
"por %%mm5, %%mm3\n\t"
785
"psllq $16, %%mm3\n\t"
786
"por %%mm3, %%mm0\n\t"
787
MOVNTQ" %%mm0, %0\n\t"
788
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
792
__asm __volatile(SFENCE:::"memory");
793
__asm __volatile(EMMS:::"memory");
800
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806
const uint8_t *s = src;
809
const uint8_t *mm_end;
811
uint16_t *d = (uint16_t *)dst;
814
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
818
::"m"(red_15mask),"m"(green_15mask));
825
"movd 3%1, %%mm3\n\t"
826
"punpckldq 6%1, %%mm0\n\t"
827
"punpckldq 9%1, %%mm3\n\t"
828
"movq %%mm0, %%mm1\n\t"
829
"movq %%mm0, %%mm2\n\t"
830
"movq %%mm3, %%mm4\n\t"
831
"movq %%mm3, %%mm5\n\t"
832
"psllq $7, %%mm0\n\t"
833
"psllq $7, %%mm3\n\t"
834
"pand %%mm7, %%mm0\n\t"
835
"pand %%mm7, %%mm3\n\t"
836
"psrlq $6, %%mm1\n\t"
837
"psrlq $6, %%mm4\n\t"
838
"pand %%mm6, %%mm1\n\t"
839
"pand %%mm6, %%mm4\n\t"
840
"psrlq $19, %%mm2\n\t"
841
"psrlq $19, %%mm5\n\t"
844
"por %%mm1, %%mm0\n\t"
845
"por %%mm4, %%mm3\n\t"
846
"por %%mm2, %%mm0\n\t"
847
"por %%mm5, %%mm3\n\t"
848
"psllq $16, %%mm3\n\t"
849
"por %%mm3, %%mm0\n\t"
850
MOVNTQ" %%mm0, %0\n\t"
851
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
__asm __volatile(SFENCE:::"memory");
856
__asm __volatile(EMMS:::"memory");
863
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868
I use here less accurate approximation by simply
869
left-shifting the input
870
value and filling the low order bits with
871
zeroes. This method improves png's
872
compression but this scheme cannot reproduce white exactly, since it does not
873
generate an all-ones maximum value; the net effect is to darken the
876
The better method should be "left bit replication":
886
| Leftmost Bits Repeated to Fill Open Bits
890
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
894
const uint16_t *mm_end;
896
uint8_t *d = (uint8_t *)dst;
897
const uint16_t *s = (uint16_t *)src;
898
end = s + src_size/2;
900
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
912
"psllq $3, %%mm0\n\t"
913
"psrlq $2, %%mm1\n\t"
914
"psrlq $7, %%mm2\n\t"
915
"movq %%mm0, %%mm3\n\t"
916
"movq %%mm1, %%mm4\n\t"
917
"movq %%mm2, %%mm5\n\t"
918
"punpcklwd %5, %%mm0\n\t"
919
"punpcklwd %5, %%mm1\n\t"
920
"punpcklwd %5, %%mm2\n\t"
921
"punpckhwd %5, %%mm3\n\t"
922
"punpckhwd %5, %%mm4\n\t"
923
"punpckhwd %5, %%mm5\n\t"
924
"psllq $8, %%mm1\n\t"
925
"psllq $16, %%mm2\n\t"
926
"por %%mm1, %%mm0\n\t"
927
"por %%mm2, %%mm0\n\t"
928
"psllq $8, %%mm4\n\t"
929
"psllq $16, %%mm5\n\t"
930
"por %%mm4, %%mm3\n\t"
931
"por %%mm5, %%mm3\n\t"
933
"movq %%mm0, %%mm6\n\t"
934
"movq %%mm3, %%mm7\n\t"
936
"movq 8%1, %%mm0\n\t"
937
"movq 8%1, %%mm1\n\t"
938
"movq 8%1, %%mm2\n\t"
942
"psllq $3, %%mm0\n\t"
943
"psrlq $2, %%mm1\n\t"
944
"psrlq $7, %%mm2\n\t"
945
"movq %%mm0, %%mm3\n\t"
946
"movq %%mm1, %%mm4\n\t"
947
"movq %%mm2, %%mm5\n\t"
948
"punpcklwd %5, %%mm0\n\t"
949
"punpcklwd %5, %%mm1\n\t"
950
"punpcklwd %5, %%mm2\n\t"
951
"punpckhwd %5, %%mm3\n\t"
952
"punpckhwd %5, %%mm4\n\t"
953
"punpckhwd %5, %%mm5\n\t"
954
"psllq $8, %%mm1\n\t"
955
"psllq $16, %%mm2\n\t"
956
"por %%mm1, %%mm0\n\t"
957
"por %%mm2, %%mm0\n\t"
958
"psllq $8, %%mm4\n\t"
959
"psllq $16, %%mm5\n\t"
960
"por %%mm4, %%mm3\n\t"
961
"por %%mm5, %%mm3\n\t"
964
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
966
/* Borrowed 32 to 24 */
968
"movq %%mm0, %%mm4\n\t"
969
"movq %%mm3, %%mm5\n\t"
970
"movq %%mm6, %%mm0\n\t"
971
"movq %%mm7, %%mm1\n\t"
973
"movq %%mm4, %%mm6\n\t"
974
"movq %%mm5, %%mm7\n\t"
975
"movq %%mm0, %%mm2\n\t"
976
"movq %%mm1, %%mm3\n\t"
978
"psrlq $8, %%mm2\n\t"
979
"psrlq $8, %%mm3\n\t"
980
"psrlq $8, %%mm6\n\t"
981
"psrlq $8, %%mm7\n\t"
990
"por %%mm2, %%mm0\n\t"
991
"por %%mm3, %%mm1\n\t"
992
"por %%mm6, %%mm4\n\t"
993
"por %%mm7, %%mm5\n\t"
995
"movq %%mm1, %%mm2\n\t"
996
"movq %%mm4, %%mm3\n\t"
997
"psllq $48, %%mm2\n\t"
998
"psllq $32, %%mm3\n\t"
1000
"pand %5, %%mm3\n\t"
1001
"por %%mm2, %%mm0\n\t"
1002
"psrlq $16, %%mm1\n\t"
1003
"psrlq $32, %%mm4\n\t"
1004
"psllq $16, %%mm5\n\t"
1005
"por %%mm3, %%mm1\n\t"
1006
"pand %6, %%mm5\n\t"
1007
"por %%mm5, %%mm4\n\t"
1009
MOVNTQ" %%mm0, %0\n\t"
1010
MOVNTQ" %%mm1, 8%0\n\t"
1011
MOVNTQ" %%mm4, 16%0"
1014
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1019
__asm __volatile(SFENCE:::"memory");
1020
__asm __volatile(EMMS:::"memory");
1024
register uint16_t bgr;
1026
*d++ = (bgr&0x1F)<<3;
1027
*d++ = (bgr&0x3E0)>>2;
1028
*d++ = (bgr&0x7C00)>>7;
1032
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034
const uint16_t *end;
1036
const uint16_t *mm_end;
1038
uint8_t *d = (uint8_t *)dst;
1039
const uint16_t *s = (const uint16_t *)src;
1040
end = s + src_size/2;
1042
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1048
"movq %1, %%mm0\n\t"
1049
"movq %1, %%mm1\n\t"
1050
"movq %1, %%mm2\n\t"
1051
"pand %2, %%mm0\n\t"
1052
"pand %3, %%mm1\n\t"
1053
"pand %4, %%mm2\n\t"
1054
"psllq $3, %%mm0\n\t"
1055
"psrlq $3, %%mm1\n\t"
1056
"psrlq $8, %%mm2\n\t"
1057
"movq %%mm0, %%mm3\n\t"
1058
"movq %%mm1, %%mm4\n\t"
1059
"movq %%mm2, %%mm5\n\t"
1060
"punpcklwd %5, %%mm0\n\t"
1061
"punpcklwd %5, %%mm1\n\t"
1062
"punpcklwd %5, %%mm2\n\t"
1063
"punpckhwd %5, %%mm3\n\t"
1064
"punpckhwd %5, %%mm4\n\t"
1065
"punpckhwd %5, %%mm5\n\t"
1066
"psllq $8, %%mm1\n\t"
1067
"psllq $16, %%mm2\n\t"
1068
"por %%mm1, %%mm0\n\t"
1069
"por %%mm2, %%mm0\n\t"
1070
"psllq $8, %%mm4\n\t"
1071
"psllq $16, %%mm5\n\t"
1072
"por %%mm4, %%mm3\n\t"
1073
"por %%mm5, %%mm3\n\t"
1075
"movq %%mm0, %%mm6\n\t"
1076
"movq %%mm3, %%mm7\n\t"
1078
"movq 8%1, %%mm0\n\t"
1079
"movq 8%1, %%mm1\n\t"
1080
"movq 8%1, %%mm2\n\t"
1081
"pand %2, %%mm0\n\t"
1082
"pand %3, %%mm1\n\t"
1083
"pand %4, %%mm2\n\t"
1084
"psllq $3, %%mm0\n\t"
1085
"psrlq $3, %%mm1\n\t"
1086
"psrlq $8, %%mm2\n\t"
1087
"movq %%mm0, %%mm3\n\t"
1088
"movq %%mm1, %%mm4\n\t"
1089
"movq %%mm2, %%mm5\n\t"
1090
"punpcklwd %5, %%mm0\n\t"
1091
"punpcklwd %5, %%mm1\n\t"
1092
"punpcklwd %5, %%mm2\n\t"
1093
"punpckhwd %5, %%mm3\n\t"
1094
"punpckhwd %5, %%mm4\n\t"
1095
"punpckhwd %5, %%mm5\n\t"
1096
"psllq $8, %%mm1\n\t"
1097
"psllq $16, %%mm2\n\t"
1098
"por %%mm1, %%mm0\n\t"
1099
"por %%mm2, %%mm0\n\t"
1100
"psllq $8, %%mm4\n\t"
1101
"psllq $16, %%mm5\n\t"
1102
"por %%mm4, %%mm3\n\t"
1103
"por %%mm5, %%mm3\n\t"
1105
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107
/* Borrowed 32 to 24 */
1109
"movq %%mm0, %%mm4\n\t"
1110
"movq %%mm3, %%mm5\n\t"
1111
"movq %%mm6, %%mm0\n\t"
1112
"movq %%mm7, %%mm1\n\t"
1114
"movq %%mm4, %%mm6\n\t"
1115
"movq %%mm5, %%mm7\n\t"
1116
"movq %%mm0, %%mm2\n\t"
1117
"movq %%mm1, %%mm3\n\t"
1119
"psrlq $8, %%mm2\n\t"
1120
"psrlq $8, %%mm3\n\t"
1121
"psrlq $8, %%mm6\n\t"
1122
"psrlq $8, %%mm7\n\t"
1123
"pand %2, %%mm0\n\t"
1124
"pand %2, %%mm1\n\t"
1125
"pand %2, %%mm4\n\t"
1126
"pand %2, %%mm5\n\t"
1127
"pand %3, %%mm2\n\t"
1128
"pand %3, %%mm3\n\t"
1129
"pand %3, %%mm6\n\t"
1130
"pand %3, %%mm7\n\t"
1131
"por %%mm2, %%mm0\n\t"
1132
"por %%mm3, %%mm1\n\t"
1133
"por %%mm6, %%mm4\n\t"
1134
"por %%mm7, %%mm5\n\t"
1136
"movq %%mm1, %%mm2\n\t"
1137
"movq %%mm4, %%mm3\n\t"
1138
"psllq $48, %%mm2\n\t"
1139
"psllq $32, %%mm3\n\t"
1140
"pand %4, %%mm2\n\t"
1141
"pand %5, %%mm3\n\t"
1142
"por %%mm2, %%mm0\n\t"
1143
"psrlq $16, %%mm1\n\t"
1144
"psrlq $32, %%mm4\n\t"
1145
"psllq $16, %%mm5\n\t"
1146
"por %%mm3, %%mm1\n\t"
1147
"pand %6, %%mm5\n\t"
1148
"por %%mm5, %%mm4\n\t"
1150
MOVNTQ" %%mm0, %0\n\t"
1151
MOVNTQ" %%mm1, 8%0\n\t"
1152
MOVNTQ" %%mm4, 16%0"
1155
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1160
__asm __volatile(SFENCE:::"memory");
1161
__asm __volatile(EMMS:::"memory");
1165
register uint16_t bgr;
1167
*d++ = (bgr&0x1F)<<3;
1168
*d++ = (bgr&0x7E0)>>3;
1169
*d++ = (bgr&0xF800)>>8;
1173
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175
const uint16_t *end;
1177
const uint16_t *mm_end;
1179
uint8_t *d = (uint8_t *)dst;
1180
const uint16_t *s = (const uint16_t *)src;
1181
end = s + src_size/2;
1183
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1190
"movq %1, %%mm0\n\t"
1191
"movq %1, %%mm1\n\t"
1192
"movq %1, %%mm2\n\t"
1193
"pand %2, %%mm0\n\t"
1194
"pand %3, %%mm1\n\t"
1195
"pand %4, %%mm2\n\t"
1196
"psllq $3, %%mm0\n\t"
1197
"psrlq $2, %%mm1\n\t"
1198
"psrlq $7, %%mm2\n\t"
1199
"movq %%mm0, %%mm3\n\t"
1200
"movq %%mm1, %%mm4\n\t"
1201
"movq %%mm2, %%mm5\n\t"
1202
"punpcklwd %%mm7, %%mm0\n\t"
1203
"punpcklwd %%mm7, %%mm1\n\t"
1204
"punpcklwd %%mm7, %%mm2\n\t"
1205
"punpckhwd %%mm7, %%mm3\n\t"
1206
"punpckhwd %%mm7, %%mm4\n\t"
1207
"punpckhwd %%mm7, %%mm5\n\t"
1208
"psllq $8, %%mm1\n\t"
1209
"psllq $16, %%mm2\n\t"
1210
"por %%mm1, %%mm0\n\t"
1211
"por %%mm2, %%mm0\n\t"
1212
"psllq $8, %%mm4\n\t"
1213
"psllq $16, %%mm5\n\t"
1214
"por %%mm4, %%mm3\n\t"
1215
"por %%mm5, %%mm3\n\t"
1216
MOVNTQ" %%mm0, %0\n\t"
1217
MOVNTQ" %%mm3, 8%0\n\t"
1219
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1224
__asm __volatile(SFENCE:::"memory");
1225
__asm __volatile(EMMS:::"memory");
1229
#if 0 //slightly slower on athlon
1231
*((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1233
//FIXME this is very likely wrong for bigendian (and the following converters too)
1234
register uint16_t bgr;
1236
*d++ = (bgr&0x1F)<<3;
1237
*d++ = (bgr&0x3E0)>>2;
1238
*d++ = (bgr&0x7C00)>>7;
1244
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246
const uint16_t *end;
1248
const uint16_t *mm_end;
1250
uint8_t *d = (uint8_t *)dst;
1251
const uint16_t *s = (uint16_t *)src;
1252
end = s + src_size/2;
1254
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1261
"movq %1, %%mm0\n\t"
1262
"movq %1, %%mm1\n\t"
1263
"movq %1, %%mm2\n\t"
1264
"pand %2, %%mm0\n\t"
1265
"pand %3, %%mm1\n\t"
1266
"pand %4, %%mm2\n\t"
1267
"psllq $3, %%mm0\n\t"
1268
"psrlq $3, %%mm1\n\t"
1269
"psrlq $8, %%mm2\n\t"
1270
"movq %%mm0, %%mm3\n\t"
1271
"movq %%mm1, %%mm4\n\t"
1272
"movq %%mm2, %%mm5\n\t"
1273
"punpcklwd %%mm7, %%mm0\n\t"
1274
"punpcklwd %%mm7, %%mm1\n\t"
1275
"punpcklwd %%mm7, %%mm2\n\t"
1276
"punpckhwd %%mm7, %%mm3\n\t"
1277
"punpckhwd %%mm7, %%mm4\n\t"
1278
"punpckhwd %%mm7, %%mm5\n\t"
1279
"psllq $8, %%mm1\n\t"
1280
"psllq $16, %%mm2\n\t"
1281
"por %%mm1, %%mm0\n\t"
1282
"por %%mm2, %%mm0\n\t"
1283
"psllq $8, %%mm4\n\t"
1284
"psllq $16, %%mm5\n\t"
1285
"por %%mm4, %%mm3\n\t"
1286
"por %%mm5, %%mm3\n\t"
1287
MOVNTQ" %%mm0, %0\n\t"
1288
MOVNTQ" %%mm3, 8%0\n\t"
1290
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1295
__asm __volatile(SFENCE:::"memory");
1296
__asm __volatile(EMMS:::"memory");
1300
register uint16_t bgr;
1302
*d++ = (bgr&0x1F)<<3;
1303
*d++ = (bgr&0x7E0)>>3;
1304
*d++ = (bgr&0xF800)>>8;
1309
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1312
/* TODO: unroll this loop */
1314
"xorl %%eax, %%eax \n\t"
1317
PREFETCH" 32(%0, %%eax) \n\t"
1318
"movq (%0, %%eax), %%mm0 \n\t"
1319
"movq %%mm0, %%mm1 \n\t"
1320
"movq %%mm0, %%mm2 \n\t"
1321
"pslld $16, %%mm0 \n\t"
1322
"psrld $16, %%mm1 \n\t"
1323
"pand "MANGLE(mask32r)", %%mm0 \n\t"
1324
"pand "MANGLE(mask32g)", %%mm2 \n\t"
1325
"pand "MANGLE(mask32b)", %%mm1 \n\t"
1326
"por %%mm0, %%mm2 \n\t"
1327
"por %%mm1, %%mm2 \n\t"
1328
MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329
"addl $8, %%eax \n\t"
1330
"cmpl %2, %%eax \n\t"
1332
:: "r" (src), "r"(dst), "r" (src_size-7)
1336
__asm __volatile(SFENCE:::"memory");
1337
__asm __volatile(EMMS:::"memory");
1340
unsigned num_pixels = src_size >> 2;
1341
for(i=0; i<num_pixels; i++)
1343
#ifdef WORDS_BIGENDIAN
1344
dst[4*i + 1] = src[4*i + 3];
1345
dst[4*i + 2] = src[4*i + 2];
1346
dst[4*i + 3] = src[4*i + 1];
1348
dst[4*i + 0] = src[4*i + 2];
1349
dst[4*i + 1] = src[4*i + 1];
1350
dst[4*i + 2] = src[4*i + 0];
1356
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1360
int mmx_size= 23 - src_size;
1362
"movq "MANGLE(mask24r)", %%mm5 \n\t"
1363
"movq "MANGLE(mask24g)", %%mm6 \n\t"
1364
"movq "MANGLE(mask24b)", %%mm7 \n\t"
1367
PREFETCH" 32(%1, %%eax) \n\t"
1368
"movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1369
"movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1370
"movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1371
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
1372
"pand %%mm5, %%mm0 \n\t"
1373
"pand %%mm6, %%mm1 \n\t"
1374
"pand %%mm7, %%mm2 \n\t"
1375
"por %%mm0, %%mm1 \n\t"
1376
"por %%mm2, %%mm1 \n\t"
1377
"movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1378
MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1379
"movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1380
"movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1381
"pand %%mm7, %%mm0 \n\t"
1382
"pand %%mm5, %%mm1 \n\t"
1383
"pand %%mm6, %%mm2 \n\t"
1384
"por %%mm0, %%mm1 \n\t"
1385
"por %%mm2, %%mm1 \n\t"
1386
"movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1387
MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1388
"movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1389
"movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1390
"pand %%mm6, %%mm0 \n\t"
1391
"pand %%mm7, %%mm1 \n\t"
1392
"pand %%mm5, %%mm2 \n\t"
1393
"por %%mm0, %%mm1 \n\t"
1394
"por %%mm2, %%mm1 \n\t"
1395
MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1396
"addl $24, %%eax \n\t"
1399
: "r" (src-mmx_size), "r"(dst-mmx_size)
1402
__asm __volatile(SFENCE:::"memory");
1403
__asm __volatile(EMMS:::"memory");
1405
if(mmx_size==23) return; //finihsed, was multiple of 8
1409
src_size= 23-mmx_size;
1413
for(i=0; i<src_size; i+=3)
1417
dst[i + 1] = src[i + 1];
1418
dst[i + 2] = src[i + 0];
1423
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424
unsigned int width, unsigned int height,
1425
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1428
const unsigned chromWidth= width>>1;
1429
for(y=0; y<height; y++)
1432
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1434
"xorl %%eax, %%eax \n\t"
1437
PREFETCH" 32(%1, %%eax, 2) \n\t"
1438
PREFETCH" 32(%2, %%eax) \n\t"
1439
PREFETCH" 32(%3, %%eax) \n\t"
1440
"movq (%2, %%eax), %%mm0 \n\t" // U(0)
1441
"movq %%mm0, %%mm2 \n\t" // U(0)
1442
"movq (%3, %%eax), %%mm1 \n\t" // V(0)
1443
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1444
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1446
"movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1447
"movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1448
"movq %%mm3, %%mm4 \n\t" // Y(0)
1449
"movq %%mm5, %%mm6 \n\t" // Y(8)
1450
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1451
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1452
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1453
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1455
MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1456
MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1457
MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458
MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1460
"addl $8, %%eax \n\t"
1461
"cmpl %4, %%eax \n\t"
1463
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1468
#if defined ARCH_ALPHA && defined HAVE_MVI
1469
#define pl2yuy2(n) \
1474
asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1475
asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1476
asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1477
asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1478
yuv1 = (u << 8) + (v << 24); \
1485
uint64_t *qdst = (uint64_t *) dst;
1486
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487
const uint32_t *yc = (uint32_t *) ysrc;
1488
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490
for(i = 0; i < chromWidth; i += 8){
1491
uint64_t y1, y2, yuv1, yuv2;
1494
asm("ldq $31,64(%0)" :: "r"(yc));
1495
asm("ldq $31,64(%0)" :: "r"(yc2));
1496
asm("ldq $31,64(%0)" :: "r"(uc));
1497
asm("ldq $31,64(%0)" :: "r"(vc));
1515
#elif __WORDSIZE >= 64
1517
uint64_t *ldst = (uint64_t *) dst;
1518
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519
for(i = 0; i < chromWidth; i += 2){
1521
k = yc[0] + (uc[0] << 8) +
1522
(yc[1] << 16) + (vc[0] << 24);
1523
l = yc[2] + (uc[1] << 8) +
1524
(yc[3] << 16) + (vc[1] << 24);
1525
*ldst++ = k + (l << 32);
1532
int i, *idst = (int32_t *) dst;
1533
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534
for(i = 0; i < chromWidth; i++){
1535
#ifdef WORDS_BIGENDIAN
1536
*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1537
(yc[1] << 8) + (vc[0] << 0);
1539
*idst++ = yc[0] + (uc[0] << 8) +
1540
(yc[1] << 16) + (vc[0] << 24);
1548
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1550
usrc += chromStride;
1551
vsrc += chromStride;
1565
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1566
* problem for anyone then tell me, and ill fix it)
1568
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1569
unsigned int width, unsigned int height,
1570
int lumStride, int chromStride, int dstStride)
1572
//FIXME interpolate chroma
1573
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1576
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1577
unsigned int width, unsigned int height,
1578
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1581
const unsigned chromWidth= width>>1;
1582
for(y=0; y<height; y++)
1585
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1587
"xorl %%eax, %%eax \n\t"
1590
PREFETCH" 32(%1, %%eax, 2) \n\t"
1591
PREFETCH" 32(%2, %%eax) \n\t"
1592
PREFETCH" 32(%3, %%eax) \n\t"
1593
"movq (%2, %%eax), %%mm0 \n\t" // U(0)
1594
"movq %%mm0, %%mm2 \n\t" // U(0)
1595
"movq (%3, %%eax), %%mm1 \n\t" // V(0)
1596
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1597
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1599
"movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1600
"movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1601
"movq %%mm0, %%mm4 \n\t" // Y(0)
1602
"movq %%mm2, %%mm6 \n\t" // Y(8)
1603
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1604
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1605
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1606
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1608
MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1609
MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1610
MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1611
MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1613
"addl $8, %%eax \n\t"
1614
"cmpl %4, %%eax \n\t"
1616
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1620
//FIXME adapt the alpha asm code from yv12->yuy2
1622
#if __WORDSIZE >= 64
1624
uint64_t *ldst = (uint64_t *) dst;
1625
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626
for(i = 0; i < chromWidth; i += 2){
1628
k = uc[0] + (yc[0] << 8) +
1629
(vc[0] << 16) + (yc[1] << 24);
1630
l = uc[1] + (yc[2] << 8) +
1631
(vc[1] << 16) + (yc[3] << 24);
1632
*ldst++ = k + (l << 32);
1639
int i, *idst = (int32_t *) dst;
1640
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641
for(i = 0; i < chromWidth; i++){
1642
#ifdef WORDS_BIGENDIAN
1643
*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644
(vc[0] << 8) + (yc[1] << 0);
1646
*idst++ = uc[0] + (yc[0] << 8) +
1647
(vc[0] << 16) + (yc[1] << 24);
1655
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1657
usrc += chromStride;
1658
vsrc += chromStride;
1672
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1673
* problem for anyone then tell me, and ill fix it)
1675
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1676
unsigned int width, unsigned int height,
1677
int lumStride, int chromStride, int dstStride)
1679
//FIXME interpolate chroma
1680
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1685
* width should be a multiple of 16
1687
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1688
unsigned int width, unsigned int height,
1689
int lumStride, int chromStride, int dstStride)
1691
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1696
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1697
* problem for anyone then tell me, and ill fix it)
1699
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1700
unsigned int width, unsigned int height,
1701
int lumStride, int chromStride, int srcStride)
1704
const unsigned chromWidth= width>>1;
1705
for(y=0; y<height; y+=2)
1709
"xorl %%eax, %%eax \n\t"
1710
"pcmpeqw %%mm7, %%mm7 \n\t"
1711
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1714
PREFETCH" 64(%0, %%eax, 4) \n\t"
1715
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1716
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1717
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1718
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1719
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1720
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1721
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1722
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1723
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1724
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1726
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1728
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1729
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1730
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1731
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1732
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1733
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1734
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1735
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1736
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1737
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1739
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1741
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1742
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1743
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1744
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1745
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1746
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1747
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1748
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1750
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1751
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1753
"addl $8, %%eax \n\t"
1754
"cmpl %4, %%eax \n\t"
1756
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1764
"xorl %%eax, %%eax \n\t"
1767
PREFETCH" 64(%0, %%eax, 4) \n\t"
1768
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1769
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1770
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1771
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1772
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1773
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1774
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1775
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1776
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1777
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1779
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1780
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1782
"addl $8, %%eax \n\t"
1783
"cmpl %4, %%eax \n\t"
1786
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1791
for(i=0; i<chromWidth; i++)
1793
ydst[2*i+0] = src[4*i+0];
1794
udst[i] = src[4*i+1];
1795
ydst[2*i+1] = src[4*i+2];
1796
vdst[i] = src[4*i+3];
1801
for(i=0; i<chromWidth; i++)
1803
ydst[2*i+0] = src[4*i+0];
1804
ydst[2*i+1] = src[4*i+2];
1807
udst += chromStride;
1808
vdst += chromStride;
1813
asm volatile( EMMS" \n\t"
1819
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1820
uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1821
unsigned int width, unsigned int height, int lumStride, int chromStride)
1824
memcpy(ydst, ysrc, width*height);
1826
/* XXX: implement upscaling for U,V */
1829
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1836
for(x=0; x<srcWidth-1; x++){
1837
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1838
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1840
dst[2*srcWidth-1]= src[srcWidth-1];
1844
for(y=1; y<srcHeight; y++){
1845
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1846
const int mmxSize= srcWidth&~15;
1848
"movl %4, %%eax \n\t"
1850
"movq (%0, %%eax), %%mm0 \n\t"
1851
"movq (%1, %%eax), %%mm1 \n\t"
1852
"movq 1(%0, %%eax), %%mm2 \n\t"
1853
"movq 1(%1, %%eax), %%mm3 \n\t"
1854
"movq -1(%0, %%eax), %%mm4 \n\t"
1855
"movq -1(%1, %%eax), %%mm5 \n\t"
1856
PAVGB" %%mm0, %%mm5 \n\t"
1857
PAVGB" %%mm0, %%mm3 \n\t"
1858
PAVGB" %%mm0, %%mm5 \n\t"
1859
PAVGB" %%mm0, %%mm3 \n\t"
1860
PAVGB" %%mm1, %%mm4 \n\t"
1861
PAVGB" %%mm1, %%mm2 \n\t"
1862
PAVGB" %%mm1, %%mm4 \n\t"
1863
PAVGB" %%mm1, %%mm2 \n\t"
1864
"movq %%mm5, %%mm7 \n\t"
1865
"movq %%mm4, %%mm6 \n\t"
1866
"punpcklbw %%mm3, %%mm5 \n\t"
1867
"punpckhbw %%mm3, %%mm7 \n\t"
1868
"punpcklbw %%mm2, %%mm4 \n\t"
1869
"punpckhbw %%mm2, %%mm6 \n\t"
1871
MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1872
MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1873
MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1874
MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1876
"movq %%mm5, (%2, %%eax, 2) \n\t"
1877
"movq %%mm7, 8(%2, %%eax, 2) \n\t"
1878
"movq %%mm4, (%3, %%eax, 2) \n\t"
1879
"movq %%mm6, 8(%3, %%eax, 2) \n\t"
1881
"addl $8, %%eax \n\t"
1883
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1884
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1890
const int mmxSize=1;
1892
dst[0 ]= (3*src[0] + src[srcStride])>>2;
1893
dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1895
for(x=mmxSize-1; x<srcWidth-1; x++){
1896
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1897
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1898
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1899
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1901
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1902
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1912
for(x=0; x<srcWidth-1; x++){
1913
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1914
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1916
dst[2*srcWidth-1]= src[srcWidth-1];
1918
for(x=0; x<srcWidth; x++){
1925
asm volatile( EMMS" \n\t"
1933
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1934
* problem for anyone then tell me, and ill fix it)
1935
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1937
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1938
unsigned int width, unsigned int height,
1939
int lumStride, int chromStride, int srcStride)
1942
const unsigned chromWidth= width>>1;
1943
for(y=0; y<height; y+=2)
1947
"xorl %%eax, %%eax \n\t"
1948
"pcmpeqw %%mm7, %%mm7 \n\t"
1949
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1952
PREFETCH" 64(%0, %%eax, 4) \n\t"
1953
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1954
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1955
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1956
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1957
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1958
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1959
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1960
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1961
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1962
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1964
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1966
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1967
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1968
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1969
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1970
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1971
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1972
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1973
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1974
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1975
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1977
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1979
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1980
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1981
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1982
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1983
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1984
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1985
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1986
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1988
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1989
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1991
"addl $8, %%eax \n\t"
1992
"cmpl %4, %%eax \n\t"
1994
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2002
"xorl %%eax, %%eax \n\t"
2005
PREFETCH" 64(%0, %%eax, 4) \n\t"
2006
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2007
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2008
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2009
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2010
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2011
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2012
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2013
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2014
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2015
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2017
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2018
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2020
"addl $8, %%eax \n\t"
2021
"cmpl %4, %%eax \n\t"
2024
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2029
for(i=0; i<chromWidth; i++)
2031
udst[i] = src[4*i+0];
2032
ydst[2*i+0] = src[4*i+1];
2033
vdst[i] = src[4*i+2];
2034
ydst[2*i+1] = src[4*i+3];
2039
for(i=0; i<chromWidth; i++)
2041
ydst[2*i+0] = src[4*i+1];
2042
ydst[2*i+1] = src[4*i+3];
2045
udst += chromStride;
2046
vdst += chromStride;
2051
asm volatile( EMMS" \n\t"
2059
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2060
* problem for anyone then tell me, and ill fix it)
2061
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2063
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2064
unsigned int width, unsigned int height,
2065
int lumStride, int chromStride, int srcStride)
2068
const unsigned chromWidth= width>>1;
2070
for(y=0; y<height-2; y+=2)
2076
"movl %2, %%eax \n\t"
2077
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2078
"movq "MANGLE(w1111)", %%mm5 \n\t"
2079
"pxor %%mm7, %%mm7 \n\t"
2080
"leal (%%eax, %%eax, 2), %%ebx \n\t"
2083
PREFETCH" 64(%0, %%ebx) \n\t"
2084
"movd (%0, %%ebx), %%mm0 \n\t"
2085
"movd 3(%0, %%ebx), %%mm1 \n\t"
2086
"punpcklbw %%mm7, %%mm0 \n\t"
2087
"punpcklbw %%mm7, %%mm1 \n\t"
2088
"movd 6(%0, %%ebx), %%mm2 \n\t"
2089
"movd 9(%0, %%ebx), %%mm3 \n\t"
2090
"punpcklbw %%mm7, %%mm2 \n\t"
2091
"punpcklbw %%mm7, %%mm3 \n\t"
2092
"pmaddwd %%mm6, %%mm0 \n\t"
2093
"pmaddwd %%mm6, %%mm1 \n\t"
2094
"pmaddwd %%mm6, %%mm2 \n\t"
2095
"pmaddwd %%mm6, %%mm3 \n\t"
2096
#ifndef FAST_BGR2YV12
2097
"psrad $8, %%mm0 \n\t"
2098
"psrad $8, %%mm1 \n\t"
2099
"psrad $8, %%mm2 \n\t"
2100
"psrad $8, %%mm3 \n\t"
2102
"packssdw %%mm1, %%mm0 \n\t"
2103
"packssdw %%mm3, %%mm2 \n\t"
2104
"pmaddwd %%mm5, %%mm0 \n\t"
2105
"pmaddwd %%mm5, %%mm2 \n\t"
2106
"packssdw %%mm2, %%mm0 \n\t"
2107
"psraw $7, %%mm0 \n\t"
2109
"movd 12(%0, %%ebx), %%mm4 \n\t"
2110
"movd 15(%0, %%ebx), %%mm1 \n\t"
2111
"punpcklbw %%mm7, %%mm4 \n\t"
2112
"punpcklbw %%mm7, %%mm1 \n\t"
2113
"movd 18(%0, %%ebx), %%mm2 \n\t"
2114
"movd 21(%0, %%ebx), %%mm3 \n\t"
2115
"punpcklbw %%mm7, %%mm2 \n\t"
2116
"punpcklbw %%mm7, %%mm3 \n\t"
2117
"pmaddwd %%mm6, %%mm4 \n\t"
2118
"pmaddwd %%mm6, %%mm1 \n\t"
2119
"pmaddwd %%mm6, %%mm2 \n\t"
2120
"pmaddwd %%mm6, %%mm3 \n\t"
2121
#ifndef FAST_BGR2YV12
2122
"psrad $8, %%mm4 \n\t"
2123
"psrad $8, %%mm1 \n\t"
2124
"psrad $8, %%mm2 \n\t"
2125
"psrad $8, %%mm3 \n\t"
2127
"packssdw %%mm1, %%mm4 \n\t"
2128
"packssdw %%mm3, %%mm2 \n\t"
2129
"pmaddwd %%mm5, %%mm4 \n\t"
2130
"pmaddwd %%mm5, %%mm2 \n\t"
2131
"addl $24, %%ebx \n\t"
2132
"packssdw %%mm2, %%mm4 \n\t"
2133
"psraw $7, %%mm4 \n\t"
2135
"packuswb %%mm4, %%mm0 \n\t"
2136
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2138
MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2139
"addl $8, %%eax \n\t"
2141
: : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2149
"movl %4, %%eax \n\t"
2150
"movq "MANGLE(w1111)", %%mm5 \n\t"
2151
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2152
"pxor %%mm7, %%mm7 \n\t"
2153
"leal (%%eax, %%eax, 2), %%ebx \n\t"
2154
"addl %%ebx, %%ebx \n\t"
2157
PREFETCH" 64(%0, %%ebx) \n\t"
2158
PREFETCH" 64(%1, %%ebx) \n\t"
2159
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2160
"movq (%0, %%ebx), %%mm0 \n\t"
2161
"movq (%1, %%ebx), %%mm1 \n\t"
2162
"movq 6(%0, %%ebx), %%mm2 \n\t"
2163
"movq 6(%1, %%ebx), %%mm3 \n\t"
2164
PAVGB" %%mm1, %%mm0 \n\t"
2165
PAVGB" %%mm3, %%mm2 \n\t"
2166
"movq %%mm0, %%mm1 \n\t"
2167
"movq %%mm2, %%mm3 \n\t"
2168
"psrlq $24, %%mm0 \n\t"
2169
"psrlq $24, %%mm2 \n\t"
2170
PAVGB" %%mm1, %%mm0 \n\t"
2171
PAVGB" %%mm3, %%mm2 \n\t"
2172
"punpcklbw %%mm7, %%mm0 \n\t"
2173
"punpcklbw %%mm7, %%mm2 \n\t"
2175
"movd (%0, %%ebx), %%mm0 \n\t"
2176
"movd (%1, %%ebx), %%mm1 \n\t"
2177
"movd 3(%0, %%ebx), %%mm2 \n\t"
2178
"movd 3(%1, %%ebx), %%mm3 \n\t"
2179
"punpcklbw %%mm7, %%mm0 \n\t"
2180
"punpcklbw %%mm7, %%mm1 \n\t"
2181
"punpcklbw %%mm7, %%mm2 \n\t"
2182
"punpcklbw %%mm7, %%mm3 \n\t"
2183
"paddw %%mm1, %%mm0 \n\t"
2184
"paddw %%mm3, %%mm2 \n\t"
2185
"paddw %%mm2, %%mm0 \n\t"
2186
"movd 6(%0, %%ebx), %%mm4 \n\t"
2187
"movd 6(%1, %%ebx), %%mm1 \n\t"
2188
"movd 9(%0, %%ebx), %%mm2 \n\t"
2189
"movd 9(%1, %%ebx), %%mm3 \n\t"
2190
"punpcklbw %%mm7, %%mm4 \n\t"
2191
"punpcklbw %%mm7, %%mm1 \n\t"
2192
"punpcklbw %%mm7, %%mm2 \n\t"
2193
"punpcklbw %%mm7, %%mm3 \n\t"
2194
"paddw %%mm1, %%mm4 \n\t"
2195
"paddw %%mm3, %%mm2 \n\t"
2196
"paddw %%mm4, %%mm2 \n\t"
2197
"psrlw $2, %%mm0 \n\t"
2198
"psrlw $2, %%mm2 \n\t"
2200
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2201
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2203
"pmaddwd %%mm0, %%mm1 \n\t"
2204
"pmaddwd %%mm2, %%mm3 \n\t"
2205
"pmaddwd %%mm6, %%mm0 \n\t"
2206
"pmaddwd %%mm6, %%mm2 \n\t"
2207
#ifndef FAST_BGR2YV12
2208
"psrad $8, %%mm0 \n\t"
2209
"psrad $8, %%mm1 \n\t"
2210
"psrad $8, %%mm2 \n\t"
2211
"psrad $8, %%mm3 \n\t"
2213
"packssdw %%mm2, %%mm0 \n\t"
2214
"packssdw %%mm3, %%mm1 \n\t"
2215
"pmaddwd %%mm5, %%mm0 \n\t"
2216
"pmaddwd %%mm5, %%mm1 \n\t"
2217
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2218
"psraw $7, %%mm0 \n\t"
2220
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2221
"movq 12(%0, %%ebx), %%mm4 \n\t"
2222
"movq 12(%1, %%ebx), %%mm1 \n\t"
2223
"movq 18(%0, %%ebx), %%mm2 \n\t"
2224
"movq 18(%1, %%ebx), %%mm3 \n\t"
2225
PAVGB" %%mm1, %%mm4 \n\t"
2226
PAVGB" %%mm3, %%mm2 \n\t"
2227
"movq %%mm4, %%mm1 \n\t"
2228
"movq %%mm2, %%mm3 \n\t"
2229
"psrlq $24, %%mm4 \n\t"
2230
"psrlq $24, %%mm2 \n\t"
2231
PAVGB" %%mm1, %%mm4 \n\t"
2232
PAVGB" %%mm3, %%mm2 \n\t"
2233
"punpcklbw %%mm7, %%mm4 \n\t"
2234
"punpcklbw %%mm7, %%mm2 \n\t"
2236
"movd 12(%0, %%ebx), %%mm4 \n\t"
2237
"movd 12(%1, %%ebx), %%mm1 \n\t"
2238
"movd 15(%0, %%ebx), %%mm2 \n\t"
2239
"movd 15(%1, %%ebx), %%mm3 \n\t"
2240
"punpcklbw %%mm7, %%mm4 \n\t"
2241
"punpcklbw %%mm7, %%mm1 \n\t"
2242
"punpcklbw %%mm7, %%mm2 \n\t"
2243
"punpcklbw %%mm7, %%mm3 \n\t"
2244
"paddw %%mm1, %%mm4 \n\t"
2245
"paddw %%mm3, %%mm2 \n\t"
2246
"paddw %%mm2, %%mm4 \n\t"
2247
"movd 18(%0, %%ebx), %%mm5 \n\t"
2248
"movd 18(%1, %%ebx), %%mm1 \n\t"
2249
"movd 21(%0, %%ebx), %%mm2 \n\t"
2250
"movd 21(%1, %%ebx), %%mm3 \n\t"
2251
"punpcklbw %%mm7, %%mm5 \n\t"
2252
"punpcklbw %%mm7, %%mm1 \n\t"
2253
"punpcklbw %%mm7, %%mm2 \n\t"
2254
"punpcklbw %%mm7, %%mm3 \n\t"
2255
"paddw %%mm1, %%mm5 \n\t"
2256
"paddw %%mm3, %%mm2 \n\t"
2257
"paddw %%mm5, %%mm2 \n\t"
2258
"movq "MANGLE(w1111)", %%mm5 \n\t"
2259
"psrlw $2, %%mm4 \n\t"
2260
"psrlw $2, %%mm2 \n\t"
2262
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2263
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2265
"pmaddwd %%mm4, %%mm1 \n\t"
2266
"pmaddwd %%mm2, %%mm3 \n\t"
2267
"pmaddwd %%mm6, %%mm4 \n\t"
2268
"pmaddwd %%mm6, %%mm2 \n\t"
2269
#ifndef FAST_BGR2YV12
2270
"psrad $8, %%mm4 \n\t"
2271
"psrad $8, %%mm1 \n\t"
2272
"psrad $8, %%mm2 \n\t"
2273
"psrad $8, %%mm3 \n\t"
2275
"packssdw %%mm2, %%mm4 \n\t"
2276
"packssdw %%mm3, %%mm1 \n\t"
2277
"pmaddwd %%mm5, %%mm4 \n\t"
2278
"pmaddwd %%mm5, %%mm1 \n\t"
2279
"addl $24, %%ebx \n\t"
2280
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2281
"psraw $7, %%mm4 \n\t"
2283
"movq %%mm0, %%mm1 \n\t"
2284
"punpckldq %%mm4, %%mm0 \n\t"
2285
"punpckhdq %%mm4, %%mm1 \n\t"
2286
"packsswb %%mm1, %%mm0 \n\t"
2287
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2289
"movd %%mm0, (%2, %%eax) \n\t"
2290
"punpckhdq %%mm0, %%mm0 \n\t"
2291
"movd %%mm0, (%3, %%eax) \n\t"
2292
"addl $4, %%eax \n\t"
2294
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2298
udst += chromStride;
2299
vdst += chromStride;
2303
asm volatile( EMMS" \n\t"
2309
for(; y<height; y+=2)
2312
for(i=0; i<chromWidth; i++)
2314
unsigned int b= src[6*i+0];
2315
unsigned int g= src[6*i+1];
2316
unsigned int r= src[6*i+2];
2318
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2330
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2336
for(i=0; i<chromWidth; i++)
2338
unsigned int b= src[6*i+0];
2339
unsigned int g= src[6*i+1];
2340
unsigned int r= src[6*i+2];
2342
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2353
udst += chromStride;
2354
vdst += chromStride;
2360
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2361
unsigned width, unsigned height, int src1Stride,
2362
int src2Stride, int dstStride){
2365
for(h=0; h < height; h++)
2372
"xorl %%eax, %%eax \n\t"
2374
PREFETCH" 64(%1, %%eax) \n\t"
2375
PREFETCH" 64(%2, %%eax) \n\t"
2376
"movdqa (%1, %%eax), %%xmm0 \n\t"
2377
"movdqa (%1, %%eax), %%xmm1 \n\t"
2378
"movdqa (%2, %%eax), %%xmm2 \n\t"
2379
"punpcklbw %%xmm2, %%xmm0 \n\t"
2380
"punpckhbw %%xmm2, %%xmm1 \n\t"
2381
"movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2382
"movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2383
"addl $16, %%eax \n\t"
2384
"cmpl %3, %%eax \n\t"
2386
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2391
"xorl %%eax, %%eax \n\t"
2393
PREFETCH" 64(%1, %%eax) \n\t"
2394
PREFETCH" 64(%2, %%eax) \n\t"
2395
"movq (%1, %%eax), %%mm0 \n\t"
2396
"movq 8(%1, %%eax), %%mm2 \n\t"
2397
"movq %%mm0, %%mm1 \n\t"
2398
"movq %%mm2, %%mm3 \n\t"
2399
"movq (%2, %%eax), %%mm4 \n\t"
2400
"movq 8(%2, %%eax), %%mm5 \n\t"
2401
"punpcklbw %%mm4, %%mm0 \n\t"
2402
"punpckhbw %%mm4, %%mm1 \n\t"
2403
"punpcklbw %%mm5, %%mm2 \n\t"
2404
"punpckhbw %%mm5, %%mm3 \n\t"
2405
MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2406
MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2407
MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2408
MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2409
"addl $16, %%eax \n\t"
2410
"cmpl %3, %%eax \n\t"
2412
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2416
for(w= (width&(~15)); w < width; w++)
2418
dest[2*w+0] = src1[w];
2419
dest[2*w+1] = src2[w];
2422
for(w=0; w < width; w++)
2424
dest[2*w+0] = src1[w];
2425
dest[2*w+1] = src2[w];
2441
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2442
uint8_t *dst1, uint8_t *dst2,
2443
unsigned width, unsigned height,
2444
int srcStride1, int srcStride2,
2445
int dstStride1, int dstStride2)
2449
w=width/2; h=height/2;
2454
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2457
const uint8_t* s1=src1+srcStride1*(y>>1);
2458
uint8_t* d=dst1+dstStride1*y;
2465
"movq %1, %%mm0\n\t"
2466
"movq 8%1, %%mm2\n\t"
2467
"movq 16%1, %%mm4\n\t"
2468
"movq 24%1, %%mm6\n\t"
2469
"movq %%mm0, %%mm1\n\t"
2470
"movq %%mm2, %%mm3\n\t"
2471
"movq %%mm4, %%mm5\n\t"
2472
"movq %%mm6, %%mm7\n\t"
2473
"punpcklbw %%mm0, %%mm0\n\t"
2474
"punpckhbw %%mm1, %%mm1\n\t"
2475
"punpcklbw %%mm2, %%mm2\n\t"
2476
"punpckhbw %%mm3, %%mm3\n\t"
2477
"punpcklbw %%mm4, %%mm4\n\t"
2478
"punpckhbw %%mm5, %%mm5\n\t"
2479
"punpcklbw %%mm6, %%mm6\n\t"
2480
"punpckhbw %%mm7, %%mm7\n\t"
2481
MOVNTQ" %%mm0, %0\n\t"
2482
MOVNTQ" %%mm1, 8%0\n\t"
2483
MOVNTQ" %%mm2, 16%0\n\t"
2484
MOVNTQ" %%mm3, 24%0\n\t"
2485
MOVNTQ" %%mm4, 32%0\n\t"
2486
MOVNTQ" %%mm5, 40%0\n\t"
2487
MOVNTQ" %%mm6, 48%0\n\t"
2488
MOVNTQ" %%mm7, 56%0"
2494
for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2497
const uint8_t* s2=src2+srcStride2*(y>>1);
2498
uint8_t* d=dst2+dstStride2*y;
2505
"movq %1, %%mm0\n\t"
2506
"movq 8%1, %%mm2\n\t"
2507
"movq 16%1, %%mm4\n\t"
2508
"movq 24%1, %%mm6\n\t"
2509
"movq %%mm0, %%mm1\n\t"
2510
"movq %%mm2, %%mm3\n\t"
2511
"movq %%mm4, %%mm5\n\t"
2512
"movq %%mm6, %%mm7\n\t"
2513
"punpcklbw %%mm0, %%mm0\n\t"
2514
"punpckhbw %%mm1, %%mm1\n\t"
2515
"punpcklbw %%mm2, %%mm2\n\t"
2516
"punpckhbw %%mm3, %%mm3\n\t"
2517
"punpcklbw %%mm4, %%mm4\n\t"
2518
"punpckhbw %%mm5, %%mm5\n\t"
2519
"punpcklbw %%mm6, %%mm6\n\t"
2520
"punpckhbw %%mm7, %%mm7\n\t"
2521
MOVNTQ" %%mm0, %0\n\t"
2522
MOVNTQ" %%mm1, 8%0\n\t"
2523
MOVNTQ" %%mm2, 16%0\n\t"
2524
MOVNTQ" %%mm3, 24%0\n\t"
2525
MOVNTQ" %%mm4, 32%0\n\t"
2526
MOVNTQ" %%mm5, 40%0\n\t"
2527
MOVNTQ" %%mm6, 48%0\n\t"
2528
MOVNTQ" %%mm7, 56%0"
2534
for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2545
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2547
unsigned width, unsigned height,
2548
int srcStride1, int srcStride2,
2549
int srcStride3, int dstStride)
2552
w=width/2; h=height;
2554
const uint8_t* yp=src1+srcStride1*y;
2555
const uint8_t* up=src2+srcStride2*(y>>2);
2556
const uint8_t* vp=src3+srcStride3*(y>>2);
2557
uint8_t* d=dst+dstStride*y;
2563
PREFETCH" 32(%1, %0)\n\t"
2564
PREFETCH" 32(%2, %0)\n\t"
2565
PREFETCH" 32(%3, %0)\n\t"
2566
"movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2567
"movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2568
"movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2569
"movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2570
"movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2571
"movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2572
"punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2573
"punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2574
"punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2575
"punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2577
"movq %%mm1, %%mm6\n\t"
2578
"punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2579
"punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2580
"punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2581
MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2582
MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2584
"punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2585
"movq 8(%1, %0, 4), %%mm0\n\t"
2586
"movq %%mm0, %%mm3\n\t"
2587
"punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2588
"punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2589
MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2590
MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2592
"movq %%mm4, %%mm6\n\t"
2593
"movq 16(%1, %0, 4), %%mm0\n\t"
2594
"movq %%mm0, %%mm3\n\t"
2595
"punpcklbw %%mm5, %%mm4\n\t"
2596
"punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2597
"punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2598
MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2599
MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2601
"punpckhbw %%mm5, %%mm6\n\t"
2602
"movq 24(%1, %0, 4), %%mm0\n\t"
2603
"movq %%mm0, %%mm3\n\t"
2604
"punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2605
"punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2606
MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2607
MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2610
: "r"(yp), "r" (up), "r"(vp), "r"(d)