3
* rgb2rgb.c, Software RGB to RGB convertor
4
* pluralize by Software PAL8 to RGB convertor
5
* Software YUV to YUV convertor
6
* Software YUV to RGB convertor
7
* Written by Nick Kurshev.
8
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
* lot of big-endian byteorder fixes by Alex Beregszaszi
13
#include <inttypes.h> /* for __WORDSIZE */
16
// #warning You have misconfigured system and probably will lose performance!
17
#define __WORDSIZE MP_WORDSIZE
35
#define PREFETCH "prefetch"
36
#define PREFETCHW "prefetchw"
37
#define PAVGB "pavgusb"
38
#elif defined ( HAVE_MMX2 )
39
#define PREFETCH "prefetchnta"
40
#define PREFETCHW "prefetcht0"
43
#define PREFETCH "/nop"
44
#define PREFETCHW "/nop"
48
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
55
#define MOVNTQ "movntq"
56
#define SFENCE "sfence"
62
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
65
const uint8_t *s = src;
68
const uint8_t *mm_end;
72
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
74
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
80
"punpckldq 3%1, %%mm0\n\t"
82
"punpckldq 9%1, %%mm1\n\t"
83
"movd 12%1, %%mm2\n\t"
84
"punpckldq 15%1, %%mm2\n\t"
85
"movd 18%1, %%mm3\n\t"
86
"punpckldq 21%1, %%mm3\n\t"
87
"pand %%mm7, %%mm0\n\t"
88
"pand %%mm7, %%mm1\n\t"
89
"pand %%mm7, %%mm2\n\t"
90
"pand %%mm7, %%mm3\n\t"
91
MOVNTQ" %%mm0, %0\n\t"
92
MOVNTQ" %%mm1, 8%0\n\t"
93
MOVNTQ" %%mm2, 16%0\n\t"
101
__asm __volatile(SFENCE:::"memory");
102
__asm __volatile(EMMS:::"memory");
106
#ifdef WORDS_BIGENDIAN
120
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
123
const uint8_t *s = src;
126
const uint8_t *mm_end;
130
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
137
"movq 8%1, %%mm1\n\t"
138
"movq 16%1, %%mm4\n\t"
139
"movq 24%1, %%mm5\n\t"
140
"movq %%mm0, %%mm2\n\t"
141
"movq %%mm1, %%mm3\n\t"
142
"movq %%mm4, %%mm6\n\t"
143
"movq %%mm5, %%mm7\n\t"
144
"psrlq $8, %%mm2\n\t"
145
"psrlq $8, %%mm3\n\t"
146
"psrlq $8, %%mm6\n\t"
147
"psrlq $8, %%mm7\n\t"
156
"por %%mm2, %%mm0\n\t"
157
"por %%mm3, %%mm1\n\t"
158
"por %%mm6, %%mm4\n\t"
159
"por %%mm7, %%mm5\n\t"
161
"movq %%mm1, %%mm2\n\t"
162
"movq %%mm4, %%mm3\n\t"
163
"psllq $48, %%mm2\n\t"
164
"psllq $32, %%mm3\n\t"
167
"por %%mm2, %%mm0\n\t"
168
"psrlq $16, %%mm1\n\t"
169
"psrlq $32, %%mm4\n\t"
170
"psllq $16, %%mm5\n\t"
171
"por %%mm3, %%mm1\n\t"
173
"por %%mm5, %%mm4\n\t"
175
MOVNTQ" %%mm0, %0\n\t"
176
MOVNTQ" %%mm1, 8%0\n\t"
179
:"m"(*s),"m"(mask24l),
180
"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
185
__asm __volatile(SFENCE:::"memory");
186
__asm __volatile(EMMS:::"memory");
190
#ifdef WORDS_BIGENDIAN
205
Original by Strepto/Astral
206
ported to gcc & bugfixed : A'rpi
207
MMX2, 3DNOW optimization by Nick Kurshev
208
32bit c version, and and&add trick by Michael Niedermayer
210
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
212
register const uint8_t* s=src;
213
register uint8_t* d=dst;
214
register const uint8_t *end;
215
const uint8_t *mm_end;
218
__asm __volatile(PREFETCH" %0"::"m"(*s));
219
__asm __volatile("movq %0, %%mm4"::"m"(mask15s));
226
"movq 8%1, %%mm2\n\t"
227
"movq %%mm0, %%mm1\n\t"
228
"movq %%mm2, %%mm3\n\t"
229
"pand %%mm4, %%mm0\n\t"
230
"pand %%mm4, %%mm2\n\t"
231
"paddw %%mm1, %%mm0\n\t"
232
"paddw %%mm3, %%mm2\n\t"
233
MOVNTQ" %%mm0, %0\n\t"
241
__asm __volatile(SFENCE:::"memory");
242
__asm __volatile(EMMS:::"memory");
247
register unsigned x= *((uint32_t *)s);
248
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
254
register unsigned short x= *((uint16_t *)s);
255
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
259
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
261
register const uint8_t* s=src;
262
register uint8_t* d=dst;
263
register const uint8_t *end;
264
const uint8_t *mm_end;
267
__asm __volatile(PREFETCH" %0"::"m"(*s));
268
__asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
269
__asm __volatile("movq %0, %%mm6"::"m"(mask15b));
276
"movq 8%1, %%mm2\n\t"
277
"movq %%mm0, %%mm1\n\t"
278
"movq %%mm2, %%mm3\n\t"
279
"psrlq $1, %%mm0\n\t"
280
"psrlq $1, %%mm2\n\t"
281
"pand %%mm7, %%mm0\n\t"
282
"pand %%mm7, %%mm2\n\t"
283
"pand %%mm6, %%mm1\n\t"
284
"pand %%mm6, %%mm3\n\t"
285
"por %%mm1, %%mm0\n\t"
286
"por %%mm3, %%mm2\n\t"
287
MOVNTQ" %%mm0, %0\n\t"
295
__asm __volatile(SFENCE:::"memory");
296
__asm __volatile(EMMS:::"memory");
301
register uint32_t x= *((uint32_t *)s);
302
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
308
register uint16_t x= *((uint16_t *)s);
309
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
317
const uint8_t *s = src;
320
const uint8_t *mm_end;
322
uint16_t *d = (uint16_t *)dst;
326
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
328
"movq %3, %%mm5 \n\t"
329
"movq %4, %%mm6 \n\t"
330
"movq %5, %%mm7 \n\t"
333
PREFETCH" 32(%1) \n\t"
334
"movd (%1), %%mm0 \n\t"
335
"movd 4(%1), %%mm3 \n\t"
336
"punpckldq 8(%1), %%mm0 \n\t"
337
"punpckldq 12(%1), %%mm3 \n\t"
338
"movq %%mm0, %%mm1 \n\t"
339
"movq %%mm3, %%mm4 \n\t"
340
"pand %%mm6, %%mm0 \n\t"
341
"pand %%mm6, %%mm3 \n\t"
342
"pmaddwd %%mm7, %%mm0 \n\t"
343
"pmaddwd %%mm7, %%mm3 \n\t"
344
"pand %%mm5, %%mm1 \n\t"
345
"pand %%mm5, %%mm4 \n\t"
346
"por %%mm1, %%mm0 \n\t"
347
"por %%mm4, %%mm3 \n\t"
348
"psrld $5, %%mm0 \n\t"
349
"pslld $11, %%mm3 \n\t"
350
"por %%mm3, %%mm0 \n\t"
351
MOVNTQ" %%mm0, (%0) \n\t"
357
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
364
::"m"(red_16mask),"m"(green_16mask));
370
"movd 4%1, %%mm3\n\t"
371
"punpckldq 8%1, %%mm0\n\t"
372
"punpckldq 12%1, %%mm3\n\t"
373
"movq %%mm0, %%mm1\n\t"
374
"movq %%mm0, %%mm2\n\t"
375
"movq %%mm3, %%mm4\n\t"
376
"movq %%mm3, %%mm5\n\t"
377
"psrlq $3, %%mm0\n\t"
378
"psrlq $3, %%mm3\n\t"
381
"psrlq $5, %%mm1\n\t"
382
"psrlq $5, %%mm4\n\t"
383
"pand %%mm6, %%mm1\n\t"
384
"pand %%mm6, %%mm4\n\t"
385
"psrlq $8, %%mm2\n\t"
386
"psrlq $8, %%mm5\n\t"
387
"pand %%mm7, %%mm2\n\t"
388
"pand %%mm7, %%mm5\n\t"
389
"por %%mm1, %%mm0\n\t"
390
"por %%mm4, %%mm3\n\t"
391
"por %%mm2, %%mm0\n\t"
392
"por %%mm5, %%mm3\n\t"
393
"psllq $16, %%mm3\n\t"
394
"por %%mm3, %%mm0\n\t"
395
MOVNTQ" %%mm0, %0\n\t"
396
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
401
__asm __volatile(SFENCE:::"memory");
402
__asm __volatile(EMMS:::"memory");
406
register int rgb = *(uint32_t*)s; s += 4;
407
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
413
const uint8_t *s = src;
416
const uint8_t *mm_end;
418
uint16_t *d = (uint16_t *)dst;
421
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
425
::"m"(red_16mask),"m"(green_16mask));
432
"movd 4%1, %%mm3\n\t"
433
"punpckldq 8%1, %%mm0\n\t"
434
"punpckldq 12%1, %%mm3\n\t"
435
"movq %%mm0, %%mm1\n\t"
436
"movq %%mm0, %%mm2\n\t"
437
"movq %%mm3, %%mm4\n\t"
438
"movq %%mm3, %%mm5\n\t"
439
"psllq $8, %%mm0\n\t"
440
"psllq $8, %%mm3\n\t"
441
"pand %%mm7, %%mm0\n\t"
442
"pand %%mm7, %%mm3\n\t"
443
"psrlq $5, %%mm1\n\t"
444
"psrlq $5, %%mm4\n\t"
445
"pand %%mm6, %%mm1\n\t"
446
"pand %%mm6, %%mm4\n\t"
447
"psrlq $19, %%mm2\n\t"
448
"psrlq $19, %%mm5\n\t"
451
"por %%mm1, %%mm0\n\t"
452
"por %%mm4, %%mm3\n\t"
453
"por %%mm2, %%mm0\n\t"
454
"por %%mm5, %%mm3\n\t"
455
"psllq $16, %%mm3\n\t"
456
"por %%mm3, %%mm0\n\t"
457
MOVNTQ" %%mm0, %0\n\t"
458
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
462
__asm __volatile(SFENCE:::"memory");
463
__asm __volatile(EMMS:::"memory");
467
// FIXME on bigendian
468
const int src= *s; s += 4;
469
*d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
473
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
475
const uint8_t *s = src;
478
const uint8_t *mm_end;
480
uint16_t *d = (uint16_t *)dst;
484
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
486
"movq %3, %%mm5 \n\t"
487
"movq %4, %%mm6 \n\t"
488
"movq %5, %%mm7 \n\t"
491
PREFETCH" 32(%1) \n\t"
492
"movd (%1), %%mm0 \n\t"
493
"movd 4(%1), %%mm3 \n\t"
494
"punpckldq 8(%1), %%mm0 \n\t"
495
"punpckldq 12(%1), %%mm3 \n\t"
496
"movq %%mm0, %%mm1 \n\t"
497
"movq %%mm3, %%mm4 \n\t"
498
"pand %%mm6, %%mm0 \n\t"
499
"pand %%mm6, %%mm3 \n\t"
500
"pmaddwd %%mm7, %%mm0 \n\t"
501
"pmaddwd %%mm7, %%mm3 \n\t"
502
"pand %%mm5, %%mm1 \n\t"
503
"pand %%mm5, %%mm4 \n\t"
504
"por %%mm1, %%mm0 \n\t"
505
"por %%mm4, %%mm3 \n\t"
506
"psrld $6, %%mm0 \n\t"
507
"pslld $10, %%mm3 \n\t"
508
"por %%mm3, %%mm0 \n\t"
509
MOVNTQ" %%mm0, (%0) \n\t"
515
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
522
::"m"(red_15mask),"m"(green_15mask));
528
"movd 4%1, %%mm3\n\t"
529
"punpckldq 8%1, %%mm0\n\t"
530
"punpckldq 12%1, %%mm3\n\t"
531
"movq %%mm0, %%mm1\n\t"
532
"movq %%mm0, %%mm2\n\t"
533
"movq %%mm3, %%mm4\n\t"
534
"movq %%mm3, %%mm5\n\t"
535
"psrlq $3, %%mm0\n\t"
536
"psrlq $3, %%mm3\n\t"
539
"psrlq $6, %%mm1\n\t"
540
"psrlq $6, %%mm4\n\t"
541
"pand %%mm6, %%mm1\n\t"
542
"pand %%mm6, %%mm4\n\t"
543
"psrlq $9, %%mm2\n\t"
544
"psrlq $9, %%mm5\n\t"
545
"pand %%mm7, %%mm2\n\t"
546
"pand %%mm7, %%mm5\n\t"
547
"por %%mm1, %%mm0\n\t"
548
"por %%mm4, %%mm3\n\t"
549
"por %%mm2, %%mm0\n\t"
550
"por %%mm5, %%mm3\n\t"
551
"psllq $16, %%mm3\n\t"
552
"por %%mm3, %%mm0\n\t"
553
MOVNTQ" %%mm0, %0\n\t"
554
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
559
__asm __volatile(SFENCE:::"memory");
560
__asm __volatile(EMMS:::"memory");
564
// FIXME on bigendian
565
const int src= *s; s += 4;
566
*d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
570
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
572
const uint8_t *s = src;
575
const uint8_t *mm_end;
577
uint16_t *d = (uint16_t *)dst;
580
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
584
::"m"(red_15mask),"m"(green_15mask));
591
"movd 4%1, %%mm3\n\t"
592
"punpckldq 8%1, %%mm0\n\t"
593
"punpckldq 12%1, %%mm3\n\t"
594
"movq %%mm0, %%mm1\n\t"
595
"movq %%mm0, %%mm2\n\t"
596
"movq %%mm3, %%mm4\n\t"
597
"movq %%mm3, %%mm5\n\t"
598
"psllq $7, %%mm0\n\t"
599
"psllq $7, %%mm3\n\t"
600
"pand %%mm7, %%mm0\n\t"
601
"pand %%mm7, %%mm3\n\t"
602
"psrlq $6, %%mm1\n\t"
603
"psrlq $6, %%mm4\n\t"
604
"pand %%mm6, %%mm1\n\t"
605
"pand %%mm6, %%mm4\n\t"
606
"psrlq $19, %%mm2\n\t"
607
"psrlq $19, %%mm5\n\t"
610
"por %%mm1, %%mm0\n\t"
611
"por %%mm4, %%mm3\n\t"
612
"por %%mm2, %%mm0\n\t"
613
"por %%mm5, %%mm3\n\t"
614
"psllq $16, %%mm3\n\t"
615
"por %%mm3, %%mm0\n\t"
616
MOVNTQ" %%mm0, %0\n\t"
617
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
621
__asm __volatile(SFENCE:::"memory");
622
__asm __volatile(EMMS:::"memory");
626
// FIXME on bigendian
627
const int src= *s; s += 4;
628
*d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
632
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
634
const uint8_t *s = src;
637
const uint8_t *mm_end;
639
uint16_t *d = (uint16_t *)dst;
642
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
646
::"m"(red_16mask),"m"(green_16mask));
653
"movd 3%1, %%mm3\n\t"
654
"punpckldq 6%1, %%mm0\n\t"
655
"punpckldq 9%1, %%mm3\n\t"
656
"movq %%mm0, %%mm1\n\t"
657
"movq %%mm0, %%mm2\n\t"
658
"movq %%mm3, %%mm4\n\t"
659
"movq %%mm3, %%mm5\n\t"
660
"psrlq $3, %%mm0\n\t"
661
"psrlq $3, %%mm3\n\t"
664
"psrlq $5, %%mm1\n\t"
665
"psrlq $5, %%mm4\n\t"
666
"pand %%mm6, %%mm1\n\t"
667
"pand %%mm6, %%mm4\n\t"
668
"psrlq $8, %%mm2\n\t"
669
"psrlq $8, %%mm5\n\t"
670
"pand %%mm7, %%mm2\n\t"
671
"pand %%mm7, %%mm5\n\t"
672
"por %%mm1, %%mm0\n\t"
673
"por %%mm4, %%mm3\n\t"
674
"por %%mm2, %%mm0\n\t"
675
"por %%mm5, %%mm3\n\t"
676
"psllq $16, %%mm3\n\t"
677
"por %%mm3, %%mm0\n\t"
678
MOVNTQ" %%mm0, %0\n\t"
679
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
683
__asm __volatile(SFENCE:::"memory");
684
__asm __volatile(EMMS:::"memory");
691
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
695
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
697
const uint8_t *s = src;
700
const uint8_t *mm_end;
702
uint16_t *d = (uint16_t *)dst;
705
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
709
::"m"(red_16mask),"m"(green_16mask));
716
"movd 3%1, %%mm3\n\t"
717
"punpckldq 6%1, %%mm0\n\t"
718
"punpckldq 9%1, %%mm3\n\t"
719
"movq %%mm0, %%mm1\n\t"
720
"movq %%mm0, %%mm2\n\t"
721
"movq %%mm3, %%mm4\n\t"
722
"movq %%mm3, %%mm5\n\t"
723
"psllq $8, %%mm0\n\t"
724
"psllq $8, %%mm3\n\t"
725
"pand %%mm7, %%mm0\n\t"
726
"pand %%mm7, %%mm3\n\t"
727
"psrlq $5, %%mm1\n\t"
728
"psrlq $5, %%mm4\n\t"
729
"pand %%mm6, %%mm1\n\t"
730
"pand %%mm6, %%mm4\n\t"
731
"psrlq $19, %%mm2\n\t"
732
"psrlq $19, %%mm5\n\t"
735
"por %%mm1, %%mm0\n\t"
736
"por %%mm4, %%mm3\n\t"
737
"por %%mm2, %%mm0\n\t"
738
"por %%mm5, %%mm3\n\t"
739
"psllq $16, %%mm3\n\t"
740
"por %%mm3, %%mm0\n\t"
741
MOVNTQ" %%mm0, %0\n\t"
742
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
746
__asm __volatile(SFENCE:::"memory");
747
__asm __volatile(EMMS:::"memory");
754
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
758
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
760
const uint8_t *s = src;
763
const uint8_t *mm_end;
765
uint16_t *d = (uint16_t *)dst;
768
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
772
::"m"(red_15mask),"m"(green_15mask));
779
"movd 3%1, %%mm3\n\t"
780
"punpckldq 6%1, %%mm0\n\t"
781
"punpckldq 9%1, %%mm3\n\t"
782
"movq %%mm0, %%mm1\n\t"
783
"movq %%mm0, %%mm2\n\t"
784
"movq %%mm3, %%mm4\n\t"
785
"movq %%mm3, %%mm5\n\t"
786
"psrlq $3, %%mm0\n\t"
787
"psrlq $3, %%mm3\n\t"
790
"psrlq $6, %%mm1\n\t"
791
"psrlq $6, %%mm4\n\t"
792
"pand %%mm6, %%mm1\n\t"
793
"pand %%mm6, %%mm4\n\t"
794
"psrlq $9, %%mm2\n\t"
795
"psrlq $9, %%mm5\n\t"
796
"pand %%mm7, %%mm2\n\t"
797
"pand %%mm7, %%mm5\n\t"
798
"por %%mm1, %%mm0\n\t"
799
"por %%mm4, %%mm3\n\t"
800
"por %%mm2, %%mm0\n\t"
801
"por %%mm5, %%mm3\n\t"
802
"psllq $16, %%mm3\n\t"
803
"por %%mm3, %%mm0\n\t"
804
MOVNTQ" %%mm0, %0\n\t"
805
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
809
__asm __volatile(SFENCE:::"memory");
810
__asm __volatile(EMMS:::"memory");
817
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
821
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
823
const uint8_t *s = src;
826
const uint8_t *mm_end;
828
uint16_t *d = (uint16_t *)dst;
831
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
835
::"m"(red_15mask),"m"(green_15mask));
842
"movd 3%1, %%mm3\n\t"
843
"punpckldq 6%1, %%mm0\n\t"
844
"punpckldq 9%1, %%mm3\n\t"
845
"movq %%mm0, %%mm1\n\t"
846
"movq %%mm0, %%mm2\n\t"
847
"movq %%mm3, %%mm4\n\t"
848
"movq %%mm3, %%mm5\n\t"
849
"psllq $7, %%mm0\n\t"
850
"psllq $7, %%mm3\n\t"
851
"pand %%mm7, %%mm0\n\t"
852
"pand %%mm7, %%mm3\n\t"
853
"psrlq $6, %%mm1\n\t"
854
"psrlq $6, %%mm4\n\t"
855
"pand %%mm6, %%mm1\n\t"
856
"pand %%mm6, %%mm4\n\t"
857
"psrlq $19, %%mm2\n\t"
858
"psrlq $19, %%mm5\n\t"
861
"por %%mm1, %%mm0\n\t"
862
"por %%mm4, %%mm3\n\t"
863
"por %%mm2, %%mm0\n\t"
864
"por %%mm5, %%mm3\n\t"
865
"psllq $16, %%mm3\n\t"
866
"por %%mm3, %%mm0\n\t"
867
MOVNTQ" %%mm0, %0\n\t"
868
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
872
__asm __volatile(SFENCE:::"memory");
873
__asm __volatile(EMMS:::"memory");
880
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
885
I use here less accurate approximation by simply
886
left-shifting the input
887
value and filling the low order bits with
888
zeroes. This method improves png's
889
compression but this scheme cannot reproduce white exactly, since it does not
890
generate an all-ones maximum value; the net effect is to darken the
893
The better method should be "left bit replication":
903
| Leftmost Bits Repeated to Fill Open Bits
907
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
911
const uint16_t *mm_end;
913
uint8_t *d = (uint8_t *)dst;
914
const uint16_t *s = (uint16_t *)src;
915
end = s + src_size/2;
917
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
929
"psllq $3, %%mm0\n\t"
930
"psrlq $2, %%mm1\n\t"
931
"psrlq $7, %%mm2\n\t"
932
"movq %%mm0, %%mm3\n\t"
933
"movq %%mm1, %%mm4\n\t"
934
"movq %%mm2, %%mm5\n\t"
935
"punpcklwd %5, %%mm0\n\t"
936
"punpcklwd %5, %%mm1\n\t"
937
"punpcklwd %5, %%mm2\n\t"
938
"punpckhwd %5, %%mm3\n\t"
939
"punpckhwd %5, %%mm4\n\t"
940
"punpckhwd %5, %%mm5\n\t"
941
"psllq $8, %%mm1\n\t"
942
"psllq $16, %%mm2\n\t"
943
"por %%mm1, %%mm0\n\t"
944
"por %%mm2, %%mm0\n\t"
945
"psllq $8, %%mm4\n\t"
946
"psllq $16, %%mm5\n\t"
947
"por %%mm4, %%mm3\n\t"
948
"por %%mm5, %%mm3\n\t"
950
"movq %%mm0, %%mm6\n\t"
951
"movq %%mm3, %%mm7\n\t"
953
"movq 8%1, %%mm0\n\t"
954
"movq 8%1, %%mm1\n\t"
955
"movq 8%1, %%mm2\n\t"
959
"psllq $3, %%mm0\n\t"
960
"psrlq $2, %%mm1\n\t"
961
"psrlq $7, %%mm2\n\t"
962
"movq %%mm0, %%mm3\n\t"
963
"movq %%mm1, %%mm4\n\t"
964
"movq %%mm2, %%mm5\n\t"
965
"punpcklwd %5, %%mm0\n\t"
966
"punpcklwd %5, %%mm1\n\t"
967
"punpcklwd %5, %%mm2\n\t"
968
"punpckhwd %5, %%mm3\n\t"
969
"punpckhwd %5, %%mm4\n\t"
970
"punpckhwd %5, %%mm5\n\t"
971
"psllq $8, %%mm1\n\t"
972
"psllq $16, %%mm2\n\t"
973
"por %%mm1, %%mm0\n\t"
974
"por %%mm2, %%mm0\n\t"
975
"psllq $8, %%mm4\n\t"
976
"psllq $16, %%mm5\n\t"
977
"por %%mm4, %%mm3\n\t"
978
"por %%mm5, %%mm3\n\t"
981
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
983
/* Borrowed 32 to 24 */
985
"movq %%mm0, %%mm4\n\t"
986
"movq %%mm3, %%mm5\n\t"
987
"movq %%mm6, %%mm0\n\t"
988
"movq %%mm7, %%mm1\n\t"
990
"movq %%mm4, %%mm6\n\t"
991
"movq %%mm5, %%mm7\n\t"
992
"movq %%mm0, %%mm2\n\t"
993
"movq %%mm1, %%mm3\n\t"
995
"psrlq $8, %%mm2\n\t"
996
"psrlq $8, %%mm3\n\t"
997
"psrlq $8, %%mm6\n\t"
998
"psrlq $8, %%mm7\n\t"
1000
"pand %2, %%mm1\n\t"
1001
"pand %2, %%mm4\n\t"
1002
"pand %2, %%mm5\n\t"
1003
"pand %3, %%mm2\n\t"
1004
"pand %3, %%mm3\n\t"
1005
"pand %3, %%mm6\n\t"
1006
"pand %3, %%mm7\n\t"
1007
"por %%mm2, %%mm0\n\t"
1008
"por %%mm3, %%mm1\n\t"
1009
"por %%mm6, %%mm4\n\t"
1010
"por %%mm7, %%mm5\n\t"
1012
"movq %%mm1, %%mm2\n\t"
1013
"movq %%mm4, %%mm3\n\t"
1014
"psllq $48, %%mm2\n\t"
1015
"psllq $32, %%mm3\n\t"
1016
"pand %4, %%mm2\n\t"
1017
"pand %5, %%mm3\n\t"
1018
"por %%mm2, %%mm0\n\t"
1019
"psrlq $16, %%mm1\n\t"
1020
"psrlq $32, %%mm4\n\t"
1021
"psllq $16, %%mm5\n\t"
1022
"por %%mm3, %%mm1\n\t"
1023
"pand %6, %%mm5\n\t"
1024
"por %%mm5, %%mm4\n\t"
1026
MOVNTQ" %%mm0, %0\n\t"
1027
MOVNTQ" %%mm1, 8%0\n\t"
1028
MOVNTQ" %%mm4, 16%0"
1031
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1036
__asm __volatile(SFENCE:::"memory");
1037
__asm __volatile(EMMS:::"memory");
1041
register uint16_t bgr;
1043
*d++ = (bgr&0x1F)<<3;
1044
*d++ = (bgr&0x3E0)>>2;
1045
*d++ = (bgr&0x7C00)>>7;
1049
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1051
const uint16_t *end;
1053
const uint16_t *mm_end;
1055
uint8_t *d = (uint8_t *)dst;
1056
const uint16_t *s = (const uint16_t *)src;
1057
end = s + src_size/2;
1059
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1065
"movq %1, %%mm0\n\t"
1066
"movq %1, %%mm1\n\t"
1067
"movq %1, %%mm2\n\t"
1068
"pand %2, %%mm0\n\t"
1069
"pand %3, %%mm1\n\t"
1070
"pand %4, %%mm2\n\t"
1071
"psllq $3, %%mm0\n\t"
1072
"psrlq $3, %%mm1\n\t"
1073
"psrlq $8, %%mm2\n\t"
1074
"movq %%mm0, %%mm3\n\t"
1075
"movq %%mm1, %%mm4\n\t"
1076
"movq %%mm2, %%mm5\n\t"
1077
"punpcklwd %5, %%mm0\n\t"
1078
"punpcklwd %5, %%mm1\n\t"
1079
"punpcklwd %5, %%mm2\n\t"
1080
"punpckhwd %5, %%mm3\n\t"
1081
"punpckhwd %5, %%mm4\n\t"
1082
"punpckhwd %5, %%mm5\n\t"
1083
"psllq $8, %%mm1\n\t"
1084
"psllq $16, %%mm2\n\t"
1085
"por %%mm1, %%mm0\n\t"
1086
"por %%mm2, %%mm0\n\t"
1087
"psllq $8, %%mm4\n\t"
1088
"psllq $16, %%mm5\n\t"
1089
"por %%mm4, %%mm3\n\t"
1090
"por %%mm5, %%mm3\n\t"
1092
"movq %%mm0, %%mm6\n\t"
1093
"movq %%mm3, %%mm7\n\t"
1095
"movq 8%1, %%mm0\n\t"
1096
"movq 8%1, %%mm1\n\t"
1097
"movq 8%1, %%mm2\n\t"
1098
"pand %2, %%mm0\n\t"
1099
"pand %3, %%mm1\n\t"
1100
"pand %4, %%mm2\n\t"
1101
"psllq $3, %%mm0\n\t"
1102
"psrlq $3, %%mm1\n\t"
1103
"psrlq $8, %%mm2\n\t"
1104
"movq %%mm0, %%mm3\n\t"
1105
"movq %%mm1, %%mm4\n\t"
1106
"movq %%mm2, %%mm5\n\t"
1107
"punpcklwd %5, %%mm0\n\t"
1108
"punpcklwd %5, %%mm1\n\t"
1109
"punpcklwd %5, %%mm2\n\t"
1110
"punpckhwd %5, %%mm3\n\t"
1111
"punpckhwd %5, %%mm4\n\t"
1112
"punpckhwd %5, %%mm5\n\t"
1113
"psllq $8, %%mm1\n\t"
1114
"psllq $16, %%mm2\n\t"
1115
"por %%mm1, %%mm0\n\t"
1116
"por %%mm2, %%mm0\n\t"
1117
"psllq $8, %%mm4\n\t"
1118
"psllq $16, %%mm5\n\t"
1119
"por %%mm4, %%mm3\n\t"
1120
"por %%mm5, %%mm3\n\t"
1122
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1124
/* Borrowed 32 to 24 */
1126
"movq %%mm0, %%mm4\n\t"
1127
"movq %%mm3, %%mm5\n\t"
1128
"movq %%mm6, %%mm0\n\t"
1129
"movq %%mm7, %%mm1\n\t"
1131
"movq %%mm4, %%mm6\n\t"
1132
"movq %%mm5, %%mm7\n\t"
1133
"movq %%mm0, %%mm2\n\t"
1134
"movq %%mm1, %%mm3\n\t"
1136
"psrlq $8, %%mm2\n\t"
1137
"psrlq $8, %%mm3\n\t"
1138
"psrlq $8, %%mm6\n\t"
1139
"psrlq $8, %%mm7\n\t"
1140
"pand %2, %%mm0\n\t"
1141
"pand %2, %%mm1\n\t"
1142
"pand %2, %%mm4\n\t"
1143
"pand %2, %%mm5\n\t"
1144
"pand %3, %%mm2\n\t"
1145
"pand %3, %%mm3\n\t"
1146
"pand %3, %%mm6\n\t"
1147
"pand %3, %%mm7\n\t"
1148
"por %%mm2, %%mm0\n\t"
1149
"por %%mm3, %%mm1\n\t"
1150
"por %%mm6, %%mm4\n\t"
1151
"por %%mm7, %%mm5\n\t"
1153
"movq %%mm1, %%mm2\n\t"
1154
"movq %%mm4, %%mm3\n\t"
1155
"psllq $48, %%mm2\n\t"
1156
"psllq $32, %%mm3\n\t"
1157
"pand %4, %%mm2\n\t"
1158
"pand %5, %%mm3\n\t"
1159
"por %%mm2, %%mm0\n\t"
1160
"psrlq $16, %%mm1\n\t"
1161
"psrlq $32, %%mm4\n\t"
1162
"psllq $16, %%mm5\n\t"
1163
"por %%mm3, %%mm1\n\t"
1164
"pand %6, %%mm5\n\t"
1165
"por %%mm5, %%mm4\n\t"
1167
MOVNTQ" %%mm0, %0\n\t"
1168
MOVNTQ" %%mm1, 8%0\n\t"
1169
MOVNTQ" %%mm4, 16%0"
1172
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1177
__asm __volatile(SFENCE:::"memory");
1178
__asm __volatile(EMMS:::"memory");
1182
register uint16_t bgr;
1184
*d++ = (bgr&0x1F)<<3;
1185
*d++ = (bgr&0x7E0)>>3;
1186
*d++ = (bgr&0xF800)>>8;
1190
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1192
const uint16_t *end;
1194
const uint16_t *mm_end;
1196
uint8_t *d = (uint8_t *)dst;
1197
const uint16_t *s = (const uint16_t *)src;
1198
end = s + src_size/2;
1200
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1201
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1207
"movq %1, %%mm0\n\t"
1208
"movq %1, %%mm1\n\t"
1209
"movq %1, %%mm2\n\t"
1210
"pand %2, %%mm0\n\t"
1211
"pand %3, %%mm1\n\t"
1212
"pand %4, %%mm2\n\t"
1213
"psllq $3, %%mm0\n\t"
1214
"psrlq $2, %%mm1\n\t"
1215
"psrlq $7, %%mm2\n\t"
1216
"movq %%mm0, %%mm3\n\t"
1217
"movq %%mm1, %%mm4\n\t"
1218
"movq %%mm2, %%mm5\n\t"
1219
"punpcklwd %%mm7, %%mm0\n\t"
1220
"punpcklwd %%mm7, %%mm1\n\t"
1221
"punpcklwd %%mm7, %%mm2\n\t"
1222
"punpckhwd %%mm7, %%mm3\n\t"
1223
"punpckhwd %%mm7, %%mm4\n\t"
1224
"punpckhwd %%mm7, %%mm5\n\t"
1225
"psllq $8, %%mm1\n\t"
1226
"psllq $16, %%mm2\n\t"
1227
"por %%mm1, %%mm0\n\t"
1228
"por %%mm2, %%mm0\n\t"
1229
"psllq $8, %%mm4\n\t"
1230
"psllq $16, %%mm5\n\t"
1231
"por %%mm4, %%mm3\n\t"
1232
"por %%mm5, %%mm3\n\t"
1233
MOVNTQ" %%mm0, %0\n\t"
1234
MOVNTQ" %%mm3, 8%0\n\t"
1236
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1241
__asm __volatile(SFENCE:::"memory");
1242
__asm __volatile(EMMS:::"memory");
1246
#if 0 //slightly slower on athlon
1248
*((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1250
//FIXME this is very likely wrong for bigendian (and the following converters too)
1251
register uint16_t bgr;
1253
#ifdef WORDS_BIGENDIAN
1255
*d++ = (bgr&0x1F)<<3;
1256
*d++ = (bgr&0x3E0)>>2;
1257
*d++ = (bgr&0x7C00)>>7;
1259
*d++ = (bgr&0x1F)<<3;
1260
*d++ = (bgr&0x3E0)>>2;
1261
*d++ = (bgr&0x7C00)>>7;
1269
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1271
const uint16_t *end;
1273
const uint16_t *mm_end;
1275
uint8_t *d = (uint8_t *)dst;
1276
const uint16_t *s = (uint16_t *)src;
1277
end = s + src_size/2;
1279
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1280
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1286
"movq %1, %%mm0\n\t"
1287
"movq %1, %%mm1\n\t"
1288
"movq %1, %%mm2\n\t"
1289
"pand %2, %%mm0\n\t"
1290
"pand %3, %%mm1\n\t"
1291
"pand %4, %%mm2\n\t"
1292
"psllq $3, %%mm0\n\t"
1293
"psrlq $3, %%mm1\n\t"
1294
"psrlq $8, %%mm2\n\t"
1295
"movq %%mm0, %%mm3\n\t"
1296
"movq %%mm1, %%mm4\n\t"
1297
"movq %%mm2, %%mm5\n\t"
1298
"punpcklwd %%mm7, %%mm0\n\t"
1299
"punpcklwd %%mm7, %%mm1\n\t"
1300
"punpcklwd %%mm7, %%mm2\n\t"
1301
"punpckhwd %%mm7, %%mm3\n\t"
1302
"punpckhwd %%mm7, %%mm4\n\t"
1303
"punpckhwd %%mm7, %%mm5\n\t"
1304
"psllq $8, %%mm1\n\t"
1305
"psllq $16, %%mm2\n\t"
1306
"por %%mm1, %%mm0\n\t"
1307
"por %%mm2, %%mm0\n\t"
1308
"psllq $8, %%mm4\n\t"
1309
"psllq $16, %%mm5\n\t"
1310
"por %%mm4, %%mm3\n\t"
1311
"por %%mm5, %%mm3\n\t"
1312
MOVNTQ" %%mm0, %0\n\t"
1313
MOVNTQ" %%mm3, 8%0\n\t"
1315
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1320
__asm __volatile(SFENCE:::"memory");
1321
__asm __volatile(EMMS:::"memory");
1325
register uint16_t bgr;
1327
#ifdef WORDS_BIGENDIAN
1329
*d++ = (bgr&0x1F)<<3;
1330
*d++ = (bgr&0x7E0)>>3;
1331
*d++ = (bgr&0xF800)>>8;
1333
*d++ = (bgr&0x1F)<<3;
1334
*d++ = (bgr&0x7E0)>>3;
1335
*d++ = (bgr&0xF800)>>8;
1341
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1344
/* TODO: unroll this loop */
1346
"xor %%"REG_a", %%"REG_a" \n\t"
1349
PREFETCH" 32(%0, %%"REG_a") \n\t"
1350
"movq (%0, %%"REG_a"), %%mm0 \n\t"
1351
"movq %%mm0, %%mm1 \n\t"
1352
"movq %%mm0, %%mm2 \n\t"
1353
"pslld $16, %%mm0 \n\t"
1354
"psrld $16, %%mm1 \n\t"
1355
"pand "MANGLE(mask32r)", %%mm0 \n\t"
1356
"pand "MANGLE(mask32g)", %%mm2 \n\t"
1357
"pand "MANGLE(mask32b)", %%mm1 \n\t"
1358
"por %%mm0, %%mm2 \n\t"
1359
"por %%mm1, %%mm2 \n\t"
1360
MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1361
"add $8, %%"REG_a" \n\t"
1362
"cmp %2, %%"REG_a" \n\t"
1364
:: "r" (src), "r"(dst), "r" ((long)src_size-7)
1368
__asm __volatile(SFENCE:::"memory");
1369
__asm __volatile(EMMS:::"memory");
1372
unsigned num_pixels = src_size >> 2;
1373
for(i=0; i<num_pixels; i++)
1375
#ifdef WORDS_BIGENDIAN
1376
dst[4*i + 1] = src[4*i + 3];
1377
dst[4*i + 2] = src[4*i + 2];
1378
dst[4*i + 3] = src[4*i + 1];
1380
dst[4*i + 0] = src[4*i + 2];
1381
dst[4*i + 1] = src[4*i + 1];
1382
dst[4*i + 2] = src[4*i + 0];
1388
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1392
long mmx_size= 23 - src_size;
1394
"movq "MANGLE(mask24r)", %%mm5 \n\t"
1395
"movq "MANGLE(mask24g)", %%mm6 \n\t"
1396
"movq "MANGLE(mask24b)", %%mm7 \n\t"
1399
PREFETCH" 32(%1, %%"REG_a") \n\t"
1400
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1401
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1402
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1403
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
1404
"pand %%mm5, %%mm0 \n\t"
1405
"pand %%mm6, %%mm1 \n\t"
1406
"pand %%mm7, %%mm2 \n\t"
1407
"por %%mm0, %%mm1 \n\t"
1408
"por %%mm2, %%mm1 \n\t"
1409
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1410
MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1411
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1412
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1413
"pand %%mm7, %%mm0 \n\t"
1414
"pand %%mm5, %%mm1 \n\t"
1415
"pand %%mm6, %%mm2 \n\t"
1416
"por %%mm0, %%mm1 \n\t"
1417
"por %%mm2, %%mm1 \n\t"
1418
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1419
MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1420
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1421
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1422
"pand %%mm6, %%mm0 \n\t"
1423
"pand %%mm7, %%mm1 \n\t"
1424
"pand %%mm5, %%mm2 \n\t"
1425
"por %%mm0, %%mm1 \n\t"
1426
"por %%mm2, %%mm1 \n\t"
1427
MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1428
"add $24, %%"REG_a" \n\t"
1431
: "r" (src-mmx_size), "r"(dst-mmx_size)
1434
__asm __volatile(SFENCE:::"memory");
1435
__asm __volatile(EMMS:::"memory");
1437
if(mmx_size==23) return; //finihsed, was multiple of 8
1441
src_size= 23-mmx_size;
1445
for(i=0; i<src_size; i+=3)
1449
dst[i + 1] = src[i + 1];
1450
dst[i + 2] = src[i + 0];
1455
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456
unsigned int width, unsigned int height,
1457
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1460
const unsigned chromWidth= width>>1;
1461
for(y=0; y<height; y++)
1464
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1466
"xor %%"REG_a", %%"REG_a" \n\t"
1469
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1470
PREFETCH" 32(%2, %%"REG_a") \n\t"
1471
PREFETCH" 32(%3, %%"REG_a") \n\t"
1472
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1473
"movq %%mm0, %%mm2 \n\t" // U(0)
1474
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1475
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1476
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1478
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1479
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1480
"movq %%mm3, %%mm4 \n\t" // Y(0)
1481
"movq %%mm5, %%mm6 \n\t" // Y(8)
1482
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1483
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1484
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1485
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1487
MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1488
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1489
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1490
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1492
"add $8, %%"REG_a" \n\t"
1493
"cmp %4, %%"REG_a" \n\t"
1495
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1500
#if defined ARCH_ALPHA && defined HAVE_MVI
1501
#define pl2yuy2(n) \
1506
asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1507
asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1508
asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1509
asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1510
yuv1 = (u << 8) + (v << 24); \
1517
uint64_t *qdst = (uint64_t *) dst;
1518
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1519
const uint32_t *yc = (uint32_t *) ysrc;
1520
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1521
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1522
for(i = 0; i < chromWidth; i += 8){
1523
uint64_t y1, y2, yuv1, yuv2;
1526
asm("ldq $31,64(%0)" :: "r"(yc));
1527
asm("ldq $31,64(%0)" :: "r"(yc2));
1528
asm("ldq $31,64(%0)" :: "r"(uc));
1529
asm("ldq $31,64(%0)" :: "r"(vc));
1547
#elif __WORDSIZE >= 64
1549
uint64_t *ldst = (uint64_t *) dst;
1550
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1551
for(i = 0; i < chromWidth; i += 2){
1553
k = yc[0] + (uc[0] << 8) +
1554
(yc[1] << 16) + (vc[0] << 24);
1555
l = yc[2] + (uc[1] << 8) +
1556
(yc[3] << 16) + (vc[1] << 24);
1557
*ldst++ = k + (l << 32);
1564
int i, *idst = (int32_t *) dst;
1565
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1566
for(i = 0; i < chromWidth; i++){
1567
#ifdef WORDS_BIGENDIAN
1568
*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1569
(yc[1] << 8) + (vc[0] << 0);
1571
*idst++ = yc[0] + (uc[0] << 8) +
1572
(yc[1] << 16) + (vc[0] << 24);
1580
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1582
usrc += chromStride;
1583
vsrc += chromStride;
1597
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1598
* problem for anyone then tell me, and ill fix it)
1600
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1601
unsigned int width, unsigned int height,
1602
int lumStride, int chromStride, int dstStride)
1604
//FIXME interpolate chroma
1605
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1608
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1609
unsigned int width, unsigned int height,
1610
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1613
const unsigned chromWidth= width>>1;
1614
for(y=0; y<height; y++)
1617
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1619
"xor %%"REG_a", %%"REG_a" \n\t"
1622
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1623
PREFETCH" 32(%2, %%"REG_a") \n\t"
1624
PREFETCH" 32(%3, %%"REG_a") \n\t"
1625
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1626
"movq %%mm0, %%mm2 \n\t" // U(0)
1627
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1628
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1629
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1631
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1632
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1633
"movq %%mm0, %%mm4 \n\t" // Y(0)
1634
"movq %%mm2, %%mm6 \n\t" // Y(8)
1635
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1636
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1637
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1638
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1640
MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1641
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1642
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1643
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1645
"add $8, %%"REG_a" \n\t"
1646
"cmp %4, %%"REG_a" \n\t"
1648
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1652
//FIXME adapt the alpha asm code from yv12->yuy2
1654
#if __WORDSIZE >= 64
1656
uint64_t *ldst = (uint64_t *) dst;
1657
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1658
for(i = 0; i < chromWidth; i += 2){
1660
k = uc[0] + (yc[0] << 8) +
1661
(vc[0] << 16) + (yc[1] << 24);
1662
l = uc[1] + (yc[2] << 8) +
1663
(vc[1] << 16) + (yc[3] << 24);
1664
*ldst++ = k + (l << 32);
1671
int i, *idst = (int32_t *) dst;
1672
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1673
for(i = 0; i < chromWidth; i++){
1674
#ifdef WORDS_BIGENDIAN
1675
*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1676
(vc[0] << 8) + (yc[1] << 0);
1678
*idst++ = uc[0] + (yc[0] << 8) +
1679
(vc[0] << 16) + (yc[1] << 24);
1687
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1689
usrc += chromStride;
1690
vsrc += chromStride;
1704
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1705
* problem for anyone then tell me, and ill fix it)
1707
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1708
unsigned int width, unsigned int height,
1709
int lumStride, int chromStride, int dstStride)
1711
//FIXME interpolate chroma
1712
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1717
* width should be a multiple of 16
1719
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1720
unsigned int width, unsigned int height,
1721
int lumStride, int chromStride, int dstStride)
1723
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1728
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729
* problem for anyone then tell me, and ill fix it)
1731
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1732
unsigned int width, unsigned int height,
1733
int lumStride, int chromStride, int srcStride)
1736
const unsigned chromWidth= width>>1;
1737
for(y=0; y<height; y+=2)
1741
"xor %%"REG_a", %%"REG_a" \n\t"
1742
"pcmpeqw %%mm7, %%mm7 \n\t"
1743
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1746
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1747
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1748
"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1749
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1750
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1751
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1752
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1753
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1754
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1755
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1756
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1758
MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1760
"movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1761
"movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1762
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1763
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1764
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1765
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1766
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1767
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1768
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1769
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1771
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1773
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1774
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1775
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1776
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1777
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1778
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1779
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1780
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1782
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1783
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1785
"add $8, %%"REG_a" \n\t"
1786
"cmp %4, %%"REG_a" \n\t"
1788
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1789
: "memory", "%"REG_a
1796
"xor %%"REG_a", %%"REG_a" \n\t"
1799
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1800
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1801
"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1802
"movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1803
"movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1804
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1805
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1806
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1807
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1808
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1809
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1811
MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1812
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1814
"add $8, %%"REG_a" \n\t"
1815
"cmp %4, %%"REG_a" \n\t"
1818
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1819
: "memory", "%"REG_a
1823
for(i=0; i<chromWidth; i++)
1825
ydst[2*i+0] = src[4*i+0];
1826
udst[i] = src[4*i+1];
1827
ydst[2*i+1] = src[4*i+2];
1828
vdst[i] = src[4*i+3];
1833
for(i=0; i<chromWidth; i++)
1835
ydst[2*i+0] = src[4*i+0];
1836
ydst[2*i+1] = src[4*i+2];
1839
udst += chromStride;
1840
vdst += chromStride;
1845
asm volatile( EMMS" \n\t"
1851
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1852
uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853
unsigned int width, unsigned int height, int lumStride, int chromStride)
1856
memcpy(ydst, ysrc, width*height);
1858
/* XXX: implement upscaling for U,V */
1861
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1868
for(x=0; x<srcWidth-1; x++){
1869
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1870
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1872
dst[2*srcWidth-1]= src[srcWidth-1];
1876
for(y=1; y<srcHeight; y++){
1877
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878
const long mmxSize= srcWidth&~15;
1880
"mov %4, %%"REG_a" \n\t"
1882
"movq (%0, %%"REG_a"), %%mm0 \n\t"
1883
"movq (%1, %%"REG_a"), %%mm1 \n\t"
1884
"movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1885
"movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1886
"movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1887
"movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1888
PAVGB" %%mm0, %%mm5 \n\t"
1889
PAVGB" %%mm0, %%mm3 \n\t"
1890
PAVGB" %%mm0, %%mm5 \n\t"
1891
PAVGB" %%mm0, %%mm3 \n\t"
1892
PAVGB" %%mm1, %%mm4 \n\t"
1893
PAVGB" %%mm1, %%mm2 \n\t"
1894
PAVGB" %%mm1, %%mm4 \n\t"
1895
PAVGB" %%mm1, %%mm2 \n\t"
1896
"movq %%mm5, %%mm7 \n\t"
1897
"movq %%mm4, %%mm6 \n\t"
1898
"punpcklbw %%mm3, %%mm5 \n\t"
1899
"punpckhbw %%mm3, %%mm7 \n\t"
1900
"punpcklbw %%mm2, %%mm4 \n\t"
1901
"punpckhbw %%mm2, %%mm6 \n\t"
1903
MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1904
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1905
MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1906
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1908
"movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1909
"movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1910
"movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1911
"movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1913
"add $8, %%"REG_a" \n\t"
1915
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1916
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1922
const int mmxSize=1;
1924
dst[0 ]= (3*src[0] + src[srcStride])>>2;
1925
dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1927
for(x=mmxSize-1; x<srcWidth-1; x++){
1928
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1929
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1930
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1931
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1933
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1934
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1944
for(x=0; x<srcWidth-1; x++){
1945
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1946
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1948
dst[2*srcWidth-1]= src[srcWidth-1];
1950
for(x=0; x<srcWidth; x++){
1957
asm volatile( EMMS" \n\t"
1965
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1966
* problem for anyone then tell me, and ill fix it)
1967
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1969
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1970
unsigned int width, unsigned int height,
1971
int lumStride, int chromStride, int srcStride)
1974
const unsigned chromWidth= width>>1;
1975
for(y=0; y<height; y+=2)
1979
"xorl %%eax, %%eax \n\t"
1980
"pcmpeqw %%mm7, %%mm7 \n\t"
1981
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1984
PREFETCH" 64(%0, %%eax, 4) \n\t"
1985
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1986
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1987
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1988
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1989
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1990
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1991
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1992
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1993
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1994
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1996
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1998
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1999
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2000
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2001
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2002
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2003
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2004
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2005
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2006
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2007
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2009
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2011
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2012
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2013
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2014
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2015
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2016
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2017
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2018
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2020
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2021
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2023
"addl $8, %%eax \n\t"
2024
"cmpl %4, %%eax \n\t"
2026
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2034
"xorl %%eax, %%eax \n\t"
2037
PREFETCH" 64(%0, %%eax, 4) \n\t"
2038
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2039
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2040
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2041
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2042
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2043
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2044
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2045
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2046
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2047
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2049
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2050
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2052
"addl $8, %%eax \n\t"
2053
"cmpl %4, %%eax \n\t"
2056
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2061
for(i=0; i<chromWidth; i++)
2063
udst[i] = src[4*i+0];
2064
ydst[2*i+0] = src[4*i+1];
2065
vdst[i] = src[4*i+2];
2066
ydst[2*i+1] = src[4*i+3];
2071
for(i=0; i<chromWidth; i++)
2073
ydst[2*i+0] = src[4*i+1];
2074
ydst[2*i+1] = src[4*i+3];
2077
udst += chromStride;
2078
vdst += chromStride;
2083
asm volatile( EMMS" \n\t"
2091
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2092
* problem for anyone then tell me, and ill fix it)
2093
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2095
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2096
unsigned int width, unsigned int height,
2097
int lumStride, int chromStride, int srcStride)
2100
const unsigned chromWidth= width>>1;
2102
for(y=0; y<height-2; y+=2)
2108
"mov %2, %%"REG_a" \n\t"
2109
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2110
"movq "MANGLE(w1111)", %%mm5 \n\t"
2111
"pxor %%mm7, %%mm7 \n\t"
2112
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115
PREFETCH" 64(%0, %%"REG_b") \n\t"
2116
"movd (%0, %%"REG_b"), %%mm0 \n\t"
2117
"movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2118
"punpcklbw %%mm7, %%mm0 \n\t"
2119
"punpcklbw %%mm7, %%mm1 \n\t"
2120
"movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2121
"movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2122
"punpcklbw %%mm7, %%mm2 \n\t"
2123
"punpcklbw %%mm7, %%mm3 \n\t"
2124
"pmaddwd %%mm6, %%mm0 \n\t"
2125
"pmaddwd %%mm6, %%mm1 \n\t"
2126
"pmaddwd %%mm6, %%mm2 \n\t"
2127
"pmaddwd %%mm6, %%mm3 \n\t"
2128
#ifndef FAST_BGR2YV12
2129
"psrad $8, %%mm0 \n\t"
2130
"psrad $8, %%mm1 \n\t"
2131
"psrad $8, %%mm2 \n\t"
2132
"psrad $8, %%mm3 \n\t"
2134
"packssdw %%mm1, %%mm0 \n\t"
2135
"packssdw %%mm3, %%mm2 \n\t"
2136
"pmaddwd %%mm5, %%mm0 \n\t"
2137
"pmaddwd %%mm5, %%mm2 \n\t"
2138
"packssdw %%mm2, %%mm0 \n\t"
2139
"psraw $7, %%mm0 \n\t"
2141
"movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2142
"movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2143
"punpcklbw %%mm7, %%mm4 \n\t"
2144
"punpcklbw %%mm7, %%mm1 \n\t"
2145
"movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2146
"movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2147
"punpcklbw %%mm7, %%mm2 \n\t"
2148
"punpcklbw %%mm7, %%mm3 \n\t"
2149
"pmaddwd %%mm6, %%mm4 \n\t"
2150
"pmaddwd %%mm6, %%mm1 \n\t"
2151
"pmaddwd %%mm6, %%mm2 \n\t"
2152
"pmaddwd %%mm6, %%mm3 \n\t"
2153
#ifndef FAST_BGR2YV12
2154
"psrad $8, %%mm4 \n\t"
2155
"psrad $8, %%mm1 \n\t"
2156
"psrad $8, %%mm2 \n\t"
2157
"psrad $8, %%mm3 \n\t"
2159
"packssdw %%mm1, %%mm4 \n\t"
2160
"packssdw %%mm3, %%mm2 \n\t"
2161
"pmaddwd %%mm5, %%mm4 \n\t"
2162
"pmaddwd %%mm5, %%mm2 \n\t"
2163
"add $24, %%"REG_b" \n\t"
2164
"packssdw %%mm2, %%mm4 \n\t"
2165
"psraw $7, %%mm4 \n\t"
2167
"packuswb %%mm4, %%mm0 \n\t"
2168
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2170
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2171
"add $8, %%"REG_a" \n\t"
2173
: : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2174
: "%"REG_a, "%"REG_b
2181
"mov %4, %%"REG_a" \n\t"
2182
"movq "MANGLE(w1111)", %%mm5 \n\t"
2183
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2184
"pxor %%mm7, %%mm7 \n\t"
2185
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2186
"add %%"REG_b", %%"REG_b" \n\t"
2189
PREFETCH" 64(%0, %%"REG_b") \n\t"
2190
PREFETCH" 64(%1, %%"REG_b") \n\t"
2191
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2192
"movq (%0, %%"REG_b"), %%mm0 \n\t"
2193
"movq (%1, %%"REG_b"), %%mm1 \n\t"
2194
"movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2195
"movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2196
PAVGB" %%mm1, %%mm0 \n\t"
2197
PAVGB" %%mm3, %%mm2 \n\t"
2198
"movq %%mm0, %%mm1 \n\t"
2199
"movq %%mm2, %%mm3 \n\t"
2200
"psrlq $24, %%mm0 \n\t"
2201
"psrlq $24, %%mm2 \n\t"
2202
PAVGB" %%mm1, %%mm0 \n\t"
2203
PAVGB" %%mm3, %%mm2 \n\t"
2204
"punpcklbw %%mm7, %%mm0 \n\t"
2205
"punpcklbw %%mm7, %%mm2 \n\t"
2207
"movd (%0, %%"REG_b"), %%mm0 \n\t"
2208
"movd (%1, %%"REG_b"), %%mm1 \n\t"
2209
"movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2210
"movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2211
"punpcklbw %%mm7, %%mm0 \n\t"
2212
"punpcklbw %%mm7, %%mm1 \n\t"
2213
"punpcklbw %%mm7, %%mm2 \n\t"
2214
"punpcklbw %%mm7, %%mm3 \n\t"
2215
"paddw %%mm1, %%mm0 \n\t"
2216
"paddw %%mm3, %%mm2 \n\t"
2217
"paddw %%mm2, %%mm0 \n\t"
2218
"movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2219
"movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2220
"movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2221
"movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2222
"punpcklbw %%mm7, %%mm4 \n\t"
2223
"punpcklbw %%mm7, %%mm1 \n\t"
2224
"punpcklbw %%mm7, %%mm2 \n\t"
2225
"punpcklbw %%mm7, %%mm3 \n\t"
2226
"paddw %%mm1, %%mm4 \n\t"
2227
"paddw %%mm3, %%mm2 \n\t"
2228
"paddw %%mm4, %%mm2 \n\t"
2229
"psrlw $2, %%mm0 \n\t"
2230
"psrlw $2, %%mm2 \n\t"
2232
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2233
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2235
"pmaddwd %%mm0, %%mm1 \n\t"
2236
"pmaddwd %%mm2, %%mm3 \n\t"
2237
"pmaddwd %%mm6, %%mm0 \n\t"
2238
"pmaddwd %%mm6, %%mm2 \n\t"
2239
#ifndef FAST_BGR2YV12
2240
"psrad $8, %%mm0 \n\t"
2241
"psrad $8, %%mm1 \n\t"
2242
"psrad $8, %%mm2 \n\t"
2243
"psrad $8, %%mm3 \n\t"
2245
"packssdw %%mm2, %%mm0 \n\t"
2246
"packssdw %%mm3, %%mm1 \n\t"
2247
"pmaddwd %%mm5, %%mm0 \n\t"
2248
"pmaddwd %%mm5, %%mm1 \n\t"
2249
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2250
"psraw $7, %%mm0 \n\t"
2252
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2253
"movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2254
"movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2255
"movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2256
"movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2257
PAVGB" %%mm1, %%mm4 \n\t"
2258
PAVGB" %%mm3, %%mm2 \n\t"
2259
"movq %%mm4, %%mm1 \n\t"
2260
"movq %%mm2, %%mm3 \n\t"
2261
"psrlq $24, %%mm4 \n\t"
2262
"psrlq $24, %%mm2 \n\t"
2263
PAVGB" %%mm1, %%mm4 \n\t"
2264
PAVGB" %%mm3, %%mm2 \n\t"
2265
"punpcklbw %%mm7, %%mm4 \n\t"
2266
"punpcklbw %%mm7, %%mm2 \n\t"
2268
"movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2269
"movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2270
"movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2271
"movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2272
"punpcklbw %%mm7, %%mm4 \n\t"
2273
"punpcklbw %%mm7, %%mm1 \n\t"
2274
"punpcklbw %%mm7, %%mm2 \n\t"
2275
"punpcklbw %%mm7, %%mm3 \n\t"
2276
"paddw %%mm1, %%mm4 \n\t"
2277
"paddw %%mm3, %%mm2 \n\t"
2278
"paddw %%mm2, %%mm4 \n\t"
2279
"movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2280
"movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2281
"movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2282
"movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2283
"punpcklbw %%mm7, %%mm5 \n\t"
2284
"punpcklbw %%mm7, %%mm1 \n\t"
2285
"punpcklbw %%mm7, %%mm2 \n\t"
2286
"punpcklbw %%mm7, %%mm3 \n\t"
2287
"paddw %%mm1, %%mm5 \n\t"
2288
"paddw %%mm3, %%mm2 \n\t"
2289
"paddw %%mm5, %%mm2 \n\t"
2290
"movq "MANGLE(w1111)", %%mm5 \n\t"
2291
"psrlw $2, %%mm4 \n\t"
2292
"psrlw $2, %%mm2 \n\t"
2294
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2295
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2297
"pmaddwd %%mm4, %%mm1 \n\t"
2298
"pmaddwd %%mm2, %%mm3 \n\t"
2299
"pmaddwd %%mm6, %%mm4 \n\t"
2300
"pmaddwd %%mm6, %%mm2 \n\t"
2301
#ifndef FAST_BGR2YV12
2302
"psrad $8, %%mm4 \n\t"
2303
"psrad $8, %%mm1 \n\t"
2304
"psrad $8, %%mm2 \n\t"
2305
"psrad $8, %%mm3 \n\t"
2307
"packssdw %%mm2, %%mm4 \n\t"
2308
"packssdw %%mm3, %%mm1 \n\t"
2309
"pmaddwd %%mm5, %%mm4 \n\t"
2310
"pmaddwd %%mm5, %%mm1 \n\t"
2311
"add $24, %%"REG_b" \n\t"
2312
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2313
"psraw $7, %%mm4 \n\t"
2315
"movq %%mm0, %%mm1 \n\t"
2316
"punpckldq %%mm4, %%mm0 \n\t"
2317
"punpckhdq %%mm4, %%mm1 \n\t"
2318
"packsswb %%mm1, %%mm0 \n\t"
2319
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2320
"movd %%mm0, (%2, %%"REG_a") \n\t"
2321
"punpckhdq %%mm0, %%mm0 \n\t"
2322
"movd %%mm0, (%3, %%"REG_a") \n\t"
2323
"add $4, %%"REG_a" \n\t"
2325
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2326
: "%"REG_a, "%"REG_b
2329
udst += chromStride;
2330
vdst += chromStride;
2334
asm volatile( EMMS" \n\t"
2340
for(; y<height; y+=2)
2343
for(i=0; i<chromWidth; i++)
2345
unsigned int b= src[6*i+0];
2346
unsigned int g= src[6*i+1];
2347
unsigned int r= src[6*i+2];
2349
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2351
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2361
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2367
for(i=0; i<chromWidth; i++)
2369
unsigned int b= src[6*i+0];
2370
unsigned int g= src[6*i+1];
2371
unsigned int r= src[6*i+2];
2373
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2381
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2384
udst += chromStride;
2385
vdst += chromStride;
2391
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2392
unsigned width, unsigned height, int src1Stride,
2393
int src2Stride, int dstStride){
2396
for(h=0; h < height; h++)
2403
"xor %%"REG_a", %%"REG_a" \n\t"
2405
PREFETCH" 64(%1, %%"REG_a") \n\t"
2406
PREFETCH" 64(%2, %%"REG_a") \n\t"
2407
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2408
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2409
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2410
"punpcklbw %%xmm2, %%xmm0 \n\t"
2411
"punpckhbw %%xmm2, %%xmm1 \n\t"
2412
"movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2413
"movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2414
"add $16, %%"REG_a" \n\t"
2415
"cmp %3, %%"REG_a" \n\t"
2417
::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2418
: "memory", "%"REG_a""
2422
"xor %%"REG_a", %%"REG_a" \n\t"
2424
PREFETCH" 64(%1, %%"REG_a") \n\t"
2425
PREFETCH" 64(%2, %%"REG_a") \n\t"
2426
"movq (%1, %%"REG_a"), %%mm0 \n\t"
2427
"movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2428
"movq %%mm0, %%mm1 \n\t"
2429
"movq %%mm2, %%mm3 \n\t"
2430
"movq (%2, %%"REG_a"), %%mm4 \n\t"
2431
"movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2432
"punpcklbw %%mm4, %%mm0 \n\t"
2433
"punpckhbw %%mm4, %%mm1 \n\t"
2434
"punpcklbw %%mm5, %%mm2 \n\t"
2435
"punpckhbw %%mm5, %%mm3 \n\t"
2436
MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2437
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2438
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2439
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2440
"add $16, %%"REG_a" \n\t"
2441
"cmp %3, %%"REG_a" \n\t"
2443
::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2444
: "memory", "%"REG_a
2447
for(w= (width&(~15)); w < width; w++)
2449
dest[2*w+0] = src1[w];
2450
dest[2*w+1] = src2[w];
2453
for(w=0; w < width; w++)
2455
dest[2*w+0] = src1[w];
2456
dest[2*w+1] = src2[w];
2472
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2473
uint8_t *dst1, uint8_t *dst2,
2474
unsigned width, unsigned height,
2475
int srcStride1, int srcStride2,
2476
int dstStride1, int dstStride2)
2480
w=width/2; h=height/2;
2485
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2488
const uint8_t* s1=src1+srcStride1*(y>>1);
2489
uint8_t* d=dst1+dstStride1*y;
2496
"movq %1, %%mm0\n\t"
2497
"movq 8%1, %%mm2\n\t"
2498
"movq 16%1, %%mm4\n\t"
2499
"movq 24%1, %%mm6\n\t"
2500
"movq %%mm0, %%mm1\n\t"
2501
"movq %%mm2, %%mm3\n\t"
2502
"movq %%mm4, %%mm5\n\t"
2503
"movq %%mm6, %%mm7\n\t"
2504
"punpcklbw %%mm0, %%mm0\n\t"
2505
"punpckhbw %%mm1, %%mm1\n\t"
2506
"punpcklbw %%mm2, %%mm2\n\t"
2507
"punpckhbw %%mm3, %%mm3\n\t"
2508
"punpcklbw %%mm4, %%mm4\n\t"
2509
"punpckhbw %%mm5, %%mm5\n\t"
2510
"punpcklbw %%mm6, %%mm6\n\t"
2511
"punpckhbw %%mm7, %%mm7\n\t"
2512
MOVNTQ" %%mm0, %0\n\t"
2513
MOVNTQ" %%mm1, 8%0\n\t"
2514
MOVNTQ" %%mm2, 16%0\n\t"
2515
MOVNTQ" %%mm3, 24%0\n\t"
2516
MOVNTQ" %%mm4, 32%0\n\t"
2517
MOVNTQ" %%mm5, 40%0\n\t"
2518
MOVNTQ" %%mm6, 48%0\n\t"
2519
MOVNTQ" %%mm7, 56%0"
2525
for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2528
const uint8_t* s2=src2+srcStride2*(y>>1);
2529
uint8_t* d=dst2+dstStride2*y;
2536
"movq %1, %%mm0\n\t"
2537
"movq 8%1, %%mm2\n\t"
2538
"movq 16%1, %%mm4\n\t"
2539
"movq 24%1, %%mm6\n\t"
2540
"movq %%mm0, %%mm1\n\t"
2541
"movq %%mm2, %%mm3\n\t"
2542
"movq %%mm4, %%mm5\n\t"
2543
"movq %%mm6, %%mm7\n\t"
2544
"punpcklbw %%mm0, %%mm0\n\t"
2545
"punpckhbw %%mm1, %%mm1\n\t"
2546
"punpcklbw %%mm2, %%mm2\n\t"
2547
"punpckhbw %%mm3, %%mm3\n\t"
2548
"punpcklbw %%mm4, %%mm4\n\t"
2549
"punpckhbw %%mm5, %%mm5\n\t"
2550
"punpcklbw %%mm6, %%mm6\n\t"
2551
"punpckhbw %%mm7, %%mm7\n\t"
2552
MOVNTQ" %%mm0, %0\n\t"
2553
MOVNTQ" %%mm1, 8%0\n\t"
2554
MOVNTQ" %%mm2, 16%0\n\t"
2555
MOVNTQ" %%mm3, 24%0\n\t"
2556
MOVNTQ" %%mm4, 32%0\n\t"
2557
MOVNTQ" %%mm5, 40%0\n\t"
2558
MOVNTQ" %%mm6, 48%0\n\t"
2559
MOVNTQ" %%mm7, 56%0"
2565
for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2578
unsigned width, unsigned height,
2579
int srcStride1, int srcStride2,
2580
int srcStride3, int dstStride)
2582
unsigned long y,x,w,h;
2583
w=width/2; h=height;
2585
const uint8_t* yp=src1+srcStride1*y;
2586
const uint8_t* up=src2+srcStride2*(y>>2);
2587
const uint8_t* vp=src3+srcStride3*(y>>2);
2588
uint8_t* d=dst+dstStride*y;
2594
PREFETCH" 32(%1, %0)\n\t"
2595
PREFETCH" 32(%2, %0)\n\t"
2596
PREFETCH" 32(%3, %0)\n\t"
2597
"movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2598
"movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2599
"movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2600
"movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2601
"movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2602
"movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2603
"punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2604
"punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2605
"punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2606
"punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2608
"movq %%mm1, %%mm6\n\t"
2609
"punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2610
"punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2611
"punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2612
MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2613
MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2615
"punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2616
"movq 8(%1, %0, 4), %%mm0\n\t"
2617
"movq %%mm0, %%mm3\n\t"
2618
"punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2619
"punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2620
MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2621
MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2623
"movq %%mm4, %%mm6\n\t"
2624
"movq 16(%1, %0, 4), %%mm0\n\t"
2625
"movq %%mm0, %%mm3\n\t"
2626
"punpcklbw %%mm5, %%mm4\n\t"
2627
"punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2628
"punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2629
MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2630
MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2632
"punpckhbw %%mm5, %%mm6\n\t"
2633
"movq 24(%1, %0, 4), %%mm0\n\t"
2634
"movq %%mm0, %%mm3\n\t"
2635
"punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2636
"punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2637
MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2638
MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2641
: "r"(yp), "r" (up), "r"(vp), "r"(d)