2
* rgb2rgb.c, Software RGB to RGB convertor
3
* pluralize by Software PAL8 to RGB convertor
4
* Software YUV to YUV convertor
5
* Software YUV to RGB convertor
6
* Written by Nick Kurshev.
7
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
* lot of big-endian byteorder fixes by Alex Beregszaszi
10
* This file is part of FFmpeg.
12
* FFmpeg is free software; you can redistribute it and/or modify
13
* it under the terms of the GNU General Public License as published by
14
* the Free Software Foundation; either version 2 of the License, or
15
* (at your option) any later version.
17
* FFmpeg is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
* GNU General Public License for more details.
22
* You should have received a copy of the GNU General Public License
23
* along with FFmpeg; if not, write to the Free Software
24
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26
* the C code (not assembly, mmx, ...) of this file can be used
27
* under the LGPL license too
31
#include <inttypes.h> /* for __WORDSIZE */
34
// #warning You have misconfigured system and probably will lose performance!
35
#define __WORDSIZE MP_WORDSIZE
53
#define PREFETCH "prefetch"
54
#define PREFETCHW "prefetchw"
55
#define PAVGB "pavgusb"
56
#elif defined ( HAVE_MMX2 )
57
#define PREFETCH "prefetchnta"
58
#define PREFETCHW "prefetcht0"
65
#define PREFETCH " # nop"
66
#define PREFETCHW " # nop"
71
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
78
#define MOVNTQ "movntq"
79
#define SFENCE "sfence"
82
#define SFENCE " # nop"
85
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
88
const uint8_t *s = src;
91
const uint8_t *mm_end;
95
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
97
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
102
"movd %1, %%mm0 \n\t"
103
"punpckldq 3%1, %%mm0 \n\t"
104
"movd 6%1, %%mm1 \n\t"
105
"punpckldq 9%1, %%mm1 \n\t"
106
"movd 12%1, %%mm2 \n\t"
107
"punpckldq 15%1, %%mm2 \n\t"
108
"movd 18%1, %%mm3 \n\t"
109
"punpckldq 21%1, %%mm3 \n\t"
110
"pand %%mm7, %%mm0 \n\t"
111
"pand %%mm7, %%mm1 \n\t"
112
"pand %%mm7, %%mm2 \n\t"
113
"pand %%mm7, %%mm3 \n\t"
114
MOVNTQ" %%mm0, %0 \n\t"
115
MOVNTQ" %%mm1, 8%0 \n\t"
116
MOVNTQ" %%mm2, 16%0 \n\t"
124
__asm __volatile(SFENCE:::"memory");
125
__asm __volatile(EMMS:::"memory");
129
#ifdef WORDS_BIGENDIAN
130
/* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
145
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
148
const uint8_t *s = src;
151
const uint8_t *mm_end;
155
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
161
"movq %1, %%mm0 \n\t"
162
"movq 8%1, %%mm1 \n\t"
163
"movq 16%1, %%mm4 \n\t"
164
"movq 24%1, %%mm5 \n\t"
165
"movq %%mm0, %%mm2 \n\t"
166
"movq %%mm1, %%mm3 \n\t"
167
"movq %%mm4, %%mm6 \n\t"
168
"movq %%mm5, %%mm7 \n\t"
169
"psrlq $8, %%mm2 \n\t"
170
"psrlq $8, %%mm3 \n\t"
171
"psrlq $8, %%mm6 \n\t"
172
"psrlq $8, %%mm7 \n\t"
173
"pand %2, %%mm0 \n\t"
174
"pand %2, %%mm1 \n\t"
175
"pand %2, %%mm4 \n\t"
176
"pand %2, %%mm5 \n\t"
177
"pand %3, %%mm2 \n\t"
178
"pand %3, %%mm3 \n\t"
179
"pand %3, %%mm6 \n\t"
180
"pand %3, %%mm7 \n\t"
181
"por %%mm2, %%mm0 \n\t"
182
"por %%mm3, %%mm1 \n\t"
183
"por %%mm6, %%mm4 \n\t"
184
"por %%mm7, %%mm5 \n\t"
186
"movq %%mm1, %%mm2 \n\t"
187
"movq %%mm4, %%mm3 \n\t"
188
"psllq $48, %%mm2 \n\t"
189
"psllq $32, %%mm3 \n\t"
190
"pand %4, %%mm2 \n\t"
191
"pand %5, %%mm3 \n\t"
192
"por %%mm2, %%mm0 \n\t"
193
"psrlq $16, %%mm1 \n\t"
194
"psrlq $32, %%mm4 \n\t"
195
"psllq $16, %%mm5 \n\t"
196
"por %%mm3, %%mm1 \n\t"
197
"pand %6, %%mm5 \n\t"
198
"por %%mm5, %%mm4 \n\t"
200
MOVNTQ" %%mm0, %0 \n\t"
201
MOVNTQ" %%mm1, 8%0 \n\t"
204
:"m"(*s),"m"(mask24l),
205
"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
210
__asm __volatile(SFENCE:::"memory");
211
__asm __volatile(EMMS:::"memory");
215
#ifdef WORDS_BIGENDIAN
216
/* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
232
Original by Strepto/Astral
233
ported to gcc & bugfixed : A'rpi
234
MMX2, 3DNOW optimization by Nick Kurshev
235
32bit c version, and and&add trick by Michael Niedermayer
237
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239
register const uint8_t* s=src;
240
register uint8_t* d=dst;
241
register const uint8_t *end;
242
const uint8_t *mm_end;
245
__asm __volatile(PREFETCH" %0"::"m"(*s));
246
__asm __volatile("movq %0, %%mm4"::"m"(mask15s));
252
"movq %1, %%mm0 \n\t"
253
"movq 8%1, %%mm2 \n\t"
254
"movq %%mm0, %%mm1 \n\t"
255
"movq %%mm2, %%mm3 \n\t"
256
"pand %%mm4, %%mm0 \n\t"
257
"pand %%mm4, %%mm2 \n\t"
258
"paddw %%mm1, %%mm0 \n\t"
259
"paddw %%mm3, %%mm2 \n\t"
260
MOVNTQ" %%mm0, %0 \n\t"
268
__asm __volatile(SFENCE:::"memory");
269
__asm __volatile(EMMS:::"memory");
274
register unsigned x= *((uint32_t *)s);
275
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
281
register unsigned short x= *((uint16_t *)s);
282
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
286
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288
register const uint8_t* s=src;
289
register uint8_t* d=dst;
290
register const uint8_t *end;
291
const uint8_t *mm_end;
294
__asm __volatile(PREFETCH" %0"::"m"(*s));
295
__asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
296
__asm __volatile("movq %0, %%mm6"::"m"(mask15b));
302
"movq %1, %%mm0 \n\t"
303
"movq 8%1, %%mm2 \n\t"
304
"movq %%mm0, %%mm1 \n\t"
305
"movq %%mm2, %%mm3 \n\t"
306
"psrlq $1, %%mm0 \n\t"
307
"psrlq $1, %%mm2 \n\t"
308
"pand %%mm7, %%mm0 \n\t"
309
"pand %%mm7, %%mm2 \n\t"
310
"pand %%mm6, %%mm1 \n\t"
311
"pand %%mm6, %%mm3 \n\t"
312
"por %%mm1, %%mm0 \n\t"
313
"por %%mm3, %%mm2 \n\t"
314
MOVNTQ" %%mm0, %0 \n\t"
322
__asm __volatile(SFENCE:::"memory");
323
__asm __volatile(EMMS:::"memory");
328
register uint32_t x= *((uint32_t *)s);
329
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
335
register uint16_t x= *((uint16_t *)s);
336
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
342
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344
const uint8_t *s = src;
347
const uint8_t *mm_end;
349
uint16_t *d = (uint16_t *)dst;
353
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
355
"movq %3, %%mm5 \n\t"
356
"movq %4, %%mm6 \n\t"
357
"movq %5, %%mm7 \n\t"
361
PREFETCH" 32(%1) \n\t"
362
"movd (%1), %%mm0 \n\t"
363
"movd 4(%1), %%mm3 \n\t"
364
"punpckldq 8(%1), %%mm0 \n\t"
365
"punpckldq 12(%1), %%mm3 \n\t"
366
"movq %%mm0, %%mm1 \n\t"
367
"movq %%mm3, %%mm4 \n\t"
368
"pand %%mm6, %%mm0 \n\t"
369
"pand %%mm6, %%mm3 \n\t"
370
"pmaddwd %%mm7, %%mm0 \n\t"
371
"pmaddwd %%mm7, %%mm3 \n\t"
372
"pand %%mm5, %%mm1 \n\t"
373
"pand %%mm5, %%mm4 \n\t"
374
"por %%mm1, %%mm0 \n\t"
375
"por %%mm4, %%mm3 \n\t"
376
"psrld $5, %%mm0 \n\t"
377
"pslld $11, %%mm3 \n\t"
378
"por %%mm3, %%mm0 \n\t"
379
MOVNTQ" %%mm0, (%0) \n\t"
386
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
389
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
391
"movq %0, %%mm7 \n\t"
392
"movq %1, %%mm6 \n\t"
393
::"m"(red_16mask),"m"(green_16mask));
398
"movd %1, %%mm0 \n\t"
399
"movd 4%1, %%mm3 \n\t"
400
"punpckldq 8%1, %%mm0 \n\t"
401
"punpckldq 12%1, %%mm3 \n\t"
402
"movq %%mm0, %%mm1 \n\t"
403
"movq %%mm0, %%mm2 \n\t"
404
"movq %%mm3, %%mm4 \n\t"
405
"movq %%mm3, %%mm5 \n\t"
406
"psrlq $3, %%mm0 \n\t"
407
"psrlq $3, %%mm3 \n\t"
408
"pand %2, %%mm0 \n\t"
409
"pand %2, %%mm3 \n\t"
410
"psrlq $5, %%mm1 \n\t"
411
"psrlq $5, %%mm4 \n\t"
412
"pand %%mm6, %%mm1 \n\t"
413
"pand %%mm6, %%mm4 \n\t"
414
"psrlq $8, %%mm2 \n\t"
415
"psrlq $8, %%mm5 \n\t"
416
"pand %%mm7, %%mm2 \n\t"
417
"pand %%mm7, %%mm5 \n\t"
418
"por %%mm1, %%mm0 \n\t"
419
"por %%mm4, %%mm3 \n\t"
420
"por %%mm2, %%mm0 \n\t"
421
"por %%mm5, %%mm3 \n\t"
422
"psllq $16, %%mm3 \n\t"
423
"por %%mm3, %%mm0 \n\t"
424
MOVNTQ" %%mm0, %0 \n\t"
425
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
430
__asm __volatile(SFENCE:::"memory");
431
__asm __volatile(EMMS:::"memory");
435
register int rgb = *(uint32_t*)s; s += 4;
436
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
440
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442
const uint8_t *s = src;
445
const uint8_t *mm_end;
447
uint16_t *d = (uint16_t *)dst;
450
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
452
"movq %0, %%mm7 \n\t"
453
"movq %1, %%mm6 \n\t"
454
::"m"(red_16mask),"m"(green_16mask));
460
"movd %1, %%mm0 \n\t"
461
"movd 4%1, %%mm3 \n\t"
462
"punpckldq 8%1, %%mm0 \n\t"
463
"punpckldq 12%1, %%mm3 \n\t"
464
"movq %%mm0, %%mm1 \n\t"
465
"movq %%mm0, %%mm2 \n\t"
466
"movq %%mm3, %%mm4 \n\t"
467
"movq %%mm3, %%mm5 \n\t"
468
"psllq $8, %%mm0 \n\t"
469
"psllq $8, %%mm3 \n\t"
470
"pand %%mm7, %%mm0 \n\t"
471
"pand %%mm7, %%mm3 \n\t"
472
"psrlq $5, %%mm1 \n\t"
473
"psrlq $5, %%mm4 \n\t"
474
"pand %%mm6, %%mm1 \n\t"
475
"pand %%mm6, %%mm4 \n\t"
476
"psrlq $19, %%mm2 \n\t"
477
"psrlq $19, %%mm5 \n\t"
478
"pand %2, %%mm2 \n\t"
479
"pand %2, %%mm5 \n\t"
480
"por %%mm1, %%mm0 \n\t"
481
"por %%mm4, %%mm3 \n\t"
482
"por %%mm2, %%mm0 \n\t"
483
"por %%mm5, %%mm3 \n\t"
484
"psllq $16, %%mm3 \n\t"
485
"por %%mm3, %%mm0 \n\t"
486
MOVNTQ" %%mm0, %0 \n\t"
487
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
491
__asm __volatile(SFENCE:::"memory");
492
__asm __volatile(EMMS:::"memory");
496
register int rgb = *(uint32_t*)s; s += 4;
497
*d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
501
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503
const uint8_t *s = src;
506
const uint8_t *mm_end;
508
uint16_t *d = (uint16_t *)dst;
512
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
514
"movq %3, %%mm5 \n\t"
515
"movq %4, %%mm6 \n\t"
516
"movq %5, %%mm7 \n\t"
520
PREFETCH" 32(%1) \n\t"
521
"movd (%1), %%mm0 \n\t"
522
"movd 4(%1), %%mm3 \n\t"
523
"punpckldq 8(%1), %%mm0 \n\t"
524
"punpckldq 12(%1), %%mm3 \n\t"
525
"movq %%mm0, %%mm1 \n\t"
526
"movq %%mm3, %%mm4 \n\t"
527
"pand %%mm6, %%mm0 \n\t"
528
"pand %%mm6, %%mm3 \n\t"
529
"pmaddwd %%mm7, %%mm0 \n\t"
530
"pmaddwd %%mm7, %%mm3 \n\t"
531
"pand %%mm5, %%mm1 \n\t"
532
"pand %%mm5, %%mm4 \n\t"
533
"por %%mm1, %%mm0 \n\t"
534
"por %%mm4, %%mm3 \n\t"
535
"psrld $6, %%mm0 \n\t"
536
"pslld $10, %%mm3 \n\t"
537
"por %%mm3, %%mm0 \n\t"
538
MOVNTQ" %%mm0, (%0) \n\t"
545
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
548
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
550
"movq %0, %%mm7 \n\t"
551
"movq %1, %%mm6 \n\t"
552
::"m"(red_15mask),"m"(green_15mask));
557
"movd %1, %%mm0 \n\t"
558
"movd 4%1, %%mm3 \n\t"
559
"punpckldq 8%1, %%mm0 \n\t"
560
"punpckldq 12%1, %%mm3 \n\t"
561
"movq %%mm0, %%mm1 \n\t"
562
"movq %%mm0, %%mm2 \n\t"
563
"movq %%mm3, %%mm4 \n\t"
564
"movq %%mm3, %%mm5 \n\t"
565
"psrlq $3, %%mm0 \n\t"
566
"psrlq $3, %%mm3 \n\t"
567
"pand %2, %%mm0 \n\t"
568
"pand %2, %%mm3 \n\t"
569
"psrlq $6, %%mm1 \n\t"
570
"psrlq $6, %%mm4 \n\t"
571
"pand %%mm6, %%mm1 \n\t"
572
"pand %%mm6, %%mm4 \n\t"
573
"psrlq $9, %%mm2 \n\t"
574
"psrlq $9, %%mm5 \n\t"
575
"pand %%mm7, %%mm2 \n\t"
576
"pand %%mm7, %%mm5 \n\t"
577
"por %%mm1, %%mm0 \n\t"
578
"por %%mm4, %%mm3 \n\t"
579
"por %%mm2, %%mm0 \n\t"
580
"por %%mm5, %%mm3 \n\t"
581
"psllq $16, %%mm3 \n\t"
582
"por %%mm3, %%mm0 \n\t"
583
MOVNTQ" %%mm0, %0 \n\t"
584
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
589
__asm __volatile(SFENCE:::"memory");
590
__asm __volatile(EMMS:::"memory");
594
register int rgb = *(uint32_t*)s; s += 4;
595
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
599
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601
const uint8_t *s = src;
604
const uint8_t *mm_end;
606
uint16_t *d = (uint16_t *)dst;
609
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
611
"movq %0, %%mm7 \n\t"
612
"movq %1, %%mm6 \n\t"
613
::"m"(red_15mask),"m"(green_15mask));
619
"movd %1, %%mm0 \n\t"
620
"movd 4%1, %%mm3 \n\t"
621
"punpckldq 8%1, %%mm0 \n\t"
622
"punpckldq 12%1, %%mm3 \n\t"
623
"movq %%mm0, %%mm1 \n\t"
624
"movq %%mm0, %%mm2 \n\t"
625
"movq %%mm3, %%mm4 \n\t"
626
"movq %%mm3, %%mm5 \n\t"
627
"psllq $7, %%mm0 \n\t"
628
"psllq $7, %%mm3 \n\t"
629
"pand %%mm7, %%mm0 \n\t"
630
"pand %%mm7, %%mm3 \n\t"
631
"psrlq $6, %%mm1 \n\t"
632
"psrlq $6, %%mm4 \n\t"
633
"pand %%mm6, %%mm1 \n\t"
634
"pand %%mm6, %%mm4 \n\t"
635
"psrlq $19, %%mm2 \n\t"
636
"psrlq $19, %%mm5 \n\t"
637
"pand %2, %%mm2 \n\t"
638
"pand %2, %%mm5 \n\t"
639
"por %%mm1, %%mm0 \n\t"
640
"por %%mm4, %%mm3 \n\t"
641
"por %%mm2, %%mm0 \n\t"
642
"por %%mm5, %%mm3 \n\t"
643
"psllq $16, %%mm3 \n\t"
644
"por %%mm3, %%mm0 \n\t"
645
MOVNTQ" %%mm0, %0 \n\t"
646
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
650
__asm __volatile(SFENCE:::"memory");
651
__asm __volatile(EMMS:::"memory");
655
register int rgb = *(uint32_t*)s; s += 4;
656
*d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
660
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662
const uint8_t *s = src;
665
const uint8_t *mm_end;
667
uint16_t *d = (uint16_t *)dst;
670
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
672
"movq %0, %%mm7 \n\t"
673
"movq %1, %%mm6 \n\t"
674
::"m"(red_16mask),"m"(green_16mask));
680
"movd %1, %%mm0 \n\t"
681
"movd 3%1, %%mm3 \n\t"
682
"punpckldq 6%1, %%mm0 \n\t"
683
"punpckldq 9%1, %%mm3 \n\t"
684
"movq %%mm0, %%mm1 \n\t"
685
"movq %%mm0, %%mm2 \n\t"
686
"movq %%mm3, %%mm4 \n\t"
687
"movq %%mm3, %%mm5 \n\t"
688
"psrlq $3, %%mm0 \n\t"
689
"psrlq $3, %%mm3 \n\t"
690
"pand %2, %%mm0 \n\t"
691
"pand %2, %%mm3 \n\t"
692
"psrlq $5, %%mm1 \n\t"
693
"psrlq $5, %%mm4 \n\t"
694
"pand %%mm6, %%mm1 \n\t"
695
"pand %%mm6, %%mm4 \n\t"
696
"psrlq $8, %%mm2 \n\t"
697
"psrlq $8, %%mm5 \n\t"
698
"pand %%mm7, %%mm2 \n\t"
699
"pand %%mm7, %%mm5 \n\t"
700
"por %%mm1, %%mm0 \n\t"
701
"por %%mm4, %%mm3 \n\t"
702
"por %%mm2, %%mm0 \n\t"
703
"por %%mm5, %%mm3 \n\t"
704
"psllq $16, %%mm3 \n\t"
705
"por %%mm3, %%mm0 \n\t"
706
MOVNTQ" %%mm0, %0 \n\t"
707
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
711
__asm __volatile(SFENCE:::"memory");
712
__asm __volatile(EMMS:::"memory");
719
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
723
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725
const uint8_t *s = src;
728
const uint8_t *mm_end;
730
uint16_t *d = (uint16_t *)dst;
733
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
735
"movq %0, %%mm7 \n\t"
736
"movq %1, %%mm6 \n\t"
737
::"m"(red_16mask),"m"(green_16mask));
743
"movd %1, %%mm0 \n\t"
744
"movd 3%1, %%mm3 \n\t"
745
"punpckldq 6%1, %%mm0 \n\t"
746
"punpckldq 9%1, %%mm3 \n\t"
747
"movq %%mm0, %%mm1 \n\t"
748
"movq %%mm0, %%mm2 \n\t"
749
"movq %%mm3, %%mm4 \n\t"
750
"movq %%mm3, %%mm5 \n\t"
751
"psllq $8, %%mm0 \n\t"
752
"psllq $8, %%mm3 \n\t"
753
"pand %%mm7, %%mm0 \n\t"
754
"pand %%mm7, %%mm3 \n\t"
755
"psrlq $5, %%mm1 \n\t"
756
"psrlq $5, %%mm4 \n\t"
757
"pand %%mm6, %%mm1 \n\t"
758
"pand %%mm6, %%mm4 \n\t"
759
"psrlq $19, %%mm2 \n\t"
760
"psrlq $19, %%mm5 \n\t"
761
"pand %2, %%mm2 \n\t"
762
"pand %2, %%mm5 \n\t"
763
"por %%mm1, %%mm0 \n\t"
764
"por %%mm4, %%mm3 \n\t"
765
"por %%mm2, %%mm0 \n\t"
766
"por %%mm5, %%mm3 \n\t"
767
"psllq $16, %%mm3 \n\t"
768
"por %%mm3, %%mm0 \n\t"
769
MOVNTQ" %%mm0, %0 \n\t"
770
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
774
__asm __volatile(SFENCE:::"memory");
775
__asm __volatile(EMMS:::"memory");
782
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
786
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788
const uint8_t *s = src;
791
const uint8_t *mm_end;
793
uint16_t *d = (uint16_t *)dst;
796
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
798
"movq %0, %%mm7 \n\t"
799
"movq %1, %%mm6 \n\t"
800
::"m"(red_15mask),"m"(green_15mask));
806
"movd %1, %%mm0 \n\t"
807
"movd 3%1, %%mm3 \n\t"
808
"punpckldq 6%1, %%mm0 \n\t"
809
"punpckldq 9%1, %%mm3 \n\t"
810
"movq %%mm0, %%mm1 \n\t"
811
"movq %%mm0, %%mm2 \n\t"
812
"movq %%mm3, %%mm4 \n\t"
813
"movq %%mm3, %%mm5 \n\t"
814
"psrlq $3, %%mm0 \n\t"
815
"psrlq $3, %%mm3 \n\t"
816
"pand %2, %%mm0 \n\t"
817
"pand %2, %%mm3 \n\t"
818
"psrlq $6, %%mm1 \n\t"
819
"psrlq $6, %%mm4 \n\t"
820
"pand %%mm6, %%mm1 \n\t"
821
"pand %%mm6, %%mm4 \n\t"
822
"psrlq $9, %%mm2 \n\t"
823
"psrlq $9, %%mm5 \n\t"
824
"pand %%mm7, %%mm2 \n\t"
825
"pand %%mm7, %%mm5 \n\t"
826
"por %%mm1, %%mm0 \n\t"
827
"por %%mm4, %%mm3 \n\t"
828
"por %%mm2, %%mm0 \n\t"
829
"por %%mm5, %%mm3 \n\t"
830
"psllq $16, %%mm3 \n\t"
831
"por %%mm3, %%mm0 \n\t"
832
MOVNTQ" %%mm0, %0 \n\t"
833
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
837
__asm __volatile(SFENCE:::"memory");
838
__asm __volatile(EMMS:::"memory");
845
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
849
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851
const uint8_t *s = src;
854
const uint8_t *mm_end;
856
uint16_t *d = (uint16_t *)dst;
859
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
861
"movq %0, %%mm7 \n\t"
862
"movq %1, %%mm6 \n\t"
863
::"m"(red_15mask),"m"(green_15mask));
869
"movd %1, %%mm0 \n\t"
870
"movd 3%1, %%mm3 \n\t"
871
"punpckldq 6%1, %%mm0 \n\t"
872
"punpckldq 9%1, %%mm3 \n\t"
873
"movq %%mm0, %%mm1 \n\t"
874
"movq %%mm0, %%mm2 \n\t"
875
"movq %%mm3, %%mm4 \n\t"
876
"movq %%mm3, %%mm5 \n\t"
877
"psllq $7, %%mm0 \n\t"
878
"psllq $7, %%mm3 \n\t"
879
"pand %%mm7, %%mm0 \n\t"
880
"pand %%mm7, %%mm3 \n\t"
881
"psrlq $6, %%mm1 \n\t"
882
"psrlq $6, %%mm4 \n\t"
883
"pand %%mm6, %%mm1 \n\t"
884
"pand %%mm6, %%mm4 \n\t"
885
"psrlq $19, %%mm2 \n\t"
886
"psrlq $19, %%mm5 \n\t"
887
"pand %2, %%mm2 \n\t"
888
"pand %2, %%mm5 \n\t"
889
"por %%mm1, %%mm0 \n\t"
890
"por %%mm4, %%mm3 \n\t"
891
"por %%mm2, %%mm0 \n\t"
892
"por %%mm5, %%mm3 \n\t"
893
"psllq $16, %%mm3 \n\t"
894
"por %%mm3, %%mm0 \n\t"
895
MOVNTQ" %%mm0, %0 \n\t"
896
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
900
__asm __volatile(SFENCE:::"memory");
901
__asm __volatile(EMMS:::"memory");
908
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
913
I use here less accurate approximation by simply
914
left-shifting the input
915
value and filling the low order bits with
916
zeroes. This method improves png's
917
compression but this scheme cannot reproduce white exactly, since it does not
918
generate an all-ones maximum value; the net effect is to darken the
921
The better method should be "left bit replication":
931
| Leftmost Bits Repeated to Fill Open Bits
935
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
939
const uint16_t *mm_end;
941
uint8_t *d = (uint8_t *)dst;
942
const uint16_t *s = (uint16_t *)src;
943
end = s + src_size/2;
945
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
951
"movq %1, %%mm0 \n\t"
952
"movq %1, %%mm1 \n\t"
953
"movq %1, %%mm2 \n\t"
954
"pand %2, %%mm0 \n\t"
955
"pand %3, %%mm1 \n\t"
956
"pand %4, %%mm2 \n\t"
957
"psllq $3, %%mm0 \n\t"
958
"psrlq $2, %%mm1 \n\t"
959
"psrlq $7, %%mm2 \n\t"
960
"movq %%mm0, %%mm3 \n\t"
961
"movq %%mm1, %%mm4 \n\t"
962
"movq %%mm2, %%mm5 \n\t"
963
"punpcklwd %5, %%mm0 \n\t"
964
"punpcklwd %5, %%mm1 \n\t"
965
"punpcklwd %5, %%mm2 \n\t"
966
"punpckhwd %5, %%mm3 \n\t"
967
"punpckhwd %5, %%mm4 \n\t"
968
"punpckhwd %5, %%mm5 \n\t"
969
"psllq $8, %%mm1 \n\t"
970
"psllq $16, %%mm2 \n\t"
971
"por %%mm1, %%mm0 \n\t"
972
"por %%mm2, %%mm0 \n\t"
973
"psllq $8, %%mm4 \n\t"
974
"psllq $16, %%mm5 \n\t"
975
"por %%mm4, %%mm3 \n\t"
976
"por %%mm5, %%mm3 \n\t"
978
"movq %%mm0, %%mm6 \n\t"
979
"movq %%mm3, %%mm7 \n\t"
981
"movq 8%1, %%mm0 \n\t"
982
"movq 8%1, %%mm1 \n\t"
983
"movq 8%1, %%mm2 \n\t"
984
"pand %2, %%mm0 \n\t"
985
"pand %3, %%mm1 \n\t"
986
"pand %4, %%mm2 \n\t"
987
"psllq $3, %%mm0 \n\t"
988
"psrlq $2, %%mm1 \n\t"
989
"psrlq $7, %%mm2 \n\t"
990
"movq %%mm0, %%mm3 \n\t"
991
"movq %%mm1, %%mm4 \n\t"
992
"movq %%mm2, %%mm5 \n\t"
993
"punpcklwd %5, %%mm0 \n\t"
994
"punpcklwd %5, %%mm1 \n\t"
995
"punpcklwd %5, %%mm2 \n\t"
996
"punpckhwd %5, %%mm3 \n\t"
997
"punpckhwd %5, %%mm4 \n\t"
998
"punpckhwd %5, %%mm5 \n\t"
999
"psllq $8, %%mm1 \n\t"
1000
"psllq $16, %%mm2 \n\t"
1001
"por %%mm1, %%mm0 \n\t"
1002
"por %%mm2, %%mm0 \n\t"
1003
"psllq $8, %%mm4 \n\t"
1004
"psllq $16, %%mm5 \n\t"
1005
"por %%mm4, %%mm3 \n\t"
1006
"por %%mm5, %%mm3 \n\t"
1009
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1011
/* Borrowed 32 to 24 */
1013
"movq %%mm0, %%mm4 \n\t"
1014
"movq %%mm3, %%mm5 \n\t"
1015
"movq %%mm6, %%mm0 \n\t"
1016
"movq %%mm7, %%mm1 \n\t"
1018
"movq %%mm4, %%mm6 \n\t"
1019
"movq %%mm5, %%mm7 \n\t"
1020
"movq %%mm0, %%mm2 \n\t"
1021
"movq %%mm1, %%mm3 \n\t"
1023
"psrlq $8, %%mm2 \n\t"
1024
"psrlq $8, %%mm3 \n\t"
1025
"psrlq $8, %%mm6 \n\t"
1026
"psrlq $8, %%mm7 \n\t"
1027
"pand %2, %%mm0 \n\t"
1028
"pand %2, %%mm1 \n\t"
1029
"pand %2, %%mm4 \n\t"
1030
"pand %2, %%mm5 \n\t"
1031
"pand %3, %%mm2 \n\t"
1032
"pand %3, %%mm3 \n\t"
1033
"pand %3, %%mm6 \n\t"
1034
"pand %3, %%mm7 \n\t"
1035
"por %%mm2, %%mm0 \n\t"
1036
"por %%mm3, %%mm1 \n\t"
1037
"por %%mm6, %%mm4 \n\t"
1038
"por %%mm7, %%mm5 \n\t"
1040
"movq %%mm1, %%mm2 \n\t"
1041
"movq %%mm4, %%mm3 \n\t"
1042
"psllq $48, %%mm2 \n\t"
1043
"psllq $32, %%mm3 \n\t"
1044
"pand %4, %%mm2 \n\t"
1045
"pand %5, %%mm3 \n\t"
1046
"por %%mm2, %%mm0 \n\t"
1047
"psrlq $16, %%mm1 \n\t"
1048
"psrlq $32, %%mm4 \n\t"
1049
"psllq $16, %%mm5 \n\t"
1050
"por %%mm3, %%mm1 \n\t"
1051
"pand %6, %%mm5 \n\t"
1052
"por %%mm5, %%mm4 \n\t"
1054
MOVNTQ" %%mm0, %0 \n\t"
1055
MOVNTQ" %%mm1, 8%0 \n\t"
1056
MOVNTQ" %%mm4, 16%0"
1059
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1064
__asm __volatile(SFENCE:::"memory");
1065
__asm __volatile(EMMS:::"memory");
1069
register uint16_t bgr;
1071
*d++ = (bgr&0x1F)<<3;
1072
*d++ = (bgr&0x3E0)>>2;
1073
*d++ = (bgr&0x7C00)>>7;
1077
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1079
const uint16_t *end;
1081
const uint16_t *mm_end;
1083
uint8_t *d = (uint8_t *)dst;
1084
const uint16_t *s = (const uint16_t *)src;
1085
end = s + src_size/2;
1087
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1092
PREFETCH" 32%1 \n\t"
1093
"movq %1, %%mm0 \n\t"
1094
"movq %1, %%mm1 \n\t"
1095
"movq %1, %%mm2 \n\t"
1096
"pand %2, %%mm0 \n\t"
1097
"pand %3, %%mm1 \n\t"
1098
"pand %4, %%mm2 \n\t"
1099
"psllq $3, %%mm0 \n\t"
1100
"psrlq $3, %%mm1 \n\t"
1101
"psrlq $8, %%mm2 \n\t"
1102
"movq %%mm0, %%mm3 \n\t"
1103
"movq %%mm1, %%mm4 \n\t"
1104
"movq %%mm2, %%mm5 \n\t"
1105
"punpcklwd %5, %%mm0 \n\t"
1106
"punpcklwd %5, %%mm1 \n\t"
1107
"punpcklwd %5, %%mm2 \n\t"
1108
"punpckhwd %5, %%mm3 \n\t"
1109
"punpckhwd %5, %%mm4 \n\t"
1110
"punpckhwd %5, %%mm5 \n\t"
1111
"psllq $8, %%mm1 \n\t"
1112
"psllq $16, %%mm2 \n\t"
1113
"por %%mm1, %%mm0 \n\t"
1114
"por %%mm2, %%mm0 \n\t"
1115
"psllq $8, %%mm4 \n\t"
1116
"psllq $16, %%mm5 \n\t"
1117
"por %%mm4, %%mm3 \n\t"
1118
"por %%mm5, %%mm3 \n\t"
1120
"movq %%mm0, %%mm6 \n\t"
1121
"movq %%mm3, %%mm7 \n\t"
1123
"movq 8%1, %%mm0 \n\t"
1124
"movq 8%1, %%mm1 \n\t"
1125
"movq 8%1, %%mm2 \n\t"
1126
"pand %2, %%mm0 \n\t"
1127
"pand %3, %%mm1 \n\t"
1128
"pand %4, %%mm2 \n\t"
1129
"psllq $3, %%mm0 \n\t"
1130
"psrlq $3, %%mm1 \n\t"
1131
"psrlq $8, %%mm2 \n\t"
1132
"movq %%mm0, %%mm3 \n\t"
1133
"movq %%mm1, %%mm4 \n\t"
1134
"movq %%mm2, %%mm5 \n\t"
1135
"punpcklwd %5, %%mm0 \n\t"
1136
"punpcklwd %5, %%mm1 \n\t"
1137
"punpcklwd %5, %%mm2 \n\t"
1138
"punpckhwd %5, %%mm3 \n\t"
1139
"punpckhwd %5, %%mm4 \n\t"
1140
"punpckhwd %5, %%mm5 \n\t"
1141
"psllq $8, %%mm1 \n\t"
1142
"psllq $16, %%mm2 \n\t"
1143
"por %%mm1, %%mm0 \n\t"
1144
"por %%mm2, %%mm0 \n\t"
1145
"psllq $8, %%mm4 \n\t"
1146
"psllq $16, %%mm5 \n\t"
1147
"por %%mm4, %%mm3 \n\t"
1148
"por %%mm5, %%mm3 \n\t"
1150
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1152
/* Borrowed 32 to 24 */
1154
"movq %%mm0, %%mm4 \n\t"
1155
"movq %%mm3, %%mm5 \n\t"
1156
"movq %%mm6, %%mm0 \n\t"
1157
"movq %%mm7, %%mm1 \n\t"
1159
"movq %%mm4, %%mm6 \n\t"
1160
"movq %%mm5, %%mm7 \n\t"
1161
"movq %%mm0, %%mm2 \n\t"
1162
"movq %%mm1, %%mm3 \n\t"
1164
"psrlq $8, %%mm2 \n\t"
1165
"psrlq $8, %%mm3 \n\t"
1166
"psrlq $8, %%mm6 \n\t"
1167
"psrlq $8, %%mm7 \n\t"
1168
"pand %2, %%mm0 \n\t"
1169
"pand %2, %%mm1 \n\t"
1170
"pand %2, %%mm4 \n\t"
1171
"pand %2, %%mm5 \n\t"
1172
"pand %3, %%mm2 \n\t"
1173
"pand %3, %%mm3 \n\t"
1174
"pand %3, %%mm6 \n\t"
1175
"pand %3, %%mm7 \n\t"
1176
"por %%mm2, %%mm0 \n\t"
1177
"por %%mm3, %%mm1 \n\t"
1178
"por %%mm6, %%mm4 \n\t"
1179
"por %%mm7, %%mm5 \n\t"
1181
"movq %%mm1, %%mm2 \n\t"
1182
"movq %%mm4, %%mm3 \n\t"
1183
"psllq $48, %%mm2 \n\t"
1184
"psllq $32, %%mm3 \n\t"
1185
"pand %4, %%mm2 \n\t"
1186
"pand %5, %%mm3 \n\t"
1187
"por %%mm2, %%mm0 \n\t"
1188
"psrlq $16, %%mm1 \n\t"
1189
"psrlq $32, %%mm4 \n\t"
1190
"psllq $16, %%mm5 \n\t"
1191
"por %%mm3, %%mm1 \n\t"
1192
"pand %6, %%mm5 \n\t"
1193
"por %%mm5, %%mm4 \n\t"
1195
MOVNTQ" %%mm0, %0 \n\t"
1196
MOVNTQ" %%mm1, 8%0 \n\t"
1197
MOVNTQ" %%mm4, 16%0"
1200
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1205
__asm __volatile(SFENCE:::"memory");
1206
__asm __volatile(EMMS:::"memory");
1210
register uint16_t bgr;
1212
*d++ = (bgr&0x1F)<<3;
1213
*d++ = (bgr&0x7E0)>>3;
1214
*d++ = (bgr&0xF800)>>8;
1218
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1220
const uint16_t *end;
1222
const uint16_t *mm_end;
1224
uint8_t *d = (uint8_t *)dst;
1225
const uint16_t *s = (const uint16_t *)src;
1226
end = s + src_size/2;
1228
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1229
__asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1234
PREFETCH" 32%1 \n\t"
1235
"movq %1, %%mm0 \n\t"
1236
"movq %1, %%mm1 \n\t"
1237
"movq %1, %%mm2 \n\t"
1238
"pand %2, %%mm0 \n\t"
1239
"pand %3, %%mm1 \n\t"
1240
"pand %4, %%mm2 \n\t"
1241
"psllq $3, %%mm0 \n\t"
1242
"psrlq $2, %%mm1 \n\t"
1243
"psrlq $7, %%mm2 \n\t"
1244
"movq %%mm0, %%mm3 \n\t"
1245
"movq %%mm1, %%mm4 \n\t"
1246
"movq %%mm2, %%mm5 \n\t"
1247
"punpcklwd %%mm7, %%mm0 \n\t"
1248
"punpcklwd %%mm7, %%mm1 \n\t"
1249
"punpcklwd %%mm7, %%mm2 \n\t"
1250
"punpckhwd %%mm7, %%mm3 \n\t"
1251
"punpckhwd %%mm7, %%mm4 \n\t"
1252
"punpckhwd %%mm7, %%mm5 \n\t"
1253
"psllq $8, %%mm1 \n\t"
1254
"psllq $16, %%mm2 \n\t"
1255
"por %%mm1, %%mm0 \n\t"
1256
"por %%mm2, %%mm0 \n\t"
1257
"psllq $8, %%mm4 \n\t"
1258
"psllq $16, %%mm5 \n\t"
1259
"por %%mm4, %%mm3 \n\t"
1260
"por %%mm5, %%mm3 \n\t"
1261
MOVNTQ" %%mm0, %0 \n\t"
1262
MOVNTQ" %%mm3, 8%0 \n\t"
1264
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1269
__asm __volatile(SFENCE:::"memory");
1270
__asm __volatile(EMMS:::"memory");
1274
#if 0 //slightly slower on athlon
1276
*((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1278
register uint16_t bgr;
1280
#ifdef WORDS_BIGENDIAN
1282
*d++ = (bgr&0x7C00)>>7;
1283
*d++ = (bgr&0x3E0)>>2;
1284
*d++ = (bgr&0x1F)<<3;
1286
*d++ = (bgr&0x1F)<<3;
1287
*d++ = (bgr&0x3E0)>>2;
1288
*d++ = (bgr&0x7C00)>>7;
1296
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1298
const uint16_t *end;
1300
const uint16_t *mm_end;
1302
uint8_t *d = (uint8_t *)dst;
1303
const uint16_t *s = (uint16_t *)src;
1304
end = s + src_size/2;
1306
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1307
__asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1312
PREFETCH" 32%1 \n\t"
1313
"movq %1, %%mm0 \n\t"
1314
"movq %1, %%mm1 \n\t"
1315
"movq %1, %%mm2 \n\t"
1316
"pand %2, %%mm0 \n\t"
1317
"pand %3, %%mm1 \n\t"
1318
"pand %4, %%mm2 \n\t"
1319
"psllq $3, %%mm0 \n\t"
1320
"psrlq $3, %%mm1 \n\t"
1321
"psrlq $8, %%mm2 \n\t"
1322
"movq %%mm0, %%mm3 \n\t"
1323
"movq %%mm1, %%mm4 \n\t"
1324
"movq %%mm2, %%mm5 \n\t"
1325
"punpcklwd %%mm7, %%mm0 \n\t"
1326
"punpcklwd %%mm7, %%mm1 \n\t"
1327
"punpcklwd %%mm7, %%mm2 \n\t"
1328
"punpckhwd %%mm7, %%mm3 \n\t"
1329
"punpckhwd %%mm7, %%mm4 \n\t"
1330
"punpckhwd %%mm7, %%mm5 \n\t"
1331
"psllq $8, %%mm1 \n\t"
1332
"psllq $16, %%mm2 \n\t"
1333
"por %%mm1, %%mm0 \n\t"
1334
"por %%mm2, %%mm0 \n\t"
1335
"psllq $8, %%mm4 \n\t"
1336
"psllq $16, %%mm5 \n\t"
1337
"por %%mm4, %%mm3 \n\t"
1338
"por %%mm5, %%mm3 \n\t"
1339
MOVNTQ" %%mm0, %0 \n\t"
1340
MOVNTQ" %%mm3, 8%0 \n\t"
1342
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1347
__asm __volatile(SFENCE:::"memory");
1348
__asm __volatile(EMMS:::"memory");
1352
register uint16_t bgr;
1354
#ifdef WORDS_BIGENDIAN
1356
*d++ = (bgr&0xF800)>>8;
1357
*d++ = (bgr&0x7E0)>>3;
1358
*d++ = (bgr&0x1F)<<3;
1360
*d++ = (bgr&0x1F)<<3;
1361
*d++ = (bgr&0x7E0)>>3;
1362
*d++ = (bgr&0xF800)>>8;
1368
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1370
long idx = 15 - src_size;
1371
uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1376
PREFETCH" (%1, %0) \n\t"
1377
"movq %3, %%mm7 \n\t"
1378
"pxor %4, %%mm7 \n\t"
1379
"movq %%mm7, %%mm6 \n\t"
1380
"pxor %5, %%mm7 \n\t"
1383
PREFETCH" 32(%1, %0) \n\t"
1384
"movq (%1, %0), %%mm0 \n\t"
1385
"movq 8(%1, %0), %%mm1 \n\t"
1387
"pshufw $177, %%mm0, %%mm3 \n\t"
1388
"pshufw $177, %%mm1, %%mm5 \n\t"
1389
"pand %%mm7, %%mm0 \n\t"
1390
"pand %%mm6, %%mm3 \n\t"
1391
"pand %%mm7, %%mm1 \n\t"
1392
"pand %%mm6, %%mm5 \n\t"
1393
"por %%mm3, %%mm0 \n\t"
1394
"por %%mm5, %%mm1 \n\t"
1396
"movq %%mm0, %%mm2 \n\t"
1397
"movq %%mm1, %%mm4 \n\t"
1398
"pand %%mm7, %%mm0 \n\t"
1399
"pand %%mm6, %%mm2 \n\t"
1400
"pand %%mm7, %%mm1 \n\t"
1401
"pand %%mm6, %%mm4 \n\t"
1402
"movq %%mm2, %%mm3 \n\t"
1403
"movq %%mm4, %%mm5 \n\t"
1404
"pslld $16, %%mm2 \n\t"
1405
"psrld $16, %%mm3 \n\t"
1406
"pslld $16, %%mm4 \n\t"
1407
"psrld $16, %%mm5 \n\t"
1408
"por %%mm2, %%mm0 \n\t"
1409
"por %%mm4, %%mm1 \n\t"
1410
"por %%mm3, %%mm0 \n\t"
1411
"por %%mm5, %%mm1 \n\t"
1413
MOVNTQ" %%mm0, (%2, %0) \n\t"
1414
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1421
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1424
for (; idx<15; idx+=4) {
1425
register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
1427
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1431
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1435
long mmx_size= 23 - src_size;
1437
"test %%"REG_a", %%"REG_a" \n\t"
1439
"movq "MANGLE(mask24r)", %%mm5 \n\t"
1440
"movq "MANGLE(mask24g)", %%mm6 \n\t"
1441
"movq "MANGLE(mask24b)", %%mm7 \n\t"
1444
PREFETCH" 32(%1, %%"REG_a") \n\t"
1445
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1446
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1447
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1448
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
1449
"pand %%mm5, %%mm0 \n\t"
1450
"pand %%mm6, %%mm1 \n\t"
1451
"pand %%mm7, %%mm2 \n\t"
1452
"por %%mm0, %%mm1 \n\t"
1453
"por %%mm2, %%mm1 \n\t"
1454
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1455
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1456
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1457
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1458
"pand %%mm7, %%mm0 \n\t"
1459
"pand %%mm5, %%mm1 \n\t"
1460
"pand %%mm6, %%mm2 \n\t"
1461
"por %%mm0, %%mm1 \n\t"
1462
"por %%mm2, %%mm1 \n\t"
1463
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1464
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1465
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1466
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1467
"pand %%mm6, %%mm0 \n\t"
1468
"pand %%mm7, %%mm1 \n\t"
1469
"pand %%mm5, %%mm2 \n\t"
1470
"por %%mm0, %%mm1 \n\t"
1471
"por %%mm2, %%mm1 \n\t"
1472
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1473
"add $24, %%"REG_a" \n\t"
1477
: "r" (src-mmx_size), "r"(dst-mmx_size)
1480
__asm __volatile(SFENCE:::"memory");
1481
__asm __volatile(EMMS:::"memory");
1483
if (mmx_size==23) return; //finihsed, was multiple of 8
1487
src_size= 23-mmx_size;
1491
for (i=0; i<src_size; i+=3)
1495
dst[i + 1] = src[i + 1];
1496
dst[i + 2] = src[i + 0];
1501
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1502
long width, long height,
1503
long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1506
const long chromWidth= width>>1;
1507
for (y=0; y<height; y++)
1510
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1512
"xor %%"REG_a", %%"REG_a" \n\t"
1515
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1516
PREFETCH" 32(%2, %%"REG_a") \n\t"
1517
PREFETCH" 32(%3, %%"REG_a") \n\t"
1518
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1519
"movq %%mm0, %%mm2 \n\t" // U(0)
1520
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1521
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1522
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1524
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1525
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1526
"movq %%mm3, %%mm4 \n\t" // Y(0)
1527
"movq %%mm5, %%mm6 \n\t" // Y(8)
1528
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1529
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1530
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1531
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1533
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1534
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1535
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1536
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1538
"add $8, %%"REG_a" \n\t"
1539
"cmp %4, %%"REG_a" \n\t"
1541
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1546
#if defined ARCH_ALPHA && defined HAVE_MVI
1547
#define pl2yuy2(n) \
1552
asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1553
asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1554
asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1555
asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1556
yuv1 = (u << 8) + (v << 24); \
1563
uint64_t *qdst = (uint64_t *) dst;
1564
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1565
const uint32_t *yc = (uint32_t *) ysrc;
1566
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1567
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1568
for (i = 0; i < chromWidth; i += 8){
1569
uint64_t y1, y2, yuv1, yuv2;
1572
asm("ldq $31,64(%0)" :: "r"(yc));
1573
asm("ldq $31,64(%0)" :: "r"(yc2));
1574
asm("ldq $31,64(%0)" :: "r"(uc));
1575
asm("ldq $31,64(%0)" :: "r"(vc));
1593
#elif __WORDSIZE >= 64
1595
uint64_t *ldst = (uint64_t *) dst;
1596
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1597
for (i = 0; i < chromWidth; i += 2){
1599
k = yc[0] + (uc[0] << 8) +
1600
(yc[1] << 16) + (vc[0] << 24);
1601
l = yc[2] + (uc[1] << 8) +
1602
(yc[3] << 16) + (vc[1] << 24);
1603
*ldst++ = k + (l << 32);
1610
int i, *idst = (int32_t *) dst;
1611
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1612
for (i = 0; i < chromWidth; i++){
1613
#ifdef WORDS_BIGENDIAN
1614
*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1615
(yc[1] << 8) + (vc[0] << 0);
1617
*idst++ = yc[0] + (uc[0] << 8) +
1618
(yc[1] << 16) + (vc[0] << 24);
1626
if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1628
usrc += chromStride;
1629
vsrc += chromStride;
1643
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1644
* problem for anyone then tell me, and ill fix it)
1646
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1647
long width, long height,
1648
long lumStride, long chromStride, long dstStride)
1650
//FIXME interpolate chroma
1651
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1654
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1655
long width, long height,
1656
long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1659
const long chromWidth= width>>1;
1660
for (y=0; y<height; y++)
1663
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1665
"xor %%"REG_a", %%"REG_a" \n\t"
1668
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1669
PREFETCH" 32(%2, %%"REG_a") \n\t"
1670
PREFETCH" 32(%3, %%"REG_a") \n\t"
1671
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1672
"movq %%mm0, %%mm2 \n\t" // U(0)
1673
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1674
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1675
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1677
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1678
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1679
"movq %%mm0, %%mm4 \n\t" // Y(0)
1680
"movq %%mm2, %%mm6 \n\t" // Y(8)
1681
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1682
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1683
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1684
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1686
MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1687
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1688
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1689
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1691
"add $8, %%"REG_a" \n\t"
1692
"cmp %4, %%"REG_a" \n\t"
1694
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1698
//FIXME adapt the alpha asm code from yv12->yuy2
1700
#if __WORDSIZE >= 64
1702
uint64_t *ldst = (uint64_t *) dst;
1703
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1704
for (i = 0; i < chromWidth; i += 2){
1706
k = uc[0] + (yc[0] << 8) +
1707
(vc[0] << 16) + (yc[1] << 24);
1708
l = uc[1] + (yc[2] << 8) +
1709
(vc[1] << 16) + (yc[3] << 24);
1710
*ldst++ = k + (l << 32);
1717
int i, *idst = (int32_t *) dst;
1718
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1719
for (i = 0; i < chromWidth; i++){
1720
#ifdef WORDS_BIGENDIAN
1721
*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1722
(vc[0] << 8) + (yc[1] << 0);
1724
*idst++ = uc[0] + (yc[0] << 8) +
1725
(vc[0] << 16) + (yc[1] << 24);
1733
if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1735
usrc += chromStride;
1736
vsrc += chromStride;
1750
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1751
* problem for anyone then tell me, and ill fix it)
1753
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1754
long width, long height,
1755
long lumStride, long chromStride, long dstStride)
1757
//FIXME interpolate chroma
1758
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1763
* width should be a multiple of 16
1765
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1766
long width, long height,
1767
long lumStride, long chromStride, long dstStride)
1769
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1774
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1775
* problem for anyone then tell me, and ill fix it)
1777
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1778
long width, long height,
1779
long lumStride, long chromStride, long srcStride)
1782
const long chromWidth= width>>1;
1783
for (y=0; y<height; y+=2)
1787
"xor %%"REG_a", %%"REG_a" \n\t"
1788
"pcmpeqw %%mm7, %%mm7 \n\t"
1789
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1792
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1793
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1794
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1795
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1796
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1797
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1798
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1799
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1800
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1801
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1802
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1804
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1806
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1807
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1808
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1809
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1810
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1811
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1812
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1813
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1814
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1815
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1817
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1819
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1820
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1821
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1822
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1823
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1824
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1825
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1826
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1828
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1829
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1831
"add $8, %%"REG_a" \n\t"
1832
"cmp %4, %%"REG_a" \n\t"
1834
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1835
: "memory", "%"REG_a
1842
"xor %%"REG_a", %%"REG_a" \n\t"
1845
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1846
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1847
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1848
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1849
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1850
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1851
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1852
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1853
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1854
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1855
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1857
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1858
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1860
"add $8, %%"REG_a" \n\t"
1861
"cmp %4, %%"REG_a" \n\t"
1864
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1865
: "memory", "%"REG_a
1869
for (i=0; i<chromWidth; i++)
1871
ydst[2*i+0] = src[4*i+0];
1872
udst[i] = src[4*i+1];
1873
ydst[2*i+1] = src[4*i+2];
1874
vdst[i] = src[4*i+3];
1879
for (i=0; i<chromWidth; i++)
1881
ydst[2*i+0] = src[4*i+0];
1882
ydst[2*i+1] = src[4*i+2];
1885
udst += chromStride;
1886
vdst += chromStride;
1891
asm volatile( EMMS" \n\t"
1897
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1898
uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1899
long width, long height, long lumStride, long chromStride)
1902
memcpy(ydst, ysrc, width*height);
1904
/* XXX: implement upscaling for U,V */
1907
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1914
for (x=0; x<srcWidth-1; x++){
1915
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1916
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1918
dst[2*srcWidth-1]= src[srcWidth-1];
1922
for (y=1; y<srcHeight; y++){
1923
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1924
const long mmxSize= srcWidth&~15;
1926
"mov %4, %%"REG_a" \n\t"
1928
"movq (%0, %%"REG_a"), %%mm0 \n\t"
1929
"movq (%1, %%"REG_a"), %%mm1 \n\t"
1930
"movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1931
"movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1932
"movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1933
"movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1934
PAVGB" %%mm0, %%mm5 \n\t"
1935
PAVGB" %%mm0, %%mm3 \n\t"
1936
PAVGB" %%mm0, %%mm5 \n\t"
1937
PAVGB" %%mm0, %%mm3 \n\t"
1938
PAVGB" %%mm1, %%mm4 \n\t"
1939
PAVGB" %%mm1, %%mm2 \n\t"
1940
PAVGB" %%mm1, %%mm4 \n\t"
1941
PAVGB" %%mm1, %%mm2 \n\t"
1942
"movq %%mm5, %%mm7 \n\t"
1943
"movq %%mm4, %%mm6 \n\t"
1944
"punpcklbw %%mm3, %%mm5 \n\t"
1945
"punpckhbw %%mm3, %%mm7 \n\t"
1946
"punpcklbw %%mm2, %%mm4 \n\t"
1947
"punpckhbw %%mm2, %%mm6 \n\t"
1949
MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1950
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1951
MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1952
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1954
"movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1955
"movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1956
"movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1957
"movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1959
"add $8, %%"REG_a" \n\t"
1961
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1962
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1968
const long mmxSize=1;
1970
dst[0 ]= (3*src[0] + src[srcStride])>>2;
1971
dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1973
for (x=mmxSize-1; x<srcWidth-1; x++){
1974
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1975
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1976
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1977
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1979
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1980
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1990
for (x=0; x<srcWidth-1; x++){
1991
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1992
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1994
dst[2*srcWidth-1]= src[srcWidth-1];
1996
for (x=0; x<srcWidth; x++){
2003
asm volatile( EMMS" \n\t"
2011
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2012
* problem for anyone then tell me, and ill fix it)
2013
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2015
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2016
long width, long height,
2017
long lumStride, long chromStride, long srcStride)
2020
const long chromWidth= width>>1;
2021
for (y=0; y<height; y+=2)
2025
"xorl %%eax, %%eax \n\t"
2026
"pcmpeqw %%mm7, %%mm7 \n\t"
2027
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2030
PREFETCH" 64(%0, %%eax, 4) \n\t"
2031
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2032
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2033
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2034
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2035
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2036
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2037
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2038
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2039
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2040
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2042
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2044
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2045
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2046
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2047
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2048
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2049
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2050
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2051
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2052
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2053
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2055
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2057
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2058
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2059
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2060
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2061
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2062
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2063
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2064
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2066
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2067
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2069
"addl $8, %%eax \n\t"
2070
"cmpl %4, %%eax \n\t"
2072
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080
"xorl %%eax, %%eax \n\t"
2083
PREFETCH" 64(%0, %%eax, 4) \n\t"
2084
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2085
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2086
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2087
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2088
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2089
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2090
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2091
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2092
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2093
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2095
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2096
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2098
"addl $8, %%eax \n\t"
2099
"cmpl %4, %%eax \n\t"
2102
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2107
for (i=0; i<chromWidth; i++)
2109
udst[i] = src[4*i+0];
2110
ydst[2*i+0] = src[4*i+1];
2111
vdst[i] = src[4*i+2];
2112
ydst[2*i+1] = src[4*i+3];
2117
for (i=0; i<chromWidth; i++)
2119
ydst[2*i+0] = src[4*i+1];
2120
ydst[2*i+1] = src[4*i+3];
2123
udst += chromStride;
2124
vdst += chromStride;
2129
asm volatile( EMMS" \n\t"
2137
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2138
* problem for anyone then tell me, and ill fix it)
2139
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2141
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2142
long width, long height,
2143
long lumStride, long chromStride, long srcStride)
2146
const long chromWidth= width>>1;
2148
for (y=0; y<height-2; y+=2)
2154
"mov %2, %%"REG_a" \n\t"
2155
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2156
"movq "MANGLE(w1111)", %%mm5 \n\t"
2157
"pxor %%mm7, %%mm7 \n\t"
2158
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2161
PREFETCH" 64(%0, %%"REG_d") \n\t"
2162
"movd (%0, %%"REG_d"), %%mm0 \n\t"
2163
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2164
"punpcklbw %%mm7, %%mm0 \n\t"
2165
"punpcklbw %%mm7, %%mm1 \n\t"
2166
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2167
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2168
"punpcklbw %%mm7, %%mm2 \n\t"
2169
"punpcklbw %%mm7, %%mm3 \n\t"
2170
"pmaddwd %%mm6, %%mm0 \n\t"
2171
"pmaddwd %%mm6, %%mm1 \n\t"
2172
"pmaddwd %%mm6, %%mm2 \n\t"
2173
"pmaddwd %%mm6, %%mm3 \n\t"
2174
#ifndef FAST_BGR2YV12
2175
"psrad $8, %%mm0 \n\t"
2176
"psrad $8, %%mm1 \n\t"
2177
"psrad $8, %%mm2 \n\t"
2178
"psrad $8, %%mm3 \n\t"
2180
"packssdw %%mm1, %%mm0 \n\t"
2181
"packssdw %%mm3, %%mm2 \n\t"
2182
"pmaddwd %%mm5, %%mm0 \n\t"
2183
"pmaddwd %%mm5, %%mm2 \n\t"
2184
"packssdw %%mm2, %%mm0 \n\t"
2185
"psraw $7, %%mm0 \n\t"
2187
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2188
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2189
"punpcklbw %%mm7, %%mm4 \n\t"
2190
"punpcklbw %%mm7, %%mm1 \n\t"
2191
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2192
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2193
"punpcklbw %%mm7, %%mm2 \n\t"
2194
"punpcklbw %%mm7, %%mm3 \n\t"
2195
"pmaddwd %%mm6, %%mm4 \n\t"
2196
"pmaddwd %%mm6, %%mm1 \n\t"
2197
"pmaddwd %%mm6, %%mm2 \n\t"
2198
"pmaddwd %%mm6, %%mm3 \n\t"
2199
#ifndef FAST_BGR2YV12
2200
"psrad $8, %%mm4 \n\t"
2201
"psrad $8, %%mm1 \n\t"
2202
"psrad $8, %%mm2 \n\t"
2203
"psrad $8, %%mm3 \n\t"
2205
"packssdw %%mm1, %%mm4 \n\t"
2206
"packssdw %%mm3, %%mm2 \n\t"
2207
"pmaddwd %%mm5, %%mm4 \n\t"
2208
"pmaddwd %%mm5, %%mm2 \n\t"
2209
"add $24, %%"REG_d" \n\t"
2210
"packssdw %%mm2, %%mm4 \n\t"
2211
"psraw $7, %%mm4 \n\t"
2213
"packuswb %%mm4, %%mm0 \n\t"
2214
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2216
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2217
"add $8, %%"REG_a" \n\t"
2219
: : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2220
: "%"REG_a, "%"REG_d
2227
"mov %4, %%"REG_a" \n\t"
2228
"movq "MANGLE(w1111)", %%mm5 \n\t"
2229
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2230
"pxor %%mm7, %%mm7 \n\t"
2231
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2232
"add %%"REG_d", %%"REG_d" \n\t"
2235
PREFETCH" 64(%0, %%"REG_d") \n\t"
2236
PREFETCH" 64(%1, %%"REG_d") \n\t"
2237
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2238
"movq (%0, %%"REG_d"), %%mm0 \n\t"
2239
"movq (%1, %%"REG_d"), %%mm1 \n\t"
2240
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2241
"movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2242
PAVGB" %%mm1, %%mm0 \n\t"
2243
PAVGB" %%mm3, %%mm2 \n\t"
2244
"movq %%mm0, %%mm1 \n\t"
2245
"movq %%mm2, %%mm3 \n\t"
2246
"psrlq $24, %%mm0 \n\t"
2247
"psrlq $24, %%mm2 \n\t"
2248
PAVGB" %%mm1, %%mm0 \n\t"
2249
PAVGB" %%mm3, %%mm2 \n\t"
2250
"punpcklbw %%mm7, %%mm0 \n\t"
2251
"punpcklbw %%mm7, %%mm2 \n\t"
2253
"movd (%0, %%"REG_d"), %%mm0 \n\t"
2254
"movd (%1, %%"REG_d"), %%mm1 \n\t"
2255
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2256
"movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2257
"punpcklbw %%mm7, %%mm0 \n\t"
2258
"punpcklbw %%mm7, %%mm1 \n\t"
2259
"punpcklbw %%mm7, %%mm2 \n\t"
2260
"punpcklbw %%mm7, %%mm3 \n\t"
2261
"paddw %%mm1, %%mm0 \n\t"
2262
"paddw %%mm3, %%mm2 \n\t"
2263
"paddw %%mm2, %%mm0 \n\t"
2264
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2265
"movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2266
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2267
"movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2268
"punpcklbw %%mm7, %%mm4 \n\t"
2269
"punpcklbw %%mm7, %%mm1 \n\t"
2270
"punpcklbw %%mm7, %%mm2 \n\t"
2271
"punpcklbw %%mm7, %%mm3 \n\t"
2272
"paddw %%mm1, %%mm4 \n\t"
2273
"paddw %%mm3, %%mm2 \n\t"
2274
"paddw %%mm4, %%mm2 \n\t"
2275
"psrlw $2, %%mm0 \n\t"
2276
"psrlw $2, %%mm2 \n\t"
2278
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2279
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2281
"pmaddwd %%mm0, %%mm1 \n\t"
2282
"pmaddwd %%mm2, %%mm3 \n\t"
2283
"pmaddwd %%mm6, %%mm0 \n\t"
2284
"pmaddwd %%mm6, %%mm2 \n\t"
2285
#ifndef FAST_BGR2YV12
2286
"psrad $8, %%mm0 \n\t"
2287
"psrad $8, %%mm1 \n\t"
2288
"psrad $8, %%mm2 \n\t"
2289
"psrad $8, %%mm3 \n\t"
2291
"packssdw %%mm2, %%mm0 \n\t"
2292
"packssdw %%mm3, %%mm1 \n\t"
2293
"pmaddwd %%mm5, %%mm0 \n\t"
2294
"pmaddwd %%mm5, %%mm1 \n\t"
2295
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2296
"psraw $7, %%mm0 \n\t"
2298
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2299
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2300
"movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2301
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2302
"movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2303
PAVGB" %%mm1, %%mm4 \n\t"
2304
PAVGB" %%mm3, %%mm2 \n\t"
2305
"movq %%mm4, %%mm1 \n\t"
2306
"movq %%mm2, %%mm3 \n\t"
2307
"psrlq $24, %%mm4 \n\t"
2308
"psrlq $24, %%mm2 \n\t"
2309
PAVGB" %%mm1, %%mm4 \n\t"
2310
PAVGB" %%mm3, %%mm2 \n\t"
2311
"punpcklbw %%mm7, %%mm4 \n\t"
2312
"punpcklbw %%mm7, %%mm2 \n\t"
2314
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2315
"movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2316
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2317
"movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2318
"punpcklbw %%mm7, %%mm4 \n\t"
2319
"punpcklbw %%mm7, %%mm1 \n\t"
2320
"punpcklbw %%mm7, %%mm2 \n\t"
2321
"punpcklbw %%mm7, %%mm3 \n\t"
2322
"paddw %%mm1, %%mm4 \n\t"
2323
"paddw %%mm3, %%mm2 \n\t"
2324
"paddw %%mm2, %%mm4 \n\t"
2325
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2326
"movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2327
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2328
"movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2329
"punpcklbw %%mm7, %%mm5 \n\t"
2330
"punpcklbw %%mm7, %%mm1 \n\t"
2331
"punpcklbw %%mm7, %%mm2 \n\t"
2332
"punpcklbw %%mm7, %%mm3 \n\t"
2333
"paddw %%mm1, %%mm5 \n\t"
2334
"paddw %%mm3, %%mm2 \n\t"
2335
"paddw %%mm5, %%mm2 \n\t"
2336
"movq "MANGLE(w1111)", %%mm5 \n\t"
2337
"psrlw $2, %%mm4 \n\t"
2338
"psrlw $2, %%mm2 \n\t"
2340
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2341
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2343
"pmaddwd %%mm4, %%mm1 \n\t"
2344
"pmaddwd %%mm2, %%mm3 \n\t"
2345
"pmaddwd %%mm6, %%mm4 \n\t"
2346
"pmaddwd %%mm6, %%mm2 \n\t"
2347
#ifndef FAST_BGR2YV12
2348
"psrad $8, %%mm4 \n\t"
2349
"psrad $8, %%mm1 \n\t"
2350
"psrad $8, %%mm2 \n\t"
2351
"psrad $8, %%mm3 \n\t"
2353
"packssdw %%mm2, %%mm4 \n\t"
2354
"packssdw %%mm3, %%mm1 \n\t"
2355
"pmaddwd %%mm5, %%mm4 \n\t"
2356
"pmaddwd %%mm5, %%mm1 \n\t"
2357
"add $24, %%"REG_d" \n\t"
2358
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2359
"psraw $7, %%mm4 \n\t"
2361
"movq %%mm0, %%mm1 \n\t"
2362
"punpckldq %%mm4, %%mm0 \n\t"
2363
"punpckhdq %%mm4, %%mm1 \n\t"
2364
"packsswb %%mm1, %%mm0 \n\t"
2365
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2366
"movd %%mm0, (%2, %%"REG_a") \n\t"
2367
"punpckhdq %%mm0, %%mm0 \n\t"
2368
"movd %%mm0, (%3, %%"REG_a") \n\t"
2369
"add $4, %%"REG_a" \n\t"
2371
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2372
: "%"REG_a, "%"REG_d
2375
udst += chromStride;
2376
vdst += chromStride;
2380
asm volatile( EMMS" \n\t"
2386
for (; y<height; y+=2)
2389
for (i=0; i<chromWidth; i++)
2391
unsigned int b = src[6*i+0];
2392
unsigned int g = src[6*i+1];
2393
unsigned int r = src[6*i+2];
2395
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2396
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2397
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2407
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2413
for (i=0; i<chromWidth; i++)
2415
unsigned int b = src[6*i+0];
2416
unsigned int g = src[6*i+1];
2417
unsigned int r = src[6*i+2];
2419
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2427
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2430
udst += chromStride;
2431
vdst += chromStride;
2437
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2438
long width, long height, long src1Stride,
2439
long src2Stride, long dstStride){
2442
for (h=0; h < height; h++)
2449
"xor %%"REG_a", %%"REG_a" \n\t"
2451
PREFETCH" 64(%1, %%"REG_a") \n\t"
2452
PREFETCH" 64(%2, %%"REG_a") \n\t"
2453
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2454
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2455
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2456
"punpcklbw %%xmm2, %%xmm0 \n\t"
2457
"punpckhbw %%xmm2, %%xmm1 \n\t"
2458
"movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2459
"movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2460
"add $16, %%"REG_a" \n\t"
2461
"cmp %3, %%"REG_a" \n\t"
2463
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2464
: "memory", "%"REG_a""
2468
"xor %%"REG_a", %%"REG_a" \n\t"
2470
PREFETCH" 64(%1, %%"REG_a") \n\t"
2471
PREFETCH" 64(%2, %%"REG_a") \n\t"
2472
"movq (%1, %%"REG_a"), %%mm0 \n\t"
2473
"movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2474
"movq %%mm0, %%mm1 \n\t"
2475
"movq %%mm2, %%mm3 \n\t"
2476
"movq (%2, %%"REG_a"), %%mm4 \n\t"
2477
"movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2478
"punpcklbw %%mm4, %%mm0 \n\t"
2479
"punpckhbw %%mm4, %%mm1 \n\t"
2480
"punpcklbw %%mm5, %%mm2 \n\t"
2481
"punpckhbw %%mm5, %%mm3 \n\t"
2482
MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2483
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2484
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2485
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2486
"add $16, %%"REG_a" \n\t"
2487
"cmp %3, %%"REG_a" \n\t"
2489
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2490
: "memory", "%"REG_a
2493
for (w= (width&(~15)); w < width; w++)
2495
dest[2*w+0] = src1[w];
2496
dest[2*w+1] = src2[w];
2499
for (w=0; w < width; w++)
2501
dest[2*w+0] = src1[w];
2502
dest[2*w+1] = src2[w];
2518
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2519
uint8_t *dst1, uint8_t *dst2,
2520
long width, long height,
2521
long srcStride1, long srcStride2,
2522
long dstStride1, long dstStride2)
2525
w=width/2; h=height/2;
2530
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2533
const uint8_t* s1=src1+srcStride1*(y>>1);
2534
uint8_t* d=dst1+dstStride1*y;
2540
PREFETCH" 32%1 \n\t"
2541
"movq %1, %%mm0 \n\t"
2542
"movq 8%1, %%mm2 \n\t"
2543
"movq 16%1, %%mm4 \n\t"
2544
"movq 24%1, %%mm6 \n\t"
2545
"movq %%mm0, %%mm1 \n\t"
2546
"movq %%mm2, %%mm3 \n\t"
2547
"movq %%mm4, %%mm5 \n\t"
2548
"movq %%mm6, %%mm7 \n\t"
2549
"punpcklbw %%mm0, %%mm0 \n\t"
2550
"punpckhbw %%mm1, %%mm1 \n\t"
2551
"punpcklbw %%mm2, %%mm2 \n\t"
2552
"punpckhbw %%mm3, %%mm3 \n\t"
2553
"punpcklbw %%mm4, %%mm4 \n\t"
2554
"punpckhbw %%mm5, %%mm5 \n\t"
2555
"punpcklbw %%mm6, %%mm6 \n\t"
2556
"punpckhbw %%mm7, %%mm7 \n\t"
2557
MOVNTQ" %%mm0, %0 \n\t"
2558
MOVNTQ" %%mm1, 8%0 \n\t"
2559
MOVNTQ" %%mm2, 16%0 \n\t"
2560
MOVNTQ" %%mm3, 24%0 \n\t"
2561
MOVNTQ" %%mm4, 32%0 \n\t"
2562
MOVNTQ" %%mm5, 40%0 \n\t"
2563
MOVNTQ" %%mm6, 48%0 \n\t"
2564
MOVNTQ" %%mm7, 56%0"
2570
for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2573
const uint8_t* s2=src2+srcStride2*(y>>1);
2574
uint8_t* d=dst2+dstStride2*y;
2580
PREFETCH" 32%1 \n\t"
2581
"movq %1, %%mm0 \n\t"
2582
"movq 8%1, %%mm2 \n\t"
2583
"movq 16%1, %%mm4 \n\t"
2584
"movq 24%1, %%mm6 \n\t"
2585
"movq %%mm0, %%mm1 \n\t"
2586
"movq %%mm2, %%mm3 \n\t"
2587
"movq %%mm4, %%mm5 \n\t"
2588
"movq %%mm6, %%mm7 \n\t"
2589
"punpcklbw %%mm0, %%mm0 \n\t"
2590
"punpckhbw %%mm1, %%mm1 \n\t"
2591
"punpcklbw %%mm2, %%mm2 \n\t"
2592
"punpckhbw %%mm3, %%mm3 \n\t"
2593
"punpcklbw %%mm4, %%mm4 \n\t"
2594
"punpckhbw %%mm5, %%mm5 \n\t"
2595
"punpcklbw %%mm6, %%mm6 \n\t"
2596
"punpckhbw %%mm7, %%mm7 \n\t"
2597
MOVNTQ" %%mm0, %0 \n\t"
2598
MOVNTQ" %%mm1, 8%0 \n\t"
2599
MOVNTQ" %%mm2, 16%0 \n\t"
2600
MOVNTQ" %%mm3, 24%0 \n\t"
2601
MOVNTQ" %%mm4, 32%0 \n\t"
2602
MOVNTQ" %%mm5, 40%0 \n\t"
2603
MOVNTQ" %%mm6, 48%0 \n\t"
2604
MOVNTQ" %%mm7, 56%0"
2610
for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2621
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2623
long width, long height,
2624
long srcStride1, long srcStride2,
2625
long srcStride3, long dstStride)
2628
w=width/2; h=height;
2630
const uint8_t* yp=src1+srcStride1*y;
2631
const uint8_t* up=src2+srcStride2*(y>>2);
2632
const uint8_t* vp=src3+srcStride3*(y>>2);
2633
uint8_t* d=dst+dstStride*y;
2639
PREFETCH" 32(%1, %0) \n\t"
2640
PREFETCH" 32(%2, %0) \n\t"
2641
PREFETCH" 32(%3, %0) \n\t"
2642
"movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2643
"movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2644
"movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2645
"movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2646
"movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2647
"movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2648
"punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2649
"punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2650
"punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2651
"punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2653
"movq %%mm1, %%mm6 \n\t"
2654
"punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2655
"punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2656
"punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2657
MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2658
MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2660
"punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2661
"movq 8(%1, %0, 4), %%mm0 \n\t"
2662
"movq %%mm0, %%mm3 \n\t"
2663
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2664
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2665
MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2666
MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2668
"movq %%mm4, %%mm6 \n\t"
2669
"movq 16(%1, %0, 4), %%mm0 \n\t"
2670
"movq %%mm0, %%mm3 \n\t"
2671
"punpcklbw %%mm5, %%mm4 \n\t"
2672
"punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2673
"punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2674
MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2675
MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2677
"punpckhbw %%mm5, %%mm6 \n\t"
2678
"movq 24(%1, %0, 4), %%mm0 \n\t"
2679
"movq %%mm0, %%mm3 \n\t"
2680
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2681
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2682
MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2683
MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2686
: "r"(yp), "r" (up), "r"(vp), "r"(d)
2692
const long x2 = x<<2;
2695
d[8*x+2] = yp[x2+1];
2697
d[8*x+4] = yp[x2+2];
2699
d[8*x+6] = yp[x2+3];
2712
static inline void RENAME(rgb2rgb_init)(void){
2713
rgb15to16 = RENAME(rgb15to16);
2714
rgb15to24 = RENAME(rgb15to24);
2715
rgb15to32 = RENAME(rgb15to32);
2716
rgb16to24 = RENAME(rgb16to24);
2717
rgb16to32 = RENAME(rgb16to32);
2718
rgb16to15 = RENAME(rgb16to15);
2719
rgb24to16 = RENAME(rgb24to16);
2720
rgb24to15 = RENAME(rgb24to15);
2721
rgb24to32 = RENAME(rgb24to32);
2722
rgb32to16 = RENAME(rgb32to16);
2723
rgb32to15 = RENAME(rgb32to15);
2724
rgb32to24 = RENAME(rgb32to24);
2725
rgb24tobgr15 = RENAME(rgb24tobgr15);
2726
rgb24tobgr16 = RENAME(rgb24tobgr16);
2727
rgb24tobgr24 = RENAME(rgb24tobgr24);
2728
rgb32tobgr32 = RENAME(rgb32tobgr32);
2729
rgb32tobgr16 = RENAME(rgb32tobgr16);
2730
rgb32tobgr15 = RENAME(rgb32tobgr15);
2731
yv12toyuy2 = RENAME(yv12toyuy2);
2732
yv12touyvy = RENAME(yv12touyvy);
2733
yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2734
yuy2toyv12 = RENAME(yuy2toyv12);
2735
// uyvytoyv12 = RENAME(uyvytoyv12);
2736
// yvu9toyv12 = RENAME(yvu9toyv12);
2737
planar2x = RENAME(planar2x);
2738
rgb24toyv12 = RENAME(rgb24toyv12);
2739
interleaveBytes = RENAME(interleaveBytes);
2740
vu9_to_vu12 = RENAME(vu9_to_vu12);
2741
yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);