2
* software RGB to RGB converter
3
* pluralize by software PAL8 to RGB converter
4
* software YUV to YUV converter
5
* software YUV to RGB converter
6
* Written by Nick Kurshev.
7
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
* lot of big-endian byte order fixes by Alex Beregszaszi
10
* This file is part of FFmpeg.
12
* FFmpeg is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU Lesser General Public
14
* License as published by the Free Software Foundation; either
15
* version 2.1 of the License, or (at your option) any later version.
17
* FFmpeg is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
* Lesser General Public License for more details.
22
* You should have received a copy of the GNU Lesser General Public
23
* License along with FFmpeg; if not, write to the Free Software
24
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
43
#define PREFETCH "prefetch"
44
#define PAVGB "pavgusb"
46
#define PREFETCH "prefetchnta"
49
#define PREFETCH " # nop"
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
64
#define SFENCE " # nop"
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
70
const uint8_t *s = src;
73
const uint8_t *mm_end;
77
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
79
__asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
84
"punpckldq 3%1, %%mm0 \n\t"
85
"movd 6%1, %%mm1 \n\t"
86
"punpckldq 9%1, %%mm1 \n\t"
87
"movd 12%1, %%mm2 \n\t"
88
"punpckldq 15%1, %%mm2 \n\t"
89
"movd 18%1, %%mm3 \n\t"
90
"punpckldq 21%1, %%mm3 \n\t"
91
"por %%mm7, %%mm0 \n\t"
92
"por %%mm7, %%mm1 \n\t"
93
"por %%mm7, %%mm2 \n\t"
94
"por %%mm7, %%mm3 \n\t"
95
MOVNTQ" %%mm0, %0 \n\t"
96
MOVNTQ" %%mm1, 8%0 \n\t"
97
MOVNTQ" %%mm2, 16%0 \n\t"
105
__asm__ volatile(SFENCE:::"memory");
106
__asm__ volatile(EMMS:::"memory");
110
/* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
125
#define STORE_BGR24_MMX \
126
"psrlq $8, %%mm2 \n\t" \
127
"psrlq $8, %%mm3 \n\t" \
128
"psrlq $8, %%mm6 \n\t" \
129
"psrlq $8, %%mm7 \n\t" \
130
"pand "MANGLE(mask24l)", %%mm0\n\t" \
131
"pand "MANGLE(mask24l)", %%mm1\n\t" \
132
"pand "MANGLE(mask24l)", %%mm4\n\t" \
133
"pand "MANGLE(mask24l)", %%mm5\n\t" \
134
"pand "MANGLE(mask24h)", %%mm2\n\t" \
135
"pand "MANGLE(mask24h)", %%mm3\n\t" \
136
"pand "MANGLE(mask24h)", %%mm6\n\t" \
137
"pand "MANGLE(mask24h)", %%mm7\n\t" \
138
"por %%mm2, %%mm0 \n\t" \
139
"por %%mm3, %%mm1 \n\t" \
140
"por %%mm6, %%mm4 \n\t" \
141
"por %%mm7, %%mm5 \n\t" \
143
"movq %%mm1, %%mm2 \n\t" \
144
"movq %%mm4, %%mm3 \n\t" \
145
"psllq $48, %%mm2 \n\t" \
146
"psllq $32, %%mm3 \n\t" \
147
"pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
"pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
"por %%mm2, %%mm0 \n\t" \
150
"psrlq $16, %%mm1 \n\t" \
151
"psrlq $32, %%mm4 \n\t" \
152
"psllq $16, %%mm5 \n\t" \
153
"por %%mm3, %%mm1 \n\t" \
154
"pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
"por %%mm5, %%mm4 \n\t" \
157
MOVNTQ" %%mm0, %0 \n\t" \
158
MOVNTQ" %%mm1, 8%0 \n\t" \
162
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
165
const uint8_t *s = src;
168
const uint8_t *mm_end;
172
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
177
"movq %1, %%mm0 \n\t"
178
"movq 8%1, %%mm1 \n\t"
179
"movq 16%1, %%mm4 \n\t"
180
"movq 24%1, %%mm5 \n\t"
181
"movq %%mm0, %%mm2 \n\t"
182
"movq %%mm1, %%mm3 \n\t"
183
"movq %%mm4, %%mm6 \n\t"
184
"movq %%mm5, %%mm7 \n\t"
192
__asm__ volatile(SFENCE:::"memory");
193
__asm__ volatile(EMMS:::"memory");
197
/* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
213
original by Strepto/Astral
214
ported to gcc & bugfixed: A'rpi
215
MMX2, 3DNOW optimization by Nick Kurshev
216
32-bit C version, and and&add trick by Michael Niedermayer
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
220
register const uint8_t* s=src;
221
register uint8_t* d=dst;
222
register const uint8_t *end;
223
const uint8_t *mm_end;
226
__asm__ volatile(PREFETCH" %0"::"m"(*s));
227
__asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
232
"movq %1, %%mm0 \n\t"
233
"movq 8%1, %%mm2 \n\t"
234
"movq %%mm0, %%mm1 \n\t"
235
"movq %%mm2, %%mm3 \n\t"
236
"pand %%mm4, %%mm0 \n\t"
237
"pand %%mm4, %%mm2 \n\t"
238
"paddw %%mm1, %%mm0 \n\t"
239
"paddw %%mm3, %%mm2 \n\t"
240
MOVNTQ" %%mm0, %0 \n\t"
248
__asm__ volatile(SFENCE:::"memory");
249
__asm__ volatile(EMMS:::"memory");
253
register unsigned x= *((const uint32_t *)s);
254
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
259
register unsigned short x= *((const uint16_t *)s);
260
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
266
register const uint8_t* s=src;
267
register uint8_t* d=dst;
268
register const uint8_t *end;
269
const uint8_t *mm_end;
272
__asm__ volatile(PREFETCH" %0"::"m"(*s));
273
__asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
274
__asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
279
"movq %1, %%mm0 \n\t"
280
"movq 8%1, %%mm2 \n\t"
281
"movq %%mm0, %%mm1 \n\t"
282
"movq %%mm2, %%mm3 \n\t"
283
"psrlq $1, %%mm0 \n\t"
284
"psrlq $1, %%mm2 \n\t"
285
"pand %%mm7, %%mm0 \n\t"
286
"pand %%mm7, %%mm2 \n\t"
287
"pand %%mm6, %%mm1 \n\t"
288
"pand %%mm6, %%mm3 \n\t"
289
"por %%mm1, %%mm0 \n\t"
290
"por %%mm3, %%mm2 \n\t"
291
MOVNTQ" %%mm0, %0 \n\t"
299
__asm__ volatile(SFENCE:::"memory");
300
__asm__ volatile(EMMS:::"memory");
304
register uint32_t x= *((const uint32_t*)s);
305
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
310
register uint16_t x= *((const uint16_t*)s);
311
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
317
const uint8_t *s = src;
320
const uint8_t *mm_end;
322
uint16_t *d = (uint16_t *)dst;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
328
"movq %3, %%mm5 \n\t"
329
"movq %4, %%mm6 \n\t"
330
"movq %5, %%mm7 \n\t"
334
PREFETCH" 32(%1) \n\t"
335
"movd (%1), %%mm0 \n\t"
336
"movd 4(%1), %%mm3 \n\t"
337
"punpckldq 8(%1), %%mm0 \n\t"
338
"punpckldq 12(%1), %%mm3 \n\t"
339
"movq %%mm0, %%mm1 \n\t"
340
"movq %%mm3, %%mm4 \n\t"
341
"pand %%mm6, %%mm0 \n\t"
342
"pand %%mm6, %%mm3 \n\t"
343
"pmaddwd %%mm7, %%mm0 \n\t"
344
"pmaddwd %%mm7, %%mm3 \n\t"
345
"pand %%mm5, %%mm1 \n\t"
346
"pand %%mm5, %%mm4 \n\t"
347
"por %%mm1, %%mm0 \n\t"
348
"por %%mm4, %%mm3 \n\t"
349
"psrld $5, %%mm0 \n\t"
350
"pslld $11, %%mm3 \n\t"
351
"por %%mm3, %%mm0 \n\t"
352
MOVNTQ" %%mm0, (%0) \n\t"
359
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
362
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
364
"movq %0, %%mm7 \n\t"
365
"movq %1, %%mm6 \n\t"
366
::"m"(red_16mask),"m"(green_16mask));
370
"movd %1, %%mm0 \n\t"
371
"movd 4%1, %%mm3 \n\t"
372
"punpckldq 8%1, %%mm0 \n\t"
373
"punpckldq 12%1, %%mm3 \n\t"
374
"movq %%mm0, %%mm1 \n\t"
375
"movq %%mm0, %%mm2 \n\t"
376
"movq %%mm3, %%mm4 \n\t"
377
"movq %%mm3, %%mm5 \n\t"
378
"psrlq $3, %%mm0 \n\t"
379
"psrlq $3, %%mm3 \n\t"
380
"pand %2, %%mm0 \n\t"
381
"pand %2, %%mm3 \n\t"
382
"psrlq $5, %%mm1 \n\t"
383
"psrlq $5, %%mm4 \n\t"
384
"pand %%mm6, %%mm1 \n\t"
385
"pand %%mm6, %%mm4 \n\t"
386
"psrlq $8, %%mm2 \n\t"
387
"psrlq $8, %%mm5 \n\t"
388
"pand %%mm7, %%mm2 \n\t"
389
"pand %%mm7, %%mm5 \n\t"
390
"por %%mm1, %%mm0 \n\t"
391
"por %%mm4, %%mm3 \n\t"
392
"por %%mm2, %%mm0 \n\t"
393
"por %%mm5, %%mm3 \n\t"
394
"psllq $16, %%mm3 \n\t"
395
"por %%mm3, %%mm0 \n\t"
396
MOVNTQ" %%mm0, %0 \n\t"
397
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
402
__asm__ volatile(SFENCE:::"memory");
403
__asm__ volatile(EMMS:::"memory");
406
register int rgb = *(const uint32_t*)s; s += 4;
407
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
413
const uint8_t *s = src;
416
const uint8_t *mm_end;
418
uint16_t *d = (uint16_t *)dst;
421
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
423
"movq %0, %%mm7 \n\t"
424
"movq %1, %%mm6 \n\t"
425
::"m"(red_16mask),"m"(green_16mask));
430
"movd %1, %%mm0 \n\t"
431
"movd 4%1, %%mm3 \n\t"
432
"punpckldq 8%1, %%mm0 \n\t"
433
"punpckldq 12%1, %%mm3 \n\t"
434
"movq %%mm0, %%mm1 \n\t"
435
"movq %%mm0, %%mm2 \n\t"
436
"movq %%mm3, %%mm4 \n\t"
437
"movq %%mm3, %%mm5 \n\t"
438
"psllq $8, %%mm0 \n\t"
439
"psllq $8, %%mm3 \n\t"
440
"pand %%mm7, %%mm0 \n\t"
441
"pand %%mm7, %%mm3 \n\t"
442
"psrlq $5, %%mm1 \n\t"
443
"psrlq $5, %%mm4 \n\t"
444
"pand %%mm6, %%mm1 \n\t"
445
"pand %%mm6, %%mm4 \n\t"
446
"psrlq $19, %%mm2 \n\t"
447
"psrlq $19, %%mm5 \n\t"
448
"pand %2, %%mm2 \n\t"
449
"pand %2, %%mm5 \n\t"
450
"por %%mm1, %%mm0 \n\t"
451
"por %%mm4, %%mm3 \n\t"
452
"por %%mm2, %%mm0 \n\t"
453
"por %%mm5, %%mm3 \n\t"
454
"psllq $16, %%mm3 \n\t"
455
"por %%mm3, %%mm0 \n\t"
456
MOVNTQ" %%mm0, %0 \n\t"
457
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461
__asm__ volatile(SFENCE:::"memory");
462
__asm__ volatile(EMMS:::"memory");
465
register int rgb = *(const uint32_t*)s; s += 4;
466
*d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
472
const uint8_t *s = src;
475
const uint8_t *mm_end;
477
uint16_t *d = (uint16_t *)dst;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
483
"movq %3, %%mm5 \n\t"
484
"movq %4, %%mm6 \n\t"
485
"movq %5, %%mm7 \n\t"
489
PREFETCH" 32(%1) \n\t"
490
"movd (%1), %%mm0 \n\t"
491
"movd 4(%1), %%mm3 \n\t"
492
"punpckldq 8(%1), %%mm0 \n\t"
493
"punpckldq 12(%1), %%mm3 \n\t"
494
"movq %%mm0, %%mm1 \n\t"
495
"movq %%mm3, %%mm4 \n\t"
496
"pand %%mm6, %%mm0 \n\t"
497
"pand %%mm6, %%mm3 \n\t"
498
"pmaddwd %%mm7, %%mm0 \n\t"
499
"pmaddwd %%mm7, %%mm3 \n\t"
500
"pand %%mm5, %%mm1 \n\t"
501
"pand %%mm5, %%mm4 \n\t"
502
"por %%mm1, %%mm0 \n\t"
503
"por %%mm4, %%mm3 \n\t"
504
"psrld $6, %%mm0 \n\t"
505
"pslld $10, %%mm3 \n\t"
506
"por %%mm3, %%mm0 \n\t"
507
MOVNTQ" %%mm0, (%0) \n\t"
514
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
517
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
519
"movq %0, %%mm7 \n\t"
520
"movq %1, %%mm6 \n\t"
521
::"m"(red_15mask),"m"(green_15mask));
525
"movd %1, %%mm0 \n\t"
526
"movd 4%1, %%mm3 \n\t"
527
"punpckldq 8%1, %%mm0 \n\t"
528
"punpckldq 12%1, %%mm3 \n\t"
529
"movq %%mm0, %%mm1 \n\t"
530
"movq %%mm0, %%mm2 \n\t"
531
"movq %%mm3, %%mm4 \n\t"
532
"movq %%mm3, %%mm5 \n\t"
533
"psrlq $3, %%mm0 \n\t"
534
"psrlq $3, %%mm3 \n\t"
535
"pand %2, %%mm0 \n\t"
536
"pand %2, %%mm3 \n\t"
537
"psrlq $6, %%mm1 \n\t"
538
"psrlq $6, %%mm4 \n\t"
539
"pand %%mm6, %%mm1 \n\t"
540
"pand %%mm6, %%mm4 \n\t"
541
"psrlq $9, %%mm2 \n\t"
542
"psrlq $9, %%mm5 \n\t"
543
"pand %%mm7, %%mm2 \n\t"
544
"pand %%mm7, %%mm5 \n\t"
545
"por %%mm1, %%mm0 \n\t"
546
"por %%mm4, %%mm3 \n\t"
547
"por %%mm2, %%mm0 \n\t"
548
"por %%mm5, %%mm3 \n\t"
549
"psllq $16, %%mm3 \n\t"
550
"por %%mm3, %%mm0 \n\t"
551
MOVNTQ" %%mm0, %0 \n\t"
552
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557
__asm__ volatile(SFENCE:::"memory");
558
__asm__ volatile(EMMS:::"memory");
561
register int rgb = *(const uint32_t*)s; s += 4;
562
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
568
const uint8_t *s = src;
571
const uint8_t *mm_end;
573
uint16_t *d = (uint16_t *)dst;
576
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
578
"movq %0, %%mm7 \n\t"
579
"movq %1, %%mm6 \n\t"
580
::"m"(red_15mask),"m"(green_15mask));
585
"movd %1, %%mm0 \n\t"
586
"movd 4%1, %%mm3 \n\t"
587
"punpckldq 8%1, %%mm0 \n\t"
588
"punpckldq 12%1, %%mm3 \n\t"
589
"movq %%mm0, %%mm1 \n\t"
590
"movq %%mm0, %%mm2 \n\t"
591
"movq %%mm3, %%mm4 \n\t"
592
"movq %%mm3, %%mm5 \n\t"
593
"psllq $7, %%mm0 \n\t"
594
"psllq $7, %%mm3 \n\t"
595
"pand %%mm7, %%mm0 \n\t"
596
"pand %%mm7, %%mm3 \n\t"
597
"psrlq $6, %%mm1 \n\t"
598
"psrlq $6, %%mm4 \n\t"
599
"pand %%mm6, %%mm1 \n\t"
600
"pand %%mm6, %%mm4 \n\t"
601
"psrlq $19, %%mm2 \n\t"
602
"psrlq $19, %%mm5 \n\t"
603
"pand %2, %%mm2 \n\t"
604
"pand %2, %%mm5 \n\t"
605
"por %%mm1, %%mm0 \n\t"
606
"por %%mm4, %%mm3 \n\t"
607
"por %%mm2, %%mm0 \n\t"
608
"por %%mm5, %%mm3 \n\t"
609
"psllq $16, %%mm3 \n\t"
610
"por %%mm3, %%mm0 \n\t"
611
MOVNTQ" %%mm0, %0 \n\t"
612
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
616
__asm__ volatile(SFENCE:::"memory");
617
__asm__ volatile(EMMS:::"memory");
620
register int rgb = *(const uint32_t*)s; s += 4;
621
*d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
627
const uint8_t *s = src;
630
const uint8_t *mm_end;
632
uint16_t *d = (uint16_t *)dst;
635
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
637
"movq %0, %%mm7 \n\t"
638
"movq %1, %%mm6 \n\t"
639
::"m"(red_16mask),"m"(green_16mask));
644
"movd %1, %%mm0 \n\t"
645
"movd 3%1, %%mm3 \n\t"
646
"punpckldq 6%1, %%mm0 \n\t"
647
"punpckldq 9%1, %%mm3 \n\t"
648
"movq %%mm0, %%mm1 \n\t"
649
"movq %%mm0, %%mm2 \n\t"
650
"movq %%mm3, %%mm4 \n\t"
651
"movq %%mm3, %%mm5 \n\t"
652
"psrlq $3, %%mm0 \n\t"
653
"psrlq $3, %%mm3 \n\t"
654
"pand %2, %%mm0 \n\t"
655
"pand %2, %%mm3 \n\t"
656
"psrlq $5, %%mm1 \n\t"
657
"psrlq $5, %%mm4 \n\t"
658
"pand %%mm6, %%mm1 \n\t"
659
"pand %%mm6, %%mm4 \n\t"
660
"psrlq $8, %%mm2 \n\t"
661
"psrlq $8, %%mm5 \n\t"
662
"pand %%mm7, %%mm2 \n\t"
663
"pand %%mm7, %%mm5 \n\t"
664
"por %%mm1, %%mm0 \n\t"
665
"por %%mm4, %%mm3 \n\t"
666
"por %%mm2, %%mm0 \n\t"
667
"por %%mm5, %%mm3 \n\t"
668
"psllq $16, %%mm3 \n\t"
669
"por %%mm3, %%mm0 \n\t"
670
MOVNTQ" %%mm0, %0 \n\t"
671
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
675
__asm__ volatile(SFENCE:::"memory");
676
__asm__ volatile(EMMS:::"memory");
682
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
688
const uint8_t *s = src;
691
const uint8_t *mm_end;
693
uint16_t *d = (uint16_t *)dst;
696
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
698
"movq %0, %%mm7 \n\t"
699
"movq %1, %%mm6 \n\t"
700
::"m"(red_16mask),"m"(green_16mask));
705
"movd %1, %%mm0 \n\t"
706
"movd 3%1, %%mm3 \n\t"
707
"punpckldq 6%1, %%mm0 \n\t"
708
"punpckldq 9%1, %%mm3 \n\t"
709
"movq %%mm0, %%mm1 \n\t"
710
"movq %%mm0, %%mm2 \n\t"
711
"movq %%mm3, %%mm4 \n\t"
712
"movq %%mm3, %%mm5 \n\t"
713
"psllq $8, %%mm0 \n\t"
714
"psllq $8, %%mm3 \n\t"
715
"pand %%mm7, %%mm0 \n\t"
716
"pand %%mm7, %%mm3 \n\t"
717
"psrlq $5, %%mm1 \n\t"
718
"psrlq $5, %%mm4 \n\t"
719
"pand %%mm6, %%mm1 \n\t"
720
"pand %%mm6, %%mm4 \n\t"
721
"psrlq $19, %%mm2 \n\t"
722
"psrlq $19, %%mm5 \n\t"
723
"pand %2, %%mm2 \n\t"
724
"pand %2, %%mm5 \n\t"
725
"por %%mm1, %%mm0 \n\t"
726
"por %%mm4, %%mm3 \n\t"
727
"por %%mm2, %%mm0 \n\t"
728
"por %%mm5, %%mm3 \n\t"
729
"psllq $16, %%mm3 \n\t"
730
"por %%mm3, %%mm0 \n\t"
731
MOVNTQ" %%mm0, %0 \n\t"
732
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
736
__asm__ volatile(SFENCE:::"memory");
737
__asm__ volatile(EMMS:::"memory");
743
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
749
const uint8_t *s = src;
752
const uint8_t *mm_end;
754
uint16_t *d = (uint16_t *)dst;
757
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
759
"movq %0, %%mm7 \n\t"
760
"movq %1, %%mm6 \n\t"
761
::"m"(red_15mask),"m"(green_15mask));
766
"movd %1, %%mm0 \n\t"
767
"movd 3%1, %%mm3 \n\t"
768
"punpckldq 6%1, %%mm0 \n\t"
769
"punpckldq 9%1, %%mm3 \n\t"
770
"movq %%mm0, %%mm1 \n\t"
771
"movq %%mm0, %%mm2 \n\t"
772
"movq %%mm3, %%mm4 \n\t"
773
"movq %%mm3, %%mm5 \n\t"
774
"psrlq $3, %%mm0 \n\t"
775
"psrlq $3, %%mm3 \n\t"
776
"pand %2, %%mm0 \n\t"
777
"pand %2, %%mm3 \n\t"
778
"psrlq $6, %%mm1 \n\t"
779
"psrlq $6, %%mm4 \n\t"
780
"pand %%mm6, %%mm1 \n\t"
781
"pand %%mm6, %%mm4 \n\t"
782
"psrlq $9, %%mm2 \n\t"
783
"psrlq $9, %%mm5 \n\t"
784
"pand %%mm7, %%mm2 \n\t"
785
"pand %%mm7, %%mm5 \n\t"
786
"por %%mm1, %%mm0 \n\t"
787
"por %%mm4, %%mm3 \n\t"
788
"por %%mm2, %%mm0 \n\t"
789
"por %%mm5, %%mm3 \n\t"
790
"psllq $16, %%mm3 \n\t"
791
"por %%mm3, %%mm0 \n\t"
792
MOVNTQ" %%mm0, %0 \n\t"
793
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
797
__asm__ volatile(SFENCE:::"memory");
798
__asm__ volatile(EMMS:::"memory");
804
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
810
const uint8_t *s = src;
813
const uint8_t *mm_end;
815
uint16_t *d = (uint16_t *)dst;
818
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
820
"movq %0, %%mm7 \n\t"
821
"movq %1, %%mm6 \n\t"
822
::"m"(red_15mask),"m"(green_15mask));
827
"movd %1, %%mm0 \n\t"
828
"movd 3%1, %%mm3 \n\t"
829
"punpckldq 6%1, %%mm0 \n\t"
830
"punpckldq 9%1, %%mm3 \n\t"
831
"movq %%mm0, %%mm1 \n\t"
832
"movq %%mm0, %%mm2 \n\t"
833
"movq %%mm3, %%mm4 \n\t"
834
"movq %%mm3, %%mm5 \n\t"
835
"psllq $7, %%mm0 \n\t"
836
"psllq $7, %%mm3 \n\t"
837
"pand %%mm7, %%mm0 \n\t"
838
"pand %%mm7, %%mm3 \n\t"
839
"psrlq $6, %%mm1 \n\t"
840
"psrlq $6, %%mm4 \n\t"
841
"pand %%mm6, %%mm1 \n\t"
842
"pand %%mm6, %%mm4 \n\t"
843
"psrlq $19, %%mm2 \n\t"
844
"psrlq $19, %%mm5 \n\t"
845
"pand %2, %%mm2 \n\t"
846
"pand %2, %%mm5 \n\t"
847
"por %%mm1, %%mm0 \n\t"
848
"por %%mm4, %%mm3 \n\t"
849
"por %%mm2, %%mm0 \n\t"
850
"por %%mm5, %%mm3 \n\t"
851
"psllq $16, %%mm3 \n\t"
852
"por %%mm3, %%mm0 \n\t"
853
MOVNTQ" %%mm0, %0 \n\t"
854
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
858
__asm__ volatile(SFENCE:::"memory");
859
__asm__ volatile(EMMS:::"memory");
865
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
870
I use less accurate approximation here by simply left-shifting the input
871
value and filling the low order bits with zeroes. This method improves PNG
872
compression but this scheme cannot reproduce white exactly, since it does
873
not generate an all-ones maximum value; the net effect is to darken the
876
The better method should be "left bit replication":
886
| leftmost bits repeated to fill open bits
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
894
const uint16_t *mm_end;
897
const uint16_t *s = (const uint16_t*)src;
898
end = s + src_size/2;
900
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
905
"movq %1, %%mm0 \n\t"
906
"movq %1, %%mm1 \n\t"
907
"movq %1, %%mm2 \n\t"
908
"pand %2, %%mm0 \n\t"
909
"pand %3, %%mm1 \n\t"
910
"pand %4, %%mm2 \n\t"
911
"psllq $3, %%mm0 \n\t"
912
"psrlq $2, %%mm1 \n\t"
913
"psrlq $7, %%mm2 \n\t"
914
"movq %%mm0, %%mm3 \n\t"
915
"movq %%mm1, %%mm4 \n\t"
916
"movq %%mm2, %%mm5 \n\t"
917
"punpcklwd %5, %%mm0 \n\t"
918
"punpcklwd %5, %%mm1 \n\t"
919
"punpcklwd %5, %%mm2 \n\t"
920
"punpckhwd %5, %%mm3 \n\t"
921
"punpckhwd %5, %%mm4 \n\t"
922
"punpckhwd %5, %%mm5 \n\t"
923
"psllq $8, %%mm1 \n\t"
924
"psllq $16, %%mm2 \n\t"
925
"por %%mm1, %%mm0 \n\t"
926
"por %%mm2, %%mm0 \n\t"
927
"psllq $8, %%mm4 \n\t"
928
"psllq $16, %%mm5 \n\t"
929
"por %%mm4, %%mm3 \n\t"
930
"por %%mm5, %%mm3 \n\t"
932
"movq %%mm0, %%mm6 \n\t"
933
"movq %%mm3, %%mm7 \n\t"
935
"movq 8%1, %%mm0 \n\t"
936
"movq 8%1, %%mm1 \n\t"
937
"movq 8%1, %%mm2 \n\t"
938
"pand %2, %%mm0 \n\t"
939
"pand %3, %%mm1 \n\t"
940
"pand %4, %%mm2 \n\t"
941
"psllq $3, %%mm0 \n\t"
942
"psrlq $2, %%mm1 \n\t"
943
"psrlq $7, %%mm2 \n\t"
944
"movq %%mm0, %%mm3 \n\t"
945
"movq %%mm1, %%mm4 \n\t"
946
"movq %%mm2, %%mm5 \n\t"
947
"punpcklwd %5, %%mm0 \n\t"
948
"punpcklwd %5, %%mm1 \n\t"
949
"punpcklwd %5, %%mm2 \n\t"
950
"punpckhwd %5, %%mm3 \n\t"
951
"punpckhwd %5, %%mm4 \n\t"
952
"punpckhwd %5, %%mm5 \n\t"
953
"psllq $8, %%mm1 \n\t"
954
"psllq $16, %%mm2 \n\t"
955
"por %%mm1, %%mm0 \n\t"
956
"por %%mm2, %%mm0 \n\t"
957
"psllq $8, %%mm4 \n\t"
958
"psllq $16, %%mm5 \n\t"
959
"por %%mm4, %%mm3 \n\t"
960
"por %%mm5, %%mm3 \n\t"
963
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965
/* borrowed 32 to 24 */
967
"movq %%mm0, %%mm4 \n\t"
968
"movq %%mm3, %%mm5 \n\t"
969
"movq %%mm6, %%mm0 \n\t"
970
"movq %%mm7, %%mm1 \n\t"
972
"movq %%mm4, %%mm6 \n\t"
973
"movq %%mm5, %%mm7 \n\t"
974
"movq %%mm0, %%mm2 \n\t"
975
"movq %%mm1, %%mm3 \n\t"
985
__asm__ volatile(SFENCE:::"memory");
986
__asm__ volatile(EMMS:::"memory");
989
register uint16_t bgr;
991
*d++ = (bgr&0x1F)<<3;
992
*d++ = (bgr&0x3E0)>>2;
993
*d++ = (bgr&0x7C00)>>7;
997
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1001
const uint16_t *mm_end;
1003
uint8_t *d = (uint8_t *)dst;
1004
const uint16_t *s = (const uint16_t *)src;
1005
end = s + src_size/2;
1007
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1009
while (s < mm_end) {
1011
PREFETCH" 32%1 \n\t"
1012
"movq %1, %%mm0 \n\t"
1013
"movq %1, %%mm1 \n\t"
1014
"movq %1, %%mm2 \n\t"
1015
"pand %2, %%mm0 \n\t"
1016
"pand %3, %%mm1 \n\t"
1017
"pand %4, %%mm2 \n\t"
1018
"psllq $3, %%mm0 \n\t"
1019
"psrlq $3, %%mm1 \n\t"
1020
"psrlq $8, %%mm2 \n\t"
1021
"movq %%mm0, %%mm3 \n\t"
1022
"movq %%mm1, %%mm4 \n\t"
1023
"movq %%mm2, %%mm5 \n\t"
1024
"punpcklwd %5, %%mm0 \n\t"
1025
"punpcklwd %5, %%mm1 \n\t"
1026
"punpcklwd %5, %%mm2 \n\t"
1027
"punpckhwd %5, %%mm3 \n\t"
1028
"punpckhwd %5, %%mm4 \n\t"
1029
"punpckhwd %5, %%mm5 \n\t"
1030
"psllq $8, %%mm1 \n\t"
1031
"psllq $16, %%mm2 \n\t"
1032
"por %%mm1, %%mm0 \n\t"
1033
"por %%mm2, %%mm0 \n\t"
1034
"psllq $8, %%mm4 \n\t"
1035
"psllq $16, %%mm5 \n\t"
1036
"por %%mm4, %%mm3 \n\t"
1037
"por %%mm5, %%mm3 \n\t"
1039
"movq %%mm0, %%mm6 \n\t"
1040
"movq %%mm3, %%mm7 \n\t"
1042
"movq 8%1, %%mm0 \n\t"
1043
"movq 8%1, %%mm1 \n\t"
1044
"movq 8%1, %%mm2 \n\t"
1045
"pand %2, %%mm0 \n\t"
1046
"pand %3, %%mm1 \n\t"
1047
"pand %4, %%mm2 \n\t"
1048
"psllq $3, %%mm0 \n\t"
1049
"psrlq $3, %%mm1 \n\t"
1050
"psrlq $8, %%mm2 \n\t"
1051
"movq %%mm0, %%mm3 \n\t"
1052
"movq %%mm1, %%mm4 \n\t"
1053
"movq %%mm2, %%mm5 \n\t"
1054
"punpcklwd %5, %%mm0 \n\t"
1055
"punpcklwd %5, %%mm1 \n\t"
1056
"punpcklwd %5, %%mm2 \n\t"
1057
"punpckhwd %5, %%mm3 \n\t"
1058
"punpckhwd %5, %%mm4 \n\t"
1059
"punpckhwd %5, %%mm5 \n\t"
1060
"psllq $8, %%mm1 \n\t"
1061
"psllq $16, %%mm2 \n\t"
1062
"por %%mm1, %%mm0 \n\t"
1063
"por %%mm2, %%mm0 \n\t"
1064
"psllq $8, %%mm4 \n\t"
1065
"psllq $16, %%mm5 \n\t"
1066
"por %%mm4, %%mm3 \n\t"
1067
"por %%mm5, %%mm3 \n\t"
1069
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1071
/* borrowed 32 to 24 */
1073
"movq %%mm0, %%mm4 \n\t"
1074
"movq %%mm3, %%mm5 \n\t"
1075
"movq %%mm6, %%mm0 \n\t"
1076
"movq %%mm7, %%mm1 \n\t"
1078
"movq %%mm4, %%mm6 \n\t"
1079
"movq %%mm5, %%mm7 \n\t"
1080
"movq %%mm0, %%mm2 \n\t"
1081
"movq %%mm1, %%mm3 \n\t"
1091
__asm__ volatile(SFENCE:::"memory");
1092
__asm__ volatile(EMMS:::"memory");
1095
register uint16_t bgr;
1097
*d++ = (bgr&0x1F)<<3;
1098
*d++ = (bgr&0x7E0)>>3;
1099
*d++ = (bgr&0xF800)>>8;
1104
* mm0 = 00 B3 00 B2 00 B1 00 B0
1105
* mm1 = 00 G3 00 G2 00 G1 00 G0
1106
* mm2 = 00 R3 00 R2 00 R1 00 R0
1107
* mm6 = FF FF FF FF FF FF FF FF
1108
* mm7 = 00 00 00 00 00 00 00 00
1110
#define PACK_RGB32 \
1111
"packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112
"packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113
"packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114
"punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115
"punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116
"movq %%mm0, %%mm3 \n\t" \
1117
"punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118
"punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119
MOVNTQ" %%mm0, %0 \n\t" \
1120
MOVNTQ" %%mm3, 8%0 \n\t" \
1122
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1124
const uint16_t *end;
1126
const uint16_t *mm_end;
1129
const uint16_t *s = (const uint16_t *)src;
1130
end = s + src_size/2;
1132
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1133
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1134
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1136
while (s < mm_end) {
1138
PREFETCH" 32%1 \n\t"
1139
"movq %1, %%mm0 \n\t"
1140
"movq %1, %%mm1 \n\t"
1141
"movq %1, %%mm2 \n\t"
1142
"pand %2, %%mm0 \n\t"
1143
"pand %3, %%mm1 \n\t"
1144
"pand %4, %%mm2 \n\t"
1145
"psllq $3, %%mm0 \n\t"
1146
"psrlq $2, %%mm1 \n\t"
1147
"psrlq $7, %%mm2 \n\t"
1150
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1155
__asm__ volatile(SFENCE:::"memory");
1156
__asm__ volatile(EMMS:::"memory");
1159
register uint16_t bgr;
1163
*d++ = (bgr&0x7C00)>>7;
1164
*d++ = (bgr&0x3E0)>>2;
1165
*d++ = (bgr&0x1F)<<3;
1167
*d++ = (bgr&0x1F)<<3;
1168
*d++ = (bgr&0x3E0)>>2;
1169
*d++ = (bgr&0x7C00)>>7;
1175
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1177
const uint16_t *end;
1179
const uint16_t *mm_end;
1182
const uint16_t *s = (const uint16_t*)src;
1183
end = s + src_size/2;
1185
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1186
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1187
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1189
while (s < mm_end) {
1191
PREFETCH" 32%1 \n\t"
1192
"movq %1, %%mm0 \n\t"
1193
"movq %1, %%mm1 \n\t"
1194
"movq %1, %%mm2 \n\t"
1195
"pand %2, %%mm0 \n\t"
1196
"pand %3, %%mm1 \n\t"
1197
"pand %4, %%mm2 \n\t"
1198
"psllq $3, %%mm0 \n\t"
1199
"psrlq $3, %%mm1 \n\t"
1200
"psrlq $8, %%mm2 \n\t"
1203
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1208
__asm__ volatile(SFENCE:::"memory");
1209
__asm__ volatile(EMMS:::"memory");
1212
register uint16_t bgr;
1216
*d++ = (bgr&0xF800)>>8;
1217
*d++ = (bgr&0x7E0)>>3;
1218
*d++ = (bgr&0x1F)<<3;
1220
*d++ = (bgr&0x1F)<<3;
1221
*d++ = (bgr&0x7E0)>>3;
1222
*d++ = (bgr&0xF800)>>8;
1228
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1230
x86_reg idx = 15 - src_size;
1231
const uint8_t *s = src-idx;
1232
uint8_t *d = dst-idx;
1237
PREFETCH" (%1, %0) \n\t"
1238
"movq %3, %%mm7 \n\t"
1239
"pxor %4, %%mm7 \n\t"
1240
"movq %%mm7, %%mm6 \n\t"
1241
"pxor %5, %%mm7 \n\t"
1244
PREFETCH" 32(%1, %0) \n\t"
1245
"movq (%1, %0), %%mm0 \n\t"
1246
"movq 8(%1, %0), %%mm1 \n\t"
1248
"pshufw $177, %%mm0, %%mm3 \n\t"
1249
"pshufw $177, %%mm1, %%mm5 \n\t"
1250
"pand %%mm7, %%mm0 \n\t"
1251
"pand %%mm6, %%mm3 \n\t"
1252
"pand %%mm7, %%mm1 \n\t"
1253
"pand %%mm6, %%mm5 \n\t"
1254
"por %%mm3, %%mm0 \n\t"
1255
"por %%mm5, %%mm1 \n\t"
1257
"movq %%mm0, %%mm2 \n\t"
1258
"movq %%mm1, %%mm4 \n\t"
1259
"pand %%mm7, %%mm0 \n\t"
1260
"pand %%mm6, %%mm2 \n\t"
1261
"pand %%mm7, %%mm1 \n\t"
1262
"pand %%mm6, %%mm4 \n\t"
1263
"movq %%mm2, %%mm3 \n\t"
1264
"movq %%mm4, %%mm5 \n\t"
1265
"pslld $16, %%mm2 \n\t"
1266
"psrld $16, %%mm3 \n\t"
1267
"pslld $16, %%mm4 \n\t"
1268
"psrld $16, %%mm5 \n\t"
1269
"por %%mm2, %%mm0 \n\t"
1270
"por %%mm4, %%mm1 \n\t"
1271
"por %%mm3, %%mm0 \n\t"
1272
"por %%mm5, %%mm1 \n\t"
1274
MOVNTQ" %%mm0, (%2, %0) \n\t"
1275
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1282
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1285
for (; idx<15; idx+=4) {
1286
register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1288
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1292
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1296
x86_reg mmx_size= 23 - src_size;
1298
"test %%"REG_a", %%"REG_a" \n\t"
1300
"movq "MANGLE(mask24r)", %%mm5 \n\t"
1301
"movq "MANGLE(mask24g)", %%mm6 \n\t"
1302
"movq "MANGLE(mask24b)", %%mm7 \n\t"
1305
PREFETCH" 32(%1, %%"REG_a") \n\t"
1306
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1307
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1308
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1309
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
1310
"pand %%mm5, %%mm0 \n\t"
1311
"pand %%mm6, %%mm1 \n\t"
1312
"pand %%mm7, %%mm2 \n\t"
1313
"por %%mm0, %%mm1 \n\t"
1314
"por %%mm2, %%mm1 \n\t"
1315
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1316
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1317
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1318
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1319
"pand %%mm7, %%mm0 \n\t"
1320
"pand %%mm5, %%mm1 \n\t"
1321
"pand %%mm6, %%mm2 \n\t"
1322
"por %%mm0, %%mm1 \n\t"
1323
"por %%mm2, %%mm1 \n\t"
1324
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1325
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1326
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1327
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1328
"pand %%mm6, %%mm0 \n\t"
1329
"pand %%mm7, %%mm1 \n\t"
1330
"pand %%mm5, %%mm2 \n\t"
1331
"por %%mm0, %%mm1 \n\t"
1332
"por %%mm2, %%mm1 \n\t"
1333
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1334
"add $24, %%"REG_a" \n\t"
1338
: "r" (src-mmx_size), "r"(dst-mmx_size)
1341
__asm__ volatile(SFENCE:::"memory");
1342
__asm__ volatile(EMMS:::"memory");
1344
if (mmx_size==23) return; //finished, was multiple of 8
1348
src_size= 23-mmx_size;
1352
for (i=0; i<src_size; i+=3) {
1355
dst[i + 1] = src[i + 1];
1356
dst[i + 2] = src[i + 0];
1361
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362
long width, long height,
1363
long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1366
const x86_reg chromWidth= width>>1;
1367
for (y=0; y<height; y++) {
1369
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1371
"xor %%"REG_a", %%"REG_a" \n\t"
1374
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1375
PREFETCH" 32(%2, %%"REG_a") \n\t"
1376
PREFETCH" 32(%3, %%"REG_a") \n\t"
1377
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1378
"movq %%mm0, %%mm2 \n\t" // U(0)
1379
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1380
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1381
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1383
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1384
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1385
"movq %%mm3, %%mm4 \n\t" // Y(0)
1386
"movq %%mm5, %%mm6 \n\t" // Y(8)
1387
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1388
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1389
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1390
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1392
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1393
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1394
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1395
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1397
"add $8, %%"REG_a" \n\t"
1398
"cmp %4, %%"REG_a" \n\t"
1400
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n) \
1411
__asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1412
__asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1413
__asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1414
__asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1415
yuv1 = (u << 8) + (v << 24); \
1422
uint64_t *qdst = (uint64_t *) dst;
1423
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
const uint32_t *yc = (uint32_t *) ysrc;
1425
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
for (i = 0; i < chromWidth; i += 8) {
1428
uint64_t y1, y2, yuv1, yuv2;
1431
__asm__("ldq $31,64(%0)" :: "r"(yc));
1432
__asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
__asm__("ldq $31,64(%0)" :: "r"(uc));
1434
__asm__("ldq $31,64(%0)" :: "r"(vc));
1452
#elif HAVE_FAST_64BIT
1454
uint64_t *ldst = (uint64_t *) dst;
1455
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456
for (i = 0; i < chromWidth; i += 2) {
1458
k = yc[0] + (uc[0] << 8) +
1459
(yc[1] << 16) + (vc[0] << 24);
1460
l = yc[2] + (uc[1] << 8) +
1461
(yc[3] << 16) + (vc[1] << 24);
1462
*ldst++ = k + (l << 32);
1469
int i, *idst = (int32_t *) dst;
1470
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471
for (i = 0; i < chromWidth; i++) {
1473
*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
(yc[1] << 8) + (vc[0] << 0);
1476
*idst++ = yc[0] + (uc[0] << 8) +
1477
(yc[1] << 16) + (vc[0] << 24);
1485
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486
usrc += chromStride;
1487
vsrc += chromStride;
1500
* Height should be a multiple of 2 and width should be a multiple of 16.
1501
* (If this is a problem for anyone then tell me, and I will fix it.)
1503
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504
long width, long height,
1505
long lumStride, long chromStride, long dstStride)
1507
//FIXME interpolate chroma
1508
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1511
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512
long width, long height,
1513
long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1516
const x86_reg chromWidth= width>>1;
1517
for (y=0; y<height; y++) {
1519
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1521
"xor %%"REG_a", %%"REG_a" \n\t"
1524
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1525
PREFETCH" 32(%2, %%"REG_a") \n\t"
1526
PREFETCH" 32(%3, %%"REG_a") \n\t"
1527
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1528
"movq %%mm0, %%mm2 \n\t" // U(0)
1529
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1530
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1531
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1533
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1534
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1535
"movq %%mm0, %%mm4 \n\t" // Y(0)
1536
"movq %%mm2, %%mm6 \n\t" // Y(8)
1537
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1538
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1539
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1540
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1542
MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1543
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1544
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1545
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1547
"add $8, %%"REG_a" \n\t"
1548
"cmp %4, %%"REG_a" \n\t"
1550
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1554
//FIXME adapt the Alpha ASM code from yv12->yuy2
1558
uint64_t *ldst = (uint64_t *) dst;
1559
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
for (i = 0; i < chromWidth; i += 2) {
1562
k = uc[0] + (yc[0] << 8) +
1563
(vc[0] << 16) + (yc[1] << 24);
1564
l = uc[1] + (yc[2] << 8) +
1565
(vc[1] << 16) + (yc[3] << 24);
1566
*ldst++ = k + (l << 32);
1573
int i, *idst = (int32_t *) dst;
1574
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
for (i = 0; i < chromWidth; i++) {
1577
*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
(vc[0] << 8) + (yc[1] << 0);
1580
*idst++ = uc[0] + (yc[0] << 8) +
1581
(vc[0] << 16) + (yc[1] << 24);
1589
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590
usrc += chromStride;
1591
vsrc += chromStride;
1604
* Height should be a multiple of 2 and width should be a multiple of 16
1605
* (If this is a problem for anyone then tell me, and I will fix it.)
1607
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608
long width, long height,
1609
long lumStride, long chromStride, long dstStride)
1611
//FIXME interpolate chroma
1612
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1616
* Width should be a multiple of 16.
1618
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
long width, long height,
1620
long lumStride, long chromStride, long dstStride)
1622
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1626
* Width should be a multiple of 16.
1628
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629
long width, long height,
1630
long lumStride, long chromStride, long dstStride)
1632
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1636
* Height should be a multiple of 2 and width should be a multiple of 16.
1637
* (If this is a problem for anyone then tell me, and I will fix it.)
1639
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640
long width, long height,
1641
long lumStride, long chromStride, long srcStride)
1644
const x86_reg chromWidth= width>>1;
1645
for (y=0; y<height; y+=2) {
1648
"xor %%"REG_a", %%"REG_a" \n\t"
1649
"pcmpeqw %%mm7, %%mm7 \n\t"
1650
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1653
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1654
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1655
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1656
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1657
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1658
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1659
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1660
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1661
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1662
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1663
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1665
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1667
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1668
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1669
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1670
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1671
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1672
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1673
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1674
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1675
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1676
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1678
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1680
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1681
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1682
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1683
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1684
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1685
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1686
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1687
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1689
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1690
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1692
"add $8, %%"REG_a" \n\t"
1693
"cmp %4, %%"REG_a" \n\t"
1695
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696
: "memory", "%"REG_a
1703
"xor %%"REG_a", %%"REG_a" \n\t"
1706
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1707
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1708
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1709
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1710
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1711
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1712
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1713
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1714
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1715
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1716
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1718
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1719
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1721
"add $8, %%"REG_a" \n\t"
1722
"cmp %4, %%"REG_a" \n\t"
1725
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726
: "memory", "%"REG_a
1730
for (i=0; i<chromWidth; i++) {
1731
ydst[2*i+0] = src[4*i+0];
1732
udst[i] = src[4*i+1];
1733
ydst[2*i+1] = src[4*i+2];
1734
vdst[i] = src[4*i+3];
1739
for (i=0; i<chromWidth; i++) {
1740
ydst[2*i+0] = src[4*i+0];
1741
ydst[2*i+1] = src[4*i+2];
1744
udst += chromStride;
1745
vdst += chromStride;
1750
__asm__ volatile(EMMS" \n\t"
1756
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1757
uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1758
long width, long height, long lumStride, long chromStride)
1761
memcpy(ydst, ysrc, width*height);
1763
/* XXX: implement upscaling for U,V */
1766
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1773
for (x=0; x<srcWidth-1; x++) {
1774
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1775
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1777
dst[2*srcWidth-1]= src[srcWidth-1];
1781
for (y=1; y<srcHeight; y++) {
1782
#if HAVE_MMX2 || HAVE_AMD3DNOW
1783
const x86_reg mmxSize= srcWidth&~15;
1785
"mov %4, %%"REG_a" \n\t"
1787
"movq (%0, %%"REG_a"), %%mm0 \n\t"
1788
"movq (%1, %%"REG_a"), %%mm1 \n\t"
1789
"movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1790
"movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1791
"movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1792
"movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1793
PAVGB" %%mm0, %%mm5 \n\t"
1794
PAVGB" %%mm0, %%mm3 \n\t"
1795
PAVGB" %%mm0, %%mm5 \n\t"
1796
PAVGB" %%mm0, %%mm3 \n\t"
1797
PAVGB" %%mm1, %%mm4 \n\t"
1798
PAVGB" %%mm1, %%mm2 \n\t"
1799
PAVGB" %%mm1, %%mm4 \n\t"
1800
PAVGB" %%mm1, %%mm2 \n\t"
1801
"movq %%mm5, %%mm7 \n\t"
1802
"movq %%mm4, %%mm6 \n\t"
1803
"punpcklbw %%mm3, %%mm5 \n\t"
1804
"punpckhbw %%mm3, %%mm7 \n\t"
1805
"punpcklbw %%mm2, %%mm4 \n\t"
1806
"punpckhbw %%mm2, %%mm6 \n\t"
1808
MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1809
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1810
MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1811
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1813
"movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1814
"movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1815
"movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1816
"movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1818
"add $8, %%"REG_a" \n\t"
1820
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1821
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1827
const x86_reg mmxSize=1;
1829
dst[0 ]= (3*src[0] + src[srcStride])>>2;
1830
dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1832
for (x=mmxSize-1; x<srcWidth-1; x++) {
1833
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1834
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1835
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1836
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1838
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1839
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1849
for (x=0; x<srcWidth-1; x++) {
1850
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1851
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1853
dst[2*srcWidth-1]= src[srcWidth-1];
1855
for (x=0; x<srcWidth; x++) {
1862
__asm__ volatile(EMMS" \n\t"
1869
* Height should be a multiple of 2 and width should be a multiple of 16.
1870
* (If this is a problem for anyone then tell me, and I will fix it.)
1871
* Chrominance data is only taken from every second line, others are ignored.
1872
* FIXME: Write HQ version.
1874
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1875
long width, long height,
1876
long lumStride, long chromStride, long srcStride)
1879
const x86_reg chromWidth= width>>1;
1880
for (y=0; y<height; y+=2) {
1883
"xor %%"REG_a", %%"REG_a" \n\t"
1884
"pcmpeqw %%mm7, %%mm7 \n\t"
1885
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1888
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1889
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1890
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1891
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1892
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1893
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1894
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1895
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1896
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1897
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1898
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1900
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1902
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1903
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1904
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1905
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1906
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1907
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1908
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1909
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1910
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1911
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1913
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1916
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1917
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1918
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1919
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1920
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1921
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1922
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1924
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1925
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1927
"add $8, %%"REG_a" \n\t"
1928
"cmp %4, %%"REG_a" \n\t"
1930
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1931
: "memory", "%"REG_a
1938
"xor %%"REG_a", %%"REG_a" \n\t"
1941
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1942
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1943
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1944
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1945
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1946
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1947
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1948
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1949
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1950
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1951
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1953
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1954
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956
"add $8, %%"REG_a" \n\t"
1957
"cmp %4, %%"REG_a" \n\t"
1960
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1961
: "memory", "%"REG_a
1965
for (i=0; i<chromWidth; i++) {
1966
udst[i] = src[4*i+0];
1967
ydst[2*i+0] = src[4*i+1];
1968
vdst[i] = src[4*i+2];
1969
ydst[2*i+1] = src[4*i+3];
1974
for (i=0; i<chromWidth; i++) {
1975
ydst[2*i+0] = src[4*i+1];
1976
ydst[2*i+1] = src[4*i+3];
1979
udst += chromStride;
1980
vdst += chromStride;
1985
__asm__ volatile(EMMS" \n\t"
1992
* Height should be a multiple of 2 and width should be a multiple of 2.
1993
* (If this is a problem for anyone then tell me, and I will fix it.)
1994
* Chrominance data is only taken from every second line,
1995
* others are ignored in the C version.
1996
* FIXME: Write HQ version.
1998
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1999
long width, long height,
2000
long lumStride, long chromStride, long srcStride)
2003
const x86_reg chromWidth= width>>1;
2005
for (y=0; y<height-2; y+=2) {
2007
for (i=0; i<2; i++) {
2009
"mov %2, %%"REG_a" \n\t"
2010
"movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2011
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2012
"pxor %%mm7, %%mm7 \n\t"
2013
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2016
PREFETCH" 64(%0, %%"REG_d") \n\t"
2017
"movd (%0, %%"REG_d"), %%mm0 \n\t"
2018
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2019
"punpcklbw %%mm7, %%mm0 \n\t"
2020
"punpcklbw %%mm7, %%mm1 \n\t"
2021
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2022
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2023
"punpcklbw %%mm7, %%mm2 \n\t"
2024
"punpcklbw %%mm7, %%mm3 \n\t"
2025
"pmaddwd %%mm6, %%mm0 \n\t"
2026
"pmaddwd %%mm6, %%mm1 \n\t"
2027
"pmaddwd %%mm6, %%mm2 \n\t"
2028
"pmaddwd %%mm6, %%mm3 \n\t"
2029
#ifndef FAST_BGR2YV12
2030
"psrad $8, %%mm0 \n\t"
2031
"psrad $8, %%mm1 \n\t"
2032
"psrad $8, %%mm2 \n\t"
2033
"psrad $8, %%mm3 \n\t"
2035
"packssdw %%mm1, %%mm0 \n\t"
2036
"packssdw %%mm3, %%mm2 \n\t"
2037
"pmaddwd %%mm5, %%mm0 \n\t"
2038
"pmaddwd %%mm5, %%mm2 \n\t"
2039
"packssdw %%mm2, %%mm0 \n\t"
2040
"psraw $7, %%mm0 \n\t"
2042
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2043
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2044
"punpcklbw %%mm7, %%mm4 \n\t"
2045
"punpcklbw %%mm7, %%mm1 \n\t"
2046
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2047
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2048
"punpcklbw %%mm7, %%mm2 \n\t"
2049
"punpcklbw %%mm7, %%mm3 \n\t"
2050
"pmaddwd %%mm6, %%mm4 \n\t"
2051
"pmaddwd %%mm6, %%mm1 \n\t"
2052
"pmaddwd %%mm6, %%mm2 \n\t"
2053
"pmaddwd %%mm6, %%mm3 \n\t"
2054
#ifndef FAST_BGR2YV12
2055
"psrad $8, %%mm4 \n\t"
2056
"psrad $8, %%mm1 \n\t"
2057
"psrad $8, %%mm2 \n\t"
2058
"psrad $8, %%mm3 \n\t"
2060
"packssdw %%mm1, %%mm4 \n\t"
2061
"packssdw %%mm3, %%mm2 \n\t"
2062
"pmaddwd %%mm5, %%mm4 \n\t"
2063
"pmaddwd %%mm5, %%mm2 \n\t"
2064
"add $24, %%"REG_d" \n\t"
2065
"packssdw %%mm2, %%mm4 \n\t"
2066
"psraw $7, %%mm4 \n\t"
2068
"packuswb %%mm4, %%mm0 \n\t"
2069
"paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2071
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2072
"add $8, %%"REG_a" \n\t"
2074
: : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2075
: "%"REG_a, "%"REG_d
2082
"mov %4, %%"REG_a" \n\t"
2083
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2084
"movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2085
"pxor %%mm7, %%mm7 \n\t"
2086
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2087
"add %%"REG_d", %%"REG_d" \n\t"
2090
PREFETCH" 64(%0, %%"REG_d") \n\t"
2091
PREFETCH" 64(%1, %%"REG_d") \n\t"
2092
#if HAVE_MMX2 || HAVE_AMD3DNOW
2093
"movq (%0, %%"REG_d"), %%mm0 \n\t"
2094
"movq (%1, %%"REG_d"), %%mm1 \n\t"
2095
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2096
"movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2097
PAVGB" %%mm1, %%mm0 \n\t"
2098
PAVGB" %%mm3, %%mm2 \n\t"
2099
"movq %%mm0, %%mm1 \n\t"
2100
"movq %%mm2, %%mm3 \n\t"
2101
"psrlq $24, %%mm0 \n\t"
2102
"psrlq $24, %%mm2 \n\t"
2103
PAVGB" %%mm1, %%mm0 \n\t"
2104
PAVGB" %%mm3, %%mm2 \n\t"
2105
"punpcklbw %%mm7, %%mm0 \n\t"
2106
"punpcklbw %%mm7, %%mm2 \n\t"
2108
"movd (%0, %%"REG_d"), %%mm0 \n\t"
2109
"movd (%1, %%"REG_d"), %%mm1 \n\t"
2110
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2111
"movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2112
"punpcklbw %%mm7, %%mm0 \n\t"
2113
"punpcklbw %%mm7, %%mm1 \n\t"
2114
"punpcklbw %%mm7, %%mm2 \n\t"
2115
"punpcklbw %%mm7, %%mm3 \n\t"
2116
"paddw %%mm1, %%mm0 \n\t"
2117
"paddw %%mm3, %%mm2 \n\t"
2118
"paddw %%mm2, %%mm0 \n\t"
2119
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2120
"movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2121
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2122
"movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2123
"punpcklbw %%mm7, %%mm4 \n\t"
2124
"punpcklbw %%mm7, %%mm1 \n\t"
2125
"punpcklbw %%mm7, %%mm2 \n\t"
2126
"punpcklbw %%mm7, %%mm3 \n\t"
2127
"paddw %%mm1, %%mm4 \n\t"
2128
"paddw %%mm3, %%mm2 \n\t"
2129
"paddw %%mm4, %%mm2 \n\t"
2130
"psrlw $2, %%mm0 \n\t"
2131
"psrlw $2, %%mm2 \n\t"
2133
"movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2134
"movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2136
"pmaddwd %%mm0, %%mm1 \n\t"
2137
"pmaddwd %%mm2, %%mm3 \n\t"
2138
"pmaddwd %%mm6, %%mm0 \n\t"
2139
"pmaddwd %%mm6, %%mm2 \n\t"
2140
#ifndef FAST_BGR2YV12
2141
"psrad $8, %%mm0 \n\t"
2142
"psrad $8, %%mm1 \n\t"
2143
"psrad $8, %%mm2 \n\t"
2144
"psrad $8, %%mm3 \n\t"
2146
"packssdw %%mm2, %%mm0 \n\t"
2147
"packssdw %%mm3, %%mm1 \n\t"
2148
"pmaddwd %%mm5, %%mm0 \n\t"
2149
"pmaddwd %%mm5, %%mm1 \n\t"
2150
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2151
"psraw $7, %%mm0 \n\t"
2153
#if HAVE_MMX2 || HAVE_AMD3DNOW
2154
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2155
"movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2156
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2157
"movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2158
PAVGB" %%mm1, %%mm4 \n\t"
2159
PAVGB" %%mm3, %%mm2 \n\t"
2160
"movq %%mm4, %%mm1 \n\t"
2161
"movq %%mm2, %%mm3 \n\t"
2162
"psrlq $24, %%mm4 \n\t"
2163
"psrlq $24, %%mm2 \n\t"
2164
PAVGB" %%mm1, %%mm4 \n\t"
2165
PAVGB" %%mm3, %%mm2 \n\t"
2166
"punpcklbw %%mm7, %%mm4 \n\t"
2167
"punpcklbw %%mm7, %%mm2 \n\t"
2169
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2170
"movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2171
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2172
"movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2173
"punpcklbw %%mm7, %%mm4 \n\t"
2174
"punpcklbw %%mm7, %%mm1 \n\t"
2175
"punpcklbw %%mm7, %%mm2 \n\t"
2176
"punpcklbw %%mm7, %%mm3 \n\t"
2177
"paddw %%mm1, %%mm4 \n\t"
2178
"paddw %%mm3, %%mm2 \n\t"
2179
"paddw %%mm2, %%mm4 \n\t"
2180
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2181
"movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2182
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2183
"movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2184
"punpcklbw %%mm7, %%mm5 \n\t"
2185
"punpcklbw %%mm7, %%mm1 \n\t"
2186
"punpcklbw %%mm7, %%mm2 \n\t"
2187
"punpcklbw %%mm7, %%mm3 \n\t"
2188
"paddw %%mm1, %%mm5 \n\t"
2189
"paddw %%mm3, %%mm2 \n\t"
2190
"paddw %%mm5, %%mm2 \n\t"
2191
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2192
"psrlw $2, %%mm4 \n\t"
2193
"psrlw $2, %%mm2 \n\t"
2195
"movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2196
"movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2198
"pmaddwd %%mm4, %%mm1 \n\t"
2199
"pmaddwd %%mm2, %%mm3 \n\t"
2200
"pmaddwd %%mm6, %%mm4 \n\t"
2201
"pmaddwd %%mm6, %%mm2 \n\t"
2202
#ifndef FAST_BGR2YV12
2203
"psrad $8, %%mm4 \n\t"
2204
"psrad $8, %%mm1 \n\t"
2205
"psrad $8, %%mm2 \n\t"
2206
"psrad $8, %%mm3 \n\t"
2208
"packssdw %%mm2, %%mm4 \n\t"
2209
"packssdw %%mm3, %%mm1 \n\t"
2210
"pmaddwd %%mm5, %%mm4 \n\t"
2211
"pmaddwd %%mm5, %%mm1 \n\t"
2212
"add $24, %%"REG_d" \n\t"
2213
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2214
"psraw $7, %%mm4 \n\t"
2216
"movq %%mm0, %%mm1 \n\t"
2217
"punpckldq %%mm4, %%mm0 \n\t"
2218
"punpckhdq %%mm4, %%mm1 \n\t"
2219
"packsswb %%mm1, %%mm0 \n\t"
2220
"paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2221
"movd %%mm0, (%2, %%"REG_a") \n\t"
2222
"punpckhdq %%mm0, %%mm0 \n\t"
2223
"movd %%mm0, (%3, %%"REG_a") \n\t"
2224
"add $4, %%"REG_a" \n\t"
2226
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2227
: "%"REG_a, "%"REG_d
2230
udst += chromStride;
2231
vdst += chromStride;
2235
__asm__ volatile(EMMS" \n\t"
2241
for (; y<height; y+=2) {
2243
for (i=0; i<chromWidth; i++) {
2244
unsigned int b = src[6*i+0];
2245
unsigned int g = src[6*i+1];
2246
unsigned int r = src[6*i+2];
2248
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2249
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2250
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2260
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2266
for (i=0; i<chromWidth; i++) {
2267
unsigned int b = src[6*i+0];
2268
unsigned int g = src[6*i+1];
2269
unsigned int r = src[6*i+2];
2271
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2279
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2282
udst += chromStride;
2283
vdst += chromStride;
2289
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2290
long width, long height, long src1Stride,
2291
long src2Stride, long dstStride)
2295
for (h=0; h < height; h++) {
2301
"xor %%"REG_a", %%"REG_a" \n\t"
2303
PREFETCH" 64(%1, %%"REG_a") \n\t"
2304
PREFETCH" 64(%2, %%"REG_a") \n\t"
2305
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2306
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2307
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2308
"punpcklbw %%xmm2, %%xmm0 \n\t"
2309
"punpckhbw %%xmm2, %%xmm1 \n\t"
2310
"movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2311
"movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2312
"add $16, %%"REG_a" \n\t"
2313
"cmp %3, %%"REG_a" \n\t"
2315
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2316
: "memory", "%"REG_a""
2320
"xor %%"REG_a", %%"REG_a" \n\t"
2322
PREFETCH" 64(%1, %%"REG_a") \n\t"
2323
PREFETCH" 64(%2, %%"REG_a") \n\t"
2324
"movq (%1, %%"REG_a"), %%mm0 \n\t"
2325
"movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2326
"movq %%mm0, %%mm1 \n\t"
2327
"movq %%mm2, %%mm3 \n\t"
2328
"movq (%2, %%"REG_a"), %%mm4 \n\t"
2329
"movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2330
"punpcklbw %%mm4, %%mm0 \n\t"
2331
"punpckhbw %%mm4, %%mm1 \n\t"
2332
"punpcklbw %%mm5, %%mm2 \n\t"
2333
"punpckhbw %%mm5, %%mm3 \n\t"
2334
MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2335
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2336
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2337
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2338
"add $16, %%"REG_a" \n\t"
2339
"cmp %3, %%"REG_a" \n\t"
2341
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2342
: "memory", "%"REG_a
2345
for (w= (width&(~15)); w < width; w++) {
2346
dest[2*w+0] = src1[w];
2347
dest[2*w+1] = src2[w];
2350
for (w=0; w < width; w++) {
2351
dest[2*w+0] = src1[w];
2352
dest[2*w+1] = src2[w];
2368
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2369
uint8_t *dst1, uint8_t *dst2,
2370
long width, long height,
2371
long srcStride1, long srcStride2,
2372
long dstStride1, long dstStride2)
2376
w=width/2; h=height/2;
2381
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2384
const uint8_t* s1=src1+srcStride1*(y>>1);
2385
uint8_t* d=dst1+dstStride1*y;
2388
for (;x<w-31;x+=32) {
2390
PREFETCH" 32%1 \n\t"
2391
"movq %1, %%mm0 \n\t"
2392
"movq 8%1, %%mm2 \n\t"
2393
"movq 16%1, %%mm4 \n\t"
2394
"movq 24%1, %%mm6 \n\t"
2395
"movq %%mm0, %%mm1 \n\t"
2396
"movq %%mm2, %%mm3 \n\t"
2397
"movq %%mm4, %%mm5 \n\t"
2398
"movq %%mm6, %%mm7 \n\t"
2399
"punpcklbw %%mm0, %%mm0 \n\t"
2400
"punpckhbw %%mm1, %%mm1 \n\t"
2401
"punpcklbw %%mm2, %%mm2 \n\t"
2402
"punpckhbw %%mm3, %%mm3 \n\t"
2403
"punpcklbw %%mm4, %%mm4 \n\t"
2404
"punpckhbw %%mm5, %%mm5 \n\t"
2405
"punpcklbw %%mm6, %%mm6 \n\t"
2406
"punpckhbw %%mm7, %%mm7 \n\t"
2407
MOVNTQ" %%mm0, %0 \n\t"
2408
MOVNTQ" %%mm1, 8%0 \n\t"
2409
MOVNTQ" %%mm2, 16%0 \n\t"
2410
MOVNTQ" %%mm3, 24%0 \n\t"
2411
MOVNTQ" %%mm4, 32%0 \n\t"
2412
MOVNTQ" %%mm5, 40%0 \n\t"
2413
MOVNTQ" %%mm6, 48%0 \n\t"
2414
MOVNTQ" %%mm7, 56%0"
2420
for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2423
const uint8_t* s2=src2+srcStride2*(y>>1);
2424
uint8_t* d=dst2+dstStride2*y;
2427
for (;x<w-31;x+=32) {
2429
PREFETCH" 32%1 \n\t"
2430
"movq %1, %%mm0 \n\t"
2431
"movq 8%1, %%mm2 \n\t"
2432
"movq 16%1, %%mm4 \n\t"
2433
"movq 24%1, %%mm6 \n\t"
2434
"movq %%mm0, %%mm1 \n\t"
2435
"movq %%mm2, %%mm3 \n\t"
2436
"movq %%mm4, %%mm5 \n\t"
2437
"movq %%mm6, %%mm7 \n\t"
2438
"punpcklbw %%mm0, %%mm0 \n\t"
2439
"punpckhbw %%mm1, %%mm1 \n\t"
2440
"punpcklbw %%mm2, %%mm2 \n\t"
2441
"punpckhbw %%mm3, %%mm3 \n\t"
2442
"punpcklbw %%mm4, %%mm4 \n\t"
2443
"punpckhbw %%mm5, %%mm5 \n\t"
2444
"punpcklbw %%mm6, %%mm6 \n\t"
2445
"punpckhbw %%mm7, %%mm7 \n\t"
2446
MOVNTQ" %%mm0, %0 \n\t"
2447
MOVNTQ" %%mm1, 8%0 \n\t"
2448
MOVNTQ" %%mm2, 16%0 \n\t"
2449
MOVNTQ" %%mm3, 24%0 \n\t"
2450
MOVNTQ" %%mm4, 32%0 \n\t"
2451
MOVNTQ" %%mm5, 40%0 \n\t"
2452
MOVNTQ" %%mm6, 48%0 \n\t"
2453
MOVNTQ" %%mm7, 56%0"
2459
for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2470
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2472
long width, long height,
2473
long srcStride1, long srcStride2,
2474
long srcStride3, long dstStride)
2478
w=width/2; h=height;
2480
const uint8_t* yp=src1+srcStride1*y;
2481
const uint8_t* up=src2+srcStride2*(y>>2);
2482
const uint8_t* vp=src3+srcStride3*(y>>2);
2483
uint8_t* d=dst+dstStride*y;
2488
PREFETCH" 32(%1, %0) \n\t"
2489
PREFETCH" 32(%2, %0) \n\t"
2490
PREFETCH" 32(%3, %0) \n\t"
2491
"movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2492
"movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2493
"movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2494
"movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2495
"movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2496
"movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2497
"punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2498
"punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2499
"punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2500
"punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2502
"movq %%mm1, %%mm6 \n\t"
2503
"punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2504
"punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2505
"punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2506
MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2507
MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2509
"punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2510
"movq 8(%1, %0, 4), %%mm0 \n\t"
2511
"movq %%mm0, %%mm3 \n\t"
2512
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2513
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2514
MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2515
MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2517
"movq %%mm4, %%mm6 \n\t"
2518
"movq 16(%1, %0, 4), %%mm0 \n\t"
2519
"movq %%mm0, %%mm3 \n\t"
2520
"punpcklbw %%mm5, %%mm4 \n\t"
2521
"punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2522
"punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2523
MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2524
MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2526
"punpckhbw %%mm5, %%mm6 \n\t"
2527
"movq 24(%1, %0, 4), %%mm0 \n\t"
2528
"movq %%mm0, %%mm3 \n\t"
2529
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2530
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2531
MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2532
MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2535
: "r"(yp), "r" (up), "r"(vp), "r"(d)
2540
const long x2 = x<<2;
2543
d[8*x+2] = yp[x2+1];
2545
d[8*x+4] = yp[x2+2];
2547
d[8*x+6] = yp[x2+3];
2560
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2570
"pcmpeqw %%mm7, %%mm7 \n\t"
2571
"psrlw $8, %%mm7 \n\t"
2573
"movq -30(%1, %0, 2), %%mm0 \n\t"
2574
"movq -22(%1, %0, 2), %%mm1 \n\t"
2575
"movq -14(%1, %0, 2), %%mm2 \n\t"
2576
"movq -6(%1, %0, 2), %%mm3 \n\t"
2577
"pand %%mm7, %%mm0 \n\t"
2578
"pand %%mm7, %%mm1 \n\t"
2579
"pand %%mm7, %%mm2 \n\t"
2580
"pand %%mm7, %%mm3 \n\t"
2581
"packuswb %%mm1, %%mm0 \n\t"
2582
"packuswb %%mm3, %%mm2 \n\t"
2583
MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2584
MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2588
: "r"(src), "r"(dst)
2594
dst[count]= src[2*count];
2599
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2609
"pcmpeqw %%mm7, %%mm7 \n\t"
2610
"psrlw $8, %%mm7 \n\t"
2612
"movq -28(%1, %0, 4), %%mm0 \n\t"
2613
"movq -20(%1, %0, 4), %%mm1 \n\t"
2614
"movq -12(%1, %0, 4), %%mm2 \n\t"
2615
"movq -4(%1, %0, 4), %%mm3 \n\t"
2616
"pand %%mm7, %%mm0 \n\t"
2617
"pand %%mm7, %%mm1 \n\t"
2618
"pand %%mm7, %%mm2 \n\t"
2619
"pand %%mm7, %%mm3 \n\t"
2620
"packuswb %%mm1, %%mm0 \n\t"
2621
"packuswb %%mm3, %%mm2 \n\t"
2622
"movq %%mm0, %%mm1 \n\t"
2623
"movq %%mm2, %%mm3 \n\t"
2624
"psrlw $8, %%mm0 \n\t"
2625
"psrlw $8, %%mm2 \n\t"
2626
"pand %%mm7, %%mm1 \n\t"
2627
"pand %%mm7, %%mm3 \n\t"
2628
"packuswb %%mm2, %%mm0 \n\t"
2629
"packuswb %%mm3, %%mm1 \n\t"
2630
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2631
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2635
: "r"(src), "r"(dst0), "r"(dst1)
2641
dst0[count]= src[4*count+0];
2642
dst1[count]= src[4*count+2];
2647
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2658
"pcmpeqw %%mm7, %%mm7 \n\t"
2659
"psrlw $8, %%mm7 \n\t"
2661
"movq -28(%1, %0, 4), %%mm0 \n\t"
2662
"movq -20(%1, %0, 4), %%mm1 \n\t"
2663
"movq -12(%1, %0, 4), %%mm2 \n\t"
2664
"movq -4(%1, %0, 4), %%mm3 \n\t"
2665
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2666
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2667
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2668
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2669
"pand %%mm7, %%mm0 \n\t"
2670
"pand %%mm7, %%mm1 \n\t"
2671
"pand %%mm7, %%mm2 \n\t"
2672
"pand %%mm7, %%mm3 \n\t"
2673
"packuswb %%mm1, %%mm0 \n\t"
2674
"packuswb %%mm3, %%mm2 \n\t"
2675
"movq %%mm0, %%mm1 \n\t"
2676
"movq %%mm2, %%mm3 \n\t"
2677
"psrlw $8, %%mm0 \n\t"
2678
"psrlw $8, %%mm2 \n\t"
2679
"pand %%mm7, %%mm1 \n\t"
2680
"pand %%mm7, %%mm3 \n\t"
2681
"packuswb %%mm2, %%mm0 \n\t"
2682
"packuswb %%mm3, %%mm1 \n\t"
2683
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2684
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2688
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2694
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2695
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2700
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2710
"pcmpeqw %%mm7, %%mm7 \n\t"
2711
"psrlw $8, %%mm7 \n\t"
2713
"movq -28(%1, %0, 4), %%mm0 \n\t"
2714
"movq -20(%1, %0, 4), %%mm1 \n\t"
2715
"movq -12(%1, %0, 4), %%mm2 \n\t"
2716
"movq -4(%1, %0, 4), %%mm3 \n\t"
2717
"psrlw $8, %%mm0 \n\t"
2718
"psrlw $8, %%mm1 \n\t"
2719
"psrlw $8, %%mm2 \n\t"
2720
"psrlw $8, %%mm3 \n\t"
2721
"packuswb %%mm1, %%mm0 \n\t"
2722
"packuswb %%mm3, %%mm2 \n\t"
2723
"movq %%mm0, %%mm1 \n\t"
2724
"movq %%mm2, %%mm3 \n\t"
2725
"psrlw $8, %%mm0 \n\t"
2726
"psrlw $8, %%mm2 \n\t"
2727
"pand %%mm7, %%mm1 \n\t"
2728
"pand %%mm7, %%mm3 \n\t"
2729
"packuswb %%mm2, %%mm0 \n\t"
2730
"packuswb %%mm3, %%mm1 \n\t"
2731
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2732
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2736
: "r"(src), "r"(dst0), "r"(dst1)
2743
dst0[count]= src[4*count+0];
2744
dst1[count]= src[4*count+2];
2749
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2760
"pcmpeqw %%mm7, %%mm7 \n\t"
2761
"psrlw $8, %%mm7 \n\t"
2763
"movq -28(%1, %0, 4), %%mm0 \n\t"
2764
"movq -20(%1, %0, 4), %%mm1 \n\t"
2765
"movq -12(%1, %0, 4), %%mm2 \n\t"
2766
"movq -4(%1, %0, 4), %%mm3 \n\t"
2767
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2768
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2769
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2770
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2771
"psrlw $8, %%mm0 \n\t"
2772
"psrlw $8, %%mm1 \n\t"
2773
"psrlw $8, %%mm2 \n\t"
2774
"psrlw $8, %%mm3 \n\t"
2775
"packuswb %%mm1, %%mm0 \n\t"
2776
"packuswb %%mm3, %%mm2 \n\t"
2777
"movq %%mm0, %%mm1 \n\t"
2778
"movq %%mm2, %%mm3 \n\t"
2779
"psrlw $8, %%mm0 \n\t"
2780
"psrlw $8, %%mm2 \n\t"
2781
"pand %%mm7, %%mm1 \n\t"
2782
"pand %%mm7, %%mm3 \n\t"
2783
"packuswb %%mm2, %%mm0 \n\t"
2784
"packuswb %%mm3, %%mm1 \n\t"
2785
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2786
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2790
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2798
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2799
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2804
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2805
long width, long height,
2806
long lumStride, long chromStride, long srcStride)
2809
const long chromWidth= -((-width)>>1);
2811
for (y=0; y<height; y++) {
2812
RENAME(extract_even)(src, ydst, width);
2814
RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2831
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2832
long width, long height,
2833
long lumStride, long chromStride, long srcStride)
2836
const long chromWidth= -((-width)>>1);
2838
for (y=0; y<height; y++) {
2839
RENAME(extract_even)(src, ydst, width);
2840
RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2856
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2857
long width, long height,
2858
long lumStride, long chromStride, long srcStride)
2861
const long chromWidth= -((-width)>>1);
2863
for (y=0; y<height; y++) {
2864
RENAME(extract_even)(src+1, ydst, width);
2866
RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2883
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2884
long width, long height,
2885
long lumStride, long chromStride, long srcStride)
2888
const long chromWidth= -((-width)>>1);
2890
for (y=0; y<height; y++) {
2891
RENAME(extract_even)(src+1, ydst, width);
2892
RENAME(extract_even2)(src, udst, vdst, chromWidth);
2908
static inline void RENAME(rgb2rgb_init)(void)
2910
rgb15to16 = RENAME(rgb15to16);
2911
rgb15tobgr24 = RENAME(rgb15tobgr24);
2912
rgb15to32 = RENAME(rgb15to32);
2913
rgb16tobgr24 = RENAME(rgb16tobgr24);
2914
rgb16to32 = RENAME(rgb16to32);
2915
rgb16to15 = RENAME(rgb16to15);
2916
rgb24tobgr16 = RENAME(rgb24tobgr16);
2917
rgb24tobgr15 = RENAME(rgb24tobgr15);
2918
rgb24tobgr32 = RENAME(rgb24tobgr32);
2919
rgb32to16 = RENAME(rgb32to16);
2920
rgb32to15 = RENAME(rgb32to15);
2921
rgb32tobgr24 = RENAME(rgb32tobgr24);
2922
rgb24to15 = RENAME(rgb24to15);
2923
rgb24to16 = RENAME(rgb24to16);
2924
rgb24tobgr24 = RENAME(rgb24tobgr24);
2925
rgb32tobgr32 = RENAME(rgb32tobgr32);
2926
rgb32tobgr16 = RENAME(rgb32tobgr16);
2927
rgb32tobgr15 = RENAME(rgb32tobgr15);
2928
yv12toyuy2 = RENAME(yv12toyuy2);
2929
yv12touyvy = RENAME(yv12touyvy);
2930
yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2931
yuv422ptouyvy = RENAME(yuv422ptouyvy);
2932
yuy2toyv12 = RENAME(yuy2toyv12);
2933
// yvu9toyv12 = RENAME(yvu9toyv12);
2934
planar2x = RENAME(planar2x);
2935
rgb24toyv12 = RENAME(rgb24toyv12);
2936
interleaveBytes = RENAME(interleaveBytes);
2937
vu9_to_vu12 = RENAME(vu9_to_vu12);
2938
yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2940
uyvytoyuv420 = RENAME(uyvytoyuv420);
2941
uyvytoyuv422 = RENAME(uyvytoyuv422);
2942
yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2943
yuyvtoyuv422 = RENAME(yuyvtoyuv422);