1
/***************************************************************************
2
rgb2yuv16.c - description
5
copyright : (C) 2000 by Christian Gerlach
6
email : cgerlach@rhrk.uni-kl.de
7
***************************************************************************/
9
/***************************************************************************
11
* This program is free software; you can redistribute it and/or modify *
12
* it under the terms of the GNU General Public License as published by *
13
* the Free Software Foundation; either version 2 of the License, or *
14
* (at your option) any later version. *
16
***************************************************************************/
18
#include "rgb2yuv16.h"
20
static unsigned short KEEPR[4] = { 63488, 63488, 63488, 63488 };
21
unsigned short KEEPG[4] = { 2016, 2016, 2016, 2016 };
22
unsigned short KEEPB[4] = { 31, 31, 31, 31 };
24
short Y_RED[4] = { 307, 307, 307, 307 };
25
short Y_GREEN[4] = { 302, 302, 302, 302 };
26
short Y_BLUE[4] = { 117, 117, 117, 117 };
28
short U_RED[4] = { -150, -150, -150, -150 };
29
short U_GREEN[4] = { -147, -147, -147, -147 };
30
short U_BLUE[4] = { 444, 444, 444, 444 };
32
short V_RED[4] = { 632, 632, 632, 632 };
33
short V_GREEN[4] = { -265, -265, -265, -265 };
34
short V_BLUE[4] = { -102, -102, -102, -102 };
37
// how to avoid these nasty compiler warinings?
38
// heres one (maybe bad) method
39
void dummyRGB2YUV16Bit() {
45
printf("%p\n",Y_GREEN);
46
printf("%p\n",Y_BLUE);
48
printf("%p\n",U_GREEN);
49
printf("%p\n",U_BLUE);
51
printf("%p\n",V_GREEN);
52
printf("%p\n",V_BLUE);
57
void rgb2yuv16bit_mmx(unsigned char* ,unsigned char* ,unsigned char* ,
58
unsigned char* ,int , int ) {
59
cout << "RGB->YUV not compiled with INTEL"<<endl;
63
void rgb2yuv16bit_mmx_fast(unsigned char* ,unsigned char* ,unsigned char* ,
64
unsigned char* ,int , int ) {
65
cout << "RGB->YUV not compiled with INTEL"<<endl;
71
void rgb2yuv16(unsigned char* rgbSource, unsigned char* dest)
73
int rgb = *((unsigned short*) rgbSource)++;
78
dest[0] = Y_RGB(r, g, b);
79
dest[1] = U_RGB(r, g, b);
80
dest[2] = V_RGB(r, g, b);
83
void rgb2yuv16bit(unsigned char* rgbSource,
86
unsigned char* cb,int height, int width) {
88
int height2 = height / 2;
89
int width2 = width / 2;
90
int r, g, b, row, col, rgb;
92
for (row=0 ; row<height2 ; row++) {
93
for (col=0 ; col<width2 ; col++) {
94
rgb = *((unsigned short*) rgbSource)++;
99
*lum++ = Y_RGB(r, g, b);
100
*cr++ = U_RGB(r, g, b);
101
*cb++ = V_RGB(r, g, b);
103
rgb = *((unsigned short*) rgbSource)++;
108
*lum++ = Y_RGB(r, g, b);
110
for (col=0 ; col<width ; col++) {
111
rgb = *((unsigned short*) rgbSource)++;
116
*lum++ = Y_RGB(r, g, b);
124
void rgb2yuv16bit_mmx(unsigned char* rgbSource,
127
unsigned char* cb,int height, int width) {
128
int height2 = height / 2;
129
int width2 = width / 2;
130
int bytesPerLine = width * 2;
132
for (int row=0 ; row<height2 ; row++) {
133
rgb2yuv16bit_mmx422_row(rgbSource, lum, cr, cb, width);
134
rgbSource += bytesPerLine;
139
rgb2y16bit_mmx_row(rgbSource, lum, width);
140
rgbSource += bytesPerLine;
145
void rgb2yuv16bit_mmx_fast(unsigned char* rgbSource,
148
unsigned char* cb,int height, int width) {
150
int height2 = height / 2;
151
int width2 = width / 2;
152
int bytesPerLine = width * 2;
154
for (int row=0 ; row<height2 ; row++) {
155
rgb2yuv16bit_mmx422_row_fast(rgbSource, lum, cr, cb, width);
156
rgbSource += bytesPerLine;
161
rgb2y16bit_mmx_row_fast(rgbSource, lum, width);
162
rgbSource += bytesPerLine;
167
void rgb2yuv16bit_mmx422_row(unsigned char* rgb,
168
unsigned char* lum, unsigned char* cr,
169
unsigned char* cb, int pixel) {
170
unsigned int buf[17];
177
__asm__ __volatile__ (
180
// unpack hicolor ( pixel 1 - 4)
183
"movq %%mm0, %%mm1\n"
184
"pand KEEPR, %%mm1\n"
185
"psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
186
"movq %%mm0, %%mm2\n"
187
"pand KEEPG, %%mm2\n"
188
"psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
189
"movq %%mm0, %%mm3\n"
190
"pand KEEPB, %%mm3\n"
191
"psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
193
"movq %%mm2, %%mm0\n"
194
"punpcklbw %%mm1, %%mm2\n"
195
"punpckhbw %%mm1, %%mm0\n"
197
"pxor %%mm5, %%mm5\n"
198
"movq %%mm3, %%mm4\n"
199
"punpcklbw %%mm5, %%mm3\n"
200
"punpckhbw %%mm5, %%mm4\n"
203
"por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
205
"por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
210
// next 4 pixels ------------------------------
212
"movq 8(%0), %%mm0\n"
214
"movq %%mm0, %%mm1\n"
215
"pand KEEPR, %%mm1\n"
216
"psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
217
"movq %%mm0, %%mm2\n"
218
"pand KEEPG, %%mm2\n"
219
"psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
220
"movq %%mm0, %%mm3\n"
221
"pand KEEPB, %%mm3\n"
222
"psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
224
"movq %%mm2, %%mm0\n"
225
"punpcklbw %%mm1, %%mm2\n"
226
"punpckhbw %%mm1, %%mm0\n"
228
"pxor %%mm5, %%mm5\n"
229
"movq %%mm3, %%mm4\n"
230
"punpcklbw %%mm5, %%mm3\n"
231
"punpckhbw %%mm5, %%mm4\n"
234
"por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
236
"por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
243
// standard algorithm --------------------------------------------------
246
// was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
247
// ------------------------------
256
// ------------------------------
258
"pxor %%mm6, %%mm6\n" // 0 -> mm6
259
"movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
260
"psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
261
"punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
262
"movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
263
"punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
264
"movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
265
"pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
267
"movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
268
"pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
269
"movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
270
"pmaddwd UR0GRX, %%mm2\n" // urR1,ugG0+urR0 -> mm2
271
"movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
272
"pmaddwd UBG0BX, %%mm3\n" // ubB1+ugG1,ubB0 -> mm3
273
"punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
274
"pmaddwd VR0GRX, %%mm4\n" // vrR1,vgG0+vrR0 -> mm4
275
"paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
277
"pmaddwd VBG0BX, %%mm5\n" // vbB1+vgG1,vbB0 -> mm5
280
// was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
281
// ------------------------------
294
// ------------------------------
296
"paddd %%mm3, %%mm2\n" // U1U0 -> mm2
298
"movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
299
"punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
300
"paddd %%mm5, %%mm4\n" // V1V0 -> mm4
302
//----------------------------------------------------------------------
304
"movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
305
"psllq $32, %%mm1\n" // R3B200 -> mm1
307
"paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
309
"punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
310
"movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
312
"pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
313
"movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
315
"pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
316
"psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
318
"movq %%mm6, 36%5\n" // R5B4G4R4 -> TEMP0
319
"movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
320
"pmaddwd UR0GRX, %%mm6\n" // urR3,ugG2+urR2 -> mm6
321
"psrad $15, %%mm2\n" // 32-bit scaled U1U0 -> mm2
323
"paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
324
"movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
325
"pmaddwd UBG0BX, %%mm7\n" // ubB3+ugG3,ubB2
326
"psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
328
"pmaddwd VR0GRX, %%mm3\n" // vrR3,vgG2+vgR2
329
"packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
331
"pmaddwd VBG0BX, %%mm5\n" // vbB3+vgG3,vbB2 -> mm5
332
"psrad $15, %%mm4\n" // 32-bit scaled V1V0 -> mm4
334
//----------------------------------------------------------------------
336
"paddd %%mm7, %%mm6\n" // U3U2 -> mm6
339
// was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
340
// ------------------------------
350
// ------------------------------
352
"movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
353
"psrad $15, %%mm6\n" // 32-bit scaled U3U2 -> mm6
355
"paddd %%mm5, %%mm3\n" // V3V2 -> mm3
356
"psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
358
"movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
359
"psrad $15, %%mm3\n" // 32-bit scaled V3V2 -> mm3
361
"movq %%mm0, 44%5\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
363
"packssdw %%mm6, %%mm2\n" // 32-bit scaled U3U2U1U0 -> mm2
365
"movq 36%5, %%mm0\n" // R5B4G4R4 -> mm0
367
"punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
368
"movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
370
"movq %%mm2, 52%5\n" // 32-bit scaled U3U2U1U0 -> TEMPU
371
"psrlq $32, %%mm0\n" // 00R5B4 -> mm0
373
"paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
374
"movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
376
"pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
377
"movq %%mm7, %%mm0\n" // B5G5R5B4 -> mm0
379
"pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
380
"packssdw %%mm3, %%mm4\n" // 32-bit scaled V3V2V1V0 -> mm4
382
//----------------------------------------------------------------------
384
"movq %%mm4, 60%5\n" // (V3V2V1V0)/256 -> mm4
386
"movq %%mm6, %%mm4\n" // B5B4G4R4 -> mm4
388
"pmaddwd UR0GRX, %%mm6\n" // urR5,ugG4+urR4
389
"movq %%mm0, %%mm3\n" // B5G5R5B4 -> mm0
391
"pmaddwd UBG0BX, %%mm0\n" // ubB5+ugG5,ubB4
392
"paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
394
//----------------------------------------------------------------------
396
"pmaddwd VR0GRX, %%mm4\n" // vrR5,vgG4+vrR4 -> mm4
397
"pxor %%mm7, %%mm7\n" // 0 -> mm7
399
"pmaddwd VBG0BX, %%mm3\n" // vbB5+vgG5,vbB4 -> mm3
400
"punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
402
"paddd %%mm6, %%mm0\n" // U5U4 -> mm0
403
"movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
405
"pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
406
"punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
408
"movq %%mm5, %%mm7\n" // R7B6G6R6 -> mm7
409
"paddd %%mm4, %%mm3\n" // V5V4 -> mm3
411
"pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
412
"movq %%mm1, %%mm4\n" // B7G7R7B6 -> mm4
414
"pmaddwd UBG0BX, %%mm4\n" // ubB7+ugG7,ubB6 -> mm4
415
"psrad $15, %%mm0\n" // 32-bit scaled U5U4 -> %%mm0
417
//----------------------------------------------------------------------
419
"paddd OFFSETWX, %%mm0\n" // add offset to U5U4 -> mm0
420
"psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
422
"paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
423
"movq %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
425
"pmaddwd UR0GRX, %%mm7\n" // urR7,ugG6+ugR6 -> mm7
426
"psrad $15, %%mm3\n" // 32-bit scaled V5V4 -> mm3
428
"pmaddwd VBG0BX, %%mm1\n" // vbB7+vgG7,vbB6 -> mm1
429
"psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
431
"paddd OFFSETDX, %%mm4\n" // add offset to U7U6
432
"packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
434
"pmaddwd VR0GRX, %%mm5\n" // vrR7,vgG6+vrR6 -> mm5
435
"paddd %%mm4, %%mm7\n" // U7U6 -> mm7
437
"psrad $15, %%mm7\n" // 32-bit scaled U7U6 -> mm7
439
//----------------------------------------------------------------------
441
"movq 44%5, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
442
"packssdw %%mm7, %%mm0\n" // 32-bit scaled U7U6U5U4 -> mm0
444
"movq 52%5, %%mm4\n" // 32-bit scaled U3U2U1U0 -> mm4
445
"packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
447
"movq OFFSETBX, %%mm7\n" // 128,128,128,128 -> mm7
448
"paddd %%mm5, %%mm1\n" // V7V6 -> mm1
450
"paddw %%mm7, %%mm4\n" // add offset to U3U2U1U0/256
451
"psrad $15, %%mm1\n" // 32-bit scaled V7V6 -> mm1
453
//----------------------------------------------------------------------
455
"movq %%mm6, (%1)\n" // store Y
457
"packuswb %%mm0, %%mm4\n" // all 8 U values -> mm4
458
"movq 60%5, %%mm5\n" // 32-bit scaled V3V2V1V0 -> mm5
460
"packssdw %%mm1, %%mm3\n" // V7V6V5V4 -> mm3
461
"paddw %%mm7, %%mm5\n" // add offset to V3V2V1V0
462
"paddw %%mm7, %%mm3\n" // add offset to V7V6V5V4
464
"packuswb %%mm3, %%mm5\n" // ALL 8 V values -> mm5
466
"movq CLEARX, %%mm2\n"
467
"pand %%mm2, %%mm4\n"
468
"pand %%mm2, %%mm5\n"
470
"packuswb %%mm5, %%mm4\n"
481
"jnz rgb2yuv16_422\n"
486
: "r" (rgb), "r" (lum), "r" (cr), "r" (cb),
487
"m" (pixel), "m" (buf)
492
void rgb2yuv16bit_mmx422_row_fast(unsigned char* rgb,
493
unsigned char* lum, unsigned char* cr,
494
unsigned char* cb, int pixel)
496
__asm__ __volatile__ (
497
"rgb2yuv16_422_fast:\n"
499
// unpack hicolor ( pixel 0 - 3)
502
"movq %%mm0, %%mm1\n"
503
"pand KEEPR, %%mm1\n"
504
"psrlq $11, %%mm1\n" // B3B2B1B0 -> mm1
506
"movq %%mm0, %%mm2\n"
507
"pand KEEPG, %%mm2\n"
508
"psrlq $5, %%mm2\n" // G3G2G1G0 -> mm2
510
"movq %%mm0, %%mm3\n"
511
"pand KEEPB, %%mm3\n" // R3R2R1R0 -> mm3
513
// unpack hicolor ( pixel 4 - 7)
514
"movq 8(%0), %%mm0\n"
516
"movq %%mm0, %%mm4\n"
517
"pand KEEPR, %%mm4\n"
518
"psrlq $11, %%mm4\n" // B7B6B5B4 -> mm4
520
"movq %%mm0, %%mm5\n"
521
"pand KEEPG, %%mm5\n"
522
"psrlq $5, %%mm5\n" // G7G6G5G4 -> mm5
524
"movq %%mm0, %%mm6\n"
525
"pand KEEPB, %%mm6\n" // R7R6R5R4 -> mm6
528
"movq %%mm6, %%mm7\n"
529
"pmullw Y_RED, %%mm7\n"
531
"movq %%mm5, %%mm0\n"
532
"pmullw Y_GREEN, %%mm0\n"
533
"paddw %%mm0, %%mm7\n"
535
"movq %%mm4, %%mm0\n"
536
"pmullw Y_BLUE, %%mm0\n"
537
"paddw %%mm0, %%mm7\n"
539
"psrlw $7, %%mm7\n" // Y3Y2Y1Y0 -> mm7
541
"pxor %%mm0, %%mm0\n"
542
"packuswb %%mm0, %%mm7\n"
543
"movd %%mm7, 4(%1)\n" // Y3Y2Y1Y0 -> lum
547
"movq %%mm3, %%mm7\n"
548
"pmullw Y_RED, %%mm7\n"
550
"movq %%mm2, %%mm0\n"
551
"pmullw Y_GREEN, %%mm0\n"
552
"paddw %%mm0, %%mm7\n"
554
"movq %%mm1, %%mm0\n"
555
"pmullw Y_BLUE, %%mm0\n"
556
"paddw %%mm0, %%mm7\n"
558
"psrlw $7, %%mm7\n" // Y7Y6Y5Y4 -> mm7
560
"pxor %%mm0, %%mm0\n"
561
"packuswb %%mm0, %%mm7\n"
562
"movd %%mm7, (%1)\n" // Y7Y6Y5Y4 -> lum
566
"packuswb %%mm4, %%mm1\n"
567
"pand CLEARX, %%mm1\n" // B6B4B2B0 -> mm1
568
"packuswb %%mm5, %%mm2\n"
569
"pand CLEARX, %%mm2\n" // GRG4G2G0 -> mm2
570
"packuswb %%mm6, %%mm3\n"
571
"pand CLEARX, %%mm3\n" // R6R4R2R0 -> mm3
574
"movq %%mm3, %%mm7\n"
575
"pmullw U_RED, %%mm7\n"
577
"movq %%mm2, %%mm0\n"
578
"pmullw U_GREEN, %%mm0\n"
579
"paddw %%mm0, %%mm7\n"
581
"movq %%mm1, %%mm0\n"
582
"pmullw U_BLUE, %%mm0\n"
583
"paddw %%mm0, %%mm7\n"
585
"psrlw $7, %%mm7\n" // U3U2U1U0 -> mm7
586
"paddw OFFSETBX,%%mm7\n"
587
"pand CLEARX, %%mm7\n"
589
"pxor %%mm0, %%mm0\n"
590
"packuswb %%mm0, %%mm7\n"
591
"movd %%mm7, (%2)\n" // U3U2U1U0 -> lum
595
"movq %%mm3, %%mm7\n"
596
"pmullw V_RED, %%mm7\n"
598
"movq %%mm2, %%mm0\n"
599
"pmullw V_GREEN, %%mm0\n"
600
"paddw %%mm0, %%mm7\n"
602
"movq %%mm1, %%mm0\n"
603
"pmullw V_BLUE, %%mm0\n"
604
"paddw %%mm0, %%mm7\n"
606
"psrlw $7, %%mm7\n" // V3V2V1V0 -> mm7
607
"paddw OFFSETBX,%%mm7\n"
608
"pand CLEARX, %%mm7\n"
610
"pxor %%mm0, %%mm0\n"
611
"packuswb %%mm0, %%mm7\n"
612
"movd %%mm7, (%3)\n" // V3V2V1V0 -> lum
618
"jnz rgb2yuv16_422_fast\n"
623
: "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel)
628
void rgb2y16bit_mmx_row(unsigned char* rgbSource,
629
unsigned char* lum, int pixel)
631
unsigned int buf[16];
636
__asm__ __volatile__ (
639
// unpack hicolor ( pixel 1 - 4)
642
"movq %%mm0, %%mm1\n"
643
"pand KEEPR, %%mm1\n"
644
"psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
645
"movq %%mm0, %%mm2\n"
646
"pand KEEPG, %%mm2\n"
647
"psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
648
"movq %%mm0, %%mm3\n"
649
"pand KEEPB, %%mm3\n"
650
"psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
652
"movq %%mm2, %%mm0\n"
653
"punpcklbw %%mm1, %%mm2\n"
654
"punpckhbw %%mm1, %%mm0\n"
656
"pxor %%mm5, %%mm5\n"
657
"movq %%mm3, %%mm4\n"
658
"punpcklbw %%mm5, %%mm3\n"
659
"punpckhbw %%mm5, %%mm4\n"
662
"por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
664
"por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
669
// next 4 pixels ------------------------------
671
"movq 8(%0), %%mm0\n"
673
"movq %%mm0, %%mm1\n"
674
"pand KEEPR, %%mm1\n"
675
"psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
676
"movq %%mm0, %%mm2\n"
677
"pand KEEPG, %%mm2\n"
678
"psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
679
"movq %%mm0, %%mm3\n"
680
"pand KEEPB, %%mm3\n"
681
"psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
683
"movq %%mm2, %%mm0\n"
684
"punpcklbw %%mm1, %%mm2\n"
685
"punpckhbw %%mm1, %%mm0\n"
687
"pxor %%mm5, %%mm5\n"
688
"movq %%mm3, %%mm4\n"
689
"punpcklbw %%mm5, %%mm3\n"
690
"punpckhbw %%mm5, %%mm4\n"
693
"por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
695
"por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
702
// standard algorithm --------------------------------------------------
705
// was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
706
// ------------------------------
715
// ------------------------------
717
"pxor %%mm6, %%mm6\n" // 0 -> mm6
718
"movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
719
"psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
720
"punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
721
"movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
722
"punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
723
"movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
724
"pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
725
"movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
726
"pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
727
"movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
728
"movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
729
"punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
730
"paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
733
// was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
734
// ------------------------------
747
// ------------------------------
749
"movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
750
"punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
752
//----------------------------------------------------------------------
754
"movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
755
"psllq $32, %%mm1\n" // R3B200 -> mm1
757
"paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
759
"punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
760
"movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
762
"pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
763
"movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
765
"pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
766
"psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
768
"movq %%mm6, 36%3\n" // R5B4G4R4 -> TEMP0
769
"movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
771
"paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
772
"movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
773
"psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
775
"packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
777
//----------------------------------------------------------------------
780
// was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
781
// ------------------------------
791
// ------------------------------
793
"movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
795
"psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
797
"movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
799
"movq %%mm0, 44%3\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
801
"movq 36%3, %%mm0\n" // R5B4G4R4 -> mm0
803
"punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
804
"movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
806
"psrlq $32, %%mm0\n" // 00R5B4 -> mm0
808
"paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
809
"movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
811
"pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
813
"pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
815
//----------------------------------------------------------------------
816
"paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
818
//----------------------------------------------------------------------
820
"pxor %%mm7, %%mm7\n" // 0 -> mm7
822
"punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
824
"movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
826
"pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
827
"punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
829
"pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
831
//----------------------------------------------------------------------
833
"psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
835
"paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
836
"psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
838
"packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
840
//----------------------------------------------------------------------
842
"movq 44%3, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
843
"packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
845
//----------------------------------------------------------------------
847
"movq %%mm6, (%1)\n" // store Y
856
: "r" (rgbSource), "r" (lum), "m" (pixel), "m" (buf)
861
void rgb2y16bit_mmx_row_fast(unsigned char* rgb, unsigned char* lum, int pixel)
863
__asm__ __volatile__ (
866
// unpack hicolor ( pixel 1 - 4)
869
"movq %%mm0, %%mm1\n"
870
"pand KEEPR, %%mm1\n"
871
"psrlq $11, %%mm1\n" // B3B2B1B0 -> mm1
873
"movq %%mm0, %%mm2\n"
874
"pand KEEPG, %%mm2\n"
875
"psrlq $5, %%mm2\n" // G3G2G1G0 -> mm2
876
"movq %%mm0, %%mm3\n"
877
"pand KEEPB, %%mm3\n" // R3R2R1R0 -> mm3
880
"movq %%mm3, %%mm4\n"
881
"pmullw Y_RED, %%mm4\n"
883
"movq %%mm2, %%mm5\n"
884
"pmullw Y_GREEN, %%mm5\n"
885
"paddw %%mm5, %%mm4\n"
887
"movq %%mm1, %%mm6\n"
888
"pmullw Y_BLUE, %%mm6\n"
889
"paddw %%mm6, %%mm4\n"
891
"psrlw $7, %%mm4\n" // Y3Y2Y1Y0 -> mm4
893
"pxor %%mm5, %%mm5\n"
894
"packuswb %%mm5, %%mm4\n"
907
: "r" (rgb), "r" (lum), "m" (pixel)