~ubuntu-branches/ubuntu/hoary/kdemultimedia/hoary

« back to all changes in this revision

Viewing changes to mpeglib/lib/util/render/dither2YUV/rgb2yuv16.cpp

  • Committer: Bazaar Package Importer
  • Author(s): Martin Schulze
  • Date: 2003-01-22 15:00:51 UTC
  • Revision ID: james.westby@ubuntu.com-20030122150051-uihwkdoxf15mi1tn
Tags: upstream-2.2.2
ImportĀ upstreamĀ versionĀ 2.2.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/***************************************************************************
 
2
                          rgb2yuv16.c  -  description
 
3
                             -------------------
 
4
    begin                : Tue Nov 2 2000
 
5
    copyright            : (C) 2000 by Christian Gerlach
 
6
    email                : cgerlach@rhrk.uni-kl.de
 
7
 ***************************************************************************/
 
8
 
 
9
/***************************************************************************
 
10
 *                                                                         *
 
11
 *   This program is free software; you can redistribute it and/or modify  *
 
12
 *   it under the terms of the GNU General Public License as published by  *
 
13
 *   the Free Software Foundation; either version 2 of the License, or     *
 
14
 *   (at your option) any later version.                                   *
 
15
 *                                                                         *
 
16
 ***************************************************************************/
 
17
 
 
18
#include "rgb2yuv16.h"
 
19
 
 
20
static unsigned short KEEPR[4] = { 63488, 63488, 63488, 63488 };
 
21
unsigned short KEEPG[4] = {  2016,  2016,  2016,  2016 };
 
22
unsigned short KEEPB[4] = {    31,    31,    31,    31 };
 
23
 
 
24
short Y_RED[4]   = {  307,  307,  307,  307 };
 
25
short Y_GREEN[4] = {  302,  302,  302,  302 };
 
26
short Y_BLUE[4]  = {  117,  117,  117,  117 };
 
27
 
 
28
short U_RED[4]   = { -150, -150, -150, -150 };
 
29
short U_GREEN[4] = { -147, -147, -147, -147 };
 
30
short U_BLUE[4]  = {  444,  444,  444,  444 };
 
31
 
 
32
short V_RED[4]   = {  632,  632,  632,  632 };
 
33
short V_GREEN[4] = { -265, -265, -265, -265 };
 
34
short V_BLUE[4]  = { -102, -102, -102, -102 };
 
35
 
 
36
 
 
37
// how to avoid these nasty compiler warinings?
 
38
// heres one (maybe bad) method
 
39
void dummyRGB2YUV16Bit() {
 
40
  
 
41
  printf("%p\n",KEEPR);
 
42
  printf("%p\n",KEEPG);
 
43
  printf("%p\n",KEEPB);
 
44
  printf("%p\n",Y_RED);
 
45
  printf("%p\n",Y_GREEN);
 
46
  printf("%p\n",Y_BLUE);
 
47
  printf("%p\n",U_RED);
 
48
  printf("%p\n",U_GREEN);
 
49
  printf("%p\n",U_BLUE);
 
50
  printf("%p\n",V_RED);
 
51
  printf("%p\n",V_GREEN);
 
52
  printf("%p\n",V_BLUE);
 
53
}
 
54
  
 
55
 
 
56
#ifndef INTEL   
 
57
void rgb2yuv16bit_mmx(unsigned char* ,unsigned char* ,unsigned char* ,
 
58
                      unsigned char* ,int , int ) {
 
59
  cout << "RGB->YUV not compiled with INTEL"<<endl;
 
60
  exit(0);
 
61
}
 
62
 
 
63
void rgb2yuv16bit_mmx_fast(unsigned char* ,unsigned char* ,unsigned char* ,
 
64
                      unsigned char* ,int , int ) {
 
65
  cout << "RGB->YUV not compiled with INTEL"<<endl;
 
66
  exit(0);
 
67
}
 
68
#endif
 
69
 
 
70
 
 
71
void rgb2yuv16(unsigned char* rgbSource, unsigned char* dest)
 
72
{
 
73
        int rgb = *((unsigned short*) rgbSource)++;
 
74
        int r = RED(rgb);
 
75
        int g = GREEN(rgb);
 
76
        int b = BLUE(rgb);
 
77
        
 
78
        dest[0] = Y_RGB(r, g, b);
 
79
        dest[1] = U_RGB(r, g, b);
 
80
        dest[2] = V_RGB(r, g, b);
 
81
}
 
82
 
 
83
void rgb2yuv16bit(unsigned char* rgbSource,
 
84
                  unsigned char* lum,
 
85
                  unsigned char* cr,
 
86
                  unsigned char* cb,int height, int width) {
 
87
 
 
88
        int height2 = height / 2;
 
89
        int width2 = width / 2;
 
90
        int r, g, b, row, col, rgb;
 
91
 
 
92
        for (row=0 ; row<height2 ; row++) {
 
93
                for (col=0 ; col<width2 ; col++) {
 
94
                        rgb = *((unsigned short*) rgbSource)++;
 
95
                        r = RED(rgb);
 
96
                        g = GREEN(rgb);
 
97
                        b = BLUE(rgb);
 
98
                        
 
99
                        *lum++ = Y_RGB(r, g, b);
 
100
                        *cr++  = U_RGB(r, g, b);
 
101
                        *cb++  = V_RGB(r, g, b);
 
102
                        
 
103
                        rgb = *((unsigned short*) rgbSource)++;
 
104
                        r = RED(rgb);
 
105
                        g = GREEN(rgb);
 
106
                        b = BLUE(rgb);
 
107
                        
 
108
                        *lum++ = Y_RGB(r, g, b);
 
109
                }
 
110
                for (col=0 ; col<width ; col++) {
 
111
                        rgb = *((unsigned short*) rgbSource)++;
 
112
                        r = RED(rgb);
 
113
                        g = GREEN(rgb);
 
114
                        b = BLUE(rgb);
 
115
                        
 
116
                        *lum++ = Y_RGB(r, g, b);
 
117
                }
 
118
        }
 
119
}
 
120
 
 
121
 
 
122
#ifdef INTEL
 
123
 
 
124
void rgb2yuv16bit_mmx(unsigned char* rgbSource,
 
125
                      unsigned char* lum,
 
126
                      unsigned char* cr,
 
127
                      unsigned char* cb,int height, int width) {
 
128
        int height2 = height / 2;
 
129
        int width2 = width / 2;
 
130
        int bytesPerLine = width * 2;
 
131
        
 
132
        for (int row=0 ; row<height2 ; row++) {
 
133
                rgb2yuv16bit_mmx422_row(rgbSource, lum, cr, cb, width);
 
134
                rgbSource += bytesPerLine;
 
135
                lum += width;
 
136
                cr += width2;
 
137
                cb += width2;
 
138
 
 
139
                rgb2y16bit_mmx_row(rgbSource, lum, width);
 
140
                rgbSource += bytesPerLine;
 
141
                lum += width;
 
142
        }
 
143
}
 
144
 
 
145
void rgb2yuv16bit_mmx_fast(unsigned char* rgbSource,
 
146
                          unsigned char* lum,
 
147
                      unsigned char* cr,
 
148
                      unsigned char* cb,int height, int width) {
 
149
 
 
150
        int height2 = height / 2;
 
151
        int width2 = width / 2;
 
152
        int bytesPerLine = width * 2;
 
153
 
 
154
        for (int row=0 ; row<height2 ; row++) {
 
155
                rgb2yuv16bit_mmx422_row_fast(rgbSource, lum, cr, cb, width);
 
156
                rgbSource += bytesPerLine;
 
157
                lum += width;
 
158
                cr += width2;
 
159
                cb += width2;
 
160
 
 
161
                rgb2y16bit_mmx_row_fast(rgbSource, lum, width);
 
162
                rgbSource += bytesPerLine;
 
163
                lum += width;
 
164
        }
 
165
}
 
166
 
 
167
void rgb2yuv16bit_mmx422_row(unsigned char* rgb, 
 
168
                             unsigned char* lum, unsigned char* cr,
 
169
                             unsigned char* cb, int pixel) {
 
170
        unsigned int buf[17];
 
171
 
 
172
        // 36%5 = TEMP0
 
173
        // 44%5 = TEMPY
 
174
        // 52%5 = TEMPU
 
175
        // 60%5 = TEMPV
 
176
 
 
177
        __asm__ __volatile__ (
 
178
                "rgb2yuv16_422:\n"
 
179
 
 
180
                // unpack hicolor ( pixel 1 - 4)
 
181
                "movq      (%0),    %%mm0\n"
 
182
 
 
183
                "movq      %%mm0,   %%mm1\n"
 
184
                "pand      KEEPR,   %%mm1\n"  
 
185
                "psrlq     $8,      %%mm1\n"  // B3B2B1B0 -> mm1
 
186
                "movq      %%mm0,   %%mm2\n"
 
187
                "pand      KEEPG,   %%mm2\n"
 
188
                "psrlq     $3,      %%mm2\n"  // G3G2G1G0 -> mm2
 
189
                "movq      %%mm0,   %%mm3\n"
 
190
                "pand      KEEPB,   %%mm3\n" 
 
191
                "psllq     $3,      %%mm3\n"  // G3G2G1G0 -> mm3
 
192
                
 
193
                "movq      %%mm2,   %%mm0\n"
 
194
                "punpcklbw %%mm1,   %%mm2\n" 
 
195
                "punpckhbw %%mm1,   %%mm0\n" 
 
196
 
 
197
                "pxor      %%mm5,   %%mm5\n"
 
198
                "movq      %%mm3,   %%mm4\n"
 
199
                "punpcklbw %%mm5,   %%mm3\n" 
 
200
                "punpckhbw %%mm5,   %%mm4\n" 
 
201
                
 
202
                "psllq     $8,      %%mm2\n"
 
203
                "por       %%mm2,   %%mm3\n"  // 0B1G1R10B0G0G0 -> mm3
 
204
                "psllq     $8,      %%mm0\n"
 
205
                "por       %%mm0,   %%mm4\n"  // 0B3G3R30B2G2G2 -> mm4
 
206
 
 
207
                "movq      %%mm3,   %5\n"
 
208
                "movq      %%mm4,   8%5\n"
 
209
 
 
210
                // next 4 pixels ------------------------------
 
211
 
 
212
                "movq      8(%0),    %%mm0\n"
 
213
 
 
214
                "movq      %%mm0,   %%mm1\n"
 
215
                "pand      KEEPR,   %%mm1\n"  
 
216
                "psrlq     $8,      %%mm1\n"  // B3B2B1B0 -> mm1
 
217
                "movq      %%mm0,   %%mm2\n"
 
218
                "pand      KEEPG,   %%mm2\n"
 
219
                "psrlq     $3,      %%mm2\n"  // G3G2G1G0 -> mm2
 
220
                "movq      %%mm0,   %%mm3\n"
 
221
                "pand      KEEPB,   %%mm3\n" 
 
222
                "psllq     $3,      %%mm3\n"  // G3G2G1G0 -> mm3
 
223
 
 
224
                "movq      %%mm2,   %%mm0\n"
 
225
                "punpcklbw %%mm1,   %%mm2\n" 
 
226
                "punpckhbw %%mm1,   %%mm0\n" 
 
227
 
 
228
                "pxor      %%mm5,   %%mm5\n"
 
229
                "movq      %%mm3,   %%mm4\n"
 
230
                "punpcklbw %%mm5,   %%mm3\n" 
 
231
                "punpckhbw %%mm5,   %%mm4\n" 
 
232
                
 
233
                "psllq     $8,      %%mm2\n"
 
234
                "por       %%mm2,   %%mm3\n"  // 0B1G1R10B0G0G0 -> mm3
 
235
                "psllq     $8,      %%mm0\n"
 
236
                "por       %%mm0,   %%mm4\n"  // 0B3G3R30B2G2G2 -> mm4
 
237
 
 
238
                "movq      %%mm3,   16%5\n"
 
239
                "movq      %%mm4,   24%5\n"
 
240
 
 
241
                "add       $16,     %0\n"
 
242
 
 
243
                // standard algorithm --------------------------------------------------
 
244
                
 
245
                // pack rgb
 
246
                // was: "movq      (%0), %%mm1\n"  // load G2R2B1G1R1B0G0R0
 
247
                // ------------------------------
 
248
                // (uses: mm0, mm1)
 
249
                "movd      8%5,    %%mm0\n"
 
250
                "psllq     $24,      %%mm0\n"  
 
251
                "movd      4%5,    %%mm1\n"
 
252
                "por       %%mm1,    %%mm0\n"
 
253
                "psllq     $24,      %%mm0\n"
 
254
                "movd      %5,     %%mm1\n"
 
255
                "por       %%mm0,    %%mm1\n"
 
256
                // ------------------------------
 
257
 
 
258
                "pxor      %%mm6,   %%mm6\n"  // 0 -> mm6
 
259
                "movq      %%mm1,   %%mm0\n"  // G2R2B1G1R1B0G0R0 -> mm0
 
260
                "psrlq     $16,     %%mm1\n"  // 00G2R2B1G1R1B0 -> mm1
 
261
                "punpcklbw ZEROSX,  %%mm0\n"  // R1B0G0R0 -> mm0
 
262
                "movq      %%mm1,   %%mm7\n"  // 00G2R2B1G1R1B0 -> mm7
 
263
                "punpcklbw ZEROSX,  %%mm1\n"  // B1G1R1B0 -> mm1
 
264
                "movq      %%mm0,   %%mm2\n"  // R1B0G0R0 -> mm2
 
265
                "pmaddwd   YR0GRX,  %%mm0\n"  // yrR1,ygG0+yrR0 -> mm0
 
266
 
 
267
                "movq      %%mm1,   %%mm3\n"  // B1G1R1B0 -> mm3
 
268
                "pmaddwd   YBG0BX,  %%mm1\n"  // ybB1+ygG1,ybB0 -> mm1
 
269
                "movq      %%mm2,   %%mm4\n"  // R1B0G0R0 -> mm4
 
270
                "pmaddwd   UR0GRX,  %%mm2\n"  // urR1,ugG0+urR0 -> mm2
 
271
                "movq      %%mm3,   %%mm5\n"  // B1G1R1B0 -> mm5
 
272
                "pmaddwd   UBG0BX,  %%mm3\n"  // ubB1+ugG1,ubB0 -> mm3
 
273
                "punpckhbw %%mm6,   %%mm7\n"  // 00G2R2 -> mm7
 
274
                "pmaddwd   VR0GRX,  %%mm4\n"  // vrR1,vgG0+vrR0 -> mm4
 
275
                "paddd     %%mm1,   %%mm0\n"  // Y1Y0 -> mm0
 
276
 
 
277
                "pmaddwd   VBG0BX,  %%mm5\n"  // vbB1+vgG1,vbB0 -> mm5
 
278
 
 
279
                // pack rgb
 
280
                // was: "movq      8(%0),%%mm1\n"     // R5B4G4R4B3G3R3B2 -> mm1
 
281
                // ------------------------------
 
282
                // (uses: mm1, mm6)
 
283
                "movd      20%5,  %%mm1\n"
 
284
                "psllq     $24,     %%mm1\n"
 
285
                "movd      16%5,  %%mm6\n"
 
286
                "por       %%mm6,   %%mm1\n"
 
287
                "psllq     $24,     %%mm1\n"
 
288
                "movd      12%5,  %%mm6\n"
 
289
                "por       %%mm6,   %%mm1\n"
 
290
                "psllq     $8,      %%mm1\n"      
 
291
                "movd      8%5,   %%mm6\n"
 
292
                "psrlq     $16,     %%mm6\n"      
 
293
                "por       %%mm6,   %%mm1\n"
 
294
                // ------------------------------
 
295
 
 
296
                "paddd     %%mm3,   %%mm2\n"  // U1U0 -> mm2
 
297
 
 
298
                "movq      %%mm1,   %%mm6\n"  // R5B4G4R4B3G3R3B2 -> mm6
 
299
                "punpcklbw ZEROSX,  %%mm1\n"  // B3G3R3B2 -> mm1
 
300
                "paddd     %%mm5,   %%mm4\n"  // V1V0 -> mm4
 
301
 
 
302
                //----------------------------------------------------------------------                
 
303
                
 
304
                "movq      %%mm1,   %%mm5\n"  // B3G3R3B2 -> mm5
 
305
                "psllq     $32,     %%mm1\n"  // R3B200 -> mm1
 
306
                
 
307
                "paddd     %%mm7,   %%mm1\n"  // R3B200+00G2R2=R3B2G2R2->mm1
 
308
                
 
309
                "punpckhbw ZEROSX,  %%mm6\n"  // R5B4G4R3 -> mm6
 
310
                "movq      %%mm1,   %%mm3\n"  // R3B2G2R2 -> mm3
 
311
                
 
312
                "pmaddwd   YR0GRX,  %%mm1\n"  // yrR3,ygG2+yrR2 -> mm1
 
313
                "movq      %%mm5,   %%mm7\n"  // B3G3R3B2 -> mm7
 
314
                
 
315
                "pmaddwd   YBG0BX,  %%mm5\n"  // ybB3+ygG3,ybB2 -> mm5
 
316
                "psrad     $15,     %%mm0\n"  // 32-bit scaled Y1Y0 -> mm0
 
317
                
 
318
                "movq      %%mm6,   36%5\n"  // R5B4G4R4 -> TEMP0
 
319
                "movq      %%mm3,   %%mm6\n"  // R3B2G2R2 -> mm6
 
320
                "pmaddwd   UR0GRX,  %%mm6\n"  // urR3,ugG2+urR2 -> mm6
 
321
                "psrad     $15,     %%mm2\n"  // 32-bit scaled U1U0 -> mm2
 
322
                
 
323
                "paddd     %%mm5,   %%mm1\n"  // Y3Y2 -> mm1
 
324
                "movq      %%mm7,   %%mm5\n"  // B3G3R3B2 -> mm5
 
325
                "pmaddwd   UBG0BX,  %%mm7\n"  // ubB3+ugG3,ubB2
 
326
                "psrad     $15,     %%mm1\n"  // 32-bit scaled Y3Y2 -> mm1
 
327
                
 
328
                "pmaddwd   VR0GRX,  %%mm3\n"  // vrR3,vgG2+vgR2
 
329
                "packssdw  %%mm1,   %%mm0\n"  // Y3Y2Y1Y0 -> mm0
 
330
                
 
331
                "pmaddwd   VBG0BX,  %%mm5\n"  // vbB3+vgG3,vbB2 -> mm5
 
332
                "psrad     $15,     %%mm4\n"  // 32-bit scaled V1V0 -> mm4
 
333
 
 
334
                //----------------------------------------------------------------------
 
335
 
 
336
                "paddd     %%mm7,    %%mm6\n"  // U3U2 -> mm6    
 
337
 
 
338
                // pack rgb
 
339
                // was: "movq      16(%0), %%mm1\n"     // B7G7R7B6G6R6B5G5 -> mm1
 
340
                // ------------------------------
 
341
                // (uses: mm1, mm7)
 
342
                "movd      28%5,    %%mm1\n"
 
343
                "psllq     $24,     %%mm1\n"
 
344
                "movd      24%5,    %%mm7\n"
 
345
                "por       %%mm7,   %%mm1\n"
 
346
                "psllq     $16,     %%mm1\n"
 
347
                "movd      20%5,    %%mm7\n"
 
348
                "psrlq     $8,      %%mm7\n"      
 
349
                "por       %%mm7,   %%mm1\n"
 
350
                // ------------------------------
 
351
 
 
352
                "movq      %%mm1,    %%mm7\n"  // B7G7R7B6G6R6B5G5 -> mm1
 
353
                "psrad     $15,      %%mm6\n"  // 32-bit scaled U3U2 -> mm6
 
354
 
 
355
                "paddd     %%mm5,    %%mm3\n"  // V3V2 -> mm3
 
356
                "psllq     $16,      %%mm7\n"  // R7B6G6R6B5G500 -> mm7
 
357
                
 
358
                "movq      %%mm7,    %%mm5\n"  // R7B6G6R6B5G500 -> mm5
 
359
                "psrad     $15,      %%mm3\n"  // 32-bit scaled V3V2 -> mm3
 
360
                
 
361
                "movq      %%mm0,    44%5\n"  // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
 
362
 
 
363
                "packssdw  %%mm6,    %%mm2\n"  // 32-bit scaled U3U2U1U0 -> mm2
 
364
                
 
365
                "movq      36%5,    %%mm0\n"  // R5B4G4R4 -> mm0
 
366
                
 
367
                "punpcklbw ZEROSX,   %%mm7\n"  // B5G500 -> mm7
 
368
                "movq      %%mm0,    %%mm6\n"  // R5B4G4R4 -> mm6
 
369
                
 
370
                "movq      %%mm2,    52%5\n"  // 32-bit scaled U3U2U1U0 -> TEMPU
 
371
                "psrlq     $32,      %%mm0\n"  // 00R5B4 -> mm0
 
372
                
 
373
                "paddw     %%mm0,    %%mm7\n"  // B5G5R5B4 -> mm7
 
374
                "movq      %%mm6,    %%mm2\n"  // B5B4G4R4 -> mm2
 
375
                
 
376
                "pmaddwd   YR0GRX,   %%mm2\n"  // yrR5,ygG4+yrR4 -> mm2
 
377
                "movq      %%mm7,    %%mm0\n"  // B5G5R5B4 -> mm0
 
378
                
 
379
                "pmaddwd   YBG0BX,   %%mm7\n"  // ybB5+ygG5,ybB4 -> mm7
 
380
                "packssdw  %%mm3,    %%mm4\n"  // 32-bit scaled V3V2V1V0 -> mm4
 
381
 
 
382
                //----------------------------------------------------------------------
 
383
                
 
384
                "movq      %%mm4,    60%5\n"  // (V3V2V1V0)/256 -> mm4
 
385
 
 
386
                "movq      %%mm6,    %%mm4\n"  // B5B4G4R4 -> mm4
 
387
                
 
388
                "pmaddwd   UR0GRX,   %%mm6\n"  // urR5,ugG4+urR4
 
389
                "movq      %%mm0,    %%mm3\n"  // B5G5R5B4 -> mm0
 
390
                
 
391
                "pmaddwd   UBG0BX,   %%mm0\n"  // ubB5+ugG5,ubB4
 
392
                "paddd     %%mm7,    %%mm2\n"  // Y5Y4 -> mm2
 
393
 
 
394
                //----------------------------------------------------------------------
 
395
                
 
396
                "pmaddwd   VR0GRX,   %%mm4\n"  // vrR5,vgG4+vrR4 -> mm4
 
397
                "pxor      %%mm7,    %%mm7\n"  // 0 -> mm7
 
398
                
 
399
                "pmaddwd   VBG0BX,   %%mm3\n"  // vbB5+vgG5,vbB4 -> mm3
 
400
                "punpckhbw %%mm7,    %%mm1\n"  // B7G7R7B6 -> mm1
 
401
                
 
402
                "paddd     %%mm6,    %%mm0\n"  // U5U4 -> mm0
 
403
                "movq      %%mm1,    %%mm6\n"  // B7G7R7B6 -> mm6
 
404
                
 
405
                "pmaddwd   YBG0BX,   %%mm6\n"  // ybB7+ygG7,ybB6 -> mm6
 
406
                "punpckhbw %%mm7,    %%mm5\n"  // R7B6G6R6 -> mm5
 
407
                
 
408
                "movq      %%mm5,    %%mm7\n"  // R7B6G6R6 -> mm7
 
409
                "paddd     %%mm4,    %%mm3\n"  // V5V4 -> mm3
 
410
                
 
411
                "pmaddwd   YR0GRX,   %%mm5\n"  // yrR7,ygG6+yrR6 -> mm5
 
412
                "movq      %%mm1,    %%mm4\n"  // B7G7R7B6 -> mm4
 
413
                
 
414
                "pmaddwd   UBG0BX,   %%mm4\n"  // ubB7+ugG7,ubB6 -> mm4
 
415
                "psrad     $15,      %%mm0\n"  // 32-bit scaled U5U4 -> %%mm0
 
416
                
 
417
                //----------------------------------------------------------------------
 
418
                
 
419
                "paddd     OFFSETWX, %%mm0\n"  // add offset to U5U4 -> mm0
 
420
                "psrad     $15,      %%mm2\n"  // 32-bit scaled Y5Y4 -> mm2
 
421
                
 
422
                "paddd     %%mm5,    %%mm6\n"  // Y7Y6 -> mm6
 
423
                "movq      %%mm7,    %%mm5\n"  // R7B6G6R6 -> mm5
 
424
                
 
425
                "pmaddwd   UR0GRX,   %%mm7\n"  // urR7,ugG6+ugR6 -> mm7
 
426
                "psrad     $15,      %%mm3\n"  // 32-bit scaled V5V4 -> mm3
 
427
                
 
428
                "pmaddwd   VBG0BX,   %%mm1\n"  // vbB7+vgG7,vbB6 -> mm1
 
429
                "psrad     $15,      %%mm6\n"  // 32-bit scaled Y7Y6 -> mm6
 
430
                
 
431
                "paddd     OFFSETDX, %%mm4\n"  // add offset to U7U6 
 
432
                "packssdw  %%mm6,    %%mm2\n"  // Y7Y6Y5Y4 -> mm2
 
433
                
 
434
                "pmaddwd   VR0GRX,   %%mm5\n"  // vrR7,vgG6+vrR6 -> mm5
 
435
                "paddd     %%mm4,    %%mm7\n"  // U7U6 -> mm7    
 
436
                
 
437
                "psrad     $15,      %%mm7\n"  // 32-bit scaled U7U6 -> mm7
 
438
 
 
439
                //----------------------------------------------------------------------
 
440
                
 
441
                "movq      44%5,    %%mm6\n"  // 32-bit scaled Y3Y2Y1Y0 -> mm6
 
442
                "packssdw  %%mm7,    %%mm0\n"  // 32-bit scaled U7U6U5U4 -> mm0
 
443
                
 
444
                "movq      52%5,    %%mm4\n"  // 32-bit scaled U3U2U1U0 -> mm4
 
445
                "packuswb  %%mm2,    %%mm6\n"  // all 8 Y values -> mm6
 
446
                
 
447
                "movq      OFFSETBX, %%mm7\n"  // 128,128,128,128 -> mm7
 
448
                "paddd     %%mm5,    %%mm1\n"  // V7V6 -> mm1
 
449
                
 
450
                "paddw     %%mm7,    %%mm4\n"  // add offset to U3U2U1U0/256 
 
451
                "psrad     $15,      %%mm1\n"  // 32-bit scaled V7V6 -> mm1
 
452
                
 
453
                //----------------------------------------------------------------------
 
454
 
 
455
                "movq      %%mm6,    (%1)\n"     // store Y
 
456
 
 
457
                "packuswb  %%mm0,    %%mm4\n"    // all 8 U values -> mm4
 
458
                "movq      60%5,    %%mm5\n"  // 32-bit scaled V3V2V1V0 -> mm5
 
459
 
 
460
                "packssdw  %%mm1,    %%mm3\n"  // V7V6V5V4 -> mm3
 
461
                "paddw     %%mm7,    %%mm5\n"  // add offset to  V3V2V1V0
 
462
                "paddw     %%mm7,    %%mm3\n"  // add offset to  V7V6V5V4
 
463
                
 
464
                "packuswb  %%mm3,    %%mm5\n"  // ALL 8 V values -> mm5
 
465
 
 
466
                "movq      CLEARX, %%mm2\n"
 
467
                "pand      %%mm2, %%mm4\n"
 
468
                "pand      %%mm2, %%mm5\n"
 
469
 
 
470
                "packuswb  %%mm5, %%mm4\n" 
 
471
 
 
472
                "movd      %%mm4,    (%2)\n"
 
473
                "psrlq     $32,      %%mm4\n"
 
474
                "movd      %%mm4,    (%3)\n"
 
475
 
 
476
                "add       $8,       %1\n"
 
477
                "add       $4,       %2\n"
 
478
                "add       $4,       %3\n"
 
479
                
 
480
                "sub       $8,       %4\n"
 
481
                "jnz       rgb2yuv16_422\n"
 
482
 
 
483
                "emms\n"
 
484
 
 
485
                :
 
486
                : "r" (rgb), "r" (lum), "r" (cr), "r" (cb),
 
487
                "m" (pixel), "m" (buf)
 
488
 
 
489
                );
 
490
}
 
491
 
 
492
void rgb2yuv16bit_mmx422_row_fast(unsigned char* rgb, 
 
493
                                  unsigned char* lum, unsigned char* cr,
 
494
                                  unsigned char* cb, int pixel)
 
495
{
 
496
        __asm__ __volatile__ (
 
497
                "rgb2yuv16_422_fast:\n"
 
498
 
 
499
                // unpack hicolor ( pixel 0 - 3)
 
500
                "movq      (%0),    %%mm0\n"
 
501
 
 
502
                "movq      %%mm0,   %%mm1\n"
 
503
                "pand      KEEPR,   %%mm1\n"  
 
504
                "psrlq     $11,     %%mm1\n"  // B3B2B1B0 -> mm1
 
505
 
 
506
                "movq      %%mm0,   %%mm2\n"
 
507
                "pand      KEEPG,   %%mm2\n"
 
508
                "psrlq     $5,      %%mm2\n"  // G3G2G1G0 -> mm2
 
509
 
 
510
                "movq      %%mm0,   %%mm3\n"
 
511
                "pand      KEEPB,   %%mm3\n"  // R3R2R1R0 -> mm3
 
512
 
 
513
                // unpack hicolor ( pixel 4 - 7)
 
514
                "movq      8(%0),    %%mm0\n"
 
515
 
 
516
                "movq      %%mm0,   %%mm4\n"
 
517
                "pand      KEEPR,   %%mm4\n"  
 
518
                "psrlq     $11,     %%mm4\n"  // B7B6B5B4 -> mm4
 
519
 
 
520
                "movq      %%mm0,   %%mm5\n"
 
521
                "pand      KEEPG,   %%mm5\n"
 
522
                "psrlq     $5,      %%mm5\n"  // G7G6G5G4 -> mm5
 
523
 
 
524
                "movq      %%mm0,   %%mm6\n"
 
525
                "pand      KEEPB,   %%mm6\n"  // R7R6R5R4 -> mm6
 
526
 
 
527
                // calculate Y
 
528
                "movq      %%mm6,   %%mm7\n"
 
529
                "pmullw    Y_RED,   %%mm7\n"
 
530
 
 
531
                "movq      %%mm5,   %%mm0\n"
 
532
                "pmullw    Y_GREEN, %%mm0\n"
 
533
                "paddw     %%mm0,   %%mm7\n"
 
534
 
 
535
                "movq      %%mm4,   %%mm0\n"
 
536
                "pmullw    Y_BLUE,  %%mm0\n"
 
537
                "paddw     %%mm0,   %%mm7\n"
 
538
 
 
539
                "psrlw     $7,      %%mm7\n"  // Y3Y2Y1Y0 -> mm7
 
540
                
 
541
                "pxor      %%mm0,   %%mm0\n"
 
542
                "packuswb  %%mm0,   %%mm7\n"
 
543
                "movd      %%mm7,   4(%1)\n"  // Y3Y2Y1Y0 -> lum
 
544
 
 
545
                // --------
 
546
                
 
547
                "movq      %%mm3,   %%mm7\n"
 
548
                "pmullw    Y_RED,   %%mm7\n"
 
549
 
 
550
                "movq      %%mm2,   %%mm0\n"
 
551
                "pmullw    Y_GREEN, %%mm0\n"
 
552
                "paddw     %%mm0,   %%mm7\n"
 
553
 
 
554
                "movq      %%mm1,   %%mm0\n"
 
555
                "pmullw    Y_BLUE,  %%mm0\n"
 
556
                "paddw     %%mm0,   %%mm7\n"
 
557
 
 
558
                "psrlw     $7,      %%mm7\n"  // Y7Y6Y5Y4 -> mm7
 
559
                
 
560
                "pxor      %%mm0,   %%mm0\n"
 
561
                "packuswb  %%mm0,   %%mm7\n"
 
562
                "movd      %%mm7,   (%1)\n"   // Y7Y6Y5Y4 -> lum
 
563
                "add       $8,      %1\n"
 
564
 
 
565
                // pack RGB
 
566
                "packuswb  %%mm4,   %%mm1\n"
 
567
                "pand      CLEARX,  %%mm1\n"  // B6B4B2B0 -> mm1
 
568
                "packuswb  %%mm5,   %%mm2\n"
 
569
                "pand      CLEARX,  %%mm2\n"  // GRG4G2G0 -> mm2
 
570
                "packuswb  %%mm6,   %%mm3\n"
 
571
                "pand      CLEARX,  %%mm3\n"  // R6R4R2R0 -> mm3
 
572
 
 
573
                // calculate U
 
574
                "movq      %%mm3,   %%mm7\n"
 
575
                "pmullw    U_RED,   %%mm7\n"
 
576
 
 
577
                "movq      %%mm2,   %%mm0\n"
 
578
                "pmullw    U_GREEN, %%mm0\n"
 
579
                "paddw     %%mm0,   %%mm7\n"
 
580
 
 
581
                "movq      %%mm1,   %%mm0\n"
 
582
                "pmullw    U_BLUE,  %%mm0\n"
 
583
                "paddw     %%mm0,   %%mm7\n"
 
584
 
 
585
                "psrlw     $7,      %%mm7\n"  // U3U2U1U0 -> mm7
 
586
                "paddw     OFFSETBX,%%mm7\n"
 
587
                "pand      CLEARX,  %%mm7\n"
 
588
                
 
589
                "pxor      %%mm0,   %%mm0\n"
 
590
                "packuswb  %%mm0,   %%mm7\n"
 
591
                "movd      %%mm7,   (%2)\n"   // U3U2U1U0 -> lum
 
592
                "add       $4,      %2\n"
 
593
 
 
594
                // calculate V
 
595
                "movq      %%mm3,   %%mm7\n"
 
596
                "pmullw    V_RED,   %%mm7\n"
 
597
 
 
598
                "movq      %%mm2,   %%mm0\n"
 
599
                "pmullw    V_GREEN, %%mm0\n"
 
600
                "paddw     %%mm0,   %%mm7\n"
 
601
 
 
602
                "movq      %%mm1,   %%mm0\n"
 
603
                "pmullw    V_BLUE,  %%mm0\n"
 
604
                "paddw     %%mm0,   %%mm7\n"
 
605
 
 
606
                "psrlw     $7,      %%mm7\n"  // V3V2V1V0 -> mm7
 
607
                "paddw     OFFSETBX,%%mm7\n"
 
608
                "pand      CLEARX,  %%mm7\n"
 
609
                
 
610
                "pxor      %%mm0,   %%mm0\n"
 
611
                "packuswb  %%mm0,   %%mm7\n"
 
612
                "movd      %%mm7,   (%3)\n"   // V3V2V1V0 -> lum
 
613
                "add       $4,      %3\n"
 
614
 
 
615
                "add       $16,     %0\n"
 
616
                
 
617
                "sub       $8,      %4\n"
 
618
                "jnz       rgb2yuv16_422_fast\n"
 
619
 
 
620
                "emms\n"
 
621
 
 
622
                :
 
623
                : "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel)
 
624
                
 
625
                );
 
626
}
 
627
 
 
628
void rgb2y16bit_mmx_row(unsigned char* rgbSource, 
 
629
                        unsigned char* lum, int pixel)
 
630
{
 
631
        unsigned int buf[16];
 
632
 
 
633
        // 36%3 = TEMP0
 
634
        // 44%3 = TEMPY
 
635
        
 
636
        __asm__ __volatile__ (
 
637
                "rgb2y_16:\n"
 
638
 
 
639
                // unpack hicolor ( pixel 1 - 4)
 
640
                "movq      (%0),    %%mm0\n"
 
641
 
 
642
                "movq      %%mm0,   %%mm1\n"
 
643
                "pand      KEEPR,   %%mm1\n"  
 
644
                "psrlq     $8,      %%mm1\n"  // B3B2B1B0 -> mm1
 
645
                "movq      %%mm0,   %%mm2\n"
 
646
                "pand      KEEPG,   %%mm2\n"
 
647
                "psrlq     $3,      %%mm2\n"  // G3G2G1G0 -> mm2
 
648
                "movq      %%mm0,   %%mm3\n"
 
649
                "pand      KEEPB,   %%mm3\n" 
 
650
                "psllq     $3,      %%mm3\n"  // G3G2G1G0 -> mm3
 
651
 
 
652
                "movq      %%mm2,   %%mm0\n"
 
653
                "punpcklbw %%mm1,   %%mm2\n" 
 
654
                "punpckhbw %%mm1,   %%mm0\n" 
 
655
 
 
656
                "pxor      %%mm5,   %%mm5\n"
 
657
                "movq      %%mm3,   %%mm4\n"
 
658
                "punpcklbw %%mm5,   %%mm3\n" 
 
659
                "punpckhbw %%mm5,   %%mm4\n" 
 
660
                
 
661
                "psllq     $8,      %%mm2\n"
 
662
                "por       %%mm2,   %%mm3\n"  // 0B1G1R10B0G0G0 -> mm3
 
663
                "psllq     $8,      %%mm0\n"
 
664
                "por       %%mm0,   %%mm4\n"  // 0B3G3R30B2G2G2 -> mm4
 
665
 
 
666
                "movq      %%mm3,   %3\n"
 
667
                "movq      %%mm4,   8%3\n"
 
668
 
 
669
                // next 4 pixels ------------------------------
 
670
 
 
671
                "movq      8(%0),    %%mm0\n"
 
672
 
 
673
                "movq      %%mm0,   %%mm1\n"
 
674
                "pand      KEEPR,   %%mm1\n"  
 
675
                "psrlq     $8,      %%mm1\n"  // B3B2B1B0 -> mm1
 
676
                "movq      %%mm0,   %%mm2\n"
 
677
                "pand      KEEPG,   %%mm2\n"
 
678
                "psrlq     $3,      %%mm2\n"  // G3G2G1G0 -> mm2
 
679
                "movq      %%mm0,   %%mm3\n"
 
680
                "pand      KEEPB,   %%mm3\n" 
 
681
                "psllq     $3,      %%mm3\n"  // G3G2G1G0 -> mm3
 
682
 
 
683
                "movq      %%mm2,   %%mm0\n"
 
684
                "punpcklbw %%mm1,   %%mm2\n" 
 
685
                "punpckhbw %%mm1,   %%mm0\n" 
 
686
 
 
687
                "pxor      %%mm5,   %%mm5\n"
 
688
                "movq      %%mm3,   %%mm4\n"
 
689
                "punpcklbw %%mm5,   %%mm3\n" 
 
690
                "punpckhbw %%mm5,   %%mm4\n" 
 
691
                
 
692
                "psllq     $8,      %%mm2\n"
 
693
                "por       %%mm2,   %%mm3\n"  // 0B1G1R10B0G0G0 -> mm3
 
694
                "psllq     $8,      %%mm0\n"
 
695
                "por       %%mm0,   %%mm4\n"  // 0B3G3R30B2G2G2 -> mm4
 
696
 
 
697
                "movq      %%mm3,   16%3\n"
 
698
                "movq      %%mm4,   24%3\n"
 
699
 
 
700
                "add       $16,     %0\n"
 
701
 
 
702
                // standard algorithm --------------------------------------------------
 
703
 
 
704
                // pack rgb
 
705
                // was: "movq      (%0), %%mm1\n"  // load G2R2B1G1R1B0G0R0
 
706
                // ------------------------------
 
707
                // (uses: mm0, mm1)
 
708
                "movd      8%3,    %%mm0\n"
 
709
                "psllq     $24,      %%mm0\n"  
 
710
                "movd      4%3,    %%mm1\n"
 
711
                "por       %%mm1,    %%mm0\n"
 
712
                "psllq     $24,      %%mm0\n"
 
713
                "movd      %3,     %%mm1\n"
 
714
                "por       %%mm0,    %%mm1\n"
 
715
                // ------------------------------
 
716
 
 
717
                "pxor      %%mm6,   %%mm6\n"  // 0 -> mm6
 
718
                "movq      %%mm1,   %%mm0\n"  // G2R2B1G1R1B0G0R0 -> mm0
 
719
                "psrlq     $16,     %%mm1\n"  // 00G2R2B1G1R1B0 -> mm1
 
720
                "punpcklbw ZEROSX,  %%mm0\n"  // R1B0G0R0 -> mm0
 
721
                "movq      %%mm1,   %%mm7\n"  // 00G2R2B1G1R1B0 -> mm7
 
722
                "punpcklbw ZEROSX,  %%mm1\n"  // B1G1R1B0 -> mm1
 
723
                "movq      %%mm0,   %%mm2\n"  // R1B0G0R0 -> mm2
 
724
                "pmaddwd   YR0GRX,  %%mm0\n"  // yrR1,ygG0+yrR0 -> mm0
 
725
                "movq      %%mm1,   %%mm3\n"  // B1G1R1B0 -> mm3
 
726
                "pmaddwd   YBG0BX,  %%mm1\n"  // ybB1+ygG1,ybB0 -> mm1
 
727
                "movq      %%mm2,   %%mm4\n"  // R1B0G0R0 -> mm4
 
728
                "movq      %%mm3,   %%mm5\n"  // B1G1R1B0 -> mm5
 
729
                "punpckhbw %%mm6,   %%mm7\n"  // 00G2R2 -> mm7
 
730
                "paddd     %%mm1,   %%mm0\n"  // Y1Y0 -> mm0
 
731
 
 
732
                // pack rgb
 
733
                // was: "movq      8(%0),%%mm1\n"     // R5B4G4R4B3G3R3B2 -> mm1
 
734
                // ------------------------------
 
735
                // (uses: mm1, mm6)
 
736
                "movd      20%3,  %%mm1\n"
 
737
                "psllq     $24,     %%mm1\n"
 
738
                "movd      16%3,  %%mm6\n"
 
739
                "por       %%mm6,   %%mm1\n"
 
740
                "psllq     $24,     %%mm1\n"
 
741
                "movd      12%3,  %%mm6\n"
 
742
                "por       %%mm6,   %%mm1\n"
 
743
                "psllq     $8,      %%mm1\n"      
 
744
                "movd      8%3,   %%mm6\n"
 
745
                "psrlq     $16,     %%mm6\n"      
 
746
                "por       %%mm6,   %%mm1\n"
 
747
                // ------------------------------
 
748
 
 
749
                "movq      %%mm1,   %%mm6\n"  // R5B4G4R4B3G3R3B2 -> mm6
 
750
                "punpcklbw ZEROSX,  %%mm1\n"  // B3G3R3B2 -> mm1
 
751
 
 
752
                //----------------------------------------------------------------------                
 
753
                
 
754
                "movq      %%mm1,   %%mm5\n"  // B3G3R3B2 -> mm5
 
755
                "psllq     $32,     %%mm1\n"  // R3B200 -> mm1
 
756
                
 
757
                "paddd     %%mm7,   %%mm1\n"  // R3B200+00G2R2=R3B2G2R2->mm1
 
758
                
 
759
                "punpckhbw ZEROSX,  %%mm6\n"  // R5B4G4R3 -> mm6
 
760
                "movq      %%mm1,   %%mm3\n"  // R3B2G2R2 -> mm3
 
761
                
 
762
                "pmaddwd   YR0GRX,  %%mm1\n"  // yrR3,ygG2+yrR2 -> mm1
 
763
                "movq      %%mm5,   %%mm7\n"  // B3G3R3B2 -> mm7
 
764
                
 
765
                "pmaddwd   YBG0BX,  %%mm5\n"  // ybB3+ygG3,ybB2 -> mm5
 
766
                "psrad     $15,     %%mm0\n"  // 32-bit scaled Y1Y0 -> mm0
 
767
                
 
768
                "movq      %%mm6,   36%3\n"  // R5B4G4R4 -> TEMP0
 
769
                "movq      %%mm3,   %%mm6\n"  // R3B2G2R2 -> mm6
 
770
                
 
771
                "paddd     %%mm5,   %%mm1\n"  // Y3Y2 -> mm1
 
772
                "movq      %%mm7,   %%mm5\n"  // B3G3R3B2 -> mm5
 
773
                "psrad     $15,     %%mm1\n"  // 32-bit scaled Y3Y2 -> mm1
 
774
                
 
775
                "packssdw  %%mm1,   %%mm0\n"  // Y3Y2Y1Y0 -> mm0
 
776
 
 
777
                //----------------------------------------------------------------------
 
778
 
 
779
                // pack rgb
 
780
                // was: "movq      16(%0), %%mm1\n"     // B7G7R7B6G6R6B5G5 -> mm1
 
781
                // ------------------------------
 
782
                // (uses: mm1, mm7)
 
783
                "movd      28%3,  %%mm1\n"
 
784
                "psllq     $24,     %%mm1\n"
 
785
                "movd      24%3,  %%mm7\n"
 
786
                "por       %%mm7,   %%mm1\n"
 
787
                "psllq     $16,     %%mm1\n"
 
788
                "movd      20%3,  %%mm7\n"
 
789
                "psrlq     $8,      %%mm7\n"      
 
790
                "por       %%mm7,   %%mm1\n"
 
791
                // ------------------------------
 
792
                
 
793
                "movq      %%mm1,    %%mm7\n"  // B7G7R7B6G6R6B5G5 -> mm1
 
794
                
 
795
                "psllq     $16,      %%mm7\n"  // R7B6G6R6B5G500 -> mm7
 
796
                
 
797
                "movq      %%mm7,    %%mm5\n"  // R7B6G6R6B5G500 -> mm5
 
798
                
 
799
                "movq      %%mm0,    44%3\n"  // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
 
800
                
 
801
                "movq      36%3,    %%mm0\n"  // R5B4G4R4 -> mm0
 
802
                
 
803
                "punpcklbw ZEROSX,   %%mm7\n"  // B5G500 -> mm7
 
804
                "movq      %%mm0,    %%mm6\n"  // R5B4G4R4 -> mm6
 
805
                
 
806
                "psrlq     $32,      %%mm0\n"  // 00R5B4 -> mm0
 
807
                
 
808
                "paddw     %%mm0,    %%mm7\n"  // B5G5R5B4 -> mm7
 
809
                "movq      %%mm6,    %%mm2\n"  // B5B4G4R4 -> mm2
 
810
                
 
811
                "pmaddwd   YR0GRX,   %%mm2\n"  // yrR5,ygG4+yrR4 -> mm2
 
812
                
 
813
                "pmaddwd   YBG0BX,   %%mm7\n"  // ybB5+ygG5,ybB4 -> mm7
 
814
 
 
815
                //----------------------------------------------------------------------
 
816
                "paddd     %%mm7,    %%mm2\n"  // Y5Y4 -> mm2
 
817
                
 
818
                //----------------------------------------------------------------------
 
819
                
 
820
                "pxor      %%mm7,    %%mm7\n"  // 0 -> mm7
 
821
                
 
822
                "punpckhbw %%mm7,    %%mm1\n"  // B7G7R7B6 -> mm1
 
823
                
 
824
                "movq      %%mm1,    %%mm6\n"  // B7G7R7B6 -> mm6
 
825
                
 
826
                "pmaddwd   YBG0BX,   %%mm6\n"  // ybB7+ygG7,ybB6 -> mm6
 
827
                "punpckhbw %%mm7,    %%mm5\n"  // R7B6G6R6 -> mm5
 
828
                
 
829
                "pmaddwd   YR0GRX,   %%mm5\n"  // yrR7,ygG6+yrR6 -> mm5
 
830
                
 
831
                //----------------------------------------------------------------------
 
832
 
 
833
                "psrad     $15,      %%mm2\n"  // 32-bit scaled Y5Y4 -> mm2
 
834
                
 
835
                "paddd     %%mm5,    %%mm6\n"  // Y7Y6 -> mm6
 
836
                "psrad     $15,      %%mm6\n"  // 32-bit scaled Y7Y6 -> mm6
 
837
                
 
838
                "packssdw  %%mm6,    %%mm2\n"  // Y7Y6Y5Y4 -> mm2
 
839
 
 
840
                //----------------------------------------------------------------------
 
841
                
 
842
                "movq      44%3,    %%mm6\n"  // 32-bit scaled Y3Y2Y1Y0 -> mm6
 
843
                "packuswb  %%mm2,    %%mm6\n"  // all 8 Y values -> mm6
 
844
                
 
845
                //----------------------------------------------------------------------
 
846
 
 
847
                "movq      %%mm6,  (%1)\n"     // store Y
 
848
                
 
849
                "add       $8,     %1\n"
 
850
                
 
851
                "sub       $8,     %2\n"
 
852
                "jnz       rgb2y_16\n"
 
853
                "emms\n"
 
854
 
 
855
                :
 
856
                : "r" (rgbSource), "r" (lum), "m" (pixel), "m" (buf)
 
857
                
 
858
        );
 
859
}
 
860
 
 
861
void rgb2y16bit_mmx_row_fast(unsigned char* rgb, unsigned char* lum, int pixel)
 
862
{
 
863
        __asm__ __volatile__ (
 
864
        "rgb2y32_fast:\n"
 
865
                
 
866
                // unpack hicolor ( pixel 1 - 4)
 
867
                "movq      (%0),    %%mm0\n"
 
868
                
 
869
                "movq      %%mm0,   %%mm1\n"
 
870
                "pand      KEEPR,   %%mm1\n"
 
871
                "psrlq     $11,     %%mm1\n"  // B3B2B1B0 -> mm1
 
872
                
 
873
                "movq      %%mm0,   %%mm2\n"
 
874
                "pand      KEEPG,   %%mm2\n"
 
875
                "psrlq     $5,      %%mm2\n"  // G3G2G1G0 -> mm2
 
876
                "movq      %%mm0,   %%mm3\n"
 
877
                "pand      KEEPB,   %%mm3\n"  // R3R2R1R0 -> mm3
 
878
                
 
879
                // calculate Y
 
880
                "movq      %%mm3,   %%mm4\n"
 
881
                "pmullw    Y_RED,   %%mm4\n"
 
882
                
 
883
                "movq      %%mm2,   %%mm5\n"
 
884
                "pmullw    Y_GREEN, %%mm5\n"
 
885
                "paddw     %%mm5,   %%mm4\n"
 
886
                
 
887
                "movq      %%mm1,   %%mm6\n"
 
888
                "pmullw    Y_BLUE,  %%mm6\n"
 
889
                "paddw     %%mm6,   %%mm4\n"
 
890
                
 
891
                "psrlw     $7,      %%mm4\n"  // Y3Y2Y1Y0 -> mm4
 
892
                
 
893
                "pxor      %%mm5,   %%mm5\n"
 
894
                "packuswb  %%mm5,   %%mm4\n"
 
895
                
 
896
                "movd      %%mm4,   (%1)\n"
 
897
                "add       $4,      %1\n"  
 
898
                
 
899
                "add       $8,      %0\n"
 
900
                
 
901
                "sub       $4,      %2\n"
 
902
                "jnz       rgb2y32_fast\n"
 
903
                
 
904
                "emms\n"
 
905
                
 
906
                :
 
907
                : "r" (rgb), "r" (lum), "m" (pixel)
 
908
    );
 
909
}
 
910
 
 
911
 
 
912
#endif
 
913
// INTEL
 
914
 
 
915