2
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation; either version 2 of the License, or
7
(at your option) any later version.
9
This program is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
GNU General Public License for more details.
14
You should have received a copy of the GNU General Public License
15
along with this program; if not, write to the Free Software
16
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35
#define PREFETCH "prefetch"
36
#define PREFETCHW "prefetchw"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
41
#define PREFETCH "/nop"
42
#define PREFETCHW "/nop"
46
#define SFENCE "sfence"
52
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53
#elif defined (HAVE_3DNOW)
54
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65
#include "swscale_altivec_template.c"
68
#define YSCALEYUV2YV12X(x, offset) \
69
"xor %%"REG_a", %%"REG_a" \n\t"\
70
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71
"movq %%mm3, %%mm4 \n\t"\
72
"lea " offset "(%0), %%"REG_d" \n\t"\
73
"mov (%%"REG_d"), %%"REG_S" \n\t"\
74
".balign 16 \n\t" /* FIXME Unroll? */\
76
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79
"add $16, %%"REG_d" \n\t"\
80
"mov (%%"REG_d"), %%"REG_S" \n\t"\
81
"test %%"REG_S", %%"REG_S" \n\t"\
82
"pmulhw %%mm0, %%mm2 \n\t"\
83
"pmulhw %%mm0, %%mm5 \n\t"\
84
"paddw %%mm2, %%mm3 \n\t"\
85
"paddw %%mm5, %%mm4 \n\t"\
87
"psraw $3, %%mm3 \n\t"\
88
"psraw $3, %%mm4 \n\t"\
89
"packuswb %%mm4, %%mm3 \n\t"\
90
MOVNTQ(%%mm3, (%1, %%REGa))\
91
"add $8, %%"REG_a" \n\t"\
92
"cmp %2, %%"REG_a" \n\t"\
93
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94
"movq %%mm3, %%mm4 \n\t"\
95
"lea " offset "(%0), %%"REG_d" \n\t"\
96
"mov (%%"REG_d"), %%"REG_S" \n\t"\
99
#define YSCALEYUV2YV121 \
100
"mov %2, %%"REG_a" \n\t"\
101
".balign 16 \n\t" /* FIXME Unroll? */\
103
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104
"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105
"psraw $7, %%mm0 \n\t"\
106
"psraw $7, %%mm1 \n\t"\
107
"packuswb %%mm1, %%mm0 \n\t"\
108
MOVNTQ(%%mm0, (%1, %%REGa))\
109
"add $8, %%"REG_a" \n\t"\
113
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
114
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115
"r" (dest), "m" (dstW),
116
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
119
#define YSCALEYUV2PACKEDX \
120
"xor %%"REG_a", %%"REG_a" \n\t"\
124
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125
"mov (%%"REG_d"), %%"REG_S" \n\t"\
126
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127
"movq %%mm3, %%mm4 \n\t"\
130
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132
"movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133
"add $16, %%"REG_d" \n\t"\
134
"mov (%%"REG_d"), %%"REG_S" \n\t"\
135
"pmulhw %%mm0, %%mm2 \n\t"\
136
"pmulhw %%mm0, %%mm5 \n\t"\
137
"paddw %%mm2, %%mm3 \n\t"\
138
"paddw %%mm5, %%mm4 \n\t"\
139
"test %%"REG_S", %%"REG_S" \n\t"\
142
"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143
"mov (%%"REG_d"), %%"REG_S" \n\t"\
144
"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145
"movq %%mm1, %%mm7 \n\t"\
148
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149
"movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151
"add $16, %%"REG_d" \n\t"\
152
"mov (%%"REG_d"), %%"REG_S" \n\t"\
153
"pmulhw %%mm0, %%mm2 \n\t"\
154
"pmulhw %%mm0, %%mm5 \n\t"\
155
"paddw %%mm2, %%mm1 \n\t"\
156
"paddw %%mm5, %%mm7 \n\t"\
157
"test %%"REG_S", %%"REG_S" \n\t"\
161
#define YSCALEYUV2RGBX \
163
"psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164
"psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172
"psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173
"psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177
"paddw %%mm3, %%mm4 \n\t"\
178
"movq %%mm2, %%mm0 \n\t"\
179
"movq %%mm5, %%mm6 \n\t"\
180
"movq %%mm4, %%mm3 \n\t"\
181
"punpcklwd %%mm2, %%mm2 \n\t"\
182
"punpcklwd %%mm5, %%mm5 \n\t"\
183
"punpcklwd %%mm4, %%mm4 \n\t"\
184
"paddw %%mm1, %%mm2 \n\t"\
185
"paddw %%mm1, %%mm5 \n\t"\
186
"paddw %%mm1, %%mm4 \n\t"\
187
"punpckhwd %%mm0, %%mm0 \n\t"\
188
"punpckhwd %%mm6, %%mm6 \n\t"\
189
"punpckhwd %%mm3, %%mm3 \n\t"\
190
"paddw %%mm7, %%mm0 \n\t"\
191
"paddw %%mm7, %%mm6 \n\t"\
192
"paddw %%mm7, %%mm3 \n\t"\
193
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194
"packuswb %%mm0, %%mm2 \n\t"\
195
"packuswb %%mm6, %%mm5 \n\t"\
196
"packuswb %%mm3, %%mm4 \n\t"\
197
"pxor %%mm7, %%mm7 \n\t"
199
#define FULL_YSCALEYUV2RGB \
200
"pxor %%mm7, %%mm7 \n\t"\
201
"movd %6, %%mm6 \n\t" /*yalpha1*/\
202
"punpcklwd %%mm6, %%mm6 \n\t"\
203
"punpcklwd %%mm6, %%mm6 \n\t"\
204
"movd %7, %%mm5 \n\t" /*uvalpha1*/\
205
"punpcklwd %%mm5, %%mm5 \n\t"\
206
"punpcklwd %%mm5, %%mm5 \n\t"\
207
"xor %%"REG_a", %%"REG_a" \n\t"\
210
"movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211
"movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212
"movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213
"movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219
"movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222
"movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225
"psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226
"psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227
"pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232
"pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234
"pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236
"psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239
"movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240
"pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241
"pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242
"paddw %%mm1, %%mm3 \n\t" /* B*/\
243
"paddw %%mm1, %%mm0 \n\t" /* R*/\
244
"packuswb %%mm3, %%mm3 \n\t"\
246
"packuswb %%mm0, %%mm0 \n\t"\
247
"paddw %%mm4, %%mm2 \n\t"\
248
"paddw %%mm2, %%mm1 \n\t" /* G*/\
250
"packuswb %%mm1, %%mm1 \n\t"
253
#define REAL_YSCALEYUV2PACKED(index, c) \
254
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256
"psraw $3, %%mm0 \n\t"\
257
"psraw $3, %%mm1 \n\t"\
258
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260
"xor "#index", "#index" \n\t"\
263
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291
#define REAL_YSCALEYUV2RGB(index, c) \
292
"xor "#index", "#index" \n\t"\
295
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297
"movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298
"movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317
"movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318
"movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334
"paddw %%mm3, %%mm4 \n\t"\
335
"movq %%mm2, %%mm0 \n\t"\
336
"movq %%mm5, %%mm6 \n\t"\
337
"movq %%mm4, %%mm3 \n\t"\
338
"punpcklwd %%mm2, %%mm2 \n\t"\
339
"punpcklwd %%mm5, %%mm5 \n\t"\
340
"punpcklwd %%mm4, %%mm4 \n\t"\
341
"paddw %%mm1, %%mm2 \n\t"\
342
"paddw %%mm1, %%mm5 \n\t"\
343
"paddw %%mm1, %%mm4 \n\t"\
344
"punpckhwd %%mm0, %%mm0 \n\t"\
345
"punpckhwd %%mm6, %%mm6 \n\t"\
346
"punpckhwd %%mm3, %%mm3 \n\t"\
347
"paddw %%mm7, %%mm0 \n\t"\
348
"paddw %%mm7, %%mm6 \n\t"\
349
"paddw %%mm7, %%mm3 \n\t"\
350
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351
"packuswb %%mm0, %%mm2 \n\t"\
352
"packuswb %%mm6, %%mm5 \n\t"\
353
"packuswb %%mm3, %%mm4 \n\t"\
354
"pxor %%mm7, %%mm7 \n\t"
355
#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357
#define REAL_YSCALEYUV2PACKED1(index, c) \
358
"xor "#index", "#index" \n\t"\
361
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363
"psraw $7, %%mm3 \n\t" \
364
"psraw $7, %%mm4 \n\t" \
365
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367
"psraw $7, %%mm1 \n\t" \
368
"psraw $7, %%mm7 \n\t" \
370
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372
#define REAL_YSCALEYUV2RGB1(index, c) \
373
"xor "#index", "#index" \n\t"\
376
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398
"paddw %%mm3, %%mm4 \n\t"\
399
"movq %%mm2, %%mm0 \n\t"\
400
"movq %%mm5, %%mm6 \n\t"\
401
"movq %%mm4, %%mm3 \n\t"\
402
"punpcklwd %%mm2, %%mm2 \n\t"\
403
"punpcklwd %%mm5, %%mm5 \n\t"\
404
"punpcklwd %%mm4, %%mm4 \n\t"\
405
"paddw %%mm1, %%mm2 \n\t"\
406
"paddw %%mm1, %%mm5 \n\t"\
407
"paddw %%mm1, %%mm4 \n\t"\
408
"punpckhwd %%mm0, %%mm0 \n\t"\
409
"punpckhwd %%mm6, %%mm6 \n\t"\
410
"punpckhwd %%mm3, %%mm3 \n\t"\
411
"paddw %%mm7, %%mm0 \n\t"\
412
"paddw %%mm7, %%mm6 \n\t"\
413
"paddw %%mm7, %%mm3 \n\t"\
414
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415
"packuswb %%mm0, %%mm2 \n\t"\
416
"packuswb %%mm6, %%mm5 \n\t"\
417
"packuswb %%mm3, %%mm4 \n\t"\
418
"pxor %%mm7, %%mm7 \n\t"
419
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421
#define REAL_YSCALEYUV2PACKED1b(index, c) \
422
"xor "#index", "#index" \n\t"\
425
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431
"psrlw $8, %%mm3 \n\t" \
432
"psrlw $8, %%mm4 \n\t" \
433
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435
"psraw $7, %%mm1 \n\t" \
436
"psraw $7, %%mm7 \n\t"
437
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439
// do vertical chrominance interpolation
440
#define REAL_YSCALEYUV2RGB1b(index, c) \
441
"xor "#index", "#index" \n\t"\
444
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470
"paddw %%mm3, %%mm4 \n\t"\
471
"movq %%mm2, %%mm0 \n\t"\
472
"movq %%mm5, %%mm6 \n\t"\
473
"movq %%mm4, %%mm3 \n\t"\
474
"punpcklwd %%mm2, %%mm2 \n\t"\
475
"punpcklwd %%mm5, %%mm5 \n\t"\
476
"punpcklwd %%mm4, %%mm4 \n\t"\
477
"paddw %%mm1, %%mm2 \n\t"\
478
"paddw %%mm1, %%mm5 \n\t"\
479
"paddw %%mm1, %%mm4 \n\t"\
480
"punpckhwd %%mm0, %%mm0 \n\t"\
481
"punpckhwd %%mm6, %%mm6 \n\t"\
482
"punpckhwd %%mm3, %%mm3 \n\t"\
483
"paddw %%mm7, %%mm0 \n\t"\
484
"paddw %%mm7, %%mm6 \n\t"\
485
"paddw %%mm7, %%mm3 \n\t"\
486
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487
"packuswb %%mm0, %%mm2 \n\t"\
488
"packuswb %%mm6, %%mm5 \n\t"\
489
"packuswb %%mm3, %%mm4 \n\t"\
490
"pxor %%mm7, %%mm7 \n\t"
491
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493
#define REAL_WRITEBGR32(dst, dstw, index) \
494
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495
"movq %%mm2, %%mm1 \n\t" /* B */\
496
"movq %%mm5, %%mm6 \n\t" /* R */\
497
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508
MOVNTQ(%%mm0, (dst, index, 4))\
509
MOVNTQ(%%mm2, 8(dst, index, 4))\
510
MOVNTQ(%%mm1, 16(dst, index, 4))\
511
MOVNTQ(%%mm3, 24(dst, index, 4))\
513
"add $8, "#index" \n\t"\
514
"cmp "#dstw", "#index" \n\t"\
516
#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518
#define REAL_WRITEBGR16(dst, dstw, index) \
519
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522
"psrlq $3, %%mm2 \n\t"\
524
"movq %%mm2, %%mm1 \n\t"\
525
"movq %%mm4, %%mm3 \n\t"\
527
"punpcklbw %%mm7, %%mm3 \n\t"\
528
"punpcklbw %%mm5, %%mm2 \n\t"\
529
"punpckhbw %%mm7, %%mm4 \n\t"\
530
"punpckhbw %%mm5, %%mm1 \n\t"\
532
"psllq $3, %%mm3 \n\t"\
533
"psllq $3, %%mm4 \n\t"\
535
"por %%mm3, %%mm2 \n\t"\
536
"por %%mm4, %%mm1 \n\t"\
538
MOVNTQ(%%mm2, (dst, index, 2))\
539
MOVNTQ(%%mm1, 8(dst, index, 2))\
541
"add $8, "#index" \n\t"\
542
"cmp "#dstw", "#index" \n\t"\
544
#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546
#define REAL_WRITEBGR15(dst, dstw, index) \
547
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548
"pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550
"psrlq $3, %%mm2 \n\t"\
551
"psrlq $1, %%mm5 \n\t"\
553
"movq %%mm2, %%mm1 \n\t"\
554
"movq %%mm4, %%mm3 \n\t"\
556
"punpcklbw %%mm7, %%mm3 \n\t"\
557
"punpcklbw %%mm5, %%mm2 \n\t"\
558
"punpckhbw %%mm7, %%mm4 \n\t"\
559
"punpckhbw %%mm5, %%mm1 \n\t"\
561
"psllq $2, %%mm3 \n\t"\
562
"psllq $2, %%mm4 \n\t"\
564
"por %%mm3, %%mm2 \n\t"\
565
"por %%mm4, %%mm1 \n\t"\
567
MOVNTQ(%%mm2, (dst, index, 2))\
568
MOVNTQ(%%mm1, 8(dst, index, 2))\
570
"add $8, "#index" \n\t"\
571
"cmp "#dstw", "#index" \n\t"\
573
#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575
#define WRITEBGR24OLD(dst, dstw, index) \
576
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577
"movq %%mm2, %%mm1 \n\t" /* B */\
578
"movq %%mm5, %%mm6 \n\t" /* R */\
579
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592
"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593
"pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603
"pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606
"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607
"pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616
"pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617
"pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622
MOVNTQ(%%mm0, (dst))\
623
MOVNTQ(%%mm2, 8(dst))\
624
MOVNTQ(%%mm3, 16(dst))\
625
"add $24, "#dst" \n\t"\
627
"add $8, "#index" \n\t"\
628
"cmp "#dstw", "#index" \n\t"\
631
#define WRITEBGR24MMX(dst, dstw, index) \
632
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633
"movq %%mm2, %%mm1 \n\t" /* B */\
634
"movq %%mm5, %%mm6 \n\t" /* R */\
635
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647
"movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648
"movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649
"movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651
"psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652
"psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653
"psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656
"punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657
"punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658
"punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659
"punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661
"psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662
"movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663
"psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665
MOVNTQ(%%mm0, (dst))\
667
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668
"movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669
"psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670
"por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671
MOVNTQ(%%mm6, 8(dst))\
673
"psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675
"por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676
MOVNTQ(%%mm5, 16(dst))\
678
"add $24, "#dst" \n\t"\
680
"add $8, "#index" \n\t"\
681
"cmp "#dstw", "#index" \n\t"\
684
#define WRITEBGR24MMX2(dst, dstw, index) \
685
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686
"movq "MANGLE(M24A)", %%mm0 \n\t"\
687
"movq "MANGLE(M24C)", %%mm7 \n\t"\
688
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692
"pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693
"pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697
"por %%mm1, %%mm6 \n\t"\
698
"por %%mm3, %%mm6 \n\t"\
699
MOVNTQ(%%mm6, (dst))\
701
"psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702
"pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703
"pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704
"pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706
"pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708
"pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710
"por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711
"por %%mm3, %%mm6 \n\t"\
712
MOVNTQ(%%mm6, 8(dst))\
714
"pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715
"pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716
"pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718
"pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719
"pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720
"pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722
"por %%mm1, %%mm3 \n\t"\
723
"por %%mm3, %%mm6 \n\t"\
724
MOVNTQ(%%mm6, 16(dst))\
726
"add $24, "#dst" \n\t"\
728
"add $8, "#index" \n\t"\
729
"cmp "#dstw", "#index" \n\t"\
734
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740
#define REAL_WRITEYUY2(dst, dstw, index) \
741
"packuswb %%mm3, %%mm3 \n\t"\
742
"packuswb %%mm4, %%mm4 \n\t"\
743
"packuswb %%mm7, %%mm1 \n\t"\
744
"punpcklbw %%mm4, %%mm3 \n\t"\
745
"movq %%mm1, %%mm7 \n\t"\
746
"punpcklbw %%mm3, %%mm1 \n\t"\
747
"punpckhbw %%mm3, %%mm7 \n\t"\
749
MOVNTQ(%%mm1, (dst, index, 2))\
750
MOVNTQ(%%mm7, 8(dst, index, 2))\
752
"add $8, "#index" \n\t"\
753
"cmp "#dstw", "#index" \n\t"\
755
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
766
YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767
:: "r" (&c->redDither),
768
"r" (uDest), "p" ((long)chrDstW)
769
: "%"REG_a, "%"REG_d, "%"REG_S
773
YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774
:: "r" (&c->redDither),
775
"r" (vDest), "p" ((long)chrDstW)
776
: "%"REG_a, "%"REG_d, "%"REG_S
781
YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782
:: "r" (&c->redDither),
783
"r" (dest), "p" ((long)dstW)
784
: "%"REG_a, "%"REG_d, "%"REG_S
788
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789
chrFilter, chrSrc, chrFilterSize,
790
dest, uDest, vDest, dstW, chrDstW);
792
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793
chrFilter, chrSrc, chrFilterSize,
794
dest, uDest, vDest, dstW, chrDstW);
795
#endif //!HAVE_ALTIVEC
799
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801
uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
803
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804
chrFilter, chrSrc, chrFilterSize,
805
dest, uDest, dstW, chrDstW, dstFormat);
808
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
816
:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
823
:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
831
:: "r" (lumSrc + dstW), "r" (dest + dstW),
837
for(i=0; i<dstW; i++)
839
int val= lumSrc[i]>>7;
850
for(i=0; i<chrDstW; i++)
853
int v=chrSrc[i + 2048]>>7;
857
else if (u>255) u=255;
859
else if (v>255) v=255;
870
* vertical scale YV12 to RGB
872
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874
uint8_t *dest, int dstW, int dstY)
884
WRITEBGR32(%4, %5, %%REGa)
886
:: "r" (&c->redDither),
887
"m" (dummy), "m" (dummy), "m" (dummy),
888
"r" (dest), "m" (dstW)
889
: "%"REG_a, "%"REG_d, "%"REG_S
897
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898
"add %4, %%"REG_b" \n\t"
899
WRITEBGR24(%%REGb, %5, %%REGa)
901
:: "r" (&c->redDither),
902
"m" (dummy), "m" (dummy), "m" (dummy),
903
"r" (dest), "m" (dstW)
904
: "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
912
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
914
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919
WRITEBGR15(%4, %5, %%REGa)
921
:: "r" (&c->redDither),
922
"m" (dummy), "m" (dummy), "m" (dummy),
923
"r" (dest), "m" (dstW)
924
: "%"REG_a, "%"REG_d, "%"REG_S
932
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939
WRITEBGR16(%4, %5, %%REGa)
941
:: "r" (&c->redDither),
942
"m" (dummy), "m" (dummy), "m" (dummy),
943
"r" (dest), "m" (dstW)
944
: "%"REG_a, "%"REG_d, "%"REG_S
952
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
954
"psraw $3, %%mm3 \n\t"
955
"psraw $3, %%mm4 \n\t"
956
"psraw $3, %%mm1 \n\t"
957
"psraw $3, %%mm7 \n\t"
958
WRITEYUY2(%4, %5, %%REGa)
960
:: "r" (&c->redDither),
961
"m" (dummy), "m" (dummy), "m" (dummy),
962
"r" (dest), "m" (dstW)
963
: "%"REG_a, "%"REG_d, "%"REG_S
970
altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971
chrFilter, chrSrc, chrFilterSize,
974
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975
chrFilter, chrSrc, chrFilterSize,
983
* vertical bilinear scale YV12 to RGB
985
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986
uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
988
int yalpha1=yalpha^4095;
989
int uvalpha1=uvalpha^4095;
993
if(flags&SWS_FULL_CHR_H_INT)
1003
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1006
"movq %%mm3, %%mm1 \n\t"
1007
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1010
MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011
MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1013
"add $4, %%"REG_a" \n\t"
1014
"cmp %5, %%"REG_a" \n\t"
1018
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019
"m" (yalpha1), "m" (uvalpha1)
1029
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1032
"movq %%mm3, %%mm1 \n\t"
1033
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1036
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037
"psrlq $8, %%mm3 \n\t" // GR0BGR00
1038
"pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039
"pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040
"por %%mm2, %%mm3 \n\t" // BGRBGR00
1041
"movq %%mm1, %%mm2 \n\t"
1042
"psllq $48, %%mm1 \n\t" // 000000BG
1043
"por %%mm1, %%mm3 \n\t" // BGRBGRBG
1045
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046
"psrld $16, %%mm2 \n\t" // R000R000
1047
"psrlq $24, %%mm1 \n\t" // 0BGR0000
1048
"por %%mm2, %%mm1 \n\t" // RBGRR000
1050
"mov %4, %%"REG_b" \n\t"
1051
"add %%"REG_a", %%"REG_b" \n\t"
1055
"movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056
"movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1058
"movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1059
"psrlq $32, %%mm3 \n\t"
1060
"movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061
"movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1063
"add $4, %%"REG_a" \n\t"
1064
"cmp %5, %%"REG_a" \n\t"
1067
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068
"m" (yalpha1), "m" (uvalpha1)
1069
: "%"REG_a, "%"REG_b
1077
"paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1081
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1085
"psrlw $3, %%mm3 \n\t"
1086
"psllw $2, %%mm1 \n\t"
1087
"psllw $7, %%mm0 \n\t"
1088
"pand "MANGLE(g15Mask)", %%mm1 \n\t"
1089
"pand "MANGLE(r15Mask)", %%mm0 \n\t"
1091
"por %%mm3, %%mm1 \n\t"
1092
"por %%mm1, %%mm0 \n\t"
1094
MOVNTQ(%%mm0, (%4, %%REGa, 2))
1096
"add $4, %%"REG_a" \n\t"
1097
"cmp %5, %%"REG_a" \n\t"
1100
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101
"m" (yalpha1), "m" (uvalpha1)
1110
"paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1114
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1118
"psrlw $3, %%mm3 \n\t"
1119
"psllw $3, %%mm1 \n\t"
1120
"psllw $8, %%mm0 \n\t"
1121
"pand "MANGLE(g16Mask)", %%mm1 \n\t"
1122
"pand "MANGLE(r16Mask)", %%mm0 \n\t"
1124
"por %%mm3, %%mm1 \n\t"
1125
"por %%mm1, %%mm0 \n\t"
1127
MOVNTQ(%%mm0, (%4, %%REGa, 2))
1129
"add $4, %%"REG_a" \n\t"
1130
"cmp %5, %%"REG_a" \n\t"
1133
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134
"m" (yalpha1), "m" (uvalpha1)
1143
if(dstFormat==IMGFMT_BGR32)
1146
#ifdef WORDS_BIGENDIAN
1149
for(i=0;i<dstW;i++){
1150
// vertical linear interpolation && yuv2rgb in a single step:
1151
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1160
else if(dstFormat==IMGFMT_BGR24)
1163
for(i=0;i<dstW;i++){
1164
// vertical linear interpolation && yuv2rgb in a single step:
1165
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1174
else if(dstFormat==IMGFMT_BGR16)
1177
for(i=0;i<dstW;i++){
1178
// vertical linear interpolation && yuv2rgb in a single step:
1179
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1183
((uint16_t*)dest)[i] =
1184
clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185
clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186
clip_table16r[(Y + yuvtab_3343[V]) >>13];
1189
else if(dstFormat==IMGFMT_BGR15)
1192
for(i=0;i<dstW;i++){
1193
// vertical linear interpolation && yuv2rgb in a single step:
1194
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1198
((uint16_t*)dest)[i] =
1199
clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200
clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201
clip_table15r[(Y + yuvtab_3343[V]) >>13];
1209
switch(c->dstFormat)
1211
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1214
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1215
"mov %4, %%"REG_SP" \n\t"
1216
YSCALEYUV2RGB(%%REGa, %5)
1217
WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1220
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1227
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1228
"mov %4, %%"REG_SP" \n\t"
1229
YSCALEYUV2RGB(%%REGa, %5)
1230
WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1232
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1239
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1240
"mov %4, %%"REG_SP" \n\t"
1241
YSCALEYUV2RGB(%%REGa, %5)
1242
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1244
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1249
WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1252
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1259
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1260
"mov %4, %%"REG_SP" \n\t"
1261
YSCALEYUV2RGB(%%REGa, %5)
1262
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1264
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1269
WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1271
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1278
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1279
"mov %4, %%"REG_SP" \n\t"
1280
YSCALEYUV2PACKED(%%REGa, %5)
1281
WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1283
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1291
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1295
* YV12 to RGB without scaling or interpolating
1297
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298
uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1300
const int yalpha1=0;
1303
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304
const int yalpha= 4096; //FIXME ...
1306
if(flags&SWS_FULL_CHR_H_INT)
1308
RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1313
if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1319
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1320
"mov %4, %%"REG_SP" \n\t"
1321
YSCALEYUV2RGB1(%%REGa, %5)
1322
WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1325
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1332
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1333
"mov %4, %%"REG_SP" \n\t"
1334
YSCALEYUV2RGB1(%%REGa, %5)
1335
WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1338
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1345
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1346
"mov %4, %%"REG_SP" \n\t"
1347
YSCALEYUV2RGB1(%%REGa, %5)
1348
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1350
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1354
WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1357
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1364
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1365
"mov %4, %%"REG_SP" \n\t"
1366
YSCALEYUV2RGB1(%%REGa, %5)
1367
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1374
WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1377
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1384
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1385
"mov %4, %%"REG_SP" \n\t"
1386
YSCALEYUV2PACKED1(%%REGa, %5)
1387
WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1390
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1403
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1404
"mov %4, %%"REG_SP" \n\t"
1405
YSCALEYUV2RGB1b(%%REGa, %5)
1406
WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1409
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1416
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1417
"mov %4, %%"REG_SP" \n\t"
1418
YSCALEYUV2RGB1b(%%REGa, %5)
1419
WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1422
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1429
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1430
"mov %4, %%"REG_SP" \n\t"
1431
YSCALEYUV2RGB1b(%%REGa, %5)
1432
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1434
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1438
WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1441
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1448
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1449
"mov %4, %%"REG_SP" \n\t"
1450
YSCALEYUV2RGB1b(%%REGa, %5)
1451
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1458
WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1461
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1468
"mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1469
"mov %4, %%"REG_SP" \n\t"
1470
YSCALEYUV2PACKED1b(%%REGa, %5)
1471
WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472
"mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1474
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1482
if( uvalpha < 2048 )
1484
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1486
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1490
//FIXME yuy2* can read upto 7 samples to much
1492
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1496
"movq "MANGLE(bm01010101)", %%mm2\n\t"
1497
"mov %0, %%"REG_a" \n\t"
1499
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1500
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501
"pand %%mm2, %%mm0 \n\t"
1502
"pand %%mm2, %%mm1 \n\t"
1503
"packuswb %%mm1, %%mm0 \n\t"
1504
"movq %%mm0, (%2, %%"REG_a") \n\t"
1505
"add $8, %%"REG_a" \n\t"
1507
: : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1512
for(i=0; i<width; i++)
1517
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1519
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1521
"movq "MANGLE(bm01010101)", %%mm4\n\t"
1522
"mov %0, %%"REG_a" \n\t"
1524
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1525
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526
"movq (%2, %%"REG_a",4), %%mm2 \n\t"
1527
"movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1530
"psrlw $8, %%mm0 \n\t"
1531
"psrlw $8, %%mm1 \n\t"
1532
"packuswb %%mm1, %%mm0 \n\t"
1533
"movq %%mm0, %%mm1 \n\t"
1534
"psrlw $8, %%mm0 \n\t"
1535
"pand %%mm4, %%mm1 \n\t"
1536
"packuswb %%mm0, %%mm0 \n\t"
1537
"packuswb %%mm1, %%mm1 \n\t"
1538
"movd %%mm0, (%4, %%"REG_a") \n\t"
1539
"movd %%mm1, (%3, %%"REG_a") \n\t"
1540
"add $4, %%"REG_a" \n\t"
1542
: : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1547
for(i=0; i<width; i++)
1549
dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550
dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1555
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1560
"mov %0, %%"REG_a" \n\t"
1562
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1563
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564
"psrlw $8, %%mm0 \n\t"
1565
"psrlw $8, %%mm1 \n\t"
1566
"packuswb %%mm1, %%mm0 \n\t"
1567
"movq %%mm0, (%2, %%"REG_a") \n\t"
1568
"add $8, %%"REG_a" \n\t"
1570
: : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1575
for(i=0; i<width; i++)
1580
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1582
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584
"movq "MANGLE(bm01010101)", %%mm4\n\t"
1585
"mov %0, %%"REG_a" \n\t"
1587
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1588
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589
"movq (%2, %%"REG_a",4), %%mm2 \n\t"
1590
"movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1593
"pand %%mm4, %%mm0 \n\t"
1594
"pand %%mm4, %%mm1 \n\t"
1595
"packuswb %%mm1, %%mm0 \n\t"
1596
"movq %%mm0, %%mm1 \n\t"
1597
"psrlw $8, %%mm0 \n\t"
1598
"pand %%mm4, %%mm1 \n\t"
1599
"packuswb %%mm0, %%mm0 \n\t"
1600
"packuswb %%mm1, %%mm1 \n\t"
1601
"movd %%mm0, (%4, %%"REG_a") \n\t"
1602
"movd %%mm1, (%3, %%"REG_a") \n\t"
1603
"add $4, %%"REG_a" \n\t"
1605
: : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1610
for(i=0; i<width; i++)
1612
dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613
dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1618
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1621
for(i=0; i<width; i++)
1623
int b= ((uint32_t*)src)[i]&0xFF;
1624
int g= (((uint32_t*)src)[i]>>8)&0xFF;
1625
int r= (((uint32_t*)src)[i]>>16)&0xFF;
1627
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1631
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1634
for(i=0; i<width; i++)
1636
const int a= ((uint32_t*)src1)[2*i+0];
1637
const int e= ((uint32_t*)src1)[2*i+1];
1638
const int c= ((uint32_t*)src2)[2*i+0];
1639
const int d= ((uint32_t*)src2)[2*i+1];
1640
const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641
const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642
const int b= l&0x3FF;
1646
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1651
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1655
"mov %2, %%"REG_a" \n\t"
1656
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1657
"movq "MANGLE(w1111)", %%mm5 \n\t"
1658
"pxor %%mm7, %%mm7 \n\t"
1659
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1662
PREFETCH" 64(%0, %%"REG_b") \n\t"
1663
"movd (%0, %%"REG_b"), %%mm0 \n\t"
1664
"movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1665
"punpcklbw %%mm7, %%mm0 \n\t"
1666
"punpcklbw %%mm7, %%mm1 \n\t"
1667
"movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1668
"movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1669
"punpcklbw %%mm7, %%mm2 \n\t"
1670
"punpcklbw %%mm7, %%mm3 \n\t"
1671
"pmaddwd %%mm6, %%mm0 \n\t"
1672
"pmaddwd %%mm6, %%mm1 \n\t"
1673
"pmaddwd %%mm6, %%mm2 \n\t"
1674
"pmaddwd %%mm6, %%mm3 \n\t"
1675
#ifndef FAST_BGR2YV12
1676
"psrad $8, %%mm0 \n\t"
1677
"psrad $8, %%mm1 \n\t"
1678
"psrad $8, %%mm2 \n\t"
1679
"psrad $8, %%mm3 \n\t"
1681
"packssdw %%mm1, %%mm0 \n\t"
1682
"packssdw %%mm3, %%mm2 \n\t"
1683
"pmaddwd %%mm5, %%mm0 \n\t"
1684
"pmaddwd %%mm5, %%mm2 \n\t"
1685
"packssdw %%mm2, %%mm0 \n\t"
1686
"psraw $7, %%mm0 \n\t"
1688
"movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1689
"movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1690
"punpcklbw %%mm7, %%mm4 \n\t"
1691
"punpcklbw %%mm7, %%mm1 \n\t"
1692
"movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1693
"movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1694
"punpcklbw %%mm7, %%mm2 \n\t"
1695
"punpcklbw %%mm7, %%mm3 \n\t"
1696
"pmaddwd %%mm6, %%mm4 \n\t"
1697
"pmaddwd %%mm6, %%mm1 \n\t"
1698
"pmaddwd %%mm6, %%mm2 \n\t"
1699
"pmaddwd %%mm6, %%mm3 \n\t"
1700
#ifndef FAST_BGR2YV12
1701
"psrad $8, %%mm4 \n\t"
1702
"psrad $8, %%mm1 \n\t"
1703
"psrad $8, %%mm2 \n\t"
1704
"psrad $8, %%mm3 \n\t"
1706
"packssdw %%mm1, %%mm4 \n\t"
1707
"packssdw %%mm3, %%mm2 \n\t"
1708
"pmaddwd %%mm5, %%mm4 \n\t"
1709
"pmaddwd %%mm5, %%mm2 \n\t"
1710
"add $24, %%"REG_b" \n\t"
1711
"packssdw %%mm2, %%mm4 \n\t"
1712
"psraw $7, %%mm4 \n\t"
1714
"packuswb %%mm4, %%mm0 \n\t"
1715
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1717
"movq %%mm0, (%1, %%"REG_a") \n\t"
1718
"add $8, %%"REG_a" \n\t"
1720
: : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1721
: "%"REG_a, "%"REG_b
1725
for(i=0; i<width; i++)
1731
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1736
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1740
"mov %4, %%"REG_a" \n\t"
1741
"movq "MANGLE(w1111)", %%mm5 \n\t"
1742
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1743
"pxor %%mm7, %%mm7 \n\t"
1744
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1745
"add %%"REG_b", %%"REG_b" \n\t"
1748
PREFETCH" 64(%0, %%"REG_b") \n\t"
1749
PREFETCH" 64(%1, %%"REG_b") \n\t"
1750
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751
"movq (%0, %%"REG_b"), %%mm0 \n\t"
1752
"movq (%1, %%"REG_b"), %%mm1 \n\t"
1753
"movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1754
"movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1757
"movq %%mm0, %%mm1 \n\t"
1758
"movq %%mm2, %%mm3 \n\t"
1759
"psrlq $24, %%mm0 \n\t"
1760
"psrlq $24, %%mm2 \n\t"
1763
"punpcklbw %%mm7, %%mm0 \n\t"
1764
"punpcklbw %%mm7, %%mm2 \n\t"
1766
"movd (%0, %%"REG_b"), %%mm0 \n\t"
1767
"movd (%1, %%"REG_b"), %%mm1 \n\t"
1768
"movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1769
"movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1770
"punpcklbw %%mm7, %%mm0 \n\t"
1771
"punpcklbw %%mm7, %%mm1 \n\t"
1772
"punpcklbw %%mm7, %%mm2 \n\t"
1773
"punpcklbw %%mm7, %%mm3 \n\t"
1774
"paddw %%mm1, %%mm0 \n\t"
1775
"paddw %%mm3, %%mm2 \n\t"
1776
"paddw %%mm2, %%mm0 \n\t"
1777
"movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1778
"movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1779
"movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1780
"movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1781
"punpcklbw %%mm7, %%mm4 \n\t"
1782
"punpcklbw %%mm7, %%mm1 \n\t"
1783
"punpcklbw %%mm7, %%mm2 \n\t"
1784
"punpcklbw %%mm7, %%mm3 \n\t"
1785
"paddw %%mm1, %%mm4 \n\t"
1786
"paddw %%mm3, %%mm2 \n\t"
1787
"paddw %%mm4, %%mm2 \n\t"
1788
"psrlw $2, %%mm0 \n\t"
1789
"psrlw $2, %%mm2 \n\t"
1791
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1792
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1794
"pmaddwd %%mm0, %%mm1 \n\t"
1795
"pmaddwd %%mm2, %%mm3 \n\t"
1796
"pmaddwd %%mm6, %%mm0 \n\t"
1797
"pmaddwd %%mm6, %%mm2 \n\t"
1798
#ifndef FAST_BGR2YV12
1799
"psrad $8, %%mm0 \n\t"
1800
"psrad $8, %%mm1 \n\t"
1801
"psrad $8, %%mm2 \n\t"
1802
"psrad $8, %%mm3 \n\t"
1804
"packssdw %%mm2, %%mm0 \n\t"
1805
"packssdw %%mm3, %%mm1 \n\t"
1806
"pmaddwd %%mm5, %%mm0 \n\t"
1807
"pmaddwd %%mm5, %%mm1 \n\t"
1808
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1809
"psraw $7, %%mm0 \n\t"
1811
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812
"movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1813
"movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1814
"movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1815
"movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1818
"movq %%mm4, %%mm1 \n\t"
1819
"movq %%mm2, %%mm3 \n\t"
1820
"psrlq $24, %%mm4 \n\t"
1821
"psrlq $24, %%mm2 \n\t"
1824
"punpcklbw %%mm7, %%mm4 \n\t"
1825
"punpcklbw %%mm7, %%mm2 \n\t"
1827
"movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1828
"movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1829
"movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1830
"movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1831
"punpcklbw %%mm7, %%mm4 \n\t"
1832
"punpcklbw %%mm7, %%mm1 \n\t"
1833
"punpcklbw %%mm7, %%mm2 \n\t"
1834
"punpcklbw %%mm7, %%mm3 \n\t"
1835
"paddw %%mm1, %%mm4 \n\t"
1836
"paddw %%mm3, %%mm2 \n\t"
1837
"paddw %%mm2, %%mm4 \n\t"
1838
"movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1839
"movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1840
"movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1841
"movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1842
"punpcklbw %%mm7, %%mm5 \n\t"
1843
"punpcklbw %%mm7, %%mm1 \n\t"
1844
"punpcklbw %%mm7, %%mm2 \n\t"
1845
"punpcklbw %%mm7, %%mm3 \n\t"
1846
"paddw %%mm1, %%mm5 \n\t"
1847
"paddw %%mm3, %%mm2 \n\t"
1848
"paddw %%mm5, %%mm2 \n\t"
1849
"movq "MANGLE(w1111)", %%mm5 \n\t"
1850
"psrlw $2, %%mm4 \n\t"
1851
"psrlw $2, %%mm2 \n\t"
1853
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1854
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1856
"pmaddwd %%mm4, %%mm1 \n\t"
1857
"pmaddwd %%mm2, %%mm3 \n\t"
1858
"pmaddwd %%mm6, %%mm4 \n\t"
1859
"pmaddwd %%mm6, %%mm2 \n\t"
1860
#ifndef FAST_BGR2YV12
1861
"psrad $8, %%mm4 \n\t"
1862
"psrad $8, %%mm1 \n\t"
1863
"psrad $8, %%mm2 \n\t"
1864
"psrad $8, %%mm3 \n\t"
1866
"packssdw %%mm2, %%mm4 \n\t"
1867
"packssdw %%mm3, %%mm1 \n\t"
1868
"pmaddwd %%mm5, %%mm4 \n\t"
1869
"pmaddwd %%mm5, %%mm1 \n\t"
1870
"add $24, %%"REG_b" \n\t"
1871
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1872
"psraw $7, %%mm4 \n\t"
1874
"movq %%mm0, %%mm1 \n\t"
1875
"punpckldq %%mm4, %%mm0 \n\t"
1876
"punpckhdq %%mm4, %%mm1 \n\t"
1877
"packsswb %%mm1, %%mm0 \n\t"
1878
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1880
"movd %%mm0, (%2, %%"REG_a") \n\t"
1881
"punpckhdq %%mm0, %%mm0 \n\t"
1882
"movd %%mm0, (%3, %%"REG_a") \n\t"
1883
"add $4, %%"REG_a" \n\t"
1885
: : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1886
: "%"REG_a, "%"REG_b
1890
for(i=0; i<width; i++)
1892
int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893
int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894
int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1896
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1902
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1905
for(i=0; i<width; i++)
1907
int d= ((uint16_t*)src)[i];
1910
int r= (d>>11)&0x1F;
1912
dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1916
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1919
for(i=0; i<width; i++)
1921
int d0= ((uint32_t*)src1)[i];
1922
int d1= ((uint32_t*)src2)[i];
1924
int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925
int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1927
int dh2= (dh>>11) + (dh<<21);
1931
int r= (d>>11)&0x7F;
1933
dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934
dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1938
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1941
for(i=0; i<width; i++)
1943
int d= ((uint16_t*)src)[i];
1946
int r= (d>>10)&0x1F;
1948
dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1952
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1955
for(i=0; i<width; i++)
1957
int d0= ((uint32_t*)src1)[i];
1958
int d1= ((uint32_t*)src2)[i];
1960
int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961
int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1963
int dh2= (dh>>11) + (dh<<21);
1967
int r= (d>>10)&0x7F;
1969
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1975
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1978
for(i=0; i<width; i++)
1980
int r= ((uint32_t*)src)[i]&0xFF;
1981
int g= (((uint32_t*)src)[i]>>8)&0xFF;
1982
int b= (((uint32_t*)src)[i]>>16)&0xFF;
1984
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1988
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1991
for(i=0; i<width; i++)
1993
const int a= ((uint32_t*)src1)[2*i+0];
1994
const int e= ((uint32_t*)src1)[2*i+1];
1995
const int c= ((uint32_t*)src2)[2*i+0];
1996
const int d= ((uint32_t*)src2)[2*i+1];
1997
const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998
const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999
const int r= l&0x3FF;
2003
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2008
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2011
for(i=0; i<width; i++)
2017
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2021
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2024
for(i=0; i<width; i++)
2026
int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027
int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028
int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2030
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2036
// Bilinear / Bicubic scaling
2037
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038
int16_t *filter, int16_t *filterPos, int filterSize)
2041
assert(filterSize % 4 == 0 && filterSize>0);
2042
if(filterSize==4) // allways true for upscaling, sometimes for down too
2044
long counter= -2*dstW;
2046
filterPos-= counter/2;
2049
"pxor %%mm7, %%mm7 \n\t"
2050
"movq "MANGLE(w02)", %%mm6 \n\t"
2051
"push %%"REG_BP" \n\t" // we use 7 regs here ...
2052
"mov %%"REG_a", %%"REG_BP" \n\t"
2055
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056
"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2057
"movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058
"movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059
"movd (%3, %%"REG_a"), %%mm0 \n\t"
2060
"movd (%3, %%"REG_b"), %%mm2 \n\t"
2061
"punpcklbw %%mm7, %%mm0 \n\t"
2062
"punpcklbw %%mm7, %%mm2 \n\t"
2063
"pmaddwd %%mm1, %%mm0 \n\t"
2064
"pmaddwd %%mm2, %%mm3 \n\t"
2065
"psrad $8, %%mm0 \n\t"
2066
"psrad $8, %%mm3 \n\t"
2067
"packssdw %%mm3, %%mm0 \n\t"
2068
"pmaddwd %%mm6, %%mm0 \n\t"
2069
"packssdw %%mm0, %%mm0 \n\t"
2070
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2071
"add $4, %%"REG_BP" \n\t"
2074
"pop %%"REG_BP" \n\t"
2076
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2080
else if(filterSize==8)
2082
long counter= -2*dstW;
2084
filterPos-= counter/2;
2087
"pxor %%mm7, %%mm7 \n\t"
2088
"movq "MANGLE(w02)", %%mm6 \n\t"
2089
"push %%"REG_BP" \n\t" // we use 7 regs here ...
2090
"mov %%"REG_a", %%"REG_BP" \n\t"
2093
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094
"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2095
"movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096
"movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097
"movd (%3, %%"REG_a"), %%mm0 \n\t"
2098
"movd (%3, %%"REG_b"), %%mm2 \n\t"
2099
"punpcklbw %%mm7, %%mm0 \n\t"
2100
"punpcklbw %%mm7, %%mm2 \n\t"
2101
"pmaddwd %%mm1, %%mm0 \n\t"
2102
"pmaddwd %%mm2, %%mm3 \n\t"
2104
"movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105
"movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106
"movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2107
"movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2108
"punpcklbw %%mm7, %%mm4 \n\t"
2109
"punpcklbw %%mm7, %%mm2 \n\t"
2110
"pmaddwd %%mm1, %%mm4 \n\t"
2111
"pmaddwd %%mm2, %%mm5 \n\t"
2112
"paddd %%mm4, %%mm0 \n\t"
2113
"paddd %%mm5, %%mm3 \n\t"
2115
"psrad $8, %%mm0 \n\t"
2116
"psrad $8, %%mm3 \n\t"
2117
"packssdw %%mm3, %%mm0 \n\t"
2118
"pmaddwd %%mm6, %%mm0 \n\t"
2119
"packssdw %%mm0, %%mm0 \n\t"
2120
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2121
"add $4, %%"REG_BP" \n\t"
2124
"pop %%"REG_BP" \n\t"
2126
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2132
uint8_t *offset = src+filterSize;
2133
long counter= -2*dstW;
2134
// filter-= counter*filterSize/2;
2135
filterPos-= counter/2;
2138
"pxor %%mm7, %%mm7 \n\t"
2139
"movq "MANGLE(w02)", %%mm6 \n\t"
2142
"mov %2, %%"REG_c" \n\t"
2143
"movzwl (%%"REG_c", %0), %%eax \n\t"
2144
"movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2145
"mov %5, %%"REG_c" \n\t"
2146
"pxor %%mm4, %%mm4 \n\t"
2147
"pxor %%mm5, %%mm5 \n\t"
2149
"movq (%1), %%mm1 \n\t"
2150
"movq (%1, %6), %%mm3 \n\t"
2151
"movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152
"movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2153
"punpcklbw %%mm7, %%mm0 \n\t"
2154
"punpcklbw %%mm7, %%mm2 \n\t"
2155
"pmaddwd %%mm1, %%mm0 \n\t"
2156
"pmaddwd %%mm2, %%mm3 \n\t"
2157
"paddd %%mm3, %%mm5 \n\t"
2158
"paddd %%mm0, %%mm4 \n\t"
2160
"add $4, %%"REG_c" \n\t"
2161
"cmp %4, %%"REG_c" \n\t"
2164
"psrad $8, %%mm4 \n\t"
2165
"psrad $8, %%mm5 \n\t"
2166
"packssdw %%mm5, %%mm4 \n\t"
2167
"pmaddwd %%mm6, %%mm4 \n\t"
2168
"packssdw %%mm4, %%mm4 \n\t"
2169
"mov %3, %%"REG_a" \n\t"
2170
"movd %%mm4, (%%"REG_a", %0) \n\t"
2174
: "+r" (counter), "+r" (filter)
2175
: "m" (filterPos), "m" (dst), "m"(offset),
2176
"m" (src), "r" ((long)filterSize*2)
2177
: "%"REG_b, "%"REG_a, "%"REG_c
2182
hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2185
for(i=0; i<dstW; i++)
2188
int srcPos= filterPos[i];
2190
// printf("filterPos: %d\n", filterPos[i]);
2191
for(j=0; j<filterSize; j++)
2193
// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194
val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2196
// filter += hFilterSize;
2197
dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2203
// *** horizontal scale Y line to temp buffer
2204
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2206
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2207
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208
int32_t *mmx2FilterPos)
2210
if(srcFormat==IMGFMT_YUY2)
2212
RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213
src= formatConvBuffer;
2215
else if(srcFormat==IMGFMT_UYVY)
2217
RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218
src= formatConvBuffer;
2220
else if(srcFormat==IMGFMT_BGR32)
2222
RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223
src= formatConvBuffer;
2225
else if(srcFormat==IMGFMT_BGR24)
2227
RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228
src= formatConvBuffer;
2230
else if(srcFormat==IMGFMT_BGR16)
2232
RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233
src= formatConvBuffer;
2235
else if(srcFormat==IMGFMT_BGR15)
2237
RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238
src= formatConvBuffer;
2240
else if(srcFormat==IMGFMT_RGB32)
2242
RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243
src= formatConvBuffer;
2245
else if(srcFormat==IMGFMT_RGB24)
2247
RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248
src= formatConvBuffer;
2252
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253
if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2255
if(!(flags&SWS_FAST_BILINEAR))
2258
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2260
else // Fast Bilinear upscale / crap downscale
2262
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2268
"pxor %%mm7, %%mm7 \n\t"
2269
"mov %0, %%"REG_c" \n\t"
2270
"mov %1, %%"REG_D" \n\t"
2271
"mov %2, %%"REG_d" \n\t"
2272
"mov %3, %%"REG_b" \n\t"
2273
"xor %%"REG_a", %%"REG_a" \n\t" // i
2274
PREFETCH" (%%"REG_c") \n\t"
2275
PREFETCH" 32(%%"REG_c") \n\t"
2276
PREFETCH" 64(%%"REG_c") \n\t"
2280
#define FUNNY_Y_CODE \
2281
"movl (%%"REG_b"), %%esi \n\t"\
2283
"movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284
"add %%"REG_S", %%"REG_c" \n\t"\
2285
"add %%"REG_a", %%"REG_D" \n\t"\
2286
"xor %%"REG_a", %%"REG_a" \n\t"\
2290
#define FUNNY_Y_CODE \
2291
"movl (%%"REG_b"), %%esi \n\t"\
2293
"addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294
"add %%"REG_a", %%"REG_D" \n\t"\
2295
"xor %%"REG_a", %%"REG_a" \n\t"\
2308
:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2310
: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2312
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2317
int xInc_shr16 = xInc >> 16;
2318
int xInc_mask = xInc & 0xffff;
2319
//NO MMX just normal asm ...
2321
"xor %%"REG_a", %%"REG_a" \n\t" // i
2322
"xor %%"REG_b", %%"REG_b" \n\t" // xx
2323
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
2326
"movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327
"movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2328
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2329
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330
"shll $16, %%edi \n\t"
2331
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332
"mov %1, %%"REG_D" \n\t"
2333
"shrl $9, %%esi \n\t"
2334
"movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2335
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2336
"adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2338
"movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2339
"movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2340
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2341
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342
"shll $16, %%edi \n\t"
2343
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344
"mov %1, %%"REG_D" \n\t"
2345
"shrl $9, %%esi \n\t"
2346
"movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2347
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2348
"adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2351
"add $2, %%"REG_a" \n\t"
2352
"cmp %2, %%"REG_a" \n\t"
2356
:: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2357
: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2360
} //if MMX2 can't be used
2364
unsigned int xpos=0;
2365
for(i=0;i<dstWidth;i++)
2367
register unsigned int xx=xpos>>16;
2368
register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369
dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2376
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2377
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2378
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2379
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380
int32_t *mmx2FilterPos)
2382
if(srcFormat==IMGFMT_YUY2)
2384
RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385
src1= formatConvBuffer;
2386
src2= formatConvBuffer+2048;
2388
else if(srcFormat==IMGFMT_UYVY)
2390
RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391
src1= formatConvBuffer;
2392
src2= formatConvBuffer+2048;
2394
else if(srcFormat==IMGFMT_BGR32)
2396
RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397
src1= formatConvBuffer;
2398
src2= formatConvBuffer+2048;
2400
else if(srcFormat==IMGFMT_BGR24)
2402
RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403
src1= formatConvBuffer;
2404
src2= formatConvBuffer+2048;
2406
else if(srcFormat==IMGFMT_BGR16)
2408
RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409
src1= formatConvBuffer;
2410
src2= formatConvBuffer+2048;
2412
else if(srcFormat==IMGFMT_BGR15)
2414
RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415
src1= formatConvBuffer;
2416
src2= formatConvBuffer+2048;
2418
else if(srcFormat==IMGFMT_RGB32)
2420
RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421
src1= formatConvBuffer;
2422
src2= formatConvBuffer+2048;
2424
else if(srcFormat==IMGFMT_RGB24)
2426
RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427
src1= formatConvBuffer;
2428
src2= formatConvBuffer+2048;
2430
else if(isGray(srcFormat))
2436
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437
if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2439
if(!(flags&SWS_FAST_BILINEAR))
2442
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2445
else // Fast Bilinear upscale / crap downscale
2447
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2453
"pxor %%mm7, %%mm7 \n\t"
2454
"mov %0, %%"REG_c" \n\t"
2455
"mov %1, %%"REG_D" \n\t"
2456
"mov %2, %%"REG_d" \n\t"
2457
"mov %3, %%"REG_b" \n\t"
2458
"xor %%"REG_a", %%"REG_a" \n\t" // i
2459
PREFETCH" (%%"REG_c") \n\t"
2460
PREFETCH" 32(%%"REG_c") \n\t"
2461
PREFETCH" 64(%%"REG_c") \n\t"
2465
#define FUNNY_UV_CODE \
2466
"movl (%%"REG_b"), %%esi \n\t"\
2468
"movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469
"add %%"REG_S", %%"REG_c" \n\t"\
2470
"add %%"REG_a", %%"REG_D" \n\t"\
2471
"xor %%"REG_a", %%"REG_a" \n\t"\
2475
#define FUNNY_UV_CODE \
2476
"movl (%%"REG_b"), %%esi \n\t"\
2478
"addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479
"add %%"REG_a", %%"REG_D" \n\t"\
2480
"xor %%"REG_a", %%"REG_a" \n\t"\
2488
"xor %%"REG_a", %%"REG_a" \n\t" // i
2489
"mov %5, %%"REG_c" \n\t" // src
2490
"mov %1, %%"REG_D" \n\t" // buf1
2491
"add $4096, %%"REG_D" \n\t"
2492
PREFETCH" (%%"REG_c") \n\t"
2493
PREFETCH" 32(%%"REG_c") \n\t"
2494
PREFETCH" 64(%%"REG_c") \n\t"
2501
:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502
"m" (funnyUVCode), "m" (src2)
2503
: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2505
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2507
// printf("%d %d %d\n", dstWidth, i, srcW);
2508
dst[i] = src1[srcW-1]*128;
2509
dst[i+2048] = src2[srcW-1]*128;
2515
long xInc_shr16 = (long) (xInc >> 16);
2516
int xInc_mask = xInc & 0xffff;
2518
"xor %%"REG_a", %%"REG_a" \n\t" // i
2519
"xor %%"REG_b", %%"REG_b" \n\t" // xx
2520
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
2523
"mov %0, %%"REG_S" \n\t"
2524
"movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2525
"movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2526
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528
"shll $16, %%edi \n\t"
2529
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530
"mov %1, %%"REG_D" \n\t"
2531
"shrl $9, %%esi \n\t"
2532
"movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2534
"movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535
"movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2536
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538
"shll $16, %%edi \n\t"
2539
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540
"mov %1, %%"REG_D" \n\t"
2541
"shrl $9, %%esi \n\t"
2542
"movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2544
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2545
"adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2546
"add $1, %%"REG_a" \n\t"
2547
"cmp %2, %%"REG_a" \n\t"
2550
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2551
which is needed to support GCC-4.0 */
2552
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2553
:: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2555
:: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2558
: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2561
} //if MMX2 can't be used
2565
unsigned int xpos=0;
2566
for(i=0;i<dstWidth;i++)
2568
register unsigned int xx=xpos>>16;
2569
register unsigned int xalpha=(xpos&0xFFFF)>>9;
2570
dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2571
dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2573
dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2574
dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2582
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2583
int srcSliceH, uint8_t* dst[], int dstStride[]){
2585
/* load a few things into local vars to make the code more readable? and faster */
2586
const int srcW= c->srcW;
2587
const int dstW= c->dstW;
2588
const int dstH= c->dstH;
2589
const int chrDstW= c->chrDstW;
2590
const int chrSrcW= c->chrSrcW;
2591
const int lumXInc= c->lumXInc;
2592
const int chrXInc= c->chrXInc;
2593
const int dstFormat= c->dstFormat;
2594
const int srcFormat= c->srcFormat;
2595
const int flags= c->flags;
2596
const int canMMX2BeUsed= c->canMMX2BeUsed;
2597
int16_t *vLumFilterPos= c->vLumFilterPos;
2598
int16_t *vChrFilterPos= c->vChrFilterPos;
2599
int16_t *hLumFilterPos= c->hLumFilterPos;
2600
int16_t *hChrFilterPos= c->hChrFilterPos;
2601
int16_t *vLumFilter= c->vLumFilter;
2602
int16_t *vChrFilter= c->vChrFilter;
2603
int16_t *hLumFilter= c->hLumFilter;
2604
int16_t *hChrFilter= c->hChrFilter;
2605
int32_t *lumMmxFilter= c->lumMmxFilter;
2606
int32_t *chrMmxFilter= c->chrMmxFilter;
2607
const int vLumFilterSize= c->vLumFilterSize;
2608
const int vChrFilterSize= c->vChrFilterSize;
2609
const int hLumFilterSize= c->hLumFilterSize;
2610
const int hChrFilterSize= c->hChrFilterSize;
2611
int16_t **lumPixBuf= c->lumPixBuf;
2612
int16_t **chrPixBuf= c->chrPixBuf;
2613
const int vLumBufSize= c->vLumBufSize;
2614
const int vChrBufSize= c->vChrBufSize;
2615
uint8_t *funnyYCode= c->funnyYCode;
2616
uint8_t *funnyUVCode= c->funnyUVCode;
2617
uint8_t *formatConvBuffer= c->formatConvBuffer;
2618
const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2619
const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2622
/* vars whch will change and which we need to storw back in the context */
2624
int lumBufIndex= c->lumBufIndex;
2625
int chrBufIndex= c->chrBufIndex;
2626
int lastInLumBuf= c->lastInLumBuf;
2627
int lastInChrBuf= c->lastInChrBuf;
2629
if(isPacked(c->srcFormat)){
2635
srcStride[2]= srcStride[0];
2637
srcStride[1]<<= c->vChrDrop;
2638
srcStride[2]<<= c->vChrDrop;
2640
// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641
// (int)dst[0], (int)dst[1], (int)dst[2]);
2643
#if 0 //self test FIXME move to a vfilter or something
2645
static volatile int i=0;
2647
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2648
selfTest(src, srcStride, c->srcW, c->srcH);
2653
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654
//dstStride[0],dstStride[1],dstStride[2]);
2656
if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2658
static int firstTime=1; //FIXME move this into the context perhaps
2659
if(flags & SWS_PRINT_INFO && firstTime)
2661
MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2662
"SwScaler: ->cannot do aligned memory acesses anymore\n");
2667
/* Note the user might start scaling the picture in the middle so this will not get executed
2668
this is not really intended but works currently, so ppl might do it */
2679
for(;dstY < dstH; dstY++){
2680
unsigned char *dest =dst[0]+dstStride[0]*dstY;
2681
const int chrDstY= dstY>>c->chrDstVSubSample;
2682
unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2683
unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2685
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2686
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2687
const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2688
const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2690
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2691
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2692
//handle holes (FAST_BILINEAR & weird filters)
2693
if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2694
if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2695
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2696
ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2697
ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2699
// Do we have enough lines in this slice to output the dstY line
2700
if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2702
//Do horizontal scaling
2703
while(lastInLumBuf < lastLumSrcY)
2705
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2707
// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2708
ASSERT(lumBufIndex < 2*vLumBufSize)
2709
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2710
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2711
// printf("%d %d\n", lumBufIndex, vLumBufSize);
2712
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714
funnyYCode, c->srcFormat, formatConvBuffer,
2715
c->lumMmx2Filter, c->lumMmx2FilterPos);
2718
while(lastInChrBuf < lastChrSrcY)
2720
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2723
ASSERT(chrBufIndex < 2*vChrBufSize)
2724
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2725
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726
//FIXME replace parameters through context struct (some at least)
2728
if(!(isGray(srcFormat) || isGray(dstFormat)))
2729
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2730
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2731
funnyUVCode, c->srcFormat, formatConvBuffer,
2732
c->chrMmx2Filter, c->chrMmx2FilterPos);
2735
//wrap buf index around to stay inside the ring buffer
2736
if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2737
if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2739
else // not enough lines left in this slice -> load the rest in the buffer
2741
/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2742
firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2743
lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2744
vChrBufSize, vLumBufSize);*/
2746
//Do horizontal scaling
2747
while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2749
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2751
ASSERT(lumBufIndex < 2*vLumBufSize)
2752
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2753
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2754
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2755
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2756
funnyYCode, c->srcFormat, formatConvBuffer,
2757
c->lumMmx2Filter, c->lumMmx2FilterPos);
2760
while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2762
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2763
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2765
ASSERT(chrBufIndex < 2*vChrBufSize)
2766
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2767
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2769
if(!(isGray(srcFormat) || isGray(dstFormat)))
2770
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2771
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2772
funnyUVCode, c->srcFormat, formatConvBuffer,
2773
c->chrMmx2Filter, c->chrMmx2FilterPos);
2776
//wrap buf index around to stay inside the ring buffer
2777
if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2778
if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2779
break; //we can't output a dstY line so let's try with the next slice
2783
b5Dither= dither8[dstY&1];
2784
g6Dither= dither4[dstY&1];
2785
g5Dither= dither8[dstY&1];
2786
r5Dither= dither8[(dstY+1)&1];
2790
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2791
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2794
for(i=0; i<vLumFilterSize; i++)
2796
lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2797
lumMmxFilter[4*i+2]=
2798
lumMmxFilter[4*i+3]=
2799
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2801
for(i=0; i<vChrFilterSize; i++)
2803
chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2804
chrMmxFilter[4*i+2]=
2805
chrMmxFilter[4*i+3]=
2806
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2809
if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2810
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811
if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2812
RENAME(yuv2nv12X)(c,
2813
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2814
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2815
dest, uDest, dstW, chrDstW, dstFormat);
2817
else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2819
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2820
if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2821
if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2823
int16_t *lumBuf = lumPixBuf[0];
2824
int16_t *chrBuf= chrPixBuf[0];
2825
RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2830
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2831
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2832
dest, uDest, vDest, dstW, chrDstW);
2837
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2838
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2839
if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2841
int chrAlpha= vChrFilter[2*dstY+1];
2842
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2843
dest, dstW, chrAlpha, dstFormat, flags, dstY);
2845
else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2847
int lumAlpha= vLumFilter[2*dstY+1];
2848
int chrAlpha= vChrFilter[2*dstY+1];
2849
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2850
dest, dstW, lumAlpha, chrAlpha, dstY);
2854
RENAME(yuv2packedX)(c,
2855
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2856
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2861
else // hmm looks like we can't use MMX here without overwriting this array's tail
2863
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865
if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2866
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2867
if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2869
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2870
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2871
dest, uDest, dstW, chrDstW, dstFormat);
2873
else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2875
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876
if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2878
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880
dest, uDest, vDest, dstW, chrDstW);
2884
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2887
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2888
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895
__asm __volatile(SFENCE:::"memory");
2896
__asm __volatile(EMMS:::"memory");
2898
/* store changed local vars back in the context */
2900
c->lumBufIndex= lumBufIndex;
2901
c->chrBufIndex= chrBufIndex;
2902
c->lastInLumBuf= lastInLumBuf;
2903
c->lastInChrBuf= lastInChrBuf;
2905
return dstY - lastDstY;