2
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4
* This file is part of Libav.
6
* Libav is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* Libav is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with Libav; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
#include "swscale_template.h"
28
#if COMPILE_TEMPLATE_AMD3DNOW
29
#define PREFETCH "prefetch"
30
#elif COMPILE_TEMPLATE_MMX2
31
#define PREFETCH "prefetchnta"
33
#define PREFETCH " # nop"
36
#if COMPILE_TEMPLATE_MMX2
37
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
38
#elif COMPILE_TEMPLATE_AMD3DNOW
39
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
42
#if COMPILE_TEMPLATE_MMX2
43
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
49
#define YSCALEYUV2YV12X(x, offset, dest, width) \
51
"xor %%"REG_a", %%"REG_a" \n\t"\
52
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
53
"movq %%mm3, %%mm4 \n\t"\
54
"lea " offset "(%0), %%"REG_d" \n\t"\
55
"mov (%%"REG_d"), %%"REG_S" \n\t"\
56
".p2align 4 \n\t" /* FIXME Unroll? */\
58
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
59
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
60
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
61
"add $16, %%"REG_d" \n\t"\
62
"mov (%%"REG_d"), %%"REG_S" \n\t"\
63
"test %%"REG_S", %%"REG_S" \n\t"\
64
"pmulhw %%mm0, %%mm2 \n\t"\
65
"pmulhw %%mm0, %%mm5 \n\t"\
66
"paddw %%mm2, %%mm3 \n\t"\
67
"paddw %%mm5, %%mm4 \n\t"\
69
"psraw $3, %%mm3 \n\t"\
70
"psraw $3, %%mm4 \n\t"\
71
"packuswb %%mm4, %%mm3 \n\t"\
72
MOVNTQ(%%mm3, (%1, %%REGa))\
73
"add $8, %%"REG_a" \n\t"\
74
"cmp %2, %%"REG_a" \n\t"\
75
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
76
"movq %%mm3, %%mm4 \n\t"\
77
"lea " offset "(%0), %%"REG_d" \n\t"\
78
"mov (%%"REG_d"), %%"REG_S" \n\t"\
80
:: "r" (&c->redDither),\
81
"r" (dest), "g" ((x86_reg)width)\
82
: "%"REG_a, "%"REG_d, "%"REG_S\
85
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
87
"lea " offset "(%0), %%"REG_d" \n\t"\
88
"xor %%"REG_a", %%"REG_a" \n\t"\
89
"pxor %%mm4, %%mm4 \n\t"\
90
"pxor %%mm5, %%mm5 \n\t"\
91
"pxor %%mm6, %%mm6 \n\t"\
92
"pxor %%mm7, %%mm7 \n\t"\
93
"mov (%%"REG_d"), %%"REG_S" \n\t"\
96
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
97
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
98
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
99
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
100
"movq %%mm0, %%mm3 \n\t"\
101
"punpcklwd %%mm1, %%mm0 \n\t"\
102
"punpckhwd %%mm1, %%mm3 \n\t"\
103
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
104
"pmaddwd %%mm1, %%mm0 \n\t"\
105
"pmaddwd %%mm1, %%mm3 \n\t"\
106
"paddd %%mm0, %%mm4 \n\t"\
107
"paddd %%mm3, %%mm5 \n\t"\
108
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
109
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
110
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
111
"test %%"REG_S", %%"REG_S" \n\t"\
112
"movq %%mm2, %%mm0 \n\t"\
113
"punpcklwd %%mm3, %%mm2 \n\t"\
114
"punpckhwd %%mm3, %%mm0 \n\t"\
115
"pmaddwd %%mm1, %%mm2 \n\t"\
116
"pmaddwd %%mm1, %%mm0 \n\t"\
117
"paddd %%mm2, %%mm6 \n\t"\
118
"paddd %%mm0, %%mm7 \n\t"\
120
"psrad $16, %%mm4 \n\t"\
121
"psrad $16, %%mm5 \n\t"\
122
"psrad $16, %%mm6 \n\t"\
123
"psrad $16, %%mm7 \n\t"\
124
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
125
"packssdw %%mm5, %%mm4 \n\t"\
126
"packssdw %%mm7, %%mm6 \n\t"\
127
"paddw %%mm0, %%mm4 \n\t"\
128
"paddw %%mm0, %%mm6 \n\t"\
129
"psraw $3, %%mm4 \n\t"\
130
"psraw $3, %%mm6 \n\t"\
131
"packuswb %%mm6, %%mm4 \n\t"\
132
MOVNTQ(%%mm4, (%1, %%REGa))\
133
"add $8, %%"REG_a" \n\t"\
134
"cmp %2, %%"REG_a" \n\t"\
135
"lea " offset "(%0), %%"REG_d" \n\t"\
136
"pxor %%mm4, %%mm4 \n\t"\
137
"pxor %%mm5, %%mm5 \n\t"\
138
"pxor %%mm6, %%mm6 \n\t"\
139
"pxor %%mm7, %%mm7 \n\t"\
140
"mov (%%"REG_d"), %%"REG_S" \n\t"\
142
:: "r" (&c->redDither),\
143
"r" (dest), "g" ((x86_reg)width)\
144
: "%"REG_a, "%"REG_d, "%"REG_S\
147
#define YSCALEYUV2YV121 \
148
"mov %2, %%"REG_a" \n\t"\
149
".p2align 4 \n\t" /* FIXME Unroll? */\
151
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
152
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
153
"psraw $7, %%mm0 \n\t"\
154
"psraw $7, %%mm1 \n\t"\
155
"packuswb %%mm1, %%mm0 \n\t"\
156
MOVNTQ(%%mm0, (%1, %%REGa))\
157
"add $8, %%"REG_a" \n\t"\
160
#define YSCALEYUV2YV121_ACCURATE \
161
"mov %2, %%"REG_a" \n\t"\
162
"pcmpeqw %%mm7, %%mm7 \n\t"\
163
"psrlw $15, %%mm7 \n\t"\
164
"psllw $6, %%mm7 \n\t"\
165
".p2align 4 \n\t" /* FIXME Unroll? */\
167
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
168
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
169
"paddsw %%mm7, %%mm0 \n\t"\
170
"paddsw %%mm7, %%mm1 \n\t"\
171
"psraw $7, %%mm0 \n\t"\
172
"psraw $7, %%mm1 \n\t"\
173
"packuswb %%mm1, %%mm0 \n\t"\
174
MOVNTQ(%%mm0, (%1, %%REGa))\
175
"add $8, %%"REG_a" \n\t"\
179
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
180
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
181
"r" (dest), "m" (dstW_reg),
182
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
183
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
185
#define YSCALEYUV2PACKEDX_UV \
187
"xor %%"REG_a", %%"REG_a" \n\t"\
191
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
192
"mov (%%"REG_d"), %%"REG_S" \n\t"\
193
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
194
"movq %%mm3, %%mm4 \n\t"\
197
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
198
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
199
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
200
"add $16, %%"REG_d" \n\t"\
201
"mov (%%"REG_d"), %%"REG_S" \n\t"\
202
"pmulhw %%mm0, %%mm2 \n\t"\
203
"pmulhw %%mm0, %%mm5 \n\t"\
204
"paddw %%mm2, %%mm3 \n\t"\
205
"paddw %%mm5, %%mm4 \n\t"\
206
"test %%"REG_S", %%"REG_S" \n\t"\
209
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
210
"lea "offset"(%0), %%"REG_d" \n\t"\
211
"mov (%%"REG_d"), %%"REG_S" \n\t"\
212
"movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
213
"movq "#dst1", "#dst2" \n\t"\
216
"movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
217
"movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
218
"movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
219
"add $16, %%"REG_d" \n\t"\
220
"mov (%%"REG_d"), %%"REG_S" \n\t"\
221
"pmulhw "#coeff", "#src1" \n\t"\
222
"pmulhw "#coeff", "#src2" \n\t"\
223
"paddw "#src1", "#dst1" \n\t"\
224
"paddw "#src2", "#dst2" \n\t"\
225
"test %%"REG_S", %%"REG_S" \n\t"\
228
#define YSCALEYUV2PACKEDX \
229
YSCALEYUV2PACKEDX_UV \
230
YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
232
#define YSCALEYUV2PACKEDX_END \
233
:: "r" (&c->redDither), \
234
"m" (dummy), "m" (dummy), "m" (dummy),\
235
"r" (dest), "m" (dstW_reg) \
236
: "%"REG_a, "%"REG_d, "%"REG_S \
239
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
241
"xor %%"REG_a", %%"REG_a" \n\t"\
245
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
246
"mov (%%"REG_d"), %%"REG_S" \n\t"\
247
"pxor %%mm4, %%mm4 \n\t"\
248
"pxor %%mm5, %%mm5 \n\t"\
249
"pxor %%mm6, %%mm6 \n\t"\
250
"pxor %%mm7, %%mm7 \n\t"\
253
"movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
254
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
255
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
256
"movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
257
"movq %%mm0, %%mm3 \n\t"\
258
"punpcklwd %%mm1, %%mm0 \n\t"\
259
"punpckhwd %%mm1, %%mm3 \n\t"\
260
"movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
261
"pmaddwd %%mm1, %%mm0 \n\t"\
262
"pmaddwd %%mm1, %%mm3 \n\t"\
263
"paddd %%mm0, %%mm4 \n\t"\
264
"paddd %%mm3, %%mm5 \n\t"\
265
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
266
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
267
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
268
"test %%"REG_S", %%"REG_S" \n\t"\
269
"movq %%mm2, %%mm0 \n\t"\
270
"punpcklwd %%mm3, %%mm2 \n\t"\
271
"punpckhwd %%mm3, %%mm0 \n\t"\
272
"pmaddwd %%mm1, %%mm2 \n\t"\
273
"pmaddwd %%mm1, %%mm0 \n\t"\
274
"paddd %%mm2, %%mm6 \n\t"\
275
"paddd %%mm0, %%mm7 \n\t"\
277
"psrad $16, %%mm4 \n\t"\
278
"psrad $16, %%mm5 \n\t"\
279
"psrad $16, %%mm6 \n\t"\
280
"psrad $16, %%mm7 \n\t"\
281
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
282
"packssdw %%mm5, %%mm4 \n\t"\
283
"packssdw %%mm7, %%mm6 \n\t"\
284
"paddw %%mm0, %%mm4 \n\t"\
285
"paddw %%mm0, %%mm6 \n\t"\
286
"movq %%mm4, "U_TEMP"(%0) \n\t"\
287
"movq %%mm6, "V_TEMP"(%0) \n\t"\
289
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
290
"lea "offset"(%0), %%"REG_d" \n\t"\
291
"mov (%%"REG_d"), %%"REG_S" \n\t"\
292
"pxor %%mm1, %%mm1 \n\t"\
293
"pxor %%mm5, %%mm5 \n\t"\
294
"pxor %%mm7, %%mm7 \n\t"\
295
"pxor %%mm6, %%mm6 \n\t"\
298
"movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
301
"movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302
"movq %%mm0, %%mm3 \n\t"\
303
"punpcklwd %%mm4, %%mm0 \n\t"\
304
"punpckhwd %%mm4, %%mm3 \n\t"\
305
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306
"pmaddwd %%mm4, %%mm0 \n\t"\
307
"pmaddwd %%mm4, %%mm3 \n\t"\
308
"paddd %%mm0, %%mm1 \n\t"\
309
"paddd %%mm3, %%mm5 \n\t"\
310
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
312
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
313
"test %%"REG_S", %%"REG_S" \n\t"\
314
"movq %%mm2, %%mm0 \n\t"\
315
"punpcklwd %%mm3, %%mm2 \n\t"\
316
"punpckhwd %%mm3, %%mm0 \n\t"\
317
"pmaddwd %%mm4, %%mm2 \n\t"\
318
"pmaddwd %%mm4, %%mm0 \n\t"\
319
"paddd %%mm2, %%mm7 \n\t"\
320
"paddd %%mm0, %%mm6 \n\t"\
322
"psrad $16, %%mm1 \n\t"\
323
"psrad $16, %%mm5 \n\t"\
324
"psrad $16, %%mm7 \n\t"\
325
"psrad $16, %%mm6 \n\t"\
326
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327
"packssdw %%mm5, %%mm1 \n\t"\
328
"packssdw %%mm6, %%mm7 \n\t"\
329
"paddw %%mm0, %%mm1 \n\t"\
330
"paddw %%mm0, %%mm7 \n\t"\
331
"movq "U_TEMP"(%0), %%mm3 \n\t"\
332
"movq "V_TEMP"(%0), %%mm4 \n\t"\
334
#define YSCALEYUV2PACKEDX_ACCURATE \
335
YSCALEYUV2PACKEDX_ACCURATE_UV \
336
YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
338
#define YSCALEYUV2RGBX \
339
"psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
340
"psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
341
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
342
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
343
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
344
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
345
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
346
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
347
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
348
"psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
349
"psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
350
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
351
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
352
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
353
"paddw %%mm3, %%mm4 \n\t"\
354
"movq %%mm2, %%mm0 \n\t"\
355
"movq %%mm5, %%mm6 \n\t"\
356
"movq %%mm4, %%mm3 \n\t"\
357
"punpcklwd %%mm2, %%mm2 \n\t"\
358
"punpcklwd %%mm5, %%mm5 \n\t"\
359
"punpcklwd %%mm4, %%mm4 \n\t"\
360
"paddw %%mm1, %%mm2 \n\t"\
361
"paddw %%mm1, %%mm5 \n\t"\
362
"paddw %%mm1, %%mm4 \n\t"\
363
"punpckhwd %%mm0, %%mm0 \n\t"\
364
"punpckhwd %%mm6, %%mm6 \n\t"\
365
"punpckhwd %%mm3, %%mm3 \n\t"\
366
"paddw %%mm7, %%mm0 \n\t"\
367
"paddw %%mm7, %%mm6 \n\t"\
368
"paddw %%mm7, %%mm3 \n\t"\
369
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
370
"packuswb %%mm0, %%mm2 \n\t"\
371
"packuswb %%mm6, %%mm5 \n\t"\
372
"packuswb %%mm3, %%mm4 \n\t"\
374
#define REAL_YSCALEYUV2PACKED(index, c) \
375
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
376
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
377
"psraw $3, %%mm0 \n\t"\
378
"psraw $3, %%mm1 \n\t"\
379
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
380
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
381
"xor "#index", "#index" \n\t"\
384
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
385
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
386
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
387
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
388
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
389
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
390
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
391
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
392
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
393
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
394
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
395
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
397
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
398
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
399
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
400
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
401
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
402
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
403
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
404
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
405
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
406
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
407
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
408
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
412
#define REAL_YSCALEYUV2RGB_UV(index, c) \
413
"xor "#index", "#index" \n\t"\
416
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
417
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
418
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
419
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
420
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
421
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
422
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
423
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
424
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
425
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
426
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
427
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
428
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
429
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
430
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
431
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
432
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
433
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
434
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
435
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
437
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
438
"movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
439
"movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
440
"movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
441
"movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
442
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
443
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
444
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
445
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
446
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
447
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
448
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
449
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451
#define REAL_YSCALEYUV2RGB_COEFF(c) \
452
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
453
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
454
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
455
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
456
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
457
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
458
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
459
"paddw %%mm3, %%mm4 \n\t"\
460
"movq %%mm2, %%mm0 \n\t"\
461
"movq %%mm5, %%mm6 \n\t"\
462
"movq %%mm4, %%mm3 \n\t"\
463
"punpcklwd %%mm2, %%mm2 \n\t"\
464
"punpcklwd %%mm5, %%mm5 \n\t"\
465
"punpcklwd %%mm4, %%mm4 \n\t"\
466
"paddw %%mm1, %%mm2 \n\t"\
467
"paddw %%mm1, %%mm5 \n\t"\
468
"paddw %%mm1, %%mm4 \n\t"\
469
"punpckhwd %%mm0, %%mm0 \n\t"\
470
"punpckhwd %%mm6, %%mm6 \n\t"\
471
"punpckhwd %%mm3, %%mm3 \n\t"\
472
"paddw %%mm7, %%mm0 \n\t"\
473
"paddw %%mm7, %%mm6 \n\t"\
474
"paddw %%mm7, %%mm3 \n\t"\
475
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
476
"packuswb %%mm0, %%mm2 \n\t"\
477
"packuswb %%mm6, %%mm5 \n\t"\
478
"packuswb %%mm3, %%mm4 \n\t"\
480
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
482
#define YSCALEYUV2RGB(index, c) \
483
REAL_YSCALEYUV2RGB_UV(index, c) \
484
REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
485
REAL_YSCALEYUV2RGB_COEFF(c)
487
#define REAL_YSCALEYUV2PACKED1(index, c) \
488
"xor "#index", "#index" \n\t"\
491
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
492
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
493
"psraw $7, %%mm3 \n\t" \
494
"psraw $7, %%mm4 \n\t" \
495
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
496
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
497
"psraw $7, %%mm1 \n\t" \
498
"psraw $7, %%mm7 \n\t" \
500
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
502
#define REAL_YSCALEYUV2RGB1(index, c) \
503
"xor "#index", "#index" \n\t"\
506
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
507
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
508
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
509
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
510
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
511
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
512
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
513
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
514
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
515
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
516
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
517
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
518
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
519
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
520
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
521
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
522
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
523
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
524
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
525
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
526
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
527
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
528
"paddw %%mm3, %%mm4 \n\t"\
529
"movq %%mm2, %%mm0 \n\t"\
530
"movq %%mm5, %%mm6 \n\t"\
531
"movq %%mm4, %%mm3 \n\t"\
532
"punpcklwd %%mm2, %%mm2 \n\t"\
533
"punpcklwd %%mm5, %%mm5 \n\t"\
534
"punpcklwd %%mm4, %%mm4 \n\t"\
535
"paddw %%mm1, %%mm2 \n\t"\
536
"paddw %%mm1, %%mm5 \n\t"\
537
"paddw %%mm1, %%mm4 \n\t"\
538
"punpckhwd %%mm0, %%mm0 \n\t"\
539
"punpckhwd %%mm6, %%mm6 \n\t"\
540
"punpckhwd %%mm3, %%mm3 \n\t"\
541
"paddw %%mm7, %%mm0 \n\t"\
542
"paddw %%mm7, %%mm6 \n\t"\
543
"paddw %%mm7, %%mm3 \n\t"\
544
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
545
"packuswb %%mm0, %%mm2 \n\t"\
546
"packuswb %%mm6, %%mm5 \n\t"\
547
"packuswb %%mm3, %%mm4 \n\t"\
549
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
551
#define REAL_YSCALEYUV2PACKED1b(index, c) \
552
"xor "#index", "#index" \n\t"\
555
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
556
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
557
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
558
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
559
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
560
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
561
"psrlw $8, %%mm3 \n\t" \
562
"psrlw $8, %%mm4 \n\t" \
563
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
564
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
565
"psraw $7, %%mm1 \n\t" \
566
"psraw $7, %%mm7 \n\t"
567
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
569
// do vertical chrominance interpolation
570
#define REAL_YSCALEYUV2RGB1b(index, c) \
571
"xor "#index", "#index" \n\t"\
574
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
575
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
576
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
577
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
578
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
579
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
580
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
581
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
582
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
583
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
584
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
585
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
586
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
587
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
588
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
589
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
590
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
591
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
592
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
593
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
594
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
595
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
596
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
597
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
598
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
599
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
600
"paddw %%mm3, %%mm4 \n\t"\
601
"movq %%mm2, %%mm0 \n\t"\
602
"movq %%mm5, %%mm6 \n\t"\
603
"movq %%mm4, %%mm3 \n\t"\
604
"punpcklwd %%mm2, %%mm2 \n\t"\
605
"punpcklwd %%mm5, %%mm5 \n\t"\
606
"punpcklwd %%mm4, %%mm4 \n\t"\
607
"paddw %%mm1, %%mm2 \n\t"\
608
"paddw %%mm1, %%mm5 \n\t"\
609
"paddw %%mm1, %%mm4 \n\t"\
610
"punpckhwd %%mm0, %%mm0 \n\t"\
611
"punpckhwd %%mm6, %%mm6 \n\t"\
612
"punpckhwd %%mm3, %%mm3 \n\t"\
613
"paddw %%mm7, %%mm0 \n\t"\
614
"paddw %%mm7, %%mm6 \n\t"\
615
"paddw %%mm7, %%mm3 \n\t"\
616
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
617
"packuswb %%mm0, %%mm2 \n\t"\
618
"packuswb %%mm6, %%mm5 \n\t"\
619
"packuswb %%mm3, %%mm4 \n\t"\
621
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
623
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
624
"movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
625
"movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
626
"psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
627
"psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
628
"packuswb %%mm1, %%mm7 \n\t"
629
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
631
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
632
"movq "#b", "#q2" \n\t" /* B */\
633
"movq "#r", "#t" \n\t" /* R */\
634
"punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
635
"punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
636
"punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
637
"punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
638
"movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
639
"movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
640
"punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
641
"punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
642
"punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
643
"punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
645
MOVNTQ( q0, (dst, index, 4))\
646
MOVNTQ( b, 8(dst, index, 4))\
647
MOVNTQ( q2, 16(dst, index, 4))\
648
MOVNTQ( q3, 24(dst, index, 4))\
650
"add $8, "#index" \n\t"\
651
"cmp "#dstw", "#index" \n\t"\
653
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
655
#define REAL_WRITERGB16(dst, dstw, index) \
656
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
657
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
658
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
659
"psrlq $3, %%mm2 \n\t"\
661
"movq %%mm2, %%mm1 \n\t"\
662
"movq %%mm4, %%mm3 \n\t"\
664
"punpcklbw %%mm7, %%mm3 \n\t"\
665
"punpcklbw %%mm5, %%mm2 \n\t"\
666
"punpckhbw %%mm7, %%mm4 \n\t"\
667
"punpckhbw %%mm5, %%mm1 \n\t"\
669
"psllq $3, %%mm3 \n\t"\
670
"psllq $3, %%mm4 \n\t"\
672
"por %%mm3, %%mm2 \n\t"\
673
"por %%mm4, %%mm1 \n\t"\
675
MOVNTQ(%%mm2, (dst, index, 2))\
676
MOVNTQ(%%mm1, 8(dst, index, 2))\
678
"add $8, "#index" \n\t"\
679
"cmp "#dstw", "#index" \n\t"\
681
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
683
#define REAL_WRITERGB15(dst, dstw, index) \
684
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
685
"pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
686
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
687
"psrlq $3, %%mm2 \n\t"\
688
"psrlq $1, %%mm5 \n\t"\
690
"movq %%mm2, %%mm1 \n\t"\
691
"movq %%mm4, %%mm3 \n\t"\
693
"punpcklbw %%mm7, %%mm3 \n\t"\
694
"punpcklbw %%mm5, %%mm2 \n\t"\
695
"punpckhbw %%mm7, %%mm4 \n\t"\
696
"punpckhbw %%mm5, %%mm1 \n\t"\
698
"psllq $2, %%mm3 \n\t"\
699
"psllq $2, %%mm4 \n\t"\
701
"por %%mm3, %%mm2 \n\t"\
702
"por %%mm4, %%mm1 \n\t"\
704
MOVNTQ(%%mm2, (dst, index, 2))\
705
MOVNTQ(%%mm1, 8(dst, index, 2))\
707
"add $8, "#index" \n\t"\
708
"cmp "#dstw", "#index" \n\t"\
710
#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
712
#define WRITEBGR24OLD(dst, dstw, index) \
713
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
714
"movq %%mm2, %%mm1 \n\t" /* B */\
715
"movq %%mm5, %%mm6 \n\t" /* R */\
716
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
717
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
718
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
719
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
720
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
721
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
722
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
723
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
724
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
725
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
727
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
728
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
729
"pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
730
"pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
731
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
732
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
733
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\
734
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
736
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
737
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
738
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
739
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
740
"pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
741
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
742
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
743
"pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
744
"pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
745
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
746
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
747
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
748
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
750
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
751
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
752
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
753
"pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
754
"pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
755
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
756
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
757
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
759
MOVNTQ(%%mm0, (dst))\
760
MOVNTQ(%%mm2, 8(dst))\
761
MOVNTQ(%%mm3, 16(dst))\
762
"add $24, "#dst" \n\t"\
764
"add $8, "#index" \n\t"\
765
"cmp "#dstw", "#index" \n\t"\
768
#define WRITEBGR24MMX(dst, dstw, index) \
769
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
770
"movq %%mm2, %%mm1 \n\t" /* B */\
771
"movq %%mm5, %%mm6 \n\t" /* R */\
772
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
773
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
774
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
775
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
776
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
777
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
778
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
779
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
780
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
781
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
783
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
784
"movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
785
"movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
786
"movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
788
"psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
789
"psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
790
"psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
791
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
793
"punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
794
"punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
795
"punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
796
"punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
798
"psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
799
"movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
800
"psllq $40, %%mm2 \n\t" /* GB000000 1 */\
801
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
802
MOVNTQ(%%mm0, (dst))\
804
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
805
"movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
806
"psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
807
"por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
808
MOVNTQ(%%mm6, 8(dst))\
810
"psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
811
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
812
"por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
813
MOVNTQ(%%mm5, 16(dst))\
815
"add $24, "#dst" \n\t"\
817
"add $8, "#index" \n\t"\
818
"cmp "#dstw", "#index" \n\t"\
821
#define WRITEBGR24MMX2(dst, dstw, index) \
822
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
"movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
824
"movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
825
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
826
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
827
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
829
"pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
830
"pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
831
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
833
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
834
"por %%mm1, %%mm6 \n\t"\
835
"por %%mm3, %%mm6 \n\t"\
836
MOVNTQ(%%mm6, (dst))\
838
"psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
839
"pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
840
"pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
841
"pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
843
"pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
844
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
845
"pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
847
"por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
848
"por %%mm3, %%mm6 \n\t"\
849
MOVNTQ(%%mm6, 8(dst))\
851
"pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
852
"pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
853
"pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
855
"pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
856
"pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
857
"pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
859
"por %%mm1, %%mm3 \n\t"\
860
"por %%mm3, %%mm6 \n\t"\
861
MOVNTQ(%%mm6, 16(dst))\
863
"add $24, "#dst" \n\t"\
865
"add $8, "#index" \n\t"\
866
"cmp "#dstw", "#index" \n\t"\
869
#if COMPILE_TEMPLATE_MMX2
871
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
874
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
877
#define REAL_WRITEYUY2(dst, dstw, index) \
878
"packuswb %%mm3, %%mm3 \n\t"\
879
"packuswb %%mm4, %%mm4 \n\t"\
880
"packuswb %%mm7, %%mm1 \n\t"\
881
"punpcklbw %%mm4, %%mm3 \n\t"\
882
"movq %%mm1, %%mm7 \n\t"\
883
"punpcklbw %%mm3, %%mm1 \n\t"\
884
"punpckhbw %%mm3, %%mm7 \n\t"\
886
MOVNTQ(%%mm1, (dst, index, 2))\
887
MOVNTQ(%%mm7, 8(dst, index, 2))\
889
"add $8, "#index" \n\t"\
890
"cmp "#dstw", "#index" \n\t"\
892
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
895
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
896
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
897
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
899
if(!(c->flags & SWS_BITEXACT)) {
900
if (c->flags & SWS_ACCURATE_RND) {
902
YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
903
YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
905
if (CONFIG_SWSCALE_ALPHA && aDest) {
906
YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
909
YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
912
YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913
YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
915
if (CONFIG_SWSCALE_ALPHA && aDest) {
916
YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
919
YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
924
chrFilter, chrSrc, chrFilterSize,
925
alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
928
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
929
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
930
uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
932
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
933
chrFilter, chrSrc, chrFilterSize,
934
dest, uDest, dstW, chrDstW, dstFormat);
937
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
938
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
941
if(!(c->flags & SWS_BITEXACT)) {
943
const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
944
uint8_t *dst[4]= {aDest, dest, uDest, vDest};
945
x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
947
if (c->flags & SWS_ACCURATE_RND) {
951
YSCALEYUV2YV121_ACCURATE
952
:: "r" (src[p]), "r" (dst[p] + counter[p]),
963
:: "r" (src[p]), "r" (dst[p] + counter[p]),
972
for (i=0; i<dstW; i++) {
973
int val= (lumSrc[i]+64)>>7;
984
for (i=0; i<chrDstW; i++) {
985
int u=(chrSrc[i ]+64)>>7;
986
int v=(chrSrc[i + VOFW]+64)>>7;
990
else if (u>255) u=255;
992
else if (v>255) v=255;
999
if (CONFIG_SWSCALE_ALPHA && aDest)
1000
for (i=0; i<dstW; i++) {
1001
int val= (alpSrc[i]+64)>>7;
1002
aDest[i]= av_clip_uint8(val);
1008
* vertical scale YV12 to RGB
1010
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1011
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1012
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1015
x86_reg dstW_reg = dstW;
1016
if(!(c->flags & SWS_BITEXACT)) {
1017
if (c->flags & SWS_ACCURATE_RND) {
1018
switch(c->dstFormat) {
1020
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1021
YSCALEYUV2PACKEDX_ACCURATE
1023
"movq %%mm2, "U_TEMP"(%0) \n\t"
1024
"movq %%mm4, "V_TEMP"(%0) \n\t"
1025
"movq %%mm5, "Y_TEMP"(%0) \n\t"
1026
YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1027
"movq "Y_TEMP"(%0), %%mm5 \n\t"
1028
"psraw $3, %%mm1 \n\t"
1029
"psraw $3, %%mm7 \n\t"
1030
"packuswb %%mm7, %%mm1 \n\t"
1031
WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1033
YSCALEYUV2PACKEDX_END
1035
YSCALEYUV2PACKEDX_ACCURATE
1037
"pcmpeqd %%mm7, %%mm7 \n\t"
1038
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1040
YSCALEYUV2PACKEDX_END
1044
YSCALEYUV2PACKEDX_ACCURATE
1046
"pxor %%mm7, %%mm7 \n\t"
1047
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1048
"add %4, %%"REG_c" \n\t"
1049
WRITEBGR24(%%REGc, %5, %%REGa)
1052
:: "r" (&c->redDither),
1053
"m" (dummy), "m" (dummy), "m" (dummy),
1054
"r" (dest), "m" (dstW_reg)
1055
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1058
case PIX_FMT_RGB555:
1059
YSCALEYUV2PACKEDX_ACCURATE
1061
"pxor %%mm7, %%mm7 \n\t"
1062
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1064
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1065
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1066
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
1069
WRITERGB15(%4, %5, %%REGa)
1070
YSCALEYUV2PACKEDX_END
1072
case PIX_FMT_RGB565:
1073
YSCALEYUV2PACKEDX_ACCURATE
1075
"pxor %%mm7, %%mm7 \n\t"
1076
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1079
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1080
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
1083
WRITERGB16(%4, %5, %%REGa)
1084
YSCALEYUV2PACKEDX_END
1086
case PIX_FMT_YUYV422:
1087
YSCALEYUV2PACKEDX_ACCURATE
1088
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090
"psraw $3, %%mm3 \n\t"
1091
"psraw $3, %%mm4 \n\t"
1092
"psraw $3, %%mm1 \n\t"
1093
"psraw $3, %%mm7 \n\t"
1094
WRITEYUY2(%4, %5, %%REGa)
1095
YSCALEYUV2PACKEDX_END
1099
switch(c->dstFormat) {
1101
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1104
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1105
"psraw $3, %%mm1 \n\t"
1106
"psraw $3, %%mm7 \n\t"
1107
"packuswb %%mm7, %%mm1 \n\t"
1108
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1109
YSCALEYUV2PACKEDX_END
1113
"pcmpeqd %%mm7, %%mm7 \n\t"
1114
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1115
YSCALEYUV2PACKEDX_END
1121
"pxor %%mm7, %%mm7 \n\t"
1122
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1123
"add %4, %%"REG_c" \n\t"
1124
WRITEBGR24(%%REGc, %5, %%REGa)
1126
:: "r" (&c->redDither),
1127
"m" (dummy), "m" (dummy), "m" (dummy),
1128
"r" (dest), "m" (dstW_reg)
1129
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1132
case PIX_FMT_RGB555:
1135
"pxor %%mm7, %%mm7 \n\t"
1136
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1138
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1139
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1140
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1143
WRITERGB15(%4, %5, %%REGa)
1144
YSCALEYUV2PACKEDX_END
1146
case PIX_FMT_RGB565:
1149
"pxor %%mm7, %%mm7 \n\t"
1150
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1153
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1154
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1157
WRITERGB16(%4, %5, %%REGa)
1158
YSCALEYUV2PACKEDX_END
1160
case PIX_FMT_YUYV422:
1162
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164
"psraw $3, %%mm3 \n\t"
1165
"psraw $3, %%mm4 \n\t"
1166
"psraw $3, %%mm1 \n\t"
1167
"psraw $3, %%mm7 \n\t"
1168
WRITEYUY2(%4, %5, %%REGa)
1169
YSCALEYUV2PACKEDX_END
1174
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1175
chrFilter, chrSrc, chrFilterSize,
1176
alpSrc, dest, dstW, dstY);
1180
* vertical bilinear scale YV12 to RGB
1182
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1183
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1185
int yalpha1=4095- yalpha;
1186
int uvalpha1=4095-uvalpha;
1189
if(!(c->flags & SWS_BITEXACT)) {
1190
switch(c->dstFormat) {
1191
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1196
YSCALEYUV2RGB(%%r8, %5)
1197
YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1198
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1199
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1200
"packuswb %%mm7, %%mm1 \n\t"
1201
WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1203
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1205
,"r" (abuf0), "r" (abuf1)
1209
*(const uint16_t **)(&c->u_temp)=abuf0;
1210
*(const uint16_t **)(&c->v_temp)=abuf1;
1212
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1213
"mov %4, %%"REG_b" \n\t"
1214
"push %%"REG_BP" \n\t"
1215
YSCALEYUV2RGB(%%REGBP, %5)
1218
"mov "U_TEMP"(%5), %0 \n\t"
1219
"mov "V_TEMP"(%5), %1 \n\t"
1220
YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1221
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1222
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1223
"packuswb %%mm7, %%mm1 \n\t"
1226
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1227
"pop %%"REG_BP" \n\t"
1228
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1230
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1236
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1237
"mov %4, %%"REG_b" \n\t"
1238
"push %%"REG_BP" \n\t"
1239
YSCALEYUV2RGB(%%REGBP, %5)
1240
"pcmpeqd %%mm7, %%mm7 \n\t"
1241
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1242
"pop %%"REG_BP" \n\t"
1243
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1245
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1252
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1253
"mov %4, %%"REG_b" \n\t"
1254
"push %%"REG_BP" \n\t"
1255
YSCALEYUV2RGB(%%REGBP, %5)
1256
"pxor %%mm7, %%mm7 \n\t"
1257
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1258
"pop %%"REG_BP" \n\t"
1259
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1260
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264
case PIX_FMT_RGB555:
1266
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1267
"mov %4, %%"REG_b" \n\t"
1268
"push %%"REG_BP" \n\t"
1269
YSCALEYUV2RGB(%%REGBP, %5)
1270
"pxor %%mm7, %%mm7 \n\t"
1271
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1273
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1274
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1275
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1278
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1279
"pop %%"REG_BP" \n\t"
1280
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1282
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1286
case PIX_FMT_RGB565:
1288
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1289
"mov %4, %%"REG_b" \n\t"
1290
"push %%"REG_BP" \n\t"
1291
YSCALEYUV2RGB(%%REGBP, %5)
1292
"pxor %%mm7, %%mm7 \n\t"
1293
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1295
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1296
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1297
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1300
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1301
"pop %%"REG_BP" \n\t"
1302
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1303
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1307
case PIX_FMT_YUYV422:
1309
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1310
"mov %4, %%"REG_b" \n\t"
1311
"push %%"REG_BP" \n\t"
1312
YSCALEYUV2PACKED(%%REGBP, %5)
1313
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1314
"pop %%"REG_BP" \n\t"
1315
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1316
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1323
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1327
* YV12 to RGB without scaling or interpolating
1329
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1330
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1332
const int yalpha1=0;
1335
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1336
const int yalpha= 4096; //FIXME ...
1338
if (flags&SWS_FULL_CHR_H_INT) {
1339
c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1343
if(!(flags & SWS_BITEXACT)) {
1344
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1347
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1349
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1350
"mov %4, %%"REG_b" \n\t"
1351
"push %%"REG_BP" \n\t"
1352
YSCALEYUV2RGB1(%%REGBP, %5)
1353
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1354
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1355
"pop %%"REG_BP" \n\t"
1356
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1358
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1363
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1364
"mov %4, %%"REG_b" \n\t"
1365
"push %%"REG_BP" \n\t"
1366
YSCALEYUV2RGB1(%%REGBP, %5)
1367
"pcmpeqd %%mm7, %%mm7 \n\t"
1368
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1369
"pop %%"REG_BP" \n\t"
1370
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1372
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1380
"mov %4, %%"REG_b" \n\t"
1381
"push %%"REG_BP" \n\t"
1382
YSCALEYUV2RGB1(%%REGBP, %5)
1383
"pxor %%mm7, %%mm7 \n\t"
1384
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1385
"pop %%"REG_BP" \n\t"
1386
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1388
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1392
case PIX_FMT_RGB555:
1394
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395
"mov %4, %%"REG_b" \n\t"
1396
"push %%"REG_BP" \n\t"
1397
YSCALEYUV2RGB1(%%REGBP, %5)
1398
"pxor %%mm7, %%mm7 \n\t"
1399
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1401
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1402
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1403
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1405
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1406
"pop %%"REG_BP" \n\t"
1407
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1409
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1413
case PIX_FMT_RGB565:
1415
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1416
"mov %4, %%"REG_b" \n\t"
1417
"push %%"REG_BP" \n\t"
1418
YSCALEYUV2RGB1(%%REGBP, %5)
1419
"pxor %%mm7, %%mm7 \n\t"
1420
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1422
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1423
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1424
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1427
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1428
"pop %%"REG_BP" \n\t"
1429
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1431
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435
case PIX_FMT_YUYV422:
1437
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1438
"mov %4, %%"REG_b" \n\t"
1439
"push %%"REG_BP" \n\t"
1440
YSCALEYUV2PACKED1(%%REGBP, %5)
1441
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1442
"pop %%"REG_BP" \n\t"
1443
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1453
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1455
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1456
"mov %4, %%"REG_b" \n\t"
1457
"push %%"REG_BP" \n\t"
1458
YSCALEYUV2RGB1b(%%REGBP, %5)
1459
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1460
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1461
"pop %%"REG_BP" \n\t"
1462
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1464
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1469
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1470
"mov %4, %%"REG_b" \n\t"
1471
"push %%"REG_BP" \n\t"
1472
YSCALEYUV2RGB1b(%%REGBP, %5)
1473
"pcmpeqd %%mm7, %%mm7 \n\t"
1474
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1475
"pop %%"REG_BP" \n\t"
1476
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1478
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1485
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1486
"mov %4, %%"REG_b" \n\t"
1487
"push %%"REG_BP" \n\t"
1488
YSCALEYUV2RGB1b(%%REGBP, %5)
1489
"pxor %%mm7, %%mm7 \n\t"
1490
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1491
"pop %%"REG_BP" \n\t"
1492
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498
case PIX_FMT_RGB555:
1500
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501
"mov %4, %%"REG_b" \n\t"
1502
"push %%"REG_BP" \n\t"
1503
YSCALEYUV2RGB1b(%%REGBP, %5)
1504
"pxor %%mm7, %%mm7 \n\t"
1505
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1507
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1508
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1509
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1511
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1512
"pop %%"REG_BP" \n\t"
1513
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1515
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1519
case PIX_FMT_RGB565:
1521
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1522
"mov %4, %%"REG_b" \n\t"
1523
"push %%"REG_BP" \n\t"
1524
YSCALEYUV2RGB1b(%%REGBP, %5)
1525
"pxor %%mm7, %%mm7 \n\t"
1526
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1528
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1529
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1530
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1533
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1534
"pop %%"REG_BP" \n\t"
1535
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1537
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1541
case PIX_FMT_YUYV422:
1543
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1544
"mov %4, %%"REG_b" \n\t"
1545
"push %%"REG_BP" \n\t"
1546
YSCALEYUV2PACKED1b(%%REGBP, %5)
1547
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1548
"pop %%"REG_BP" \n\t"
1549
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1551
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1558
if (uvalpha < 2048) {
1559
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1561
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1565
//FIXME yuy2* can read up to 7 samples too much
1567
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1570
"movq "MANGLE(bm01010101)", %%mm2 \n\t"
1571
"mov %0, %%"REG_a" \n\t"
1573
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1574
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1575
"pand %%mm2, %%mm0 \n\t"
1576
"pand %%mm2, %%mm1 \n\t"
1577
"packuswb %%mm1, %%mm0 \n\t"
1578
"movq %%mm0, (%2, %%"REG_a") \n\t"
1579
"add $8, %%"REG_a" \n\t"
1581
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1586
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1589
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1590
"mov %0, %%"REG_a" \n\t"
1592
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1593
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1594
"psrlw $8, %%mm0 \n\t"
1595
"psrlw $8, %%mm1 \n\t"
1596
"packuswb %%mm1, %%mm0 \n\t"
1597
"movq %%mm0, %%mm1 \n\t"
1598
"psrlw $8, %%mm0 \n\t"
1599
"pand %%mm4, %%mm1 \n\t"
1600
"packuswb %%mm0, %%mm0 \n\t"
1601
"packuswb %%mm1, %%mm1 \n\t"
1602
"movd %%mm0, (%3, %%"REG_a") \n\t"
1603
"movd %%mm1, (%2, %%"REG_a") \n\t"
1604
"add $4, %%"REG_a" \n\t"
1606
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1609
assert(src1 == src2);
1612
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1615
"mov %0, %%"REG_a" \n\t"
1617
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1618
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1619
"movq (%2, %%"REG_a",2), %%mm2 \n\t"
1620
"movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1621
"psrlw $8, %%mm0 \n\t"
1622
"psrlw $8, %%mm1 \n\t"
1623
"psrlw $8, %%mm2 \n\t"
1624
"psrlw $8, %%mm3 \n\t"
1625
"packuswb %%mm1, %%mm0 \n\t"
1626
"packuswb %%mm3, %%mm2 \n\t"
1627
"movq %%mm0, (%3, %%"REG_a") \n\t"
1628
"movq %%mm2, (%4, %%"REG_a") \n\t"
1629
"add $8, %%"REG_a" \n\t"
1631
: : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1636
/* This is almost identical to the previous, end exists only because
1637
* yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1638
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1641
"mov %0, %%"REG_a" \n\t"
1643
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1644
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1645
"psrlw $8, %%mm0 \n\t"
1646
"psrlw $8, %%mm1 \n\t"
1647
"packuswb %%mm1, %%mm0 \n\t"
1648
"movq %%mm0, (%2, %%"REG_a") \n\t"
1649
"add $8, %%"REG_a" \n\t"
1651
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1656
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1660
"mov %0, %%"REG_a" \n\t"
1662
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1663
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1664
"pand %%mm4, %%mm0 \n\t"
1665
"pand %%mm4, %%mm1 \n\t"
1666
"packuswb %%mm1, %%mm0 \n\t"
1667
"movq %%mm0, %%mm1 \n\t"
1668
"psrlw $8, %%mm0 \n\t"
1669
"pand %%mm4, %%mm1 \n\t"
1670
"packuswb %%mm0, %%mm0 \n\t"
1671
"packuswb %%mm1, %%mm1 \n\t"
1672
"movd %%mm0, (%3, %%"REG_a") \n\t"
1673
"movd %%mm1, (%2, %%"REG_a") \n\t"
1674
"add $4, %%"REG_a" \n\t"
1676
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1679
assert(src1 == src2);
1682
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1685
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1686
"mov %0, %%"REG_a" \n\t"
1688
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1689
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1690
"movq (%2, %%"REG_a",2), %%mm2 \n\t"
1691
"movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1692
"pand %%mm4, %%mm0 \n\t"
1693
"pand %%mm4, %%mm1 \n\t"
1694
"pand %%mm4, %%mm2 \n\t"
1695
"pand %%mm4, %%mm3 \n\t"
1696
"packuswb %%mm1, %%mm0 \n\t"
1697
"packuswb %%mm3, %%mm2 \n\t"
1698
"movq %%mm0, (%3, %%"REG_a") \n\t"
1699
"movq %%mm2, (%4, %%"REG_a") \n\t"
1700
"add $8, %%"REG_a" \n\t"
1702
: : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1707
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1708
const uint8_t *src, long width)
1711
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1712
"mov %0, %%"REG_a" \n\t"
1714
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1715
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1716
"movq %%mm0, %%mm2 \n\t"
1717
"movq %%mm1, %%mm3 \n\t"
1718
"pand %%mm4, %%mm0 \n\t"
1719
"pand %%mm4, %%mm1 \n\t"
1720
"psrlw $8, %%mm2 \n\t"
1721
"psrlw $8, %%mm3 \n\t"
1722
"packuswb %%mm1, %%mm0 \n\t"
1723
"packuswb %%mm3, %%mm2 \n\t"
1724
"movq %%mm0, (%2, %%"REG_a") \n\t"
1725
"movq %%mm2, (%3, %%"REG_a") \n\t"
1726
"add $8, %%"REG_a" \n\t"
1728
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1733
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1734
const uint8_t *src1, const uint8_t *src2,
1735
long width, uint32_t *unused)
1737
RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1740
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1741
const uint8_t *src1, const uint8_t *src2,
1742
long width, uint32_t *unused)
1744
RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1747
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1750
if(srcFormat == PIX_FMT_BGR24) {
1752
"movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1753
"movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1758
"movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1759
"movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1765
"movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1766
"mov %2, %%"REG_a" \n\t"
1767
"pxor %%mm7, %%mm7 \n\t"
1769
PREFETCH" 64(%0) \n\t"
1770
"movd (%0), %%mm0 \n\t"
1771
"movd 2(%0), %%mm1 \n\t"
1772
"movd 6(%0), %%mm2 \n\t"
1773
"movd 8(%0), %%mm3 \n\t"
1775
"punpcklbw %%mm7, %%mm0 \n\t"
1776
"punpcklbw %%mm7, %%mm1 \n\t"
1777
"punpcklbw %%mm7, %%mm2 \n\t"
1778
"punpcklbw %%mm7, %%mm3 \n\t"
1779
"pmaddwd %%mm5, %%mm0 \n\t"
1780
"pmaddwd %%mm6, %%mm1 \n\t"
1781
"pmaddwd %%mm5, %%mm2 \n\t"
1782
"pmaddwd %%mm6, %%mm3 \n\t"
1783
"paddd %%mm1, %%mm0 \n\t"
1784
"paddd %%mm3, %%mm2 \n\t"
1785
"paddd %%mm4, %%mm0 \n\t"
1786
"paddd %%mm4, %%mm2 \n\t"
1787
"psrad $15, %%mm0 \n\t"
1788
"psrad $15, %%mm2 \n\t"
1789
"packssdw %%mm2, %%mm0 \n\t"
1790
"packuswb %%mm0, %%mm0 \n\t"
1791
"movd %%mm0, (%1, %%"REG_a") \n\t"
1792
"add $4, %%"REG_a" \n\t"
1795
: "r" (dst+width), "g" ((x86_reg)-width)
1800
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1803
"movq 24(%4), %%mm6 \n\t"
1804
"mov %3, %%"REG_a" \n\t"
1805
"pxor %%mm7, %%mm7 \n\t"
1807
PREFETCH" 64(%0) \n\t"
1808
"movd (%0), %%mm0 \n\t"
1809
"movd 2(%0), %%mm1 \n\t"
1810
"punpcklbw %%mm7, %%mm0 \n\t"
1811
"punpcklbw %%mm7, %%mm1 \n\t"
1812
"movq %%mm0, %%mm2 \n\t"
1813
"movq %%mm1, %%mm3 \n\t"
1814
"pmaddwd (%4), %%mm0 \n\t"
1815
"pmaddwd 8(%4), %%mm1 \n\t"
1816
"pmaddwd 16(%4), %%mm2 \n\t"
1817
"pmaddwd %%mm6, %%mm3 \n\t"
1818
"paddd %%mm1, %%mm0 \n\t"
1819
"paddd %%mm3, %%mm2 \n\t"
1821
"movd 6(%0), %%mm1 \n\t"
1822
"movd 8(%0), %%mm3 \n\t"
1824
"punpcklbw %%mm7, %%mm1 \n\t"
1825
"punpcklbw %%mm7, %%mm3 \n\t"
1826
"movq %%mm1, %%mm4 \n\t"
1827
"movq %%mm3, %%mm5 \n\t"
1828
"pmaddwd (%4), %%mm1 \n\t"
1829
"pmaddwd 8(%4), %%mm3 \n\t"
1830
"pmaddwd 16(%4), %%mm4 \n\t"
1831
"pmaddwd %%mm6, %%mm5 \n\t"
1832
"paddd %%mm3, %%mm1 \n\t"
1833
"paddd %%mm5, %%mm4 \n\t"
1835
"movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1836
"paddd %%mm3, %%mm0 \n\t"
1837
"paddd %%mm3, %%mm2 \n\t"
1838
"paddd %%mm3, %%mm1 \n\t"
1839
"paddd %%mm3, %%mm4 \n\t"
1840
"psrad $15, %%mm0 \n\t"
1841
"psrad $15, %%mm2 \n\t"
1842
"psrad $15, %%mm1 \n\t"
1843
"psrad $15, %%mm4 \n\t"
1844
"packssdw %%mm1, %%mm0 \n\t"
1845
"packssdw %%mm4, %%mm2 \n\t"
1846
"packuswb %%mm0, %%mm0 \n\t"
1847
"packuswb %%mm2, %%mm2 \n\t"
1848
"movd %%mm0, (%1, %%"REG_a") \n\t"
1849
"movd %%mm2, (%2, %%"REG_a") \n\t"
1850
"add $4, %%"REG_a" \n\t"
1853
: "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1858
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1860
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1863
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1865
RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1866
assert(src1 == src2);
1869
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1872
for (i=0; i<width; i++) {
1873
int b= src1[6*i + 0] + src1[6*i + 3];
1874
int g= src1[6*i + 1] + src1[6*i + 4];
1875
int r= src1[6*i + 2] + src1[6*i + 5];
1877
dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1878
dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1880
assert(src1 == src2);
1883
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1885
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1888
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1891
RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1894
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1898
for (i=0; i<width; i++) {
1899
int r= src1[6*i + 0] + src1[6*i + 3];
1900
int g= src1[6*i + 1] + src1[6*i + 4];
1901
int b= src1[6*i + 2] + src1[6*i + 5];
1903
dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1904
dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1909
// bilinear / bicubic scaling
1910
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
1911
const int16_t *filter, const int16_t *filterPos, long filterSize)
1913
assert(filterSize % 4 == 0 && filterSize>0);
1914
if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1915
x86_reg counter= -2*dstW;
1917
filterPos-= counter/2;
1921
"push %%"REG_b" \n\t"
1923
"pxor %%mm7, %%mm7 \n\t"
1924
"push %%"REG_BP" \n\t" // we use 7 regs here ...
1925
"mov %%"REG_a", %%"REG_BP" \n\t"
1928
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
1929
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1930
"movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1931
"movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1932
"movd (%3, %%"REG_a"), %%mm0 \n\t"
1933
"movd (%3, %%"REG_b"), %%mm2 \n\t"
1934
"punpcklbw %%mm7, %%mm0 \n\t"
1935
"punpcklbw %%mm7, %%mm2 \n\t"
1936
"pmaddwd %%mm1, %%mm0 \n\t"
1937
"pmaddwd %%mm2, %%mm3 \n\t"
1938
"movq %%mm0, %%mm4 \n\t"
1939
"punpckldq %%mm3, %%mm0 \n\t"
1940
"punpckhdq %%mm3, %%mm4 \n\t"
1941
"paddd %%mm4, %%mm0 \n\t"
1942
"psrad $7, %%mm0 \n\t"
1943
"packssdw %%mm0, %%mm0 \n\t"
1944
"movd %%mm0, (%4, %%"REG_BP") \n\t"
1945
"add $4, %%"REG_BP" \n\t"
1948
"pop %%"REG_BP" \n\t"
1950
"pop %%"REG_b" \n\t"
1953
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1958
} else if (filterSize==8) {
1959
x86_reg counter= -2*dstW;
1961
filterPos-= counter/2;
1965
"push %%"REG_b" \n\t"
1967
"pxor %%mm7, %%mm7 \n\t"
1968
"push %%"REG_BP" \n\t" // we use 7 regs here ...
1969
"mov %%"REG_a", %%"REG_BP" \n\t"
1972
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
1973
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1974
"movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1975
"movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1976
"movd (%3, %%"REG_a"), %%mm0 \n\t"
1977
"movd (%3, %%"REG_b"), %%mm2 \n\t"
1978
"punpcklbw %%mm7, %%mm0 \n\t"
1979
"punpcklbw %%mm7, %%mm2 \n\t"
1980
"pmaddwd %%mm1, %%mm0 \n\t"
1981
"pmaddwd %%mm2, %%mm3 \n\t"
1983
"movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1984
"movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1985
"movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1986
"movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1987
"punpcklbw %%mm7, %%mm4 \n\t"
1988
"punpcklbw %%mm7, %%mm2 \n\t"
1989
"pmaddwd %%mm1, %%mm4 \n\t"
1990
"pmaddwd %%mm2, %%mm5 \n\t"
1991
"paddd %%mm4, %%mm0 \n\t"
1992
"paddd %%mm5, %%mm3 \n\t"
1993
"movq %%mm0, %%mm4 \n\t"
1994
"punpckldq %%mm3, %%mm0 \n\t"
1995
"punpckhdq %%mm3, %%mm4 \n\t"
1996
"paddd %%mm4, %%mm0 \n\t"
1997
"psrad $7, %%mm0 \n\t"
1998
"packssdw %%mm0, %%mm0 \n\t"
1999
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2000
"add $4, %%"REG_BP" \n\t"
2003
"pop %%"REG_BP" \n\t"
2005
"pop %%"REG_b" \n\t"
2008
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2014
const uint8_t *offset = src+filterSize;
2015
x86_reg counter= -2*dstW;
2016
//filter-= counter*filterSize/2;
2017
filterPos-= counter/2;
2020
"pxor %%mm7, %%mm7 \n\t"
2023
"mov %2, %%"REG_c" \n\t"
2024
"movzwl (%%"REG_c", %0), %%eax \n\t"
2025
"movzwl 2(%%"REG_c", %0), %%edx \n\t"
2026
"mov %5, %%"REG_c" \n\t"
2027
"pxor %%mm4, %%mm4 \n\t"
2028
"pxor %%mm5, %%mm5 \n\t"
2030
"movq (%1), %%mm1 \n\t"
2031
"movq (%1, %6), %%mm3 \n\t"
2032
"movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2033
"movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2034
"punpcklbw %%mm7, %%mm0 \n\t"
2035
"punpcklbw %%mm7, %%mm2 \n\t"
2036
"pmaddwd %%mm1, %%mm0 \n\t"
2037
"pmaddwd %%mm2, %%mm3 \n\t"
2038
"paddd %%mm3, %%mm5 \n\t"
2039
"paddd %%mm0, %%mm4 \n\t"
2041
"add $4, %%"REG_c" \n\t"
2042
"cmp %4, %%"REG_c" \n\t"
2045
"movq %%mm4, %%mm0 \n\t"
2046
"punpckldq %%mm5, %%mm4 \n\t"
2047
"punpckhdq %%mm5, %%mm0 \n\t"
2048
"paddd %%mm0, %%mm4 \n\t"
2049
"psrad $7, %%mm4 \n\t"
2050
"packssdw %%mm4, %%mm4 \n\t"
2051
"mov %3, %%"REG_a" \n\t"
2052
"movd %%mm4, (%%"REG_a", %0) \n\t"
2056
: "+r" (counter), "+r" (filter)
2057
: "m" (filterPos), "m" (dst), "m"(offset),
2058
"m" (src), "r" ((x86_reg)filterSize*2)
2059
: "%"REG_a, "%"REG_c, "%"REG_d
2064
//FIXME all pal and rgb srcFormats could do this convertion as well
2065
//FIXME all scalers more complex than bilinear could do half of this transform
2066
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2069
for (i = 0; i < width; i++) {
2070
dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2071
dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2074
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2077
for (i = 0; i < width; i++) {
2078
dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2079
dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2082
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2085
for (i = 0; i < width; i++)
2086
dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2088
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2091
for (i = 0; i < width; i++)
2092
dst[i] = (dst[i]*14071 + 33561947)>>14;
2095
#define FAST_BILINEAR_X86 \
2096
"subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2097
"imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2098
"shll $16, %%edi \n\t" \
2099
"addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2100
"mov %1, %%"REG_D"\n\t" \
2101
"shrl $9, %%esi \n\t" \
2103
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2104
long dstWidth, const uint8_t *src, int srcW,
2107
#if COMPILE_TEMPLATE_MMX2
2108
int32_t *filterPos = c->hLumFilterPos;
2109
int16_t *filter = c->hLumFilter;
2110
int canMMX2BeUsed = c->canMMX2BeUsed;
2111
void *mmx2FilterCode= c->lumMmx2FilterCode;
2114
DECLARE_ALIGNED(8, uint64_t, ebxsave);
2116
if (canMMX2BeUsed) {
2119
"mov %%"REG_b", %5 \n\t"
2121
"pxor %%mm7, %%mm7 \n\t"
2122
"mov %0, %%"REG_c" \n\t"
2123
"mov %1, %%"REG_D" \n\t"
2124
"mov %2, %%"REG_d" \n\t"
2125
"mov %3, %%"REG_b" \n\t"
2126
"xor %%"REG_a", %%"REG_a" \n\t" // i
2127
PREFETCH" (%%"REG_c") \n\t"
2128
PREFETCH" 32(%%"REG_c") \n\t"
2129
PREFETCH" 64(%%"REG_c") \n\t"
2133
#define CALL_MMX2_FILTER_CODE \
2134
"movl (%%"REG_b"), %%esi \n\t"\
2136
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2137
"add %%"REG_S", %%"REG_c" \n\t"\
2138
"add %%"REG_a", %%"REG_D" \n\t"\
2139
"xor %%"REG_a", %%"REG_a" \n\t"\
2143
#define CALL_MMX2_FILTER_CODE \
2144
"movl (%%"REG_b"), %%esi \n\t"\
2146
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2147
"add %%"REG_a", %%"REG_D" \n\t"\
2148
"xor %%"REG_a", %%"REG_a" \n\t"\
2150
#endif /* ARCH_X86_64 */
2152
CALL_MMX2_FILTER_CODE
2153
CALL_MMX2_FILTER_CODE
2154
CALL_MMX2_FILTER_CODE
2155
CALL_MMX2_FILTER_CODE
2156
CALL_MMX2_FILTER_CODE
2157
CALL_MMX2_FILTER_CODE
2158
CALL_MMX2_FILTER_CODE
2159
CALL_MMX2_FILTER_CODE
2162
"mov %5, %%"REG_b" \n\t"
2164
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2165
"m" (mmx2FilterCode)
2169
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2174
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2176
#endif /* COMPILE_TEMPLATE_MMX2 */
2177
x86_reg xInc_shr16 = xInc >> 16;
2178
uint16_t xInc_mask = xInc & 0xffff;
2179
x86_reg dstWidth_reg = dstWidth;
2180
//NO MMX just normal asm ...
2182
"xor %%"REG_a", %%"REG_a" \n\t" // i
2183
"xor %%"REG_d", %%"REG_d" \n\t" // xx
2184
"xorl %%ecx, %%ecx \n\t" // xalpha
2187
"movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2188
"movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2190
"movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2191
"addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2192
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2194
"movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2195
"movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2197
"movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2198
"addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2199
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2202
"add $2, %%"REG_a" \n\t"
2203
"cmp %2, %%"REG_a" \n\t"
2207
:: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2208
: "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2210
#if COMPILE_TEMPLATE_MMX2
2211
} //if MMX2 can't be used
2215
// *** horizontal scale Y line to temp buffer
2216
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2217
const int16_t *hLumFilter,
2218
const int16_t *hLumFilterPos, int hLumFilterSize,
2219
uint8_t *formatConvBuffer,
2220
uint32_t *pal, int isAlpha)
2222
void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2223
void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2225
src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2228
toYV12(formatConvBuffer, src, srcW, pal);
2229
src= formatConvBuffer;
2232
if (!c->hyscale_fast) {
2233
c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2234
} else { // fast bilinear upscale / crap downscale
2235
c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2239
convertRange(dst, dstWidth);
2242
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2243
long dstWidth, const uint8_t *src1,
2244
const uint8_t *src2, int srcW, int xInc)
2246
#if COMPILE_TEMPLATE_MMX2
2247
int32_t *filterPos = c->hChrFilterPos;
2248
int16_t *filter = c->hChrFilter;
2249
int canMMX2BeUsed = c->canMMX2BeUsed;
2250
void *mmx2FilterCode= c->chrMmx2FilterCode;
2253
DECLARE_ALIGNED(8, uint64_t, ebxsave);
2255
if (canMMX2BeUsed) {
2258
"mov %%"REG_b", %6 \n\t"
2260
"pxor %%mm7, %%mm7 \n\t"
2261
"mov %0, %%"REG_c" \n\t"
2262
"mov %1, %%"REG_D" \n\t"
2263
"mov %2, %%"REG_d" \n\t"
2264
"mov %3, %%"REG_b" \n\t"
2265
"xor %%"REG_a", %%"REG_a" \n\t" // i
2266
PREFETCH" (%%"REG_c") \n\t"
2267
PREFETCH" 32(%%"REG_c") \n\t"
2268
PREFETCH" 64(%%"REG_c") \n\t"
2270
CALL_MMX2_FILTER_CODE
2271
CALL_MMX2_FILTER_CODE
2272
CALL_MMX2_FILTER_CODE
2273
CALL_MMX2_FILTER_CODE
2274
"xor %%"REG_a", %%"REG_a" \n\t" // i
2275
"mov %5, %%"REG_c" \n\t" // src
2276
"mov %1, %%"REG_D" \n\t" // buf1
2277
"add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2278
PREFETCH" (%%"REG_c") \n\t"
2279
PREFETCH" 32(%%"REG_c") \n\t"
2280
PREFETCH" 64(%%"REG_c") \n\t"
2282
CALL_MMX2_FILTER_CODE
2283
CALL_MMX2_FILTER_CODE
2284
CALL_MMX2_FILTER_CODE
2285
CALL_MMX2_FILTER_CODE
2288
"mov %6, %%"REG_b" \n\t"
2290
:: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2291
"m" (mmx2FilterCode), "m" (src2)
2295
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2300
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2301
//printf("%d %d %d\n", dstWidth, i, srcW);
2302
dst[i] = src1[srcW-1]*128;
2303
dst[i+VOFW] = src2[srcW-1]*128;
2306
#endif /* COMPILE_TEMPLATE_MMX2 */
2307
x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2308
uint16_t xInc_mask = xInc & 0xffff;
2309
x86_reg dstWidth_reg = dstWidth;
2311
"xor %%"REG_a", %%"REG_a" \n\t" // i
2312
"xor %%"REG_d", %%"REG_d" \n\t" // xx
2313
"xorl %%ecx, %%ecx \n\t" // xalpha
2316
"mov %0, %%"REG_S" \n\t"
2317
"movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2318
"movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2320
"movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2322
"movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2323
"movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2325
"movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2327
"addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2328
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2329
"add $1, %%"REG_a" \n\t"
2330
"cmp %2, %%"REG_a" \n\t"
2333
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2334
which is needed to support GCC 4.0. */
2335
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2336
:: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2338
:: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2341
: "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2343
#if COMPILE_TEMPLATE_MMX2
2344
} //if MMX2 can't be used
2348
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2349
int srcW, int xInc, const int16_t *hChrFilter,
2350
const int16_t *hChrFilterPos, int hChrFilterSize,
2351
uint8_t *formatConvBuffer,
2355
src1 += c->chrSrcOffset;
2356
src2 += c->chrSrcOffset;
2359
c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2360
src1= formatConvBuffer;
2361
src2= formatConvBuffer+VOFW;
2364
if (!c->hcscale_fast) {
2365
c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2366
c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2367
} else { // fast bilinear upscale / crap downscale
2368
c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2371
if (c->chrConvertRange)
2372
c->chrConvertRange(dst, dstWidth);
2375
#define DEBUG_SWSCALE_BUFFERS 0
2376
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2378
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2379
int srcSliceH, uint8_t* dst[], int dstStride[])
2381
/* load a few things into local vars to make the code more readable? and faster */
2382
const int srcW= c->srcW;
2383
const int dstW= c->dstW;
2384
const int dstH= c->dstH;
2385
const int chrDstW= c->chrDstW;
2386
const int chrSrcW= c->chrSrcW;
2387
const int lumXInc= c->lumXInc;
2388
const int chrXInc= c->chrXInc;
2389
const enum PixelFormat dstFormat= c->dstFormat;
2390
const int flags= c->flags;
2391
int16_t *vLumFilterPos= c->vLumFilterPos;
2392
int16_t *vChrFilterPos= c->vChrFilterPos;
2393
int16_t *hLumFilterPos= c->hLumFilterPos;
2394
int16_t *hChrFilterPos= c->hChrFilterPos;
2395
int16_t *vLumFilter= c->vLumFilter;
2396
int16_t *vChrFilter= c->vChrFilter;
2397
int16_t *hLumFilter= c->hLumFilter;
2398
int16_t *hChrFilter= c->hChrFilter;
2399
int32_t *lumMmxFilter= c->lumMmxFilter;
2400
int32_t *chrMmxFilter= c->chrMmxFilter;
2401
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2402
const int vLumFilterSize= c->vLumFilterSize;
2403
const int vChrFilterSize= c->vChrFilterSize;
2404
const int hLumFilterSize= c->hLumFilterSize;
2405
const int hChrFilterSize= c->hChrFilterSize;
2406
int16_t **lumPixBuf= c->lumPixBuf;
2407
int16_t **chrPixBuf= c->chrPixBuf;
2408
int16_t **alpPixBuf= c->alpPixBuf;
2409
const int vLumBufSize= c->vLumBufSize;
2410
const int vChrBufSize= c->vChrBufSize;
2411
uint8_t *formatConvBuffer= c->formatConvBuffer;
2412
const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2413
const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2415
uint32_t *pal=c->pal_yuv;
2417
/* vars which will change and which we need to store back in the context */
2419
int lumBufIndex= c->lumBufIndex;
2420
int chrBufIndex= c->chrBufIndex;
2421
int lastInLumBuf= c->lastInLumBuf;
2422
int lastInChrBuf= c->lastInChrBuf;
2424
if (isPacked(c->srcFormat)) {
2432
srcStride[3]= srcStride[0];
2434
srcStride[1]<<= c->vChrDrop;
2435
srcStride[2]<<= c->vChrDrop;
2437
DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2438
src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2439
dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2440
DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2441
srcSliceY, srcSliceH, dstY, dstH);
2442
DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2443
vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2445
if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2446
static int warnedAlready=0; //FIXME move this into the context perhaps
2447
if (flags & SWS_PRINT_INFO && !warnedAlready) {
2448
av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2449
" ->cannot do aligned memory accesses anymore\n");
2454
/* Note the user might start scaling the picture in the middle so this
2455
will not get executed. This is not really intended but works
2456
currently, so people might do it. */
2457
if (srcSliceY ==0) {
2467
for (;dstY < dstH; dstY++) {
2468
unsigned char *dest =dst[0]+dstStride[0]*dstY;
2469
const int chrDstY= dstY>>c->chrDstVSubSample;
2470
unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2471
unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2472
unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2474
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2475
const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2476
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2477
int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2478
int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2479
int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2482
//handle holes (FAST_BILINEAR & weird filters)
2483
if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2484
if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2485
assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2486
assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2488
DEBUG_BUFFERS("dstY: %d\n", dstY);
2489
DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2490
firstLumSrcY, lastLumSrcY, lastInLumBuf);
2491
DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2492
firstChrSrcY, lastChrSrcY, lastInChrBuf);
2494
// Do we have enough lines in this slice to output the dstY line
2495
enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2497
if (!enough_lines) {
2498
lastLumSrcY = srcSliceY + srcSliceH - 1;
2499
lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2500
DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2501
lastLumSrcY, lastChrSrcY);
2504
//Do horizontal scaling
2505
while(lastInLumBuf < lastLumSrcY) {
2506
const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2507
const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2509
assert(lumBufIndex < 2*vLumBufSize);
2510
assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2511
assert(lastInLumBuf + 1 - srcSliceY >= 0);
2512
RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2513
hLumFilter, hLumFilterPos, hLumFilterSize,
2516
if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2517
RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2518
hLumFilter, hLumFilterPos, hLumFilterSize,
2522
DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2523
lumBufIndex, lastInLumBuf);
2525
while(lastInChrBuf < lastChrSrcY) {
2526
const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2527
const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2529
assert(chrBufIndex < 2*vChrBufSize);
2530
assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2531
assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2532
//FIXME replace parameters through context struct (some at least)
2534
if (c->needs_hcscale)
2535
RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2536
hChrFilter, hChrFilterPos, hChrFilterSize,
2540
DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2541
chrBufIndex, lastInChrBuf);
2543
//wrap buf index around to stay inside the ring buffer
2544
if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2545
if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2547
break; //we can't output a dstY line so let's try with the next slice
2549
c->blueDither= ff_dither8[dstY&1];
2550
if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2551
c->greenDither= ff_dither8[dstY&1];
2553
c->greenDither= ff_dither4[dstY&1];
2554
c->redDither= ff_dither8[(dstY+1)&1];
2555
if (dstY < dstH-2) {
2556
const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2557
const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2558
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2560
if (flags & SWS_ACCURATE_RND) {
2561
int s= APCK_SIZE / 8;
2562
for (i=0; i<vLumFilterSize; i+=2) {
2563
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2564
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2565
lumMmxFilter[s*i+APCK_COEF/4 ]=
2566
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2567
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2568
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2569
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2570
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2571
alpMmxFilter[s*i+APCK_COEF/4 ]=
2572
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2575
for (i=0; i<vChrFilterSize; i+=2) {
2576
*(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2577
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2578
chrMmxFilter[s*i+APCK_COEF/4 ]=
2579
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2580
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2583
for (i=0; i<vLumFilterSize; i++) {
2584
lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2585
lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2586
lumMmxFilter[4*i+2]=
2587
lumMmxFilter[4*i+3]=
2588
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2589
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2590
alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2591
alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2592
alpMmxFilter[4*i+2]=
2593
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2596
for (i=0; i<vChrFilterSize; i++) {
2597
chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2598
chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2599
chrMmxFilter[4*i+2]=
2600
chrMmxFilter[4*i+3]=
2601
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2604
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2605
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2606
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2608
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2609
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2610
dest, uDest, dstW, chrDstW, dstFormat);
2611
} else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2612
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2613
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2614
if (is16BPS(dstFormat)) {
2616
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2617
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2618
alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2620
} else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2621
const int16_t *lumBuf = lumSrcPtr[0];
2622
const int16_t *chrBuf= chrSrcPtr[0];
2623
const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2624
c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2625
} else { //General YV12
2627
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2628
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2629
alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2632
assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2633
assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2634
if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2635
int chrAlpha= vChrFilter[2*dstY+1];
2636
if(flags & SWS_FULL_CHR_H_INT) {
2637
yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2638
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2639
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2640
alpSrcPtr, dest, dstW, dstY);
2642
c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2643
alpPixBuf ? *alpSrcPtr : NULL,
2644
dest, dstW, chrAlpha, dstFormat, flags, dstY);
2646
} else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2647
int lumAlpha= vLumFilter[2*dstY+1];
2648
int chrAlpha= vChrFilter[2*dstY+1];
2650
lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2652
chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2653
if(flags & SWS_FULL_CHR_H_INT) {
2654
yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2655
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2656
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2657
alpSrcPtr, dest, dstW, dstY);
2659
c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2660
alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2661
dest, dstW, lumAlpha, chrAlpha, dstY);
2663
} else { //general RGB
2664
if(flags & SWS_FULL_CHR_H_INT) {
2666
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2667
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2668
alpSrcPtr, dest, dstW, dstY);
2671
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2672
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2673
alpSrcPtr, dest, dstW, dstY);
2677
} else { // hmm looks like we can't use MMX here without overwriting this array's tail
2678
const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2679
const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2680
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2681
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2682
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2683
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2685
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2686
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2687
dest, uDest, dstW, chrDstW, dstFormat);
2688
} else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2689
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2690
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2691
if (is16BPS(dstFormat)) {
2693
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2694
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2695
alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2699
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2700
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2701
alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2704
assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2705
assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2706
if(flags & SWS_FULL_CHR_H_INT) {
2708
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2709
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2710
alpSrcPtr, dest, dstW, dstY);
2713
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2714
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2715
alpSrcPtr, dest, dstW, dstY);
2721
if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2722
fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2724
if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2725
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2726
if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2727
else __asm__ volatile("emms" :::"memory");
2728
/* store changed local vars back in the context */
2730
c->lumBufIndex= lumBufIndex;
2731
c->chrBufIndex= chrBufIndex;
2732
c->lastInLumBuf= lastInLumBuf;
2733
c->lastInChrBuf= lastInChrBuf;
2735
return dstY - lastDstY;
2738
static void RENAME(sws_init_swScale)(SwsContext *c)
2740
enum PixelFormat srcFormat = c->srcFormat;
2742
c->yuv2nv12X = RENAME(yuv2nv12X );
2743
c->yuv2yuv1 = RENAME(yuv2yuv1 );
2744
c->yuv2yuvX = RENAME(yuv2yuvX );
2745
c->yuv2packed1 = RENAME(yuv2packed1 );
2746
c->yuv2packed2 = RENAME(yuv2packed2 );
2747
c->yuv2packedX = RENAME(yuv2packedX );
2749
c->hScale = RENAME(hScale );
2751
// Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2752
if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2754
c->hyscale_fast = RENAME(hyscale_fast);
2755
c->hcscale_fast = RENAME(hcscale_fast);
2757
c->hyscale_fast = NULL;
2758
c->hcscale_fast = NULL;
2762
case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2763
case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2764
case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2765
case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2766
case PIX_FMT_YUV420P16BE:
2767
case PIX_FMT_YUV422P16BE:
2768
case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2769
case PIX_FMT_YUV420P16LE:
2770
case PIX_FMT_YUV422P16LE:
2771
case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2774
if (c->chrSrcHSubSample) {
2776
case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2777
case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2782
case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2783
case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2788
switch (srcFormat) {
2789
case PIX_FMT_YUYV422 :
2790
case PIX_FMT_YUV420P16BE:
2791
case PIX_FMT_YUV422P16BE:
2792
case PIX_FMT_YUV444P16BE:
2793
case PIX_FMT_Y400A :
2794
case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2795
case PIX_FMT_UYVY422 :
2796
case PIX_FMT_YUV420P16LE:
2797
case PIX_FMT_YUV422P16LE:
2798
case PIX_FMT_YUV444P16LE:
2799
case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2800
case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
2801
case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
2805
switch (srcFormat) {
2806
case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
2811
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2813
c->lumConvertRange = RENAME(lumRangeFromJpeg);
2814
c->chrConvertRange = RENAME(chrRangeFromJpeg);
2816
c->lumConvertRange = RENAME(lumRangeToJpeg);
2817
c->chrConvertRange = RENAME(chrRangeToJpeg);