2
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4
* This file is part of FFmpeg.
6
* FFmpeg is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 2 of the License, or
9
* (at your option) any later version.
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
* the C code (not assembly, mmx, ...) of this file can be used
21
* under the LGPL license too
33
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40
#define PREFETCH "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined ( HAVE_MMX2 )
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
46
#define PREFETCH " # nop"
47
#define PREFETCHW " # nop"
51
#define SFENCE "sfence"
53
#define SFENCE " # nop"
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70
#include "swscale_altivec_template.c"
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
75
"xor %%"REG_a", %%"REG_a" \n\t"\
76
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77
"movq %%mm3, %%mm4 \n\t"\
78
"lea " offset "(%0), %%"REG_d" \n\t"\
79
"mov (%%"REG_d"), %%"REG_S" \n\t"\
80
ASMALIGN(4) /* FIXME Unroll? */\
82
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85
"add $16, %%"REG_d" \n\t"\
86
"mov (%%"REG_d"), %%"REG_S" \n\t"\
87
"test %%"REG_S", %%"REG_S" \n\t"\
88
"pmulhw %%mm0, %%mm2 \n\t"\
89
"pmulhw %%mm0, %%mm5 \n\t"\
90
"paddw %%mm2, %%mm3 \n\t"\
91
"paddw %%mm5, %%mm4 \n\t"\
93
"psraw $3, %%mm3 \n\t"\
94
"psraw $3, %%mm4 \n\t"\
95
"packuswb %%mm4, %%mm3 \n\t"\
96
MOVNTQ(%%mm3, (%1, %%REGa))\
97
"add $8, %%"REG_a" \n\t"\
98
"cmp %2, %%"REG_a" \n\t"\
99
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100
"movq %%mm3, %%mm4 \n\t"\
101
"lea " offset "(%0), %%"REG_d" \n\t"\
102
"mov (%%"REG_d"), %%"REG_S" \n\t"\
104
:: "r" (&c->redDither),\
105
"r" (dest), "g" (width)\
106
: "%"REG_a, "%"REG_d, "%"REG_S\
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111
"lea " offset "(%0), %%"REG_d" \n\t"\
112
"xor %%"REG_a", %%"REG_a" \n\t"\
113
"pxor %%mm4, %%mm4 \n\t"\
114
"pxor %%mm5, %%mm5 \n\t"\
115
"pxor %%mm6, %%mm6 \n\t"\
116
"pxor %%mm7, %%mm7 \n\t"\
117
"mov (%%"REG_d"), %%"REG_S" \n\t"\
120
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122
"mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124
"movq %%mm0, %%mm3 \n\t"\
125
"punpcklwd %%mm1, %%mm0 \n\t"\
126
"punpckhwd %%mm1, %%mm3 \n\t"\
127
"movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128
"pmaddwd %%mm1, %%mm0 \n\t"\
129
"pmaddwd %%mm1, %%mm3 \n\t"\
130
"paddd %%mm0, %%mm4 \n\t"\
131
"paddd %%mm3, %%mm5 \n\t"\
132
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133
"mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134
"add $16, %%"REG_d" \n\t"\
135
"test %%"REG_S", %%"REG_S" \n\t"\
136
"movq %%mm2, %%mm0 \n\t"\
137
"punpcklwd %%mm3, %%mm2 \n\t"\
138
"punpckhwd %%mm3, %%mm0 \n\t"\
139
"pmaddwd %%mm1, %%mm2 \n\t"\
140
"pmaddwd %%mm1, %%mm0 \n\t"\
141
"paddd %%mm2, %%mm6 \n\t"\
142
"paddd %%mm0, %%mm7 \n\t"\
144
"psrad $16, %%mm4 \n\t"\
145
"psrad $16, %%mm5 \n\t"\
146
"psrad $16, %%mm6 \n\t"\
147
"psrad $16, %%mm7 \n\t"\
148
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149
"packssdw %%mm5, %%mm4 \n\t"\
150
"packssdw %%mm7, %%mm6 \n\t"\
151
"paddw %%mm0, %%mm4 \n\t"\
152
"paddw %%mm0, %%mm6 \n\t"\
153
"psraw $3, %%mm4 \n\t"\
154
"psraw $3, %%mm6 \n\t"\
155
"packuswb %%mm6, %%mm4 \n\t"\
156
MOVNTQ(%%mm4, (%1, %%REGa))\
157
"add $8, %%"REG_a" \n\t"\
158
"cmp %2, %%"REG_a" \n\t"\
159
"lea " offset "(%0), %%"REG_d" \n\t"\
160
"pxor %%mm4, %%mm4 \n\t"\
161
"pxor %%mm5, %%mm5 \n\t"\
162
"pxor %%mm6, %%mm6 \n\t"\
163
"pxor %%mm7, %%mm7 \n\t"\
164
"mov (%%"REG_d"), %%"REG_S" \n\t"\
166
:: "r" (&c->redDither),\
167
"r" (dest), "g" (width)\
168
: "%"REG_a, "%"REG_d, "%"REG_S\
171
#define YSCALEYUV2YV121 \
172
"mov %2, %%"REG_a" \n\t"\
173
ASMALIGN(4) /* FIXME Unroll? */\
175
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177
"psraw $7, %%mm0 \n\t"\
178
"psraw $7, %%mm1 \n\t"\
179
"packuswb %%mm1, %%mm0 \n\t"\
180
MOVNTQ(%%mm0, (%1, %%REGa))\
181
"add $8, %%"REG_a" \n\t"\
185
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
"r" (dest), "m" (dstW),
188
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
191
#define YSCALEYUV2PACKEDX \
193
"xor %%"REG_a", %%"REG_a" \n\t"\
197
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198
"mov (%%"REG_d"), %%"REG_S" \n\t"\
199
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200
"movq %%mm3, %%mm4 \n\t"\
203
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205
"movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206
"add $16, %%"REG_d" \n\t"\
207
"mov (%%"REG_d"), %%"REG_S" \n\t"\
208
"pmulhw %%mm0, %%mm2 \n\t"\
209
"pmulhw %%mm0, %%mm5 \n\t"\
210
"paddw %%mm2, %%mm3 \n\t"\
211
"paddw %%mm5, %%mm4 \n\t"\
212
"test %%"REG_S", %%"REG_S" \n\t"\
215
"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216
"mov (%%"REG_d"), %%"REG_S" \n\t"\
217
"movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218
"movq %%mm1, %%mm7 \n\t"\
221
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222
"movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224
"add $16, %%"REG_d" \n\t"\
225
"mov (%%"REG_d"), %%"REG_S" \n\t"\
226
"pmulhw %%mm0, %%mm2 \n\t"\
227
"pmulhw %%mm0, %%mm5 \n\t"\
228
"paddw %%mm2, %%mm1 \n\t"\
229
"paddw %%mm5, %%mm7 \n\t"\
230
"test %%"REG_S", %%"REG_S" \n\t"\
233
#define YSCALEYUV2PACKEDX_END \
234
:: "r" (&c->redDither), \
235
"m" (dummy), "m" (dummy), "m" (dummy),\
236
"r" (dest), "m" (dstW) \
237
: "%"REG_a, "%"REG_d, "%"REG_S \
240
#define YSCALEYUV2PACKEDX_ACCURATE \
242
"xor %%"REG_a", %%"REG_a" \n\t"\
246
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247
"mov (%%"REG_d"), %%"REG_S" \n\t"\
248
"pxor %%mm4, %%mm4 \n\t"\
249
"pxor %%mm5, %%mm5 \n\t"\
250
"pxor %%mm6, %%mm6 \n\t"\
251
"pxor %%mm7, %%mm7 \n\t"\
254
"movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255
"movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256
"mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257
"movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258
"movq %%mm0, %%mm3 \n\t"\
259
"punpcklwd %%mm1, %%mm0 \n\t"\
260
"punpckhwd %%mm1, %%mm3 \n\t"\
261
"movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262
"pmaddwd %%mm1, %%mm0 \n\t"\
263
"pmaddwd %%mm1, %%mm3 \n\t"\
264
"paddd %%mm0, %%mm4 \n\t"\
265
"paddd %%mm3, %%mm5 \n\t"\
266
"movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267
"mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268
"add $16, %%"REG_d" \n\t"\
269
"test %%"REG_S", %%"REG_S" \n\t"\
270
"movq %%mm2, %%mm0 \n\t"\
271
"punpcklwd %%mm3, %%mm2 \n\t"\
272
"punpckhwd %%mm3, %%mm0 \n\t"\
273
"pmaddwd %%mm1, %%mm2 \n\t"\
274
"pmaddwd %%mm1, %%mm0 \n\t"\
275
"paddd %%mm2, %%mm6 \n\t"\
276
"paddd %%mm0, %%mm7 \n\t"\
278
"psrad $16, %%mm4 \n\t"\
279
"psrad $16, %%mm5 \n\t"\
280
"psrad $16, %%mm6 \n\t"\
281
"psrad $16, %%mm7 \n\t"\
282
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283
"packssdw %%mm5, %%mm4 \n\t"\
284
"packssdw %%mm7, %%mm6 \n\t"\
285
"paddw %%mm0, %%mm4 \n\t"\
286
"paddw %%mm0, %%mm6 \n\t"\
287
"movq %%mm4, "U_TEMP"(%0) \n\t"\
288
"movq %%mm6, "V_TEMP"(%0) \n\t"\
290
"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291
"mov (%%"REG_d"), %%"REG_S" \n\t"\
292
"pxor %%mm1, %%mm1 \n\t"\
293
"pxor %%mm5, %%mm5 \n\t"\
294
"pxor %%mm7, %%mm7 \n\t"\
295
"pxor %%mm6, %%mm6 \n\t"\
298
"movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300
"mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301
"movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302
"movq %%mm0, %%mm3 \n\t"\
303
"punpcklwd %%mm4, %%mm0 \n\t"\
304
"punpckhwd %%mm4, %%mm3 \n\t"\
305
"movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306
"pmaddwd %%mm4, %%mm0 \n\t"\
307
"pmaddwd %%mm4, %%mm3 \n\t"\
308
"paddd %%mm0, %%mm1 \n\t"\
309
"paddd %%mm3, %%mm5 \n\t"\
310
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311
"mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312
"add $16, %%"REG_d" \n\t"\
313
"test %%"REG_S", %%"REG_S" \n\t"\
314
"movq %%mm2, %%mm0 \n\t"\
315
"punpcklwd %%mm3, %%mm2 \n\t"\
316
"punpckhwd %%mm3, %%mm0 \n\t"\
317
"pmaddwd %%mm4, %%mm2 \n\t"\
318
"pmaddwd %%mm4, %%mm0 \n\t"\
319
"paddd %%mm2, %%mm7 \n\t"\
320
"paddd %%mm0, %%mm6 \n\t"\
322
"psrad $16, %%mm1 \n\t"\
323
"psrad $16, %%mm5 \n\t"\
324
"psrad $16, %%mm7 \n\t"\
325
"psrad $16, %%mm6 \n\t"\
326
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327
"packssdw %%mm5, %%mm1 \n\t"\
328
"packssdw %%mm6, %%mm7 \n\t"\
329
"paddw %%mm0, %%mm1 \n\t"\
330
"paddw %%mm0, %%mm7 \n\t"\
331
"movq "U_TEMP"(%0), %%mm3 \n\t"\
332
"movq "V_TEMP"(%0), %%mm4 \n\t"\
334
#define YSCALEYUV2RGBX \
335
"psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336
"psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344
"psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345
"psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
"paddw %%mm3, %%mm4 \n\t"\
350
"movq %%mm2, %%mm0 \n\t"\
351
"movq %%mm5, %%mm6 \n\t"\
352
"movq %%mm4, %%mm3 \n\t"\
353
"punpcklwd %%mm2, %%mm2 \n\t"\
354
"punpcklwd %%mm5, %%mm5 \n\t"\
355
"punpcklwd %%mm4, %%mm4 \n\t"\
356
"paddw %%mm1, %%mm2 \n\t"\
357
"paddw %%mm1, %%mm5 \n\t"\
358
"paddw %%mm1, %%mm4 \n\t"\
359
"punpckhwd %%mm0, %%mm0 \n\t"\
360
"punpckhwd %%mm6, %%mm6 \n\t"\
361
"punpckhwd %%mm3, %%mm3 \n\t"\
362
"paddw %%mm7, %%mm0 \n\t"\
363
"paddw %%mm7, %%mm6 \n\t"\
364
"paddw %%mm7, %%mm3 \n\t"\
365
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
"packuswb %%mm0, %%mm2 \n\t"\
367
"packuswb %%mm6, %%mm5 \n\t"\
368
"packuswb %%mm3, %%mm4 \n\t"\
369
"pxor %%mm7, %%mm7 \n\t"
371
#define FULL_YSCALEYUV2RGB \
372
"pxor %%mm7, %%mm7 \n\t"\
373
"movd %6, %%mm6 \n\t" /*yalpha1*/\
374
"punpcklwd %%mm6, %%mm6 \n\t"\
375
"punpcklwd %%mm6, %%mm6 \n\t"\
376
"movd %7, %%mm5 \n\t" /*uvalpha1*/\
377
"punpcklwd %%mm5, %%mm5 \n\t"\
378
"punpcklwd %%mm5, %%mm5 \n\t"\
379
"xor %%"REG_a", %%"REG_a" \n\t"\
382
"movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383
"movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384
"movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385
"movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
"movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
"movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
"psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398
"psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399
"pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404
"pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
"pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
"psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411
"movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412
"pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413
"pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414
"paddw %%mm1, %%mm3 \n\t" /* B*/\
415
"paddw %%mm1, %%mm0 \n\t" /* R*/\
416
"packuswb %%mm3, %%mm3 \n\t"\
418
"packuswb %%mm0, %%mm0 \n\t"\
419
"paddw %%mm4, %%mm2 \n\t"\
420
"paddw %%mm2, %%mm1 \n\t" /* G*/\
422
"packuswb %%mm1, %%mm1 \n\t"
425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428
"psraw $3, %%mm0 \n\t"\
429
"psraw $3, %%mm1 \n\t"\
430
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432
"xor "#index", "#index" \n\t"\
435
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463
#define REAL_YSCALEYUV2RGB(index, c) \
464
"xor "#index", "#index" \n\t"\
467
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
"paddw %%mm3, %%mm4 \n\t"\
507
"movq %%mm2, %%mm0 \n\t"\
508
"movq %%mm5, %%mm6 \n\t"\
509
"movq %%mm4, %%mm3 \n\t"\
510
"punpcklwd %%mm2, %%mm2 \n\t"\
511
"punpcklwd %%mm5, %%mm5 \n\t"\
512
"punpcklwd %%mm4, %%mm4 \n\t"\
513
"paddw %%mm1, %%mm2 \n\t"\
514
"paddw %%mm1, %%mm5 \n\t"\
515
"paddw %%mm1, %%mm4 \n\t"\
516
"punpckhwd %%mm0, %%mm0 \n\t"\
517
"punpckhwd %%mm6, %%mm6 \n\t"\
518
"punpckhwd %%mm3, %%mm3 \n\t"\
519
"paddw %%mm7, %%mm0 \n\t"\
520
"paddw %%mm7, %%mm6 \n\t"\
521
"paddw %%mm7, %%mm3 \n\t"\
522
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
"packuswb %%mm0, %%mm2 \n\t"\
524
"packuswb %%mm6, %%mm5 \n\t"\
525
"packuswb %%mm3, %%mm4 \n\t"\
526
"pxor %%mm7, %%mm7 \n\t"
527
#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
"xor "#index", "#index" \n\t"\
533
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535
"psraw $7, %%mm3 \n\t" \
536
"psraw $7, %%mm4 \n\t" \
537
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539
"psraw $7, %%mm1 \n\t" \
540
"psraw $7, %%mm7 \n\t" \
542
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
"xor "#index", "#index" \n\t"\
548
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
"paddw %%mm3, %%mm4 \n\t"\
571
"movq %%mm2, %%mm0 \n\t"\
572
"movq %%mm5, %%mm6 \n\t"\
573
"movq %%mm4, %%mm3 \n\t"\
574
"punpcklwd %%mm2, %%mm2 \n\t"\
575
"punpcklwd %%mm5, %%mm5 \n\t"\
576
"punpcklwd %%mm4, %%mm4 \n\t"\
577
"paddw %%mm1, %%mm2 \n\t"\
578
"paddw %%mm1, %%mm5 \n\t"\
579
"paddw %%mm1, %%mm4 \n\t"\
580
"punpckhwd %%mm0, %%mm0 \n\t"\
581
"punpckhwd %%mm6, %%mm6 \n\t"\
582
"punpckhwd %%mm3, %%mm3 \n\t"\
583
"paddw %%mm7, %%mm0 \n\t"\
584
"paddw %%mm7, %%mm6 \n\t"\
585
"paddw %%mm7, %%mm3 \n\t"\
586
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
"packuswb %%mm0, %%mm2 \n\t"\
588
"packuswb %%mm6, %%mm5 \n\t"\
589
"packuswb %%mm3, %%mm4 \n\t"\
590
"pxor %%mm7, %%mm7 \n\t"
591
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
"xor "#index", "#index" \n\t"\
597
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
"psrlw $8, %%mm3 \n\t" \
604
"psrlw $8, %%mm4 \n\t" \
605
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607
"psraw $7, %%mm1 \n\t" \
608
"psraw $7, %%mm7 \n\t"
609
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611
// do vertical chrominance interpolation
612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
"xor "#index", "#index" \n\t"\
616
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
"paddw %%mm3, %%mm4 \n\t"\
643
"movq %%mm2, %%mm0 \n\t"\
644
"movq %%mm5, %%mm6 \n\t"\
645
"movq %%mm4, %%mm3 \n\t"\
646
"punpcklwd %%mm2, %%mm2 \n\t"\
647
"punpcklwd %%mm5, %%mm5 \n\t"\
648
"punpcklwd %%mm4, %%mm4 \n\t"\
649
"paddw %%mm1, %%mm2 \n\t"\
650
"paddw %%mm1, %%mm5 \n\t"\
651
"paddw %%mm1, %%mm4 \n\t"\
652
"punpckhwd %%mm0, %%mm0 \n\t"\
653
"punpckhwd %%mm6, %%mm6 \n\t"\
654
"punpckhwd %%mm3, %%mm3 \n\t"\
655
"paddw %%mm7, %%mm0 \n\t"\
656
"paddw %%mm7, %%mm6 \n\t"\
657
"paddw %%mm7, %%mm3 \n\t"\
658
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
"packuswb %%mm0, %%mm2 \n\t"\
660
"packuswb %%mm6, %%mm5 \n\t"\
661
"packuswb %%mm3, %%mm4 \n\t"\
662
"pxor %%mm7, %%mm7 \n\t"
663
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
"movq %%mm2, %%mm1 \n\t" /* B */\
668
"movq %%mm5, %%mm6 \n\t" /* R */\
669
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680
MOVNTQ(%%mm0, (dst, index, 4))\
681
MOVNTQ(%%mm2, 8(dst, index, 4))\
682
MOVNTQ(%%mm1, 16(dst, index, 4))\
683
MOVNTQ(%%mm3, 24(dst, index, 4))\
685
"add $8, "#index" \n\t"\
686
"cmp "#dstw", "#index" \n\t"\
688
#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694
"psrlq $3, %%mm2 \n\t"\
696
"movq %%mm2, %%mm1 \n\t"\
697
"movq %%mm4, %%mm3 \n\t"\
699
"punpcklbw %%mm7, %%mm3 \n\t"\
700
"punpcklbw %%mm5, %%mm2 \n\t"\
701
"punpckhbw %%mm7, %%mm4 \n\t"\
702
"punpckhbw %%mm5, %%mm1 \n\t"\
704
"psllq $3, %%mm3 \n\t"\
705
"psllq $3, %%mm4 \n\t"\
707
"por %%mm3, %%mm2 \n\t"\
708
"por %%mm4, %%mm1 \n\t"\
710
MOVNTQ(%%mm2, (dst, index, 2))\
711
MOVNTQ(%%mm1, 8(dst, index, 2))\
713
"add $8, "#index" \n\t"\
714
"cmp "#dstw", "#index" \n\t"\
716
#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720
"pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722
"psrlq $3, %%mm2 \n\t"\
723
"psrlq $1, %%mm5 \n\t"\
725
"movq %%mm2, %%mm1 \n\t"\
726
"movq %%mm4, %%mm3 \n\t"\
728
"punpcklbw %%mm7, %%mm3 \n\t"\
729
"punpcklbw %%mm5, %%mm2 \n\t"\
730
"punpckhbw %%mm7, %%mm4 \n\t"\
731
"punpckhbw %%mm5, %%mm1 \n\t"\
733
"psllq $2, %%mm3 \n\t"\
734
"psllq $2, %%mm4 \n\t"\
736
"por %%mm3, %%mm2 \n\t"\
737
"por %%mm4, %%mm1 \n\t"\
739
MOVNTQ(%%mm2, (dst, index, 2))\
740
MOVNTQ(%%mm1, 8(dst, index, 2))\
742
"add $8, "#index" \n\t"\
743
"cmp "#dstw", "#index" \n\t"\
745
#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747
#define WRITEBGR24OLD(dst, dstw, index) \
748
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
"movq %%mm2, %%mm1 \n\t" /* B */\
750
"movq %%mm5, %%mm6 \n\t" /* R */\
751
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764
"pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765
"pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775
"pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778
"pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779
"pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788
"pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789
"pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794
MOVNTQ(%%mm0, (dst))\
795
MOVNTQ(%%mm2, 8(dst))\
796
MOVNTQ(%%mm3, 16(dst))\
797
"add $24, "#dst" \n\t"\
799
"add $8, "#index" \n\t"\
800
"cmp "#dstw", "#index" \n\t"\
803
#define WRITEBGR24MMX(dst, dstw, index) \
804
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
"movq %%mm2, %%mm1 \n\t" /* B */\
806
"movq %%mm5, %%mm6 \n\t" /* R */\
807
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819
"movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820
"movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821
"movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823
"psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824
"psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825
"psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828
"punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829
"punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830
"punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831
"punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833
"psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834
"movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835
"psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837
MOVNTQ(%%mm0, (dst))\
839
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840
"movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841
"psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842
"por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843
MOVNTQ(%%mm6, 8(dst))\
845
"psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847
"por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848
MOVNTQ(%%mm5, 16(dst))\
850
"add $24, "#dst" \n\t"\
852
"add $8, "#index" \n\t"\
853
"cmp "#dstw", "#index" \n\t"\
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
"movq "MANGLE(M24A)", %%mm0 \n\t"\
859
"movq "MANGLE(M24C)", %%mm7 \n\t"\
860
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864
"pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865
"pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869
"por %%mm1, %%mm6 \n\t"\
870
"por %%mm3, %%mm6 \n\t"\
871
MOVNTQ(%%mm6, (dst))\
873
"psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874
"pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875
"pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876
"pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878
"pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880
"pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882
"por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883
"por %%mm3, %%mm6 \n\t"\
884
MOVNTQ(%%mm6, 8(dst))\
886
"pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887
"pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888
"pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890
"pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891
"pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892
"pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894
"por %%mm1, %%mm3 \n\t"\
895
"por %%mm3, %%mm6 \n\t"\
896
MOVNTQ(%%mm6, 16(dst))\
898
"add $24, "#dst" \n\t"\
900
"add $8, "#index" \n\t"\
901
"cmp "#dstw", "#index" \n\t"\
906
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
909
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
912
#define REAL_WRITEYUY2(dst, dstw, index) \
913
"packuswb %%mm3, %%mm3 \n\t"\
914
"packuswb %%mm4, %%mm4 \n\t"\
915
"packuswb %%mm7, %%mm1 \n\t"\
916
"punpcklbw %%mm4, %%mm3 \n\t"\
917
"movq %%mm1, %%mm7 \n\t"\
918
"punpcklbw %%mm3, %%mm1 \n\t"\
919
"punpckhbw %%mm3, %%mm7 \n\t"\
921
MOVNTQ(%%mm1, (dst, index, 2))\
922
MOVNTQ(%%mm7, 8(dst, index, 2))\
924
"add $8, "#index" \n\t"\
925
"cmp "#dstw", "#index" \n\t"\
927
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
935
if (c->flags & SWS_ACCURATE_RND){
937
YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941
YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
944
YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948
YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
952
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953
chrFilter, chrSrc, chrFilterSize,
954
dest, uDest, vDest, dstW, chrDstW);
956
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957
chrFilter, chrSrc, chrFilterSize,
958
dest, uDest, vDest, dstW, chrDstW);
959
#endif //!HAVE_ALTIVEC
963
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965
uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
967
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968
chrFilter, chrSrc, chrFilterSize,
969
dest, uDest, dstW, chrDstW, dstFormat);
972
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
980
:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
987
:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
995
:: "r" (lumSrc + dstW), "r" (dest + dstW),
1001
for (i=0; i<dstW; i++)
1003
int val= lumSrc[i]>>7;
1014
for (i=0; i<chrDstW; i++)
1017
int v=chrSrc[i + 2048]>>7;
1021
else if (u>255) u=255;
1023
else if (v>255) v=255;
1034
* vertical scale YV12 to RGB
1036
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038
uint8_t *dest, long dstW, long dstY)
1042
if (c->flags & SWS_ACCURATE_RND){
1043
switch(c->dstFormat){
1045
YSCALEYUV2PACKEDX_ACCURATE
1047
WRITEBGR32(%4, %5, %%REGa)
1049
YSCALEYUV2PACKEDX_END
1052
YSCALEYUV2PACKEDX_ACCURATE
1054
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055
"add %4, %%"REG_c" \n\t"
1056
WRITEBGR24(%%REGc, %5, %%REGa)
1059
:: "r" (&c->redDither),
1060
"m" (dummy), "m" (dummy), "m" (dummy),
1061
"r" (dest), "m" (dstW)
1062
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1065
case PIX_FMT_BGR555:
1066
YSCALEYUV2PACKEDX_ACCURATE
1068
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1070
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1075
WRITEBGR15(%4, %5, %%REGa)
1076
YSCALEYUV2PACKEDX_END
1078
case PIX_FMT_BGR565:
1079
YSCALEYUV2PACKEDX_ACCURATE
1081
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1088
WRITEBGR16(%4, %5, %%REGa)
1089
YSCALEYUV2PACKEDX_END
1091
case PIX_FMT_YUYV422:
1092
YSCALEYUV2PACKEDX_ACCURATE
1093
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095
"psraw $3, %%mm3 \n\t"
1096
"psraw $3, %%mm4 \n\t"
1097
"psraw $3, %%mm1 \n\t"
1098
"psraw $3, %%mm7 \n\t"
1099
WRITEYUY2(%4, %5, %%REGa)
1100
YSCALEYUV2PACKEDX_END
1104
switch(c->dstFormat)
1109
WRITEBGR32(%4, %5, %%REGa)
1110
YSCALEYUV2PACKEDX_END
1115
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116
"add %4, %%"REG_c" \n\t"
1117
WRITEBGR24(%%REGc, %5, %%REGa)
1119
:: "r" (&c->redDither),
1120
"m" (dummy), "m" (dummy), "m" (dummy),
1121
"r" (dest), "m" (dstW)
1122
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1125
case PIX_FMT_BGR555:
1128
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1130
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131
"paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1135
WRITEBGR15(%4, %5, %%REGa)
1136
YSCALEYUV2PACKEDX_END
1138
case PIX_FMT_BGR565:
1141
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1143
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144
"paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1148
WRITEBGR16(%4, %5, %%REGa)
1149
YSCALEYUV2PACKEDX_END
1151
case PIX_FMT_YUYV422:
1153
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155
"psraw $3, %%mm3 \n\t"
1156
"psraw $3, %%mm4 \n\t"
1157
"psraw $3, %%mm1 \n\t"
1158
"psraw $3, %%mm7 \n\t"
1159
WRITEYUY2(%4, %5, %%REGa)
1160
YSCALEYUV2PACKEDX_END
1166
/* The following list of supported dstFormat values should
1167
match what's found in the body of altivec_yuv2packedX() */
1168
if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169
c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170
c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171
altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172
chrFilter, chrSrc, chrFilterSize,
1176
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177
chrFilter, chrSrc, chrFilterSize,
1182
* vertical bilinear scale YV12 to RGB
1184
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185
uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1187
int yalpha1=yalpha^4095;
1188
int uvalpha1=uvalpha^4095;
1192
if (flags&SWS_FULL_CHR_H_INT)
1202
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205
"movq %%mm3, %%mm1 \n\t"
1206
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209
MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210
MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1212
"add $4, %%"REG_a" \n\t"
1213
"cmp %5, %%"REG_a" \n\t"
1216
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217
"m" (yalpha1), "m" (uvalpha1)
1227
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230
"movq %%mm3, %%mm1 \n\t"
1231
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235
"psrlq $8, %%mm3 \n\t" // GR0BGR00
1236
"pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237
"pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238
"por %%mm2, %%mm3 \n\t" // BGRBGR00
1239
"movq %%mm1, %%mm2 \n\t"
1240
"psllq $48, %%mm1 \n\t" // 000000BG
1241
"por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244
"psrld $16, %%mm2 \n\t" // R000R000
1245
"psrlq $24, %%mm1 \n\t" // 0BGR0000
1246
"por %%mm2, %%mm1 \n\t" // RBGRR000
1248
"mov %4, %%"REG_b" \n\t"
1249
"add %%"REG_a", %%"REG_b" \n\t"
1253
"movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254
"movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1256
"movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257
"psrlq $32, %%mm3 \n\t"
1258
"movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259
"movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1261
"add $4, %%"REG_a" \n\t"
1262
"cmp %5, %%"REG_a" \n\t"
1265
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266
"m" (yalpha1), "m" (uvalpha1)
1267
: "%"REG_a, "%"REG_b
1270
case PIX_FMT_BGR555:
1275
"paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276
"paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277
"paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1279
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283
"psrlw $3, %%mm3 \n\t"
1284
"psllw $2, %%mm1 \n\t"
1285
"psllw $7, %%mm0 \n\t"
1286
"pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287
"pand "MANGLE(r15Mask)", %%mm0 \n\t"
1289
"por %%mm3, %%mm1 \n\t"
1290
"por %%mm1, %%mm0 \n\t"
1292
MOVNTQ(%%mm0, (%4, %%REGa, 2))
1294
"add $4, %%"REG_a" \n\t"
1295
"cmp %5, %%"REG_a" \n\t"
1298
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299
"m" (yalpha1), "m" (uvalpha1)
1303
case PIX_FMT_BGR565:
1308
"paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309
"paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310
"paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1312
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316
"psrlw $3, %%mm3 \n\t"
1317
"psllw $3, %%mm1 \n\t"
1318
"psllw $8, %%mm0 \n\t"
1319
"pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320
"pand "MANGLE(r16Mask)", %%mm0 \n\t"
1322
"por %%mm3, %%mm1 \n\t"
1323
"por %%mm1, %%mm0 \n\t"
1325
MOVNTQ(%%mm0, (%4, %%REGa, 2))
1327
"add $4, %%"REG_a" \n\t"
1328
"cmp %5, %%"REG_a" \n\t"
1331
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332
"m" (yalpha1), "m" (uvalpha1)
1341
if (dstFormat==PIX_FMT_RGB32)
1344
#ifdef WORDS_BIGENDIAN
1347
for (i=0;i<dstW;i++){
1348
// vertical linear interpolation && yuv2rgb in a single step:
1349
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1352
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1358
else if (dstFormat==PIX_FMT_BGR24)
1361
for (i=0;i<dstW;i++){
1362
// vertical linear interpolation && yuv2rgb in a single step:
1363
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1366
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1372
else if (dstFormat==PIX_FMT_BGR565)
1375
for (i=0;i<dstW;i++){
1376
// vertical linear interpolation && yuv2rgb in a single step:
1377
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1381
((uint16_t*)dest)[i] =
1382
clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383
clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384
clip_table16r[(Y + yuvtab_3343[V]) >>13];
1387
else if (dstFormat==PIX_FMT_BGR555)
1390
for (i=0;i<dstW;i++){
1391
// vertical linear interpolation && yuv2rgb in a single step:
1392
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1396
((uint16_t*)dest)[i] =
1397
clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398
clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399
clip_table15r[(Y + yuvtab_3343[V]) >>13];
1407
switch(c->dstFormat)
1409
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1412
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413
"mov %4, %%"REG_b" \n\t"
1414
"push %%"REG_BP" \n\t"
1415
YSCALEYUV2RGB(%%REGBP, %5)
1416
WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417
"pop %%"REG_BP" \n\t"
1418
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1420
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1426
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427
"mov %4, %%"REG_b" \n\t"
1428
"push %%"REG_BP" \n\t"
1429
YSCALEYUV2RGB(%%REGBP, %5)
1430
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431
"pop %%"REG_BP" \n\t"
1432
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1437
case PIX_FMT_BGR555:
1439
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440
"mov %4, %%"REG_b" \n\t"
1441
"push %%"REG_BP" \n\t"
1442
YSCALEYUV2RGB(%%REGBP, %5)
1443
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1445
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446
"paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1450
WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451
"pop %%"REG_BP" \n\t"
1452
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1454
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1458
case PIX_FMT_BGR565:
1460
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461
"mov %4, %%"REG_b" \n\t"
1462
"push %%"REG_BP" \n\t"
1463
YSCALEYUV2RGB(%%REGBP, %5)
1464
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1466
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467
"paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1471
WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472
"pop %%"REG_BP" \n\t"
1473
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1478
case PIX_FMT_YUYV422:
1480
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481
"mov %4, %%"REG_b" \n\t"
1482
"push %%"REG_BP" \n\t"
1483
YSCALEYUV2PACKED(%%REGBP, %5)
1484
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485
"pop %%"REG_BP" \n\t"
1486
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1498
* YV12 to RGB without scaling or interpolating
1500
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501
uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1503
const int yalpha1=0;
1506
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507
const int yalpha= 4096; //FIXME ...
1509
if (flags&SWS_FULL_CHR_H_INT)
1511
RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1516
if ( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1522
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523
"mov %4, %%"REG_b" \n\t"
1524
"push %%"REG_BP" \n\t"
1525
YSCALEYUV2RGB1(%%REGBP, %5)
1526
WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527
"pop %%"REG_BP" \n\t"
1528
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1530
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1536
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537
"mov %4, %%"REG_b" \n\t"
1538
"push %%"REG_BP" \n\t"
1539
YSCALEYUV2RGB1(%%REGBP, %5)
1540
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541
"pop %%"REG_BP" \n\t"
1542
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1548
case PIX_FMT_BGR555:
1550
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551
"mov %4, %%"REG_b" \n\t"
1552
"push %%"REG_BP" \n\t"
1553
YSCALEYUV2RGB1(%%REGBP, %5)
1554
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557
"paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1560
WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561
"pop %%"REG_BP" \n\t"
1562
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1568
case PIX_FMT_BGR565:
1570
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571
"mov %4, %%"REG_b" \n\t"
1572
"push %%"REG_BP" \n\t"
1573
YSCALEYUV2RGB1(%%REGBP, %5)
1574
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577
"paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1581
WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582
"pop %%"REG_BP" \n\t"
1583
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1589
case PIX_FMT_YUYV422:
1591
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592
"mov %4, %%"REG_b" \n\t"
1593
"push %%"REG_BP" \n\t"
1594
YSCALEYUV2PACKED1(%%REGBP, %5)
1595
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596
"pop %%"REG_BP" \n\t"
1597
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1599
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1611
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612
"mov %4, %%"REG_b" \n\t"
1613
"push %%"REG_BP" \n\t"
1614
YSCALEYUV2RGB1b(%%REGBP, %5)
1615
WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616
"pop %%"REG_BP" \n\t"
1617
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1625
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626
"mov %4, %%"REG_b" \n\t"
1627
"push %%"REG_BP" \n\t"
1628
YSCALEYUV2RGB1b(%%REGBP, %5)
1629
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630
"pop %%"REG_BP" \n\t"
1631
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1633
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1637
case PIX_FMT_BGR555:
1639
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640
"mov %4, %%"REG_b" \n\t"
1641
"push %%"REG_BP" \n\t"
1642
YSCALEYUV2RGB1b(%%REGBP, %5)
1643
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1645
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646
"paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1649
WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650
"pop %%"REG_BP" \n\t"
1651
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1657
case PIX_FMT_BGR565:
1659
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660
"mov %4, %%"REG_b" \n\t"
1661
"push %%"REG_BP" \n\t"
1662
YSCALEYUV2RGB1b(%%REGBP, %5)
1663
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665
"paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666
"paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667
"paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1670
WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671
"pop %%"REG_BP" \n\t"
1672
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1674
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1678
case PIX_FMT_YUYV422:
1680
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681
"mov %4, %%"REG_b" \n\t"
1682
"push %%"REG_BP" \n\t"
1683
YSCALEYUV2PACKED1b(%%REGBP, %5)
1684
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685
"pop %%"REG_BP" \n\t"
1686
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1688
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695
if ( uvalpha < 2048 )
1697
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1699
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1703
//FIXME yuy2* can read upto 7 samples to much
1705
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1709
"movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710
"mov %0, %%"REG_a" \n\t"
1712
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714
"pand %%mm2, %%mm0 \n\t"
1715
"pand %%mm2, %%mm1 \n\t"
1716
"packuswb %%mm1, %%mm0 \n\t"
1717
"movq %%mm0, (%2, %%"REG_a") \n\t"
1718
"add $8, %%"REG_a" \n\t"
1720
: : "g" (-width), "r" (src+width*2), "r" (dst+width)
1725
for (i=0; i<width; i++)
1730
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1734
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735
"mov %0, %%"REG_a" \n\t"
1737
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739
"psrlw $8, %%mm0 \n\t"
1740
"psrlw $8, %%mm1 \n\t"
1741
"packuswb %%mm1, %%mm0 \n\t"
1742
"movq %%mm0, %%mm1 \n\t"
1743
"psrlw $8, %%mm0 \n\t"
1744
"pand %%mm4, %%mm1 \n\t"
1745
"packuswb %%mm0, %%mm0 \n\t"
1746
"packuswb %%mm1, %%mm1 \n\t"
1747
"movd %%mm0, (%3, %%"REG_a") \n\t"
1748
"movd %%mm1, (%2, %%"REG_a") \n\t"
1749
"add $4, %%"REG_a" \n\t"
1751
: : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1756
for (i=0; i<width; i++)
1758
dstU[i]= src1[4*i + 1];
1759
dstV[i]= src1[4*i + 3];
1762
assert(src1 == src2);
1765
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1770
"mov %0, %%"REG_a" \n\t"
1772
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1773
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1774
"psrlw $8, %%mm0 \n\t"
1775
"psrlw $8, %%mm1 \n\t"
1776
"packuswb %%mm1, %%mm0 \n\t"
1777
"movq %%mm0, (%2, %%"REG_a") \n\t"
1778
"add $8, %%"REG_a" \n\t"
1780
: : "g" (-width), "r" (src+width*2), "r" (dst+width)
1785
for (i=0; i<width; i++)
1790
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1794
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1795
"mov %0, %%"REG_a" \n\t"
1797
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1798
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1799
"pand %%mm4, %%mm0 \n\t"
1800
"pand %%mm4, %%mm1 \n\t"
1801
"packuswb %%mm1, %%mm0 \n\t"
1802
"movq %%mm0, %%mm1 \n\t"
1803
"psrlw $8, %%mm0 \n\t"
1804
"pand %%mm4, %%mm1 \n\t"
1805
"packuswb %%mm0, %%mm0 \n\t"
1806
"packuswb %%mm1, %%mm1 \n\t"
1807
"movd %%mm0, (%3, %%"REG_a") \n\t"
1808
"movd %%mm1, (%2, %%"REG_a") \n\t"
1809
"add $4, %%"REG_a" \n\t"
1811
: : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1816
for (i=0; i<width; i++)
1818
dstU[i]= src1[4*i + 0];
1819
dstV[i]= src1[4*i + 2];
1822
assert(src1 == src2);
1825
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1828
for (i=0; i<width; i++)
1830
int b= ((uint32_t*)src)[i]&0xFF;
1831
int g= (((uint32_t*)src)[i]>>8)&0xFF;
1832
int r= (((uint32_t*)src)[i]>>16)&0xFF;
1834
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1838
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1841
assert(src1 == src2);
1842
for (i=0; i<width; i++)
1844
const int a= ((uint32_t*)src1)[2*i+0];
1845
const int e= ((uint32_t*)src1)[2*i+1];
1846
const int l= (a&0xFF00FF) + (e&0xFF00FF);
1847
const int h= (a&0x00FF00) + (e&0x00FF00);
1848
const int b= l&0x3FF;
1852
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1853
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1857
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1861
"mov %2, %%"REG_a" \n\t"
1862
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1863
"movq "MANGLE(w1111)", %%mm5 \n\t"
1864
"pxor %%mm7, %%mm7 \n\t"
1865
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1868
PREFETCH" 64(%0, %%"REG_d") \n\t"
1869
"movd (%0, %%"REG_d"), %%mm0 \n\t"
1870
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1871
"punpcklbw %%mm7, %%mm0 \n\t"
1872
"punpcklbw %%mm7, %%mm1 \n\t"
1873
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1874
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1875
"punpcklbw %%mm7, %%mm2 \n\t"
1876
"punpcklbw %%mm7, %%mm3 \n\t"
1877
"pmaddwd %%mm6, %%mm0 \n\t"
1878
"pmaddwd %%mm6, %%mm1 \n\t"
1879
"pmaddwd %%mm6, %%mm2 \n\t"
1880
"pmaddwd %%mm6, %%mm3 \n\t"
1881
#ifndef FAST_BGR2YV12
1882
"psrad $8, %%mm0 \n\t"
1883
"psrad $8, %%mm1 \n\t"
1884
"psrad $8, %%mm2 \n\t"
1885
"psrad $8, %%mm3 \n\t"
1887
"packssdw %%mm1, %%mm0 \n\t"
1888
"packssdw %%mm3, %%mm2 \n\t"
1889
"pmaddwd %%mm5, %%mm0 \n\t"
1890
"pmaddwd %%mm5, %%mm2 \n\t"
1891
"packssdw %%mm2, %%mm0 \n\t"
1892
"psraw $7, %%mm0 \n\t"
1894
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1895
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1896
"punpcklbw %%mm7, %%mm4 \n\t"
1897
"punpcklbw %%mm7, %%mm1 \n\t"
1898
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1899
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1900
"punpcklbw %%mm7, %%mm2 \n\t"
1901
"punpcklbw %%mm7, %%mm3 \n\t"
1902
"pmaddwd %%mm6, %%mm4 \n\t"
1903
"pmaddwd %%mm6, %%mm1 \n\t"
1904
"pmaddwd %%mm6, %%mm2 \n\t"
1905
"pmaddwd %%mm6, %%mm3 \n\t"
1906
#ifndef FAST_BGR2YV12
1907
"psrad $8, %%mm4 \n\t"
1908
"psrad $8, %%mm1 \n\t"
1909
"psrad $8, %%mm2 \n\t"
1910
"psrad $8, %%mm3 \n\t"
1912
"packssdw %%mm1, %%mm4 \n\t"
1913
"packssdw %%mm3, %%mm2 \n\t"
1914
"pmaddwd %%mm5, %%mm4 \n\t"
1915
"pmaddwd %%mm5, %%mm2 \n\t"
1916
"add $24, %%"REG_d" \n\t"
1917
"packssdw %%mm2, %%mm4 \n\t"
1918
"psraw $7, %%mm4 \n\t"
1920
"packuswb %%mm4, %%mm0 \n\t"
1921
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1923
"movq %%mm0, (%1, %%"REG_a") \n\t"
1924
"add $8, %%"REG_a" \n\t"
1926
: : "r" (src+width*3), "r" (dst+width), "g" (-width)
1927
: "%"REG_a, "%"REG_d
1931
for (i=0; i<width; i++)
1937
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1942
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1946
"mov %3, %%"REG_a" \n\t"
1947
"movq "MANGLE(w1111)", %%mm5 \n\t"
1948
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1949
"pxor %%mm7, %%mm7 \n\t"
1950
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1951
"add %%"REG_d", %%"REG_d" \n\t"
1954
PREFETCH" 64(%0, %%"REG_d") \n\t"
1955
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956
"movq (%0, %%"REG_d"), %%mm0 \n\t"
1957
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1958
"movq %%mm0, %%mm1 \n\t"
1959
"movq %%mm2, %%mm3 \n\t"
1960
"psrlq $24, %%mm0 \n\t"
1961
"psrlq $24, %%mm2 \n\t"
1964
"punpcklbw %%mm7, %%mm0 \n\t"
1965
"punpcklbw %%mm7, %%mm2 \n\t"
1967
"movd (%0, %%"REG_d"), %%mm0 \n\t"
1968
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1969
"punpcklbw %%mm7, %%mm0 \n\t"
1970
"punpcklbw %%mm7, %%mm2 \n\t"
1971
"paddw %%mm2, %%mm0 \n\t"
1972
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1973
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1974
"punpcklbw %%mm7, %%mm4 \n\t"
1975
"punpcklbw %%mm7, %%mm2 \n\t"
1976
"paddw %%mm4, %%mm2 \n\t"
1977
"psrlw $1, %%mm0 \n\t"
1978
"psrlw $1, %%mm2 \n\t"
1980
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1981
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1983
"pmaddwd %%mm0, %%mm1 \n\t"
1984
"pmaddwd %%mm2, %%mm3 \n\t"
1985
"pmaddwd %%mm6, %%mm0 \n\t"
1986
"pmaddwd %%mm6, %%mm2 \n\t"
1987
#ifndef FAST_BGR2YV12
1988
"psrad $8, %%mm0 \n\t"
1989
"psrad $8, %%mm1 \n\t"
1990
"psrad $8, %%mm2 \n\t"
1991
"psrad $8, %%mm3 \n\t"
1993
"packssdw %%mm2, %%mm0 \n\t"
1994
"packssdw %%mm3, %%mm1 \n\t"
1995
"pmaddwd %%mm5, %%mm0 \n\t"
1996
"pmaddwd %%mm5, %%mm1 \n\t"
1997
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1998
"psraw $7, %%mm0 \n\t"
2000
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2002
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2003
"movq %%mm4, %%mm1 \n\t"
2004
"movq %%mm2, %%mm3 \n\t"
2005
"psrlq $24, %%mm4 \n\t"
2006
"psrlq $24, %%mm2 \n\t"
2009
"punpcklbw %%mm7, %%mm4 \n\t"
2010
"punpcklbw %%mm7, %%mm2 \n\t"
2012
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2013
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2014
"punpcklbw %%mm7, %%mm4 \n\t"
2015
"punpcklbw %%mm7, %%mm2 \n\t"
2016
"paddw %%mm2, %%mm4 \n\t"
2017
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2018
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2019
"punpcklbw %%mm7, %%mm5 \n\t"
2020
"punpcklbw %%mm7, %%mm2 \n\t"
2021
"paddw %%mm5, %%mm2 \n\t"
2022
"movq "MANGLE(w1111)", %%mm5 \n\t"
2023
"psrlw $2, %%mm4 \n\t"
2024
"psrlw $2, %%mm2 \n\t"
2026
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2027
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2029
"pmaddwd %%mm4, %%mm1 \n\t"
2030
"pmaddwd %%mm2, %%mm3 \n\t"
2031
"pmaddwd %%mm6, %%mm4 \n\t"
2032
"pmaddwd %%mm6, %%mm2 \n\t"
2033
#ifndef FAST_BGR2YV12
2034
"psrad $8, %%mm4 \n\t"
2035
"psrad $8, %%mm1 \n\t"
2036
"psrad $8, %%mm2 \n\t"
2037
"psrad $8, %%mm3 \n\t"
2039
"packssdw %%mm2, %%mm4 \n\t"
2040
"packssdw %%mm3, %%mm1 \n\t"
2041
"pmaddwd %%mm5, %%mm4 \n\t"
2042
"pmaddwd %%mm5, %%mm1 \n\t"
2043
"add $24, %%"REG_d" \n\t"
2044
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2045
"psraw $7, %%mm4 \n\t"
2047
"movq %%mm0, %%mm1 \n\t"
2048
"punpckldq %%mm4, %%mm0 \n\t"
2049
"punpckhdq %%mm4, %%mm1 \n\t"
2050
"packsswb %%mm1, %%mm0 \n\t"
2051
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2053
"movd %%mm0, (%1, %%"REG_a") \n\t"
2054
"punpckhdq %%mm0, %%mm0 \n\t"
2055
"movd %%mm0, (%2, %%"REG_a") \n\t"
2056
"add $4, %%"REG_a" \n\t"
2058
: : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2059
: "%"REG_a, "%"REG_d
2063
for (i=0; i<width; i++)
2065
int b= src1[6*i + 0] + src1[6*i + 3];
2066
int g= src1[6*i + 1] + src1[6*i + 4];
2067
int r= src1[6*i + 2] + src1[6*i + 5];
2069
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2070
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2073
assert(src1 == src2);
2076
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2079
for (i=0; i<width; i++)
2081
int d= ((uint16_t*)src)[i];
2084
int r= (d>>11)&0x1F;
2086
dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2090
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2094
for (i=0; i<width; i++)
2096
int d0= ((uint32_t*)src1)[i];
2098
int dl= (d0&0x07E0F81F);
2099
int dh= ((d0>>5)&0x07C0F83F);
2101
int dh2= (dh>>11) + (dh<<21);
2105
int r= (d>>11)&0x7F;
2107
dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2108
dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2112
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2115
for (i=0; i<width; i++)
2117
int d= ((uint16_t*)src)[i];
2120
int r= (d>>10)&0x1F;
2122
dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2126
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2130
for (i=0; i<width; i++)
2132
int d0= ((uint32_t*)src1)[i];
2134
int dl= (d0&0x03E07C1F);
2135
int dh= ((d0>>5)&0x03E0F81F);
2137
int dh2= (dh>>11) + (dh<<21);
2141
int r= (d>>10)&0x7F;
2143
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2144
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2149
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2152
for (i=0; i<width; i++)
2154
int r= ((uint32_t*)src)[i]&0xFF;
2155
int g= (((uint32_t*)src)[i]>>8)&0xFF;
2156
int b= (((uint32_t*)src)[i]>>16)&0xFF;
2158
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2162
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2166
for (i=0; i<width; i++)
2168
const int a= ((uint32_t*)src1)[2*i+0];
2169
const int e= ((uint32_t*)src1)[2*i+1];
2170
const int l= (a&0xFF00FF) + (e&0xFF00FF);
2171
const int h= (a&0x00FF00) + (e&0x00FF00);
2172
const int r= l&0x3FF;
2176
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2177
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2181
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2184
for (i=0; i<width; i++)
2190
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2194
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2198
for (i=0; i<width; i++)
2200
int r= src1[6*i + 0] + src1[6*i + 3];
2201
int g= src1[6*i + 1] + src1[6*i + 4];
2202
int b= src1[6*i + 2] + src1[6*i + 5];
2204
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2205
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2209
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2212
for (i=0; i<width; i++)
2214
int d= ((uint16_t*)src)[i];
2217
int b= (d>>11)&0x1F;
2219
dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2223
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2226
assert(src1 == src2);
2227
for (i=0; i<width; i++)
2229
int d0= ((uint32_t*)src1)[i];
2231
int dl= (d0&0x07E0F81F);
2232
int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2235
int b= (d>>11)&0x3F;
2237
dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2238
dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2245
for (i=0; i<width; i++)
2247
int d= ((uint16_t*)src)[i];
2250
int b= (d>>10)&0x1F;
2252
dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2256
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2259
assert(src1 == src2);
2260
for (i=0; i<width; i++)
2262
int d0= ((uint32_t*)src1)[i];
2264
int dl= (d0&0x03E07C1F);
2265
int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2268
int b= (d>>10)&0x3F;
2270
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2271
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2275
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2278
for (i=0; i<width; i++)
2282
dst[i]= pal[d] & 0xFF;
2286
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2289
assert(src1 == src2);
2290
for (i=0; i<width; i++)
2292
int p= pal[src1[i]];
2299
// Bilinear / Bicubic scaling
2300
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2301
int16_t *filter, int16_t *filterPos, long filterSize)
2304
assert(filterSize % 4 == 0 && filterSize>0);
2305
if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2307
long counter= -2*dstW;
2309
filterPos-= counter/2;
2313
"push %%"REG_b" \n\t"
2315
"pxor %%mm7, %%mm7 \n\t"
2316
"movq "MANGLE(w02)", %%mm6 \n\t"
2317
"push %%"REG_BP" \n\t" // we use 7 regs here ...
2318
"mov %%"REG_a", %%"REG_BP" \n\t"
2321
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
2322
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2323
"movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2324
"movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2325
"movd (%3, %%"REG_a"), %%mm0 \n\t"
2326
"movd (%3, %%"REG_b"), %%mm2 \n\t"
2327
"punpcklbw %%mm7, %%mm0 \n\t"
2328
"punpcklbw %%mm7, %%mm2 \n\t"
2329
"pmaddwd %%mm1, %%mm0 \n\t"
2330
"pmaddwd %%mm2, %%mm3 \n\t"
2331
"psrad $8, %%mm0 \n\t"
2332
"psrad $8, %%mm3 \n\t"
2333
"packssdw %%mm3, %%mm0 \n\t"
2334
"pmaddwd %%mm6, %%mm0 \n\t"
2335
"packssdw %%mm0, %%mm0 \n\t"
2336
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2337
"add $4, %%"REG_BP" \n\t"
2340
"pop %%"REG_BP" \n\t"
2342
"pop %%"REG_b" \n\t"
2345
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2351
else if (filterSize==8)
2353
long counter= -2*dstW;
2355
filterPos-= counter/2;
2359
"push %%"REG_b" \n\t"
2361
"pxor %%mm7, %%mm7 \n\t"
2362
"movq "MANGLE(w02)", %%mm6 \n\t"
2363
"push %%"REG_BP" \n\t" // we use 7 regs here ...
2364
"mov %%"REG_a", %%"REG_BP" \n\t"
2367
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
2368
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2369
"movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2370
"movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2371
"movd (%3, %%"REG_a"), %%mm0 \n\t"
2372
"movd (%3, %%"REG_b"), %%mm2 \n\t"
2373
"punpcklbw %%mm7, %%mm0 \n\t"
2374
"punpcklbw %%mm7, %%mm2 \n\t"
2375
"pmaddwd %%mm1, %%mm0 \n\t"
2376
"pmaddwd %%mm2, %%mm3 \n\t"
2378
"movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2379
"movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2380
"movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2381
"movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2382
"punpcklbw %%mm7, %%mm4 \n\t"
2383
"punpcklbw %%mm7, %%mm2 \n\t"
2384
"pmaddwd %%mm1, %%mm4 \n\t"
2385
"pmaddwd %%mm2, %%mm5 \n\t"
2386
"paddd %%mm4, %%mm0 \n\t"
2387
"paddd %%mm5, %%mm3 \n\t"
2389
"psrad $8, %%mm0 \n\t"
2390
"psrad $8, %%mm3 \n\t"
2391
"packssdw %%mm3, %%mm0 \n\t"
2392
"pmaddwd %%mm6, %%mm0 \n\t"
2393
"packssdw %%mm0, %%mm0 \n\t"
2394
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2395
"add $4, %%"REG_BP" \n\t"
2398
"pop %%"REG_BP" \n\t"
2400
"pop %%"REG_b" \n\t"
2403
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2411
uint8_t *offset = src+filterSize;
2412
long counter= -2*dstW;
2413
//filter-= counter*filterSize/2;
2414
filterPos-= counter/2;
2417
"pxor %%mm7, %%mm7 \n\t"
2418
"movq "MANGLE(w02)", %%mm6 \n\t"
2421
"mov %2, %%"REG_c" \n\t"
2422
"movzwl (%%"REG_c", %0), %%eax \n\t"
2423
"movzwl 2(%%"REG_c", %0), %%edx \n\t"
2424
"mov %5, %%"REG_c" \n\t"
2425
"pxor %%mm4, %%mm4 \n\t"
2426
"pxor %%mm5, %%mm5 \n\t"
2428
"movq (%1), %%mm1 \n\t"
2429
"movq (%1, %6), %%mm3 \n\t"
2430
"movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2431
"movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2432
"punpcklbw %%mm7, %%mm0 \n\t"
2433
"punpcklbw %%mm7, %%mm2 \n\t"
2434
"pmaddwd %%mm1, %%mm0 \n\t"
2435
"pmaddwd %%mm2, %%mm3 \n\t"
2436
"paddd %%mm3, %%mm5 \n\t"
2437
"paddd %%mm0, %%mm4 \n\t"
2439
"add $4, %%"REG_c" \n\t"
2440
"cmp %4, %%"REG_c" \n\t"
2443
"psrad $8, %%mm4 \n\t"
2444
"psrad $8, %%mm5 \n\t"
2445
"packssdw %%mm5, %%mm4 \n\t"
2446
"pmaddwd %%mm6, %%mm4 \n\t"
2447
"packssdw %%mm4, %%mm4 \n\t"
2448
"mov %3, %%"REG_a" \n\t"
2449
"movd %%mm4, (%%"REG_a", %0) \n\t"
2453
: "+r" (counter), "+r" (filter)
2454
: "m" (filterPos), "m" (dst), "m"(offset),
2455
"m" (src), "r" (filterSize*2)
2456
: "%"REG_a, "%"REG_c, "%"REG_d
2461
hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2464
for (i=0; i<dstW; i++)
2467
int srcPos= filterPos[i];
2469
//printf("filterPos: %d\n", filterPos[i]);
2470
for (j=0; j<filterSize; j++)
2472
//printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2473
val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2475
//filter += hFilterSize;
2476
dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2482
// *** horizontal scale Y line to temp buffer
2483
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2484
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2485
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2486
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2487
int32_t *mmx2FilterPos, uint8_t *pal)
2489
if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2491
RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2492
src= formatConvBuffer;
2494
else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2496
RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2497
src= formatConvBuffer;
2499
else if (srcFormat==PIX_FMT_RGB32)
2501
RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2502
src= formatConvBuffer;
2504
else if (srcFormat==PIX_FMT_BGR24)
2506
RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2507
src= formatConvBuffer;
2509
else if (srcFormat==PIX_FMT_BGR565)
2511
RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2512
src= formatConvBuffer;
2514
else if (srcFormat==PIX_FMT_BGR555)
2516
RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2517
src= formatConvBuffer;
2519
else if (srcFormat==PIX_FMT_BGR32)
2521
RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2522
src= formatConvBuffer;
2524
else if (srcFormat==PIX_FMT_RGB24)
2526
RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2527
src= formatConvBuffer;
2529
else if (srcFormat==PIX_FMT_RGB565)
2531
RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2532
src= formatConvBuffer;
2534
else if (srcFormat==PIX_FMT_RGB555)
2536
RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2537
src= formatConvBuffer;
2539
else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2541
RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2542
src= formatConvBuffer;
2546
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2549
if (!(flags&SWS_FAST_BILINEAR))
2552
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2554
else // Fast Bilinear upscale / crap downscale
2556
#if defined(ARCH_X86)
2560
uint64_t ebxsave __attribute__((aligned(8)));
2566
"mov %%"REG_b", %5 \n\t"
2568
"pxor %%mm7, %%mm7 \n\t"
2569
"mov %0, %%"REG_c" \n\t"
2570
"mov %1, %%"REG_D" \n\t"
2571
"mov %2, %%"REG_d" \n\t"
2572
"mov %3, %%"REG_b" \n\t"
2573
"xor %%"REG_a", %%"REG_a" \n\t" // i
2574
PREFETCH" (%%"REG_c") \n\t"
2575
PREFETCH" 32(%%"REG_c") \n\t"
2576
PREFETCH" 64(%%"REG_c") \n\t"
2580
#define FUNNY_Y_CODE \
2581
"movl (%%"REG_b"), %%esi \n\t"\
2583
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2584
"add %%"REG_S", %%"REG_c" \n\t"\
2585
"add %%"REG_a", %%"REG_D" \n\t"\
2586
"xor %%"REG_a", %%"REG_a" \n\t"\
2590
#define FUNNY_Y_CODE \
2591
"movl (%%"REG_b"), %%esi \n\t"\
2593
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2594
"add %%"REG_a", %%"REG_D" \n\t"\
2595
"xor %%"REG_a", %%"REG_a" \n\t"\
2609
"mov %5, %%"REG_b" \n\t"
2611
:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2616
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2621
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2626
long xInc_shr16 = xInc >> 16;
2627
uint16_t xInc_mask = xInc & 0xffff;
2628
//NO MMX just normal asm ...
2630
"xor %%"REG_a", %%"REG_a" \n\t" // i
2631
"xor %%"REG_d", %%"REG_d" \n\t" // xx
2632
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
2635
"movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2636
"movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2637
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2638
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2639
"shll $16, %%edi \n\t"
2640
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2641
"mov %1, %%"REG_D" \n\t"
2642
"shrl $9, %%esi \n\t"
2643
"movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2644
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2645
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2647
"movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2648
"movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2649
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2650
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2651
"shll $16, %%edi \n\t"
2652
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2653
"mov %1, %%"REG_D" \n\t"
2654
"shrl $9, %%esi \n\t"
2655
"movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2656
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2657
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2660
"add $2, %%"REG_a" \n\t"
2661
"cmp %2, %%"REG_a" \n\t"
2665
:: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2666
: "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2669
} //if MMX2 can't be used
2673
unsigned int xpos=0;
2674
for (i=0;i<dstWidth;i++)
2676
register unsigned int xx=xpos>>16;
2677
register unsigned int xalpha=(xpos&0xFFFF)>>9;
2678
dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2685
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2686
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2687
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2688
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2689
int32_t *mmx2FilterPos, uint8_t *pal)
2691
if (srcFormat==PIX_FMT_YUYV422)
2693
RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2694
src1= formatConvBuffer;
2695
src2= formatConvBuffer+2048;
2697
else if (srcFormat==PIX_FMT_UYVY422)
2699
RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700
src1= formatConvBuffer;
2701
src2= formatConvBuffer+2048;
2703
else if (srcFormat==PIX_FMT_RGB32)
2705
RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706
src1= formatConvBuffer;
2707
src2= formatConvBuffer+2048;
2709
else if (srcFormat==PIX_FMT_BGR24)
2711
RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712
src1= formatConvBuffer;
2713
src2= formatConvBuffer+2048;
2715
else if (srcFormat==PIX_FMT_BGR565)
2717
RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718
src1= formatConvBuffer;
2719
src2= formatConvBuffer+2048;
2721
else if (srcFormat==PIX_FMT_BGR555)
2723
RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724
src1= formatConvBuffer;
2725
src2= formatConvBuffer+2048;
2727
else if (srcFormat==PIX_FMT_BGR32)
2729
RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730
src1= formatConvBuffer;
2731
src2= formatConvBuffer+2048;
2733
else if (srcFormat==PIX_FMT_RGB24)
2735
RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736
src1= formatConvBuffer;
2737
src2= formatConvBuffer+2048;
2739
else if (srcFormat==PIX_FMT_RGB565)
2741
RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742
src1= formatConvBuffer;
2743
src2= formatConvBuffer+2048;
2745
else if (srcFormat==PIX_FMT_RGB555)
2747
RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748
src1= formatConvBuffer;
2749
src2= formatConvBuffer+2048;
2751
else if (isGray(srcFormat))
2755
else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2757
RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2758
src1= formatConvBuffer;
2759
src2= formatConvBuffer+2048;
2763
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2766
if (!(flags&SWS_FAST_BILINEAR))
2769
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2770
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2772
else // Fast Bilinear upscale / crap downscale
2774
#if defined(ARCH_X86)
2778
uint64_t ebxsave __attribute__((aligned(8)));
2784
"mov %%"REG_b", %6 \n\t"
2786
"pxor %%mm7, %%mm7 \n\t"
2787
"mov %0, %%"REG_c" \n\t"
2788
"mov %1, %%"REG_D" \n\t"
2789
"mov %2, %%"REG_d" \n\t"
2790
"mov %3, %%"REG_b" \n\t"
2791
"xor %%"REG_a", %%"REG_a" \n\t" // i
2792
PREFETCH" (%%"REG_c") \n\t"
2793
PREFETCH" 32(%%"REG_c") \n\t"
2794
PREFETCH" 64(%%"REG_c") \n\t"
2798
#define FUNNY_UV_CODE \
2799
"movl (%%"REG_b"), %%esi \n\t"\
2801
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2802
"add %%"REG_S", %%"REG_c" \n\t"\
2803
"add %%"REG_a", %%"REG_D" \n\t"\
2804
"xor %%"REG_a", %%"REG_a" \n\t"\
2808
#define FUNNY_UV_CODE \
2809
"movl (%%"REG_b"), %%esi \n\t"\
2811
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2812
"add %%"REG_a", %%"REG_D" \n\t"\
2813
"xor %%"REG_a", %%"REG_a" \n\t"\
2821
"xor %%"REG_a", %%"REG_a" \n\t" // i
2822
"mov %5, %%"REG_c" \n\t" // src
2823
"mov %1, %%"REG_D" \n\t" // buf1
2824
"add $4096, %%"REG_D" \n\t"
2825
PREFETCH" (%%"REG_c") \n\t"
2826
PREFETCH" 32(%%"REG_c") \n\t"
2827
PREFETCH" 64(%%"REG_c") \n\t"
2835
"mov %6, %%"REG_b" \n\t"
2837
:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2838
"m" (funnyUVCode), "m" (src2)
2842
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2847
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2849
//printf("%d %d %d\n", dstWidth, i, srcW);
2850
dst[i] = src1[srcW-1]*128;
2851
dst[i+2048] = src2[srcW-1]*128;
2857
long xInc_shr16 = (long) (xInc >> 16);
2858
uint16_t xInc_mask = xInc & 0xffff;
2860
"xor %%"REG_a", %%"REG_a" \n\t" // i
2861
"xor %%"REG_d", %%"REG_d" \n\t" // xx
2862
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
2865
"mov %0, %%"REG_S" \n\t"
2866
"movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2867
"movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2868
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2869
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2870
"shll $16, %%edi \n\t"
2871
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2872
"mov %1, %%"REG_D" \n\t"
2873
"shrl $9, %%esi \n\t"
2874
"movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2876
"movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2877
"movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2878
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2879
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2880
"shll $16, %%edi \n\t"
2881
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2882
"mov %1, %%"REG_D" \n\t"
2883
"shrl $9, %%esi \n\t"
2884
"movw %%si, 4096(%%"REG_D", %%"REG_a", 2) \n\t"
2886
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2887
"adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2888
"add $1, %%"REG_a" \n\t"
2889
"cmp %2, %%"REG_a" \n\t"
2892
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2893
which is needed to support GCC-4.0 */
2894
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2895
:: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2897
:: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2900
: "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2903
} //if MMX2 can't be used
2907
unsigned int xpos=0;
2908
for (i=0;i<dstWidth;i++)
2910
register unsigned int xx=xpos>>16;
2911
register unsigned int xalpha=(xpos&0xFFFF)>>9;
2912
dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2913
dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2915
dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2916
dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2924
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2925
int srcSliceH, uint8_t* dst[], int dstStride[]){
2927
/* load a few things into local vars to make the code more readable? and faster */
2928
const int srcW= c->srcW;
2929
const int dstW= c->dstW;
2930
const int dstH= c->dstH;
2931
const int chrDstW= c->chrDstW;
2932
const int chrSrcW= c->chrSrcW;
2933
const int lumXInc= c->lumXInc;
2934
const int chrXInc= c->chrXInc;
2935
const int dstFormat= c->dstFormat;
2936
const int srcFormat= c->srcFormat;
2937
const int flags= c->flags;
2938
const int canMMX2BeUsed= c->canMMX2BeUsed;
2939
int16_t *vLumFilterPos= c->vLumFilterPos;
2940
int16_t *vChrFilterPos= c->vChrFilterPos;
2941
int16_t *hLumFilterPos= c->hLumFilterPos;
2942
int16_t *hChrFilterPos= c->hChrFilterPos;
2943
int16_t *vLumFilter= c->vLumFilter;
2944
int16_t *vChrFilter= c->vChrFilter;
2945
int16_t *hLumFilter= c->hLumFilter;
2946
int16_t *hChrFilter= c->hChrFilter;
2947
int32_t *lumMmxFilter= c->lumMmxFilter;
2948
int32_t *chrMmxFilter= c->chrMmxFilter;
2949
const int vLumFilterSize= c->vLumFilterSize;
2950
const int vChrFilterSize= c->vChrFilterSize;
2951
const int hLumFilterSize= c->hLumFilterSize;
2952
const int hChrFilterSize= c->hChrFilterSize;
2953
int16_t **lumPixBuf= c->lumPixBuf;
2954
int16_t **chrPixBuf= c->chrPixBuf;
2955
const int vLumBufSize= c->vLumBufSize;
2956
const int vChrBufSize= c->vChrBufSize;
2957
uint8_t *funnyYCode= c->funnyYCode;
2958
uint8_t *funnyUVCode= c->funnyUVCode;
2959
uint8_t *formatConvBuffer= c->formatConvBuffer;
2960
const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2961
const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2965
/* vars whch will change and which we need to storw back in the context */
2967
int lumBufIndex= c->lumBufIndex;
2968
int chrBufIndex= c->chrBufIndex;
2969
int lastInLumBuf= c->lastInLumBuf;
2970
int lastInChrBuf= c->lastInChrBuf;
2972
if (isPacked(c->srcFormat)){
2979
srcStride[2]= srcStride[0];
2981
srcStride[1]<<= c->vChrDrop;
2982
srcStride[2]<<= c->vChrDrop;
2984
//printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2985
// (int)dst[0], (int)dst[1], (int)dst[2]);
2987
#if 0 //self test FIXME move to a vfilter or something
2989
static volatile int i=0;
2991
if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2992
selfTest(src, srcStride, c->srcW, c->srcH);
2997
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2998
//dstStride[0],dstStride[1],dstStride[2]);
3000
if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3002
static int firstTime=1; //FIXME move this into the context perhaps
3003
if (flags & SWS_PRINT_INFO && firstTime)
3005
av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3006
"SwScaler: ->cannot do aligned memory acesses anymore\n");
3011
/* Note the user might start scaling the picture in the middle so this will not get executed
3012
this is not really intended but works currently, so ppl might do it */
3023
for (;dstY < dstH; dstY++){
3024
unsigned char *dest =dst[0]+dstStride[0]*dstY;
3025
const int chrDstY= dstY>>c->chrDstVSubSample;
3026
unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3027
unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3029
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3030
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3031
const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3032
const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3034
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3035
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3036
//handle holes (FAST_BILINEAR & weird filters)
3037
if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3038
if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3039
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3040
ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3041
ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3043
// Do we have enough lines in this slice to output the dstY line
3044
if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3046
//Do horizontal scaling
3047
while(lastInLumBuf < lastLumSrcY)
3049
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3051
//printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3052
ASSERT(lumBufIndex < 2*vLumBufSize)
3053
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3054
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3055
//printf("%d %d\n", lumBufIndex, vLumBufSize);
3056
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3057
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3058
funnyYCode, c->srcFormat, formatConvBuffer,
3059
c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3062
while(lastInChrBuf < lastChrSrcY)
3064
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3065
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3067
ASSERT(chrBufIndex < 2*vChrBufSize)
3068
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3069
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3070
//FIXME replace parameters through context struct (some at least)
3072
if (!(isGray(srcFormat) || isGray(dstFormat)))
3073
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3074
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3075
funnyUVCode, c->srcFormat, formatConvBuffer,
3076
c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3079
//wrap buf index around to stay inside the ring buffer
3080
if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3081
if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3083
else // not enough lines left in this slice -> load the rest in the buffer
3085
/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3086
firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3087
lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3088
vChrBufSize, vLumBufSize);*/
3090
//Do horizontal scaling
3091
while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3093
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3095
ASSERT(lumBufIndex < 2*vLumBufSize)
3096
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3097
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3098
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3099
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3100
funnyYCode, c->srcFormat, formatConvBuffer,
3101
c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3104
while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3106
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3107
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3109
ASSERT(chrBufIndex < 2*vChrBufSize)
3110
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3111
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3113
if (!(isGray(srcFormat) || isGray(dstFormat)))
3114
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3115
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3116
funnyUVCode, c->srcFormat, formatConvBuffer,
3117
c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3120
//wrap buf index around to stay inside the ring buffer
3121
if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3122
if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3123
break; //we can't output a dstY line so let's try with the next slice
3127
b5Dither= dither8[dstY&1];
3128
g6Dither= dither4[dstY&1];
3129
g5Dither= dither8[dstY&1];
3130
r5Dither= dither8[(dstY+1)&1];
3134
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3135
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3138
if (flags & SWS_ACCURATE_RND){
3139
for (i=0; i<vLumFilterSize; i+=2){
3140
lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3141
lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3142
lumMmxFilter[2*i+2]=
3143
lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3144
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3146
for (i=0; i<vChrFilterSize; i+=2){
3147
chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3148
chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3149
chrMmxFilter[2*i+2]=
3150
chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3151
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3154
for (i=0; i<vLumFilterSize; i++)
3156
lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3157
lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3158
lumMmxFilter[4*i+2]=
3159
lumMmxFilter[4*i+3]=
3160
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3162
for (i=0; i<vChrFilterSize; i++)
3164
chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3165
chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3166
chrMmxFilter[4*i+2]=
3167
chrMmxFilter[4*i+3]=
3168
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3172
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3173
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3174
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3175
RENAME(yuv2nv12X)(c,
3176
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3177
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3178
dest, uDest, dstW, chrDstW, dstFormat);
3180
else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3182
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3183
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3184
if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3186
int16_t *lumBuf = lumPixBuf[0];
3187
int16_t *chrBuf= chrPixBuf[0];
3188
RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3193
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3194
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3195
dest, uDest, vDest, dstW, chrDstW);
3200
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3201
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3202
if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3204
int chrAlpha= vChrFilter[2*dstY+1];
3205
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3206
dest, dstW, chrAlpha, dstFormat, flags, dstY);
3208
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3210
int lumAlpha= vLumFilter[2*dstY+1];
3211
int chrAlpha= vChrFilter[2*dstY+1];
3213
lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3215
chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3216
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3217
dest, dstW, lumAlpha, chrAlpha, dstY);
3221
RENAME(yuv2packedX)(c,
3222
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3223
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3228
else // hmm looks like we can't use MMX here without overwriting this array's tail
3230
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3231
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3232
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3233
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3234
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3236
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3237
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3238
dest, uDest, dstW, chrDstW, dstFormat);
3240
else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3242
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3243
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3245
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3246
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3247
dest, uDest, vDest, dstW, chrDstW);
3251
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3252
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3254
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3255
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3262
__asm __volatile(SFENCE:::"memory");
3263
__asm __volatile(EMMS:::"memory");
3265
/* store changed local vars back in the context */
3267
c->lumBufIndex= lumBufIndex;
3268
c->chrBufIndex= chrBufIndex;
3269
c->lastInLumBuf= lastInLumBuf;
3270
c->lastInChrBuf= lastInChrBuf;
3272
return dstY - lastDstY;