~ubuntu-branches/ubuntu/hardy/avidemux/hardy

« back to all changes in this revision

Viewing changes to avidemux/MPlayer_pp/swscale_template.c

  • Committer: Bazaar Package Importer
  • Author(s): Daniel T Chen
  • Date: 2006-12-15 17:13:20 UTC
  • mfrom: (1.1.6 upstream)
  • Revision ID: james.westby@ubuntu.com-20061215171320-w79pvpehxx2fr217
Tags: 1:2.3.0-0.0ubuntu1
* Merge from debian-multimedia.org, remaining Ubuntu change:
  - desktop file,
  - no support for ccache and make -j.
* Closes Ubuntu: #69614.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 
 
4
 
    This program is free software; you can redistribute it and/or modify
5
 
    it under the terms of the GNU General Public License as published by
6
 
    the Free Software Foundation; either version 2 of the License, or
7
 
    (at your option) any later version.
8
 
 
9
 
    This program is distributed in the hope that it will be useful,
10
 
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 
    GNU General Public License for more details.
13
 
 
14
 
    You should have received a copy of the GNU General Public License
15
 
    along with this program; if not, write to the Free Software
16
 
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
 
*/
18
 
 
19
 
#undef REAL_MOVNTQ
20
 
#undef MOVNTQ
21
 
#undef PAVGB
22
 
#undef PREFETCH
23
 
#undef PREFETCHW
24
 
#undef EMMS
25
 
#undef SFENCE
26
 
 
27
 
#ifdef HAVE_3DNOW
28
 
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29
 
#define EMMS     "femms"
30
 
#else
31
 
#define EMMS     "emms"
32
 
#endif
33
 
 
34
 
#ifdef HAVE_3DNOW
35
 
#define PREFETCH  "prefetch"
36
 
#define PREFETCHW "prefetchw"
37
 
#elif defined ( HAVE_MMX2 )
38
 
#define PREFETCH "prefetchnta"
39
 
#define PREFETCHW "prefetcht0"
40
 
#else
41
 
#define PREFETCH "/nop"
42
 
#define PREFETCHW "/nop"
43
 
#endif
44
 
 
45
 
#ifdef HAVE_MMX2
46
 
#define SFENCE "sfence"
47
 
#else
48
 
#define SFENCE "/nop"
49
 
#endif
50
 
 
51
 
#ifdef HAVE_MMX2
52
 
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53
 
#elif defined (HAVE_3DNOW)
54
 
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55
 
#endif
56
 
 
57
 
#ifdef HAVE_MMX2
58
 
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59
 
#else
60
 
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61
 
#endif
62
 
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63
 
 
64
 
#ifdef HAVE_ALTIVEC
65
 
#include "swscale_altivec_template.c"
66
 
#endif
67
 
 
68
 
#define YSCALEYUV2YV12X(x, offset) \
69
 
                        "xor %%"REG_a", %%"REG_a"       \n\t"\
70
 
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71
 
                        "movq %%mm3, %%mm4              \n\t"\
72
 
                        "lea " offset "(%0), %%"REG_d"  \n\t"\
73
 
                        "mov (%%"REG_d"), %%"REG_S"     \n\t"\
74
 
                        ".balign 16                     \n\t" /* FIXME Unroll? */\
75
 
                        "1:                             \n\t"\
76
 
                        "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
77
 
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78
 
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79
 
                        "add $16, %%"REG_d"             \n\t"\
80
 
                        "mov (%%"REG_d"), %%"REG_S"     \n\t"\
81
 
                        "test %%"REG_S", %%"REG_S"      \n\t"\
82
 
                        "pmulhw %%mm0, %%mm2            \n\t"\
83
 
                        "pmulhw %%mm0, %%mm5            \n\t"\
84
 
                        "paddw %%mm2, %%mm3             \n\t"\
85
 
                        "paddw %%mm5, %%mm4             \n\t"\
86
 
                        " jnz 1b                        \n\t"\
87
 
                        "psraw $3, %%mm3                \n\t"\
88
 
                        "psraw $3, %%mm4                \n\t"\
89
 
                        "packuswb %%mm4, %%mm3          \n\t"\
90
 
                        MOVNTQ(%%mm3, (%1, %%REGa))\
91
 
                        "add $8, %%"REG_a"              \n\t"\
92
 
                        "cmp %2, %%"REG_a"              \n\t"\
93
 
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94
 
                        "movq %%mm3, %%mm4              \n\t"\
95
 
                        "lea " offset "(%0), %%"REG_d"  \n\t"\
96
 
                        "mov (%%"REG_d"), %%"REG_S"     \n\t"\
97
 
                        "jb 1b                          \n\t"
98
 
 
99
 
#define YSCALEYUV2YV121 \
100
 
                        "mov %2, %%"REG_a"              \n\t"\
101
 
                        ".balign 16                     \n\t" /* FIXME Unroll? */\
102
 
                        "1:                             \n\t"\
103
 
                        "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104
 
                        "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105
 
                        "psraw $7, %%mm0                \n\t"\
106
 
                        "psraw $7, %%mm1                \n\t"\
107
 
                        "packuswb %%mm1, %%mm0          \n\t"\
108
 
                        MOVNTQ(%%mm0, (%1, %%REGa))\
109
 
                        "add $8, %%"REG_a"              \n\t"\
110
 
                        "jnc 1b                         \n\t"
111
 
 
112
 
/*
113
 
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114
 
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115
 
                           "r" (dest), "m" (dstW),
116
 
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117
 
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118
 
*/
119
 
#define YSCALEYUV2PACKEDX \
120
 
                "xor %%"REG_a", %%"REG_a"       \n\t"\
121
 
                ".balign 16                     \n\t"\
122
 
                "nop                            \n\t"\
123
 
                "1:                             \n\t"\
124
 
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125
 
                "mov (%%"REG_d"), %%"REG_S"     \n\t"\
126
 
                "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127
 
                "movq %%mm3, %%mm4              \n\t"\
128
 
                ".balign 16                     \n\t"\
129
 
                "2:                             \n\t"\
130
 
                "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
131
 
                "movq (%%"REG_S", %%"REG_a"), %%mm2     \n\t" /* UsrcData */\
132
 
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133
 
                "add $16, %%"REG_d"             \n\t"\
134
 
                "mov (%%"REG_d"), %%"REG_S"     \n\t"\
135
 
                "pmulhw %%mm0, %%mm2            \n\t"\
136
 
                "pmulhw %%mm0, %%mm5            \n\t"\
137
 
                "paddw %%mm2, %%mm3             \n\t"\
138
 
                "paddw %%mm5, %%mm4             \n\t"\
139
 
                "test %%"REG_S", %%"REG_S"      \n\t"\
140
 
                " jnz 2b                        \n\t"\
141
 
\
142
 
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143
 
                "mov (%%"REG_d"), %%"REG_S"     \n\t"\
144
 
                "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145
 
                "movq %%mm1, %%mm7              \n\t"\
146
 
                ".balign 16                     \n\t"\
147
 
                "2:                             \n\t"\
148
 
                "movq 8(%%"REG_d"), %%mm0       \n\t" /* filterCoeff */\
149
 
                "movq (%%"REG_S", %%"REG_a", 2), %%mm2  \n\t" /* Y1srcData */\
150
 
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151
 
                "add $16, %%"REG_d"             \n\t"\
152
 
                "mov (%%"REG_d"), %%"REG_S"     \n\t"\
153
 
                "pmulhw %%mm0, %%mm2            \n\t"\
154
 
                "pmulhw %%mm0, %%mm5            \n\t"\
155
 
                "paddw %%mm2, %%mm1             \n\t"\
156
 
                "paddw %%mm5, %%mm7             \n\t"\
157
 
                "test %%"REG_S", %%"REG_S"      \n\t"\
158
 
                " jnz 2b                        \n\t"\
159
 
 
160
 
 
161
 
#define YSCALEYUV2RGBX \
162
 
                YSCALEYUV2PACKEDX\
163
 
                "psubw "U_OFFSET"(%0), %%mm3    \n\t" /* (U-128)8*/\
164
 
                "psubw "V_OFFSET"(%0), %%mm4    \n\t" /* (V-128)8*/\
165
 
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
166
 
                "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
167
 
                "pmulhw "UG_COEFF"(%0), %%mm3   \n\t"\
168
 
                "pmulhw "VG_COEFF"(%0), %%mm4   \n\t"\
169
 
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170
 
                "pmulhw "UB_COEFF"(%0), %%mm2   \n\t"\
171
 
                "pmulhw "VR_COEFF"(%0), %%mm5   \n\t"\
172
 
                "psubw "Y_OFFSET"(%0), %%mm1    \n\t" /* 8(Y-16)*/\
173
 
                "psubw "Y_OFFSET"(%0), %%mm7    \n\t" /* 8(Y-16)*/\
174
 
                "pmulhw "Y_COEFF"(%0), %%mm1    \n\t"\
175
 
                "pmulhw "Y_COEFF"(%0), %%mm7    \n\t"\
176
 
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177
 
                "paddw %%mm3, %%mm4             \n\t"\
178
 
                "movq %%mm2, %%mm0              \n\t"\
179
 
                "movq %%mm5, %%mm6              \n\t"\
180
 
                "movq %%mm4, %%mm3              \n\t"\
181
 
                "punpcklwd %%mm2, %%mm2         \n\t"\
182
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
183
 
                "punpcklwd %%mm4, %%mm4         \n\t"\
184
 
                "paddw %%mm1, %%mm2             \n\t"\
185
 
                "paddw %%mm1, %%mm5             \n\t"\
186
 
                "paddw %%mm1, %%mm4             \n\t"\
187
 
                "punpckhwd %%mm0, %%mm0         \n\t"\
188
 
                "punpckhwd %%mm6, %%mm6         \n\t"\
189
 
                "punpckhwd %%mm3, %%mm3         \n\t"\
190
 
                "paddw %%mm7, %%mm0             \n\t"\
191
 
                "paddw %%mm7, %%mm6             \n\t"\
192
 
                "paddw %%mm7, %%mm3             \n\t"\
193
 
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194
 
                "packuswb %%mm0, %%mm2          \n\t"\
195
 
                "packuswb %%mm6, %%mm5          \n\t"\
196
 
                "packuswb %%mm3, %%mm4          \n\t"\
197
 
                "pxor %%mm7, %%mm7              \n\t"
198
 
#if 0
199
 
#define FULL_YSCALEYUV2RGB \
200
 
                "pxor %%mm7, %%mm7              \n\t"\
201
 
                "movd %6, %%mm6                 \n\t" /*yalpha1*/\
202
 
                "punpcklwd %%mm6, %%mm6         \n\t"\
203
 
                "punpcklwd %%mm6, %%mm6         \n\t"\
204
 
                "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
205
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
206
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
207
 
                "xor %%"REG_a", %%"REG_a"               \n\t"\
208
 
                ".balign 16                     \n\t"\
209
 
                "1:                             \n\t"\
210
 
                "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211
 
                "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212
 
                "movq (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
213
 
                "movq (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
214
 
                "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
215
 
                "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216
 
                "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217
 
                "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218
 
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219
 
                "movq 4096(%2, %%"REG_a",2), %%mm4      \n\t" /* uvbuf0[eax+2048]*/\
220
 
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221
 
                "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222
 
                "movq 4096(%3, %%"REG_a",2), %%mm0      \n\t" /* uvbuf1[eax+2048]*/\
223
 
                "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224
 
                "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225
 
                "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
226
 
                "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
227
 
                "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228
 
\
229
 
\
230
 
                "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231
 
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
232
 
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233
 
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234
 
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235
 
                "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236
 
                "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
237
 
\
238
 
\
239
 
                "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
240
 
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241
 
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242
 
                "paddw %%mm1, %%mm3             \n\t" /* B*/\
243
 
                "paddw %%mm1, %%mm0             \n\t" /* R*/\
244
 
                "packuswb %%mm3, %%mm3          \n\t"\
245
 
\
246
 
                "packuswb %%mm0, %%mm0          \n\t"\
247
 
                "paddw %%mm4, %%mm2             \n\t"\
248
 
                "paddw %%mm2, %%mm1             \n\t" /* G*/\
249
 
\
250
 
                "packuswb %%mm1, %%mm1          \n\t"
251
 
#endif
252
 
 
253
 
#define REAL_YSCALEYUV2PACKED(index, c) \
254
 
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255
 
                "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256
 
                "psraw $3, %%mm0                \n\t"\
257
 
                "psraw $3, %%mm1                \n\t"\
258
 
                "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259
 
                "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260
 
                "xor "#index", "#index"         \n\t"\
261
 
                ".balign 16                     \n\t"\
262
 
                "1:                             \n\t"\
263
 
                "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
264
 
                "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
265
 
                "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266
 
                "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267
 
                "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268
 
                "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269
 
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270
 
                "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271
 
                "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272
 
                "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273
 
                "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274
 
                "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275
 
                "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276
 
                "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
277
 
                "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
278
 
                "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279
 
                "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280
 
                "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
281
 
                "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
282
 
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283
 
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284
 
                "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285
 
                "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286
 
                "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287
 
                "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288
 
                
289
 
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290
 
                
291
 
#define REAL_YSCALEYUV2RGB(index, c) \
292
 
                "xor "#index", "#index" \n\t"\
293
 
                ".balign 16                     \n\t"\
294
 
                "1:                             \n\t"\
295
 
                "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
296
 
                "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
297
 
                "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298
 
                "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299
 
                "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300
 
                "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301
 
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302
 
                "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303
 
                "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304
 
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305
 
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306
 
                "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307
 
                "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308
 
                "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
309
 
                "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
310
 
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
311
 
                "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
312
 
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313
 
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314
 
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
 
                "movq (%0, "#index", 2), %%mm0  \n\t" /*buf0[eax]*/\
316
 
                "movq (%1, "#index", 2), %%mm1  \n\t" /*buf1[eax]*/\
317
 
                "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318
 
                "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319
 
                "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
320
 
                "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
321
 
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322
 
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323
 
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324
 
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325
 
                "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326
 
                "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327
 
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328
 
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329
 
                "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
330
 
                "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
331
 
                "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
332
 
                "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
333
 
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334
 
                "paddw %%mm3, %%mm4             \n\t"\
335
 
                "movq %%mm2, %%mm0              \n\t"\
336
 
                "movq %%mm5, %%mm6              \n\t"\
337
 
                "movq %%mm4, %%mm3              \n\t"\
338
 
                "punpcklwd %%mm2, %%mm2         \n\t"\
339
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
340
 
                "punpcklwd %%mm4, %%mm4         \n\t"\
341
 
                "paddw %%mm1, %%mm2             \n\t"\
342
 
                "paddw %%mm1, %%mm5             \n\t"\
343
 
                "paddw %%mm1, %%mm4             \n\t"\
344
 
                "punpckhwd %%mm0, %%mm0         \n\t"\
345
 
                "punpckhwd %%mm6, %%mm6         \n\t"\
346
 
                "punpckhwd %%mm3, %%mm3         \n\t"\
347
 
                "paddw %%mm7, %%mm0             \n\t"\
348
 
                "paddw %%mm7, %%mm6             \n\t"\
349
 
                "paddw %%mm7, %%mm3             \n\t"\
350
 
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351
 
                "packuswb %%mm0, %%mm2          \n\t"\
352
 
                "packuswb %%mm6, %%mm5          \n\t"\
353
 
                "packuswb %%mm3, %%mm4          \n\t"\
354
 
                "pxor %%mm7, %%mm7              \n\t"
355
 
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356
 
                
357
 
#define REAL_YSCALEYUV2PACKED1(index, c) \
358
 
                "xor "#index", "#index"         \n\t"\
359
 
                ".balign 16                     \n\t"\
360
 
                "1:                             \n\t"\
361
 
                "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
362
 
                "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363
 
                "psraw $7, %%mm3                \n\t" \
364
 
                "psraw $7, %%mm4                \n\t" \
365
 
                "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
366
 
                "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367
 
                "psraw $7, %%mm1                \n\t" \
368
 
                "psraw $7, %%mm7                \n\t" \
369
 
                
370
 
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371
 
                
372
 
#define REAL_YSCALEYUV2RGB1(index, c) \
373
 
                "xor "#index", "#index" \n\t"\
374
 
                ".balign 16                     \n\t"\
375
 
                "1:                             \n\t"\
376
 
                "movq (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
377
 
                "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378
 
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379
 
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380
 
                "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
381
 
                "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
382
 
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
383
 
                "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
384
 
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385
 
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386
 
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387
 
                "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
388
 
                "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389
 
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390
 
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
 
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392
 
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393
 
                "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
394
 
                "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
395
 
                "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
396
 
                "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
397
 
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398
 
                "paddw %%mm3, %%mm4             \n\t"\
399
 
                "movq %%mm2, %%mm0              \n\t"\
400
 
                "movq %%mm5, %%mm6              \n\t"\
401
 
                "movq %%mm4, %%mm3              \n\t"\
402
 
                "punpcklwd %%mm2, %%mm2         \n\t"\
403
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
404
 
                "punpcklwd %%mm4, %%mm4         \n\t"\
405
 
                "paddw %%mm1, %%mm2             \n\t"\
406
 
                "paddw %%mm1, %%mm5             \n\t"\
407
 
                "paddw %%mm1, %%mm4             \n\t"\
408
 
                "punpckhwd %%mm0, %%mm0         \n\t"\
409
 
                "punpckhwd %%mm6, %%mm6         \n\t"\
410
 
                "punpckhwd %%mm3, %%mm3         \n\t"\
411
 
                "paddw %%mm7, %%mm0             \n\t"\
412
 
                "paddw %%mm7, %%mm6             \n\t"\
413
 
                "paddw %%mm7, %%mm3             \n\t"\
414
 
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415
 
                "packuswb %%mm0, %%mm2          \n\t"\
416
 
                "packuswb %%mm6, %%mm5          \n\t"\
417
 
                "packuswb %%mm3, %%mm4          \n\t"\
418
 
                "pxor %%mm7, %%mm7              \n\t"
419
 
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420
 
 
421
 
#define REAL_YSCALEYUV2PACKED1b(index, c) \
422
 
                "xor "#index", "#index"         \n\t"\
423
 
                ".balign 16                     \n\t"\
424
 
                "1:                             \n\t"\
425
 
                "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426
 
                "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427
 
                "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428
 
                "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429
 
                "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430
 
                "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431
 
                "psrlw $8, %%mm3                \n\t" \
432
 
                "psrlw $8, %%mm4                \n\t" \
433
 
                "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
434
 
                "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435
 
                "psraw $7, %%mm1                \n\t" \
436
 
                "psraw $7, %%mm7                \n\t" 
437
 
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438
 
                
439
 
// do vertical chrominance interpolation
440
 
#define REAL_YSCALEYUV2RGB1b(index, c) \
441
 
                "xor "#index", "#index"         \n\t"\
442
 
                ".balign 16                     \n\t"\
443
 
                "1:                             \n\t"\
444
 
                "movq (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
445
 
                "movq (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
446
 
                "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447
 
                "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448
 
                "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449
 
                "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450
 
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451
 
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452
 
                "psubw "U_OFFSET"("#c"), %%mm3  \n\t" /* (U-128)8*/\
453
 
                "psubw "V_OFFSET"("#c"), %%mm4  \n\t" /* (V-128)8*/\
454
 
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
455
 
                "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
456
 
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457
 
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458
 
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459
 
                "movq (%0, "#index", 2), %%mm1  \n\t" /*buf0[eax]*/\
460
 
                "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461
 
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462
 
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463
 
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464
 
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465
 
                "psubw "Y_OFFSET"("#c"), %%mm1  \n\t" /* 8(Y-16)*/\
466
 
                "psubw "Y_OFFSET"("#c"), %%mm7  \n\t" /* 8(Y-16)*/\
467
 
                "pmulhw "Y_COEFF"("#c"), %%mm1  \n\t"\
468
 
                "pmulhw "Y_COEFF"("#c"), %%mm7  \n\t"\
469
 
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470
 
                "paddw %%mm3, %%mm4             \n\t"\
471
 
                "movq %%mm2, %%mm0              \n\t"\
472
 
                "movq %%mm5, %%mm6              \n\t"\
473
 
                "movq %%mm4, %%mm3              \n\t"\
474
 
                "punpcklwd %%mm2, %%mm2         \n\t"\
475
 
                "punpcklwd %%mm5, %%mm5         \n\t"\
476
 
                "punpcklwd %%mm4, %%mm4         \n\t"\
477
 
                "paddw %%mm1, %%mm2             \n\t"\
478
 
                "paddw %%mm1, %%mm5             \n\t"\
479
 
                "paddw %%mm1, %%mm4             \n\t"\
480
 
                "punpckhwd %%mm0, %%mm0         \n\t"\
481
 
                "punpckhwd %%mm6, %%mm6         \n\t"\
482
 
                "punpckhwd %%mm3, %%mm3         \n\t"\
483
 
                "paddw %%mm7, %%mm0             \n\t"\
484
 
                "paddw %%mm7, %%mm6             \n\t"\
485
 
                "paddw %%mm7, %%mm3             \n\t"\
486
 
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487
 
                "packuswb %%mm0, %%mm2          \n\t"\
488
 
                "packuswb %%mm6, %%mm5          \n\t"\
489
 
                "packuswb %%mm3, %%mm4          \n\t"\
490
 
                "pxor %%mm7, %%mm7              \n\t"
491
 
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492
 
 
493
 
#define REAL_WRITEBGR32(dst, dstw, index) \
494
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495
 
                        "movq %%mm2, %%mm1              \n\t" /* B */\
496
 
                        "movq %%mm5, %%mm6              \n\t" /* R */\
497
 
                        "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
498
 
                        "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
499
 
                        "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
500
 
                        "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
501
 
                        "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
502
 
                        "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
503
 
                        "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
504
 
                        "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
505
 
                        "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
506
 
                        "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
507
 
\
508
 
                        MOVNTQ(%%mm0, (dst, index, 4))\
509
 
                        MOVNTQ(%%mm2, 8(dst, index, 4))\
510
 
                        MOVNTQ(%%mm1, 16(dst, index, 4))\
511
 
                        MOVNTQ(%%mm3, 24(dst, index, 4))\
512
 
\
513
 
                        "add $8, "#index"               \n\t"\
514
 
                        "cmp "#dstw", "#index"          \n\t"\
515
 
                        " jb 1b                         \n\t"
516
 
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517
 
 
518
 
#define REAL_WRITEBGR16(dst, dstw, index) \
519
 
                        "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
520
 
                        "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
521
 
                        "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
522
 
                        "psrlq $3, %%mm2                \n\t"\
523
 
\
524
 
                        "movq %%mm2, %%mm1              \n\t"\
525
 
                        "movq %%mm4, %%mm3              \n\t"\
526
 
\
527
 
                        "punpcklbw %%mm7, %%mm3         \n\t"\
528
 
                        "punpcklbw %%mm5, %%mm2         \n\t"\
529
 
                        "punpckhbw %%mm7, %%mm4         \n\t"\
530
 
                        "punpckhbw %%mm5, %%mm1         \n\t"\
531
 
\
532
 
                        "psllq $3, %%mm3                \n\t"\
533
 
                        "psllq $3, %%mm4                \n\t"\
534
 
\
535
 
                        "por %%mm3, %%mm2               \n\t"\
536
 
                        "por %%mm4, %%mm1               \n\t"\
537
 
\
538
 
                        MOVNTQ(%%mm2, (dst, index, 2))\
539
 
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
540
 
\
541
 
                        "add $8, "#index"               \n\t"\
542
 
                        "cmp "#dstw", "#index"          \n\t"\
543
 
                        " jb 1b                         \n\t"
544
 
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545
 
 
546
 
#define REAL_WRITEBGR15(dst, dstw, index) \
547
 
                        "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
548
 
                        "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
549
 
                        "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
550
 
                        "psrlq $3, %%mm2                \n\t"\
551
 
                        "psrlq $1, %%mm5                \n\t"\
552
 
\
553
 
                        "movq %%mm2, %%mm1              \n\t"\
554
 
                        "movq %%mm4, %%mm3              \n\t"\
555
 
\
556
 
                        "punpcklbw %%mm7, %%mm3         \n\t"\
557
 
                        "punpcklbw %%mm5, %%mm2         \n\t"\
558
 
                        "punpckhbw %%mm7, %%mm4         \n\t"\
559
 
                        "punpckhbw %%mm5, %%mm1         \n\t"\
560
 
\
561
 
                        "psllq $2, %%mm3                \n\t"\
562
 
                        "psllq $2, %%mm4                \n\t"\
563
 
\
564
 
                        "por %%mm3, %%mm2               \n\t"\
565
 
                        "por %%mm4, %%mm1               \n\t"\
566
 
\
567
 
                        MOVNTQ(%%mm2, (dst, index, 2))\
568
 
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
569
 
\
570
 
                        "add $8, "#index"               \n\t"\
571
 
                        "cmp "#dstw", "#index"          \n\t"\
572
 
                        " jb 1b                         \n\t"
573
 
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574
 
 
575
 
#define WRITEBGR24OLD(dst, dstw, index) \
576
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577
 
                        "movq %%mm2, %%mm1              \n\t" /* B */\
578
 
                        "movq %%mm5, %%mm6              \n\t" /* R */\
579
 
                        "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
580
 
                        "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
581
 
                        "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
582
 
                        "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
583
 
                        "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
584
 
                        "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
585
 
                        "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
586
 
                        "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
587
 
                        "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
588
 
                        "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
589
 
\
590
 
                        "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
591
 
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592
 
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593
 
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594
 
                        "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
595
 
                        "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
596
 
                        "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
597
 
                        "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
598
 
\
599
 
                        "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
600
 
                        "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
601
 
                        "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
602
 
                        "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
603
 
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604
 
                        "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
605
 
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606
 
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607
 
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608
 
                        "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
609
 
                        "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
610
 
                        "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
611
 
                        "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
612
 
\
613
 
                        "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
614
 
                        "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
615
 
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616
 
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617
 
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618
 
                        "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
619
 
                        "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
620
 
                        "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
621
 
\
622
 
                        MOVNTQ(%%mm0, (dst))\
623
 
                        MOVNTQ(%%mm2, 8(dst))\
624
 
                        MOVNTQ(%%mm3, 16(dst))\
625
 
                        "add $24, "#dst"                \n\t"\
626
 
\
627
 
                        "add $8, "#index"               \n\t"\
628
 
                        "cmp "#dstw", "#index"          \n\t"\
629
 
                        " jb 1b                         \n\t"
630
 
 
631
 
#define WRITEBGR24MMX(dst, dstw, index) \
632
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633
 
                        "movq %%mm2, %%mm1              \n\t" /* B */\
634
 
                        "movq %%mm5, %%mm6              \n\t" /* R */\
635
 
                        "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
636
 
                        "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
637
 
                        "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
638
 
                        "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
639
 
                        "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
640
 
                        "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
641
 
                        "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
642
 
                        "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
643
 
                        "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
644
 
                        "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
645
 
\
646
 
                        "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
647
 
                        "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
648
 
                        "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
649
 
                        "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
650
 
\
651
 
                        "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
652
 
                        "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
653
 
                        "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
654
 
                        "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
655
 
\
656
 
                        "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
657
 
                        "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
658
 
                        "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
659
 
                        "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
660
 
\
661
 
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662
 
                        "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
663
 
                        "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
664
 
                        "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
665
 
                        MOVNTQ(%%mm0, (dst))\
666
 
\
667
 
                        "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
668
 
                        "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
669
 
                        "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
670
 
                        "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
671
 
                        MOVNTQ(%%mm6, 8(dst))\
672
 
\
673
 
                        "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
674
 
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675
 
                        "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
676
 
                        MOVNTQ(%%mm5, 16(dst))\
677
 
\
678
 
                        "add $24, "#dst"                \n\t"\
679
 
\
680
 
                        "add $8, "#index"                       \n\t"\
681
 
                        "cmp "#dstw", "#index"                  \n\t"\
682
 
                        " jb 1b                         \n\t"
683
 
 
684
 
#define WRITEBGR24MMX2(dst, dstw, index) \
685
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686
 
                        "movq "MANGLE(M24A)", %%mm0     \n\t"\
687
 
                        "movq "MANGLE(M24C)", %%mm7     \n\t"\
688
 
                        "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689
 
                        "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690
 
                        "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691
 
\
692
 
                        "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
693
 
                        "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
694
 
                        "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
695
 
\
696
 
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697
 
                        "por %%mm1, %%mm6               \n\t"\
698
 
                        "por %%mm3, %%mm6               \n\t"\
699
 
                        MOVNTQ(%%mm6, (dst))\
700
 
\
701
 
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702
 
                        "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703
 
                        "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704
 
                        "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705
 
\
706
 
                        "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
707
 
                        "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
708
 
                        "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
709
 
\
710
 
                        "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
711
 
                        "por %%mm3, %%mm6               \n\t"\
712
 
                        MOVNTQ(%%mm6, 8(dst))\
713
 
\
714
 
                        "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715
 
                        "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716
 
                        "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717
 
\
718
 
                        "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
719
 
                        "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
720
 
                        "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
721
 
\
722
 
                        "por %%mm1, %%mm3               \n\t"\
723
 
                        "por %%mm3, %%mm6               \n\t"\
724
 
                        MOVNTQ(%%mm6, 16(dst))\
725
 
\
726
 
                        "add $24, "#dst"                \n\t"\
727
 
\
728
 
                        "add $8, "#index"               \n\t"\
729
 
                        "cmp "#dstw", "#index"          \n\t"\
730
 
                        " jb 1b                         \n\t"
731
 
 
732
 
#ifdef HAVE_MMX2
733
 
#undef WRITEBGR24
734
 
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735
 
#else
736
 
#undef WRITEBGR24
737
 
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738
 
#endif
739
 
 
740
 
#define REAL_WRITEYUY2(dst, dstw, index) \
741
 
                        "packuswb %%mm3, %%mm3          \n\t"\
742
 
                        "packuswb %%mm4, %%mm4          \n\t"\
743
 
                        "packuswb %%mm7, %%mm1          \n\t"\
744
 
                        "punpcklbw %%mm4, %%mm3         \n\t"\
745
 
                        "movq %%mm1, %%mm7              \n\t"\
746
 
                        "punpcklbw %%mm3, %%mm1         \n\t"\
747
 
                        "punpckhbw %%mm3, %%mm7         \n\t"\
748
 
\
749
 
                        MOVNTQ(%%mm1, (dst, index, 2))\
750
 
                        MOVNTQ(%%mm7, 8(dst, index, 2))\
751
 
\
752
 
                        "add $8, "#index"               \n\t"\
753
 
                        "cmp "#dstw", "#index"          \n\t"\
754
 
                        " jb 1b                         \n\t"
755
 
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756
 
 
757
 
 
758
 
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759
 
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760
 
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761
 
{
762
 
#ifdef HAVE_MMX
763
 
        if(uDest != NULL)
764
 
        {
765
 
                asm volatile(
766
 
                                YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767
 
                                :: "r" (&c->redDither),
768
 
                                "r" (uDest), "p" ((long)chrDstW)
769
 
                                : "%"REG_a, "%"REG_d, "%"REG_S
770
 
                        );
771
 
 
772
 
                asm volatile(
773
 
                                YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774
 
                                :: "r" (&c->redDither),
775
 
                                "r" (vDest), "p" ((long)chrDstW)
776
 
                                : "%"REG_a, "%"REG_d, "%"REG_S
777
 
                        );
778
 
        }
779
 
 
780
 
        asm volatile(
781
 
                        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782
 
                        :: "r" (&c->redDither),
783
 
                           "r" (dest), "p" ((long)dstW)
784
 
                        : "%"REG_a, "%"REG_d, "%"REG_S
785
 
                );
786
 
#else
787
 
#ifdef HAVE_ALTIVEC
788
 
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789
 
                      chrFilter, chrSrc, chrFilterSize,
790
 
                      dest, uDest, vDest, dstW, chrDstW);
791
 
#else //HAVE_ALTIVEC
792
 
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793
 
            chrFilter, chrSrc, chrFilterSize,
794
 
            dest, uDest, vDest, dstW, chrDstW);
795
 
#endif //!HAVE_ALTIVEC
796
 
#endif
797
 
}
798
 
 
799
 
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800
 
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801
 
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802
 
{
803
 
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804
 
             chrFilter, chrSrc, chrFilterSize,
805
 
             dest, uDest, dstW, chrDstW, dstFormat);
806
 
}
807
 
 
808
 
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809
 
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
810
 
{
811
 
#ifdef HAVE_MMX
812
 
        if(uDest != NULL)
813
 
        {
814
 
                asm volatile(
815
 
                                YSCALEYUV2YV121
816
 
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817
 
                                "g" ((long)-chrDstW)
818
 
                                : "%"REG_a
819
 
                        );
820
 
 
821
 
                asm volatile(
822
 
                                YSCALEYUV2YV121
823
 
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824
 
                                "g" ((long)-chrDstW)
825
 
                                : "%"REG_a
826
 
                        );
827
 
        }
828
 
 
829
 
        asm volatile(
830
 
                YSCALEYUV2YV121
831
 
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
832
 
                "g" ((long)-dstW)
833
 
                : "%"REG_a
834
 
        );
835
 
#else
836
 
        int i;
837
 
        for(i=0; i<dstW; i++)
838
 
        {
839
 
                int val= lumSrc[i]>>7;
840
 
                
841
 
                if(val&256){
842
 
                        if(val<0) val=0;
843
 
                        else      val=255;
844
 
                }
845
 
 
846
 
                dest[i]= val;
847
 
        }
848
 
 
849
 
        if(uDest != NULL)
850
 
                for(i=0; i<chrDstW; i++)
851
 
                {
852
 
                        int u=chrSrc[i]>>7;
853
 
                        int v=chrSrc[i + 2048]>>7;
854
 
 
855
 
                        if((u|v)&256){
856
 
                                if(u<0)         u=0;
857
 
                                else if (u>255) u=255;
858
 
                                if(v<0)         v=0;
859
 
                                else if (v>255) v=255;
860
 
                        }
861
 
 
862
 
                        uDest[i]= u;
863
 
                        vDest[i]= v;
864
 
                }
865
 
#endif
866
 
}
867
 
 
868
 
 
869
 
/**
870
 
 * vertical scale YV12 to RGB
871
 
 */
872
 
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873
 
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874
 
                            uint8_t *dest, int dstW, int dstY)
875
 
{
876
 
        int dummy=0;
877
 
        switch(c->dstFormat)
878
 
        {
879
 
#ifdef HAVE_MMX
880
 
        case IMGFMT_BGR32:
881
 
                {
882
 
                        asm volatile(
883
 
                                YSCALEYUV2RGBX
884
 
                                WRITEBGR32(%4, %5, %%REGa)
885
 
 
886
 
                        :: "r" (&c->redDither), 
887
 
                           "m" (dummy), "m" (dummy), "m" (dummy),
888
 
                           "r" (dest), "m" (dstW)
889
 
                        : "%"REG_a, "%"REG_d, "%"REG_S
890
 
                        );
891
 
                }
892
 
                break;
893
 
        case IMGFMT_BGR24:
894
 
                {
895
 
                        asm volatile(
896
 
                                YSCALEYUV2RGBX
897
 
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898
 
                                "add %4, %%"REG_b"                      \n\t"
899
 
                                WRITEBGR24(%%REGb, %5, %%REGa)
900
 
 
901
 
                        :: "r" (&c->redDither), 
902
 
                           "m" (dummy), "m" (dummy), "m" (dummy),
903
 
                           "r" (dest), "m" (dstW)
904
 
                        : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
905
 
                        );
906
 
                }
907
 
                break;
908
 
        case IMGFMT_BGR15:
909
 
                {
910
 
                        asm volatile(
911
 
                                YSCALEYUV2RGBX
912
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913
 
#ifdef DITHER1XBPP
914
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915
 
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917
 
#endif
918
 
 
919
 
                                WRITEBGR15(%4, %5, %%REGa)
920
 
 
921
 
                        :: "r" (&c->redDither), 
922
 
                           "m" (dummy), "m" (dummy), "m" (dummy),
923
 
                           "r" (dest), "m" (dstW)
924
 
                        : "%"REG_a, "%"REG_d, "%"REG_S
925
 
                        );
926
 
                }
927
 
                break;
928
 
        case IMGFMT_BGR16:
929
 
                {
930
 
                        asm volatile(
931
 
                                YSCALEYUV2RGBX
932
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933
 
#ifdef DITHER1XBPP
934
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935
 
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937
 
#endif
938
 
 
939
 
                                WRITEBGR16(%4, %5, %%REGa)
940
 
 
941
 
                        :: "r" (&c->redDither), 
942
 
                           "m" (dummy), "m" (dummy), "m" (dummy),
943
 
                           "r" (dest), "m" (dstW)
944
 
                        : "%"REG_a, "%"REG_d, "%"REG_S
945
 
                        );
946
 
                }
947
 
                break;
948
 
        case IMGFMT_YUY2:
949
 
                {
950
 
                        asm volatile(
951
 
                                YSCALEYUV2PACKEDX
952
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
 
 
954
 
                                "psraw $3, %%mm3                \n\t"
955
 
                                "psraw $3, %%mm4                \n\t"
956
 
                                "psraw $3, %%mm1                \n\t"
957
 
                                "psraw $3, %%mm7                \n\t"
958
 
                                WRITEYUY2(%4, %5, %%REGa)
959
 
 
960
 
                        :: "r" (&c->redDither), 
961
 
                           "m" (dummy), "m" (dummy), "m" (dummy),
962
 
                           "r" (dest), "m" (dstW)
963
 
                        : "%"REG_a, "%"REG_d, "%"REG_S
964
 
                        );
965
 
                }
966
 
                break;
967
 
#endif
968
 
        default:
969
 
#ifdef HAVE_ALTIVEC
970
 
                altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971
 
                            chrFilter, chrSrc, chrFilterSize,
972
 
                            dest, dstW, dstY);
973
 
#else
974
 
                yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975
 
                            chrFilter, chrSrc, chrFilterSize,
976
 
                            dest, dstW, dstY);
977
 
#endif
978
 
                break;
979
 
        }
980
 
}
981
 
 
982
 
/**
983
 
 * vertical bilinear scale YV12 to RGB
984
 
 */
985
 
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986
 
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
987
 
{
988
 
        int yalpha1=yalpha^4095;
989
 
        int uvalpha1=uvalpha^4095;
990
 
        int i;
991
 
 
992
 
#if 0 //isn't used
993
 
        if(flags&SWS_FULL_CHR_H_INT)
994
 
        {
995
 
                switch(dstFormat)
996
 
                {
997
 
#ifdef HAVE_MMX
998
 
                case IMGFMT_BGR32:
999
 
                        asm volatile(
1000
 
 
1001
 
 
1002
 
FULL_YSCALEYUV2RGB
1003
 
                        "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1004
 
                        "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1005
 
 
1006
 
                        "movq %%mm3, %%mm1              \n\t"
1007
 
                        "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1008
 
                        "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1009
 
 
1010
 
                        MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011
 
                        MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1012
 
 
1013
 
                        "add $4, %%"REG_a"              \n\t"
1014
 
                        "cmp %5, %%"REG_a"              \n\t"
1015
 
                        " jb 1b                         \n\t"
1016
 
 
1017
 
 
1018
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019
 
                        "m" (yalpha1), "m" (uvalpha1)
1020
 
                        : "%"REG_a
1021
 
                        );
1022
 
                        break;
1023
 
                case IMGFMT_BGR24:
1024
 
                        asm volatile(
1025
 
 
1026
 
FULL_YSCALEYUV2RGB
1027
 
 
1028
 
                                                                // lsb ... msb
1029
 
                        "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
1030
 
                        "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
1031
 
 
1032
 
                        "movq %%mm3, %%mm1              \n\t"
1033
 
                        "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
1034
 
                        "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
1035
 
 
1036
 
                        "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
1037
 
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
1038
 
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039
 
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040
 
                        "por %%mm2, %%mm3               \n\t" // BGRBGR00
1041
 
                        "movq %%mm1, %%mm2              \n\t"
1042
 
                        "psllq $48, %%mm1               \n\t" // 000000BG
1043
 
                        "por %%mm1, %%mm3               \n\t" // BGRBGRBG
1044
 
 
1045
 
                        "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
1046
 
                        "psrld $16, %%mm2               \n\t" // R000R000
1047
 
                        "psrlq $24, %%mm1               \n\t" // 0BGR0000
1048
 
                        "por %%mm2, %%mm1               \n\t" // RBGRR000
1049
 
 
1050
 
                        "mov %4, %%"REG_b"              \n\t"
1051
 
                        "add %%"REG_a", %%"REG_b"       \n\t"
1052
 
 
1053
 
#ifdef HAVE_MMX2
1054
 
                        //FIXME Alignment
1055
 
                        "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056
 
                        "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1057
 
#else
1058
 
                        "movd %%mm3, (%%"REG_b", %%"REG_a", 2)  \n\t"
1059
 
                        "psrlq $32, %%mm3               \n\t"
1060
 
                        "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061
 
                        "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1062
 
#endif
1063
 
                        "add $4, %%"REG_a"              \n\t"
1064
 
                        "cmp %5, %%"REG_a"              \n\t"
1065
 
                        " jb 1b                         \n\t"
1066
 
 
1067
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068
 
                        "m" (yalpha1), "m" (uvalpha1)
1069
 
                        : "%"REG_a, "%"REG_b
1070
 
                        );
1071
 
                        break;
1072
 
                case IMGFMT_BGR15:
1073
 
                        asm volatile(
1074
 
 
1075
 
FULL_YSCALEYUV2RGB
1076
 
#ifdef DITHER1XBPP
1077
 
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078
 
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079
 
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1080
 
#endif
1081
 
                        "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1082
 
                        "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1083
 
                        "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1084
 
 
1085
 
                        "psrlw $3, %%mm3                \n\t"
1086
 
                        "psllw $2, %%mm1                \n\t"
1087
 
                        "psllw $7, %%mm0                \n\t"
1088
 
                        "pand "MANGLE(g15Mask)", %%mm1  \n\t"
1089
 
                        "pand "MANGLE(r15Mask)", %%mm0  \n\t"
1090
 
 
1091
 
                        "por %%mm3, %%mm1               \n\t"
1092
 
                        "por %%mm1, %%mm0               \n\t"
1093
 
 
1094
 
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1095
 
 
1096
 
                        "add $4, %%"REG_a"              \n\t"
1097
 
                        "cmp %5, %%"REG_a"              \n\t"
1098
 
                        " jb 1b                         \n\t"
1099
 
 
1100
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101
 
                        "m" (yalpha1), "m" (uvalpha1)
1102
 
                        : "%"REG_a
1103
 
                        );
1104
 
                        break;
1105
 
                case IMGFMT_BGR16:
1106
 
                        asm volatile(
1107
 
 
1108
 
FULL_YSCALEYUV2RGB
1109
 
#ifdef DITHER1XBPP
1110
 
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111
 
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112
 
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1113
 
#endif
1114
 
                        "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
1115
 
                        "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
1116
 
                        "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
1117
 
 
1118
 
                        "psrlw $3, %%mm3                \n\t"
1119
 
                        "psllw $3, %%mm1                \n\t"
1120
 
                        "psllw $8, %%mm0                \n\t"
1121
 
                        "pand "MANGLE(g16Mask)", %%mm1  \n\t"
1122
 
                        "pand "MANGLE(r16Mask)", %%mm0  \n\t"
1123
 
 
1124
 
                        "por %%mm3, %%mm1               \n\t"
1125
 
                        "por %%mm1, %%mm0               \n\t"
1126
 
 
1127
 
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1128
 
 
1129
 
                        "add $4, %%"REG_a"              \n\t"
1130
 
                        "cmp %5, %%"REG_a"              \n\t"
1131
 
                        " jb 1b                         \n\t"
1132
 
 
1133
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134
 
                        "m" (yalpha1), "m" (uvalpha1)
1135
 
                        : "%"REG_a
1136
 
                        );
1137
 
                break;
1138
 
#endif
1139
 
                case IMGFMT_RGB32:
1140
 
#ifndef HAVE_MMX
1141
 
                case IMGFMT_BGR32:
1142
 
#endif
1143
 
                if(dstFormat==IMGFMT_BGR32)
1144
 
                {
1145
 
                        int i;
1146
 
#ifdef WORDS_BIGENDIAN
1147
 
                        dest++;
1148
 
#endif
1149
 
                        for(i=0;i<dstW;i++){
1150
 
                                // vertical linear interpolation && yuv2rgb in a single step:
1151
 
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152
 
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153
 
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154
 
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155
 
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156
 
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157
 
                                dest+= 4;
1158
 
                        }
1159
 
                }
1160
 
                else if(dstFormat==IMGFMT_BGR24)
1161
 
                {
1162
 
                        int i;
1163
 
                        for(i=0;i<dstW;i++){
1164
 
                                // vertical linear interpolation && yuv2rgb in a single step:
1165
 
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166
 
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167
 
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168
 
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169
 
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170
 
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1171
 
                                dest+= 3;
1172
 
                        }
1173
 
                }
1174
 
                else if(dstFormat==IMGFMT_BGR16)
1175
 
                {
1176
 
                        int i;
1177
 
                        for(i=0;i<dstW;i++){
1178
 
                                // vertical linear interpolation && yuv2rgb in a single step:
1179
 
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180
 
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181
 
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1182
 
 
1183
 
                                ((uint16_t*)dest)[i] =
1184
 
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185
 
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186
 
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1187
 
                        }
1188
 
                }
1189
 
                else if(dstFormat==IMGFMT_BGR15)
1190
 
                {
1191
 
                        int i;
1192
 
                        for(i=0;i<dstW;i++){
1193
 
                                // vertical linear interpolation && yuv2rgb in a single step:
1194
 
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195
 
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196
 
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1197
 
 
1198
 
                                ((uint16_t*)dest)[i] =
1199
 
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200
 
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201
 
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1202
 
                        }
1203
 
                }
1204
 
        }//FULL_UV_IPOL
1205
 
        else
1206
 
        {
1207
 
#endif // if 0
1208
 
#ifdef HAVE_MMX
1209
 
        switch(c->dstFormat)
1210
 
        {
1211
 
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1212
 
        case IMGFMT_BGR32:
1213
 
                        asm volatile(
1214
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1215
 
                                "mov %4, %%"REG_SP"                     \n\t"
1216
 
                                YSCALEYUV2RGB(%%REGa, %5)
1217
 
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1219
 
 
1220
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221
 
                        "r" (&c->redDither)
1222
 
                        : "%"REG_a
1223
 
                        );
1224
 
                        return;
1225
 
        case IMGFMT_BGR24:
1226
 
                        asm volatile(
1227
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1228
 
                                "mov %4, %%"REG_SP"                     \n\t"
1229
 
                                YSCALEYUV2RGB(%%REGa, %5)
1230
 
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1232
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233
 
                        "r" (&c->redDither)
1234
 
                        : "%"REG_a
1235
 
                        );
1236
 
                        return;
1237
 
        case IMGFMT_BGR15:
1238
 
                        asm volatile(
1239
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1240
 
                                "mov %4, %%"REG_SP"                     \n\t"
1241
 
                                YSCALEYUV2RGB(%%REGa, %5)
1242
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243
 
#ifdef DITHER1XBPP
1244
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245
 
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1247
 
#endif
1248
 
 
1249
 
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1251
 
 
1252
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253
 
                        "r" (&c->redDither)
1254
 
                        : "%"REG_a
1255
 
                        );
1256
 
                        return;
1257
 
        case IMGFMT_BGR16:
1258
 
                        asm volatile(
1259
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1260
 
                                "mov %4, %%"REG_SP"                     \n\t"
1261
 
                                YSCALEYUV2RGB(%%REGa, %5)
1262
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263
 
#ifdef DITHER1XBPP
1264
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265
 
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1267
 
#endif
1268
 
 
1269
 
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1271
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272
 
                        "r" (&c->redDither)
1273
 
                        : "%"REG_a
1274
 
                        );
1275
 
                        return;
1276
 
        case IMGFMT_YUY2:
1277
 
                        asm volatile(
1278
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1279
 
                                "mov %4, %%"REG_SP"                     \n\t"
1280
 
                                YSCALEYUV2PACKED(%%REGa, %5)
1281
 
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1283
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284
 
                        "r" (&c->redDither)
1285
 
                        : "%"REG_a
1286
 
                        );
1287
 
                        return;
1288
 
        default: break;
1289
 
        }
1290
 
#endif //HAVE_MMX
1291
 
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1292
 
}
1293
 
 
1294
 
/**
1295
 
 * YV12 to RGB without scaling or interpolating
1296
 
 */
1297
 
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298
 
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1299
 
{
1300
 
        const int yalpha1=0;
1301
 
        int i;
1302
 
        
1303
 
        uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304
 
        const int yalpha= 4096; //FIXME ...
1305
 
 
1306
 
        if(flags&SWS_FULL_CHR_H_INT)
1307
 
        {
1308
 
                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1309
 
                return;
1310
 
        }
1311
 
 
1312
 
#ifdef HAVE_MMX
1313
 
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1314
 
        {
1315
 
                switch(dstFormat)
1316
 
                {
1317
 
                case IMGFMT_BGR32:
1318
 
                        asm volatile(
1319
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1320
 
                                "mov %4, %%"REG_SP"                     \n\t"
1321
 
                                YSCALEYUV2RGB1(%%REGa, %5)
1322
 
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1324
 
 
1325
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326
 
                        "r" (&c->redDither)
1327
 
                        : "%"REG_a
1328
 
                        );
1329
 
                        return;
1330
 
                case IMGFMT_BGR24:
1331
 
                        asm volatile(
1332
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1333
 
                                "mov %4, %%"REG_SP"                     \n\t"
1334
 
                                YSCALEYUV2RGB1(%%REGa, %5)
1335
 
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1337
 
 
1338
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339
 
                        "r" (&c->redDither)
1340
 
                        : "%"REG_a
1341
 
                        );
1342
 
                        return;
1343
 
                case IMGFMT_BGR15:
1344
 
                        asm volatile(
1345
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1346
 
                                "mov %4, %%"REG_SP"                     \n\t"
1347
 
                                YSCALEYUV2RGB1(%%REGa, %5)
1348
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349
 
#ifdef DITHER1XBPP
1350
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351
 
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1353
 
#endif
1354
 
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1356
 
 
1357
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358
 
                        "r" (&c->redDither)
1359
 
                        : "%"REG_a
1360
 
                        );
1361
 
                        return;
1362
 
                case IMGFMT_BGR16:
1363
 
                        asm volatile(
1364
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1365
 
                                "mov %4, %%"REG_SP"                     \n\t"
1366
 
                                YSCALEYUV2RGB1(%%REGa, %5)
1367
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368
 
#ifdef DITHER1XBPP
1369
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370
 
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372
 
#endif
1373
 
 
1374
 
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1376
 
 
1377
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378
 
                        "r" (&c->redDither)
1379
 
                        : "%"REG_a
1380
 
                        );
1381
 
                        return;
1382
 
                case IMGFMT_YUY2:
1383
 
                        asm volatile(
1384
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1385
 
                                "mov %4, %%"REG_SP"                     \n\t"
1386
 
                                YSCALEYUV2PACKED1(%%REGa, %5)
1387
 
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1389
 
 
1390
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391
 
                        "r" (&c->redDither)
1392
 
                        : "%"REG_a
1393
 
                        );
1394
 
                        return;
1395
 
                }
1396
 
        }
1397
 
        else
1398
 
        {
1399
 
                switch(dstFormat)
1400
 
                {
1401
 
                case IMGFMT_BGR32:
1402
 
                        asm volatile(
1403
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1404
 
                                "mov %4, %%"REG_SP"                     \n\t"
1405
 
                                YSCALEYUV2RGB1b(%%REGa, %5)
1406
 
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1408
 
 
1409
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410
 
                        "r" (&c->redDither)
1411
 
                        : "%"REG_a
1412
 
                        );
1413
 
                        return;
1414
 
                case IMGFMT_BGR24:
1415
 
                        asm volatile(
1416
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1417
 
                                "mov %4, %%"REG_SP"                     \n\t"
1418
 
                                YSCALEYUV2RGB1b(%%REGa, %5)
1419
 
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1421
 
 
1422
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423
 
                        "r" (&c->redDither)
1424
 
                        : "%"REG_a
1425
 
                        );
1426
 
                        return;
1427
 
                case IMGFMT_BGR15:
1428
 
                        asm volatile(
1429
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1430
 
                                "mov %4, %%"REG_SP"                     \n\t"
1431
 
                                YSCALEYUV2RGB1b(%%REGa, %5)
1432
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433
 
#ifdef DITHER1XBPP
1434
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435
 
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1437
 
#endif
1438
 
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1440
 
 
1441
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442
 
                        "r" (&c->redDither)
1443
 
                        : "%"REG_a
1444
 
                        );
1445
 
                        return;
1446
 
                case IMGFMT_BGR16:
1447
 
                        asm volatile(
1448
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1449
 
                                "mov %4, %%"REG_SP"                     \n\t"
1450
 
                                YSCALEYUV2RGB1b(%%REGa, %5)
1451
 
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452
 
#ifdef DITHER1XBPP
1453
 
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454
 
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455
 
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1456
 
#endif
1457
 
 
1458
 
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1460
 
 
1461
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462
 
                        "r" (&c->redDither)
1463
 
                        : "%"REG_a
1464
 
                        );
1465
 
                        return;
1466
 
                case IMGFMT_YUY2:
1467
 
                        asm volatile(
1468
 
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)       \n\t"
1469
 
                                "mov %4, %%"REG_SP"                     \n\t"
1470
 
                                YSCALEYUV2PACKED1b(%%REGa, %5)
1471
 
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472
 
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"       \n\t"
1473
 
 
1474
 
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475
 
                        "r" (&c->redDither)
1476
 
                        : "%"REG_a
1477
 
                        );
1478
 
                        return;
1479
 
                }
1480
 
        }
1481
 
#endif
1482
 
        if( uvalpha < 2048 )
1483
 
        {
1484
 
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1485
 
        }else{
1486
 
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1487
 
        }
1488
 
}
1489
 
 
1490
 
//FIXME yuy2* can read upto 7 samples to much
1491
 
 
1492
 
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1493
 
{
1494
 
#ifdef HAVE_MMX
1495
 
        asm volatile(
1496
 
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497
 
                "mov %0, %%"REG_a"              \n\t"
1498
 
                "1:                             \n\t"
1499
 
                "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1500
 
                "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501
 
                "pand %%mm2, %%mm0              \n\t"
1502
 
                "pand %%mm2, %%mm1              \n\t"
1503
 
                "packuswb %%mm1, %%mm0          \n\t"
1504
 
                "movq %%mm0, (%2, %%"REG_a")    \n\t"
1505
 
                "add $8, %%"REG_a"              \n\t"
1506
 
                " js 1b                         \n\t"
1507
 
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1508
 
                : "%"REG_a
1509
 
        );
1510
 
#else
1511
 
        int i;
1512
 
        for(i=0; i<width; i++)
1513
 
                dst[i]= src[2*i];
1514
 
#endif
1515
 
}
1516
 
 
1517
 
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1518
 
{
1519
 
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520
 
        asm volatile(
1521
 
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522
 
                "mov %0, %%"REG_a"              \n\t"
1523
 
                "1:                             \n\t"
1524
 
                "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1525
 
                "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526
 
                "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1527
 
                "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1528
 
                PAVGB(%%mm2, %%mm0)
1529
 
                PAVGB(%%mm3, %%mm1)
1530
 
                "psrlw $8, %%mm0                \n\t"
1531
 
                "psrlw $8, %%mm1                \n\t"
1532
 
                "packuswb %%mm1, %%mm0          \n\t"
1533
 
                "movq %%mm0, %%mm1              \n\t"
1534
 
                "psrlw $8, %%mm0                \n\t"
1535
 
                "pand %%mm4, %%mm1              \n\t"
1536
 
                "packuswb %%mm0, %%mm0          \n\t"
1537
 
                "packuswb %%mm1, %%mm1          \n\t"
1538
 
                "movd %%mm0, (%4, %%"REG_a")    \n\t"
1539
 
                "movd %%mm1, (%3, %%"REG_a")    \n\t"
1540
 
                "add $4, %%"REG_a"              \n\t"
1541
 
                " js 1b                         \n\t"
1542
 
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543
 
                : "%"REG_a
1544
 
        );
1545
 
#else
1546
 
        int i;
1547
 
        for(i=0; i<width; i++)
1548
 
        {
1549
 
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550
 
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1551
 
        }
1552
 
#endif
1553
 
}
1554
 
 
1555
 
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556
 
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1557
 
{
1558
 
#ifdef HAVE_MMX
1559
 
        asm volatile(
1560
 
                "mov %0, %%"REG_a"              \n\t"
1561
 
                "1:                             \n\t"
1562
 
                "movq (%1, %%"REG_a",2), %%mm0  \n\t"
1563
 
                "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564
 
                "psrlw $8, %%mm0                \n\t"
1565
 
                "psrlw $8, %%mm1                \n\t"
1566
 
                "packuswb %%mm1, %%mm0          \n\t"
1567
 
                "movq %%mm0, (%2, %%"REG_a")    \n\t"
1568
 
                "add $8, %%"REG_a"              \n\t"
1569
 
                " js 1b                         \n\t"
1570
 
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1571
 
                : "%"REG_a
1572
 
        );
1573
 
#else
1574
 
        int i;
1575
 
        for(i=0; i<width; i++)
1576
 
                dst[i]= src[2*i+1];
1577
 
#endif
1578
 
}
1579
 
 
1580
 
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1581
 
{
1582
 
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583
 
        asm volatile(
1584
 
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585
 
                "mov %0, %%"REG_a"              \n\t"
1586
 
                "1:                             \n\t"
1587
 
                "movq (%1, %%"REG_a",4), %%mm0  \n\t"
1588
 
                "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589
 
                "movq (%2, %%"REG_a",4), %%mm2  \n\t"
1590
 
                "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1591
 
                PAVGB(%%mm2, %%mm0)
1592
 
                PAVGB(%%mm3, %%mm1)
1593
 
                "pand %%mm4, %%mm0              \n\t"
1594
 
                "pand %%mm4, %%mm1              \n\t"
1595
 
                "packuswb %%mm1, %%mm0          \n\t"
1596
 
                "movq %%mm0, %%mm1              \n\t"
1597
 
                "psrlw $8, %%mm0                \n\t"
1598
 
                "pand %%mm4, %%mm1              \n\t"
1599
 
                "packuswb %%mm0, %%mm0          \n\t"
1600
 
                "packuswb %%mm1, %%mm1          \n\t"
1601
 
                "movd %%mm0, (%4, %%"REG_a")    \n\t"
1602
 
                "movd %%mm1, (%3, %%"REG_a")    \n\t"
1603
 
                "add $4, %%"REG_a"              \n\t"
1604
 
                " js 1b                         \n\t"
1605
 
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606
 
                : "%"REG_a
1607
 
        );
1608
 
#else
1609
 
        int i;
1610
 
        for(i=0; i<width; i++)
1611
 
        {
1612
 
                dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613
 
                dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1614
 
        }
1615
 
#endif
1616
 
}
1617
 
 
1618
 
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1619
 
{
1620
 
        int i;
1621
 
        for(i=0; i<width; i++)
1622
 
        {
1623
 
                int b=  ((uint32_t*)src)[i]&0xFF;
1624
 
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1625
 
                int r= (((uint32_t*)src)[i]>>16)&0xFF;
1626
 
 
1627
 
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1628
 
        }
1629
 
}
1630
 
 
1631
 
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1632
 
{
1633
 
        int i;
1634
 
        for(i=0; i<width; i++)
1635
 
        {
1636
 
                const int a= ((uint32_t*)src1)[2*i+0];
1637
 
                const int e= ((uint32_t*)src1)[2*i+1];
1638
 
                const int c= ((uint32_t*)src2)[2*i+0];
1639
 
                const int d= ((uint32_t*)src2)[2*i+1];
1640
 
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641
 
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642
 
                const int b=  l&0x3FF;
1643
 
                const int g=  h>>8;
1644
 
                const int r=  l>>16;
1645
 
 
1646
 
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647
 
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648
 
        }
1649
 
}
1650
 
 
1651
 
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1652
 
{
1653
 
#ifdef HAVE_MMX
1654
 
        asm volatile(
1655
 
                "mov %2, %%"REG_a"              \n\t"
1656
 
                "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1657
 
                "movq "MANGLE(w1111)", %%mm5            \n\t"
1658
 
                "pxor %%mm7, %%mm7              \n\t"
1659
 
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1660
 
                ".balign 16                     \n\t"
1661
 
                "1:                             \n\t"
1662
 
                PREFETCH" 64(%0, %%"REG_b")     \n\t"
1663
 
                "movd (%0, %%"REG_b"), %%mm0    \n\t"
1664
 
                "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
1665
 
                "punpcklbw %%mm7, %%mm0         \n\t"
1666
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1667
 
                "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
1668
 
                "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
1669
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1670
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1671
 
                "pmaddwd %%mm6, %%mm0           \n\t"
1672
 
                "pmaddwd %%mm6, %%mm1           \n\t"
1673
 
                "pmaddwd %%mm6, %%mm2           \n\t"
1674
 
                "pmaddwd %%mm6, %%mm3           \n\t"
1675
 
#ifndef FAST_BGR2YV12
1676
 
                "psrad $8, %%mm0                \n\t"
1677
 
                "psrad $8, %%mm1                \n\t"
1678
 
                "psrad $8, %%mm2                \n\t"
1679
 
                "psrad $8, %%mm3                \n\t"
1680
 
#endif
1681
 
                "packssdw %%mm1, %%mm0          \n\t"
1682
 
                "packssdw %%mm3, %%mm2          \n\t"
1683
 
                "pmaddwd %%mm5, %%mm0           \n\t"
1684
 
                "pmaddwd %%mm5, %%mm2           \n\t"
1685
 
                "packssdw %%mm2, %%mm0          \n\t"
1686
 
                "psraw $7, %%mm0                \n\t"
1687
 
 
1688
 
                "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1689
 
                "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
1690
 
                "punpcklbw %%mm7, %%mm4         \n\t"
1691
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1692
 
                "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
1693
 
                "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
1694
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1695
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1696
 
                "pmaddwd %%mm6, %%mm4           \n\t"
1697
 
                "pmaddwd %%mm6, %%mm1           \n\t"
1698
 
                "pmaddwd %%mm6, %%mm2           \n\t"
1699
 
                "pmaddwd %%mm6, %%mm3           \n\t"
1700
 
#ifndef FAST_BGR2YV12
1701
 
                "psrad $8, %%mm4                \n\t"
1702
 
                "psrad $8, %%mm1                \n\t"
1703
 
                "psrad $8, %%mm2                \n\t"
1704
 
                "psrad $8, %%mm3                \n\t"
1705
 
#endif
1706
 
                "packssdw %%mm1, %%mm4          \n\t"
1707
 
                "packssdw %%mm3, %%mm2          \n\t"
1708
 
                "pmaddwd %%mm5, %%mm4           \n\t"
1709
 
                "pmaddwd %%mm5, %%mm2           \n\t"
1710
 
                "add $24, %%"REG_b"             \n\t"
1711
 
                "packssdw %%mm2, %%mm4          \n\t"
1712
 
                "psraw $7, %%mm4                \n\t"
1713
 
 
1714
 
                "packuswb %%mm4, %%mm0          \n\t"
1715
 
                "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1716
 
 
1717
 
                "movq %%mm0, (%1, %%"REG_a")    \n\t"
1718
 
                "add $8, %%"REG_a"              \n\t"
1719
 
                " js 1b                         \n\t"
1720
 
                : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1721
 
                : "%"REG_a, "%"REG_b
1722
 
        );
1723
 
#else
1724
 
        int i;
1725
 
        for(i=0; i<width; i++)
1726
 
        {
1727
 
                int b= src[i*3+0];
1728
 
                int g= src[i*3+1];
1729
 
                int r= src[i*3+2];
1730
 
 
1731
 
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1732
 
        }
1733
 
#endif
1734
 
}
1735
 
 
1736
 
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1737
 
{
1738
 
#ifdef HAVE_MMX
1739
 
        asm volatile(
1740
 
                "mov %4, %%"REG_a"              \n\t"
1741
 
                "movq "MANGLE(w1111)", %%mm5            \n\t"
1742
 
                "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1743
 
                "pxor %%mm7, %%mm7              \n\t"
1744
 
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"       \n\t"
1745
 
                "add %%"REG_b", %%"REG_b"       \n\t"
1746
 
                ".balign 16                     \n\t"
1747
 
                "1:                             \n\t"
1748
 
                PREFETCH" 64(%0, %%"REG_b")     \n\t"
1749
 
                PREFETCH" 64(%1, %%"REG_b")     \n\t"
1750
 
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751
 
                "movq (%0, %%"REG_b"), %%mm0    \n\t"
1752
 
                "movq (%1, %%"REG_b"), %%mm1    \n\t"
1753
 
                "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
1754
 
                "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
1755
 
                PAVGB(%%mm1, %%mm0)
1756
 
                PAVGB(%%mm3, %%mm2)
1757
 
                "movq %%mm0, %%mm1              \n\t"
1758
 
                "movq %%mm2, %%mm3              \n\t"
1759
 
                "psrlq $24, %%mm0               \n\t"
1760
 
                "psrlq $24, %%mm2               \n\t"
1761
 
                PAVGB(%%mm1, %%mm0)
1762
 
                PAVGB(%%mm3, %%mm2)
1763
 
                "punpcklbw %%mm7, %%mm0         \n\t"
1764
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1765
 
#else
1766
 
                "movd (%0, %%"REG_b"), %%mm0    \n\t"
1767
 
                "movd (%1, %%"REG_b"), %%mm1    \n\t"
1768
 
                "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
1769
 
                "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
1770
 
                "punpcklbw %%mm7, %%mm0         \n\t"
1771
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1772
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1773
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1774
 
                "paddw %%mm1, %%mm0             \n\t"
1775
 
                "paddw %%mm3, %%mm2             \n\t"
1776
 
                "paddw %%mm2, %%mm0             \n\t"
1777
 
                "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
1778
 
                "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
1779
 
                "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
1780
 
                "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
1781
 
                "punpcklbw %%mm7, %%mm4         \n\t"
1782
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1783
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1784
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1785
 
                "paddw %%mm1, %%mm4             \n\t"
1786
 
                "paddw %%mm3, %%mm2             \n\t"
1787
 
                "paddw %%mm4, %%mm2             \n\t"
1788
 
                "psrlw $2, %%mm0                \n\t"
1789
 
                "psrlw $2, %%mm2                \n\t"
1790
 
#endif
1791
 
                "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1792
 
                "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1793
 
                
1794
 
                "pmaddwd %%mm0, %%mm1           \n\t"
1795
 
                "pmaddwd %%mm2, %%mm3           \n\t"
1796
 
                "pmaddwd %%mm6, %%mm0           \n\t"
1797
 
                "pmaddwd %%mm6, %%mm2           \n\t"
1798
 
#ifndef FAST_BGR2YV12
1799
 
                "psrad $8, %%mm0                \n\t"
1800
 
                "psrad $8, %%mm1                \n\t"
1801
 
                "psrad $8, %%mm2                \n\t"
1802
 
                "psrad $8, %%mm3                \n\t"
1803
 
#endif
1804
 
                "packssdw %%mm2, %%mm0          \n\t"
1805
 
                "packssdw %%mm3, %%mm1          \n\t"
1806
 
                "pmaddwd %%mm5, %%mm0           \n\t"
1807
 
                "pmaddwd %%mm5, %%mm1           \n\t"
1808
 
                "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1809
 
                "psraw $7, %%mm0                \n\t"
1810
 
 
1811
 
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812
 
                "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
1813
 
                "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
1814
 
                "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
1815
 
                "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
1816
 
                PAVGB(%%mm1, %%mm4)
1817
 
                PAVGB(%%mm3, %%mm2)
1818
 
                "movq %%mm4, %%mm1              \n\t"
1819
 
                "movq %%mm2, %%mm3              \n\t"
1820
 
                "psrlq $24, %%mm4               \n\t"
1821
 
                "psrlq $24, %%mm2               \n\t"
1822
 
                PAVGB(%%mm1, %%mm4)
1823
 
                PAVGB(%%mm3, %%mm2)
1824
 
                "punpcklbw %%mm7, %%mm4         \n\t"
1825
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1826
 
#else
1827
 
                "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
1828
 
                "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
1829
 
                "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
1830
 
                "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
1831
 
                "punpcklbw %%mm7, %%mm4         \n\t"
1832
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1833
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1834
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1835
 
                "paddw %%mm1, %%mm4             \n\t"
1836
 
                "paddw %%mm3, %%mm2             \n\t"
1837
 
                "paddw %%mm2, %%mm4             \n\t"
1838
 
                "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
1839
 
                "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
1840
 
                "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
1841
 
                "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
1842
 
                "punpcklbw %%mm7, %%mm5         \n\t"
1843
 
                "punpcklbw %%mm7, %%mm1         \n\t"
1844
 
                "punpcklbw %%mm7, %%mm2         \n\t"
1845
 
                "punpcklbw %%mm7, %%mm3         \n\t"
1846
 
                "paddw %%mm1, %%mm5             \n\t"
1847
 
                "paddw %%mm3, %%mm2             \n\t"
1848
 
                "paddw %%mm5, %%mm2             \n\t"
1849
 
                "movq "MANGLE(w1111)", %%mm5            \n\t"
1850
 
                "psrlw $2, %%mm4                \n\t"
1851
 
                "psrlw $2, %%mm2                \n\t"
1852
 
#endif
1853
 
                "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1854
 
                "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1855
 
                
1856
 
                "pmaddwd %%mm4, %%mm1           \n\t"
1857
 
                "pmaddwd %%mm2, %%mm3           \n\t"
1858
 
                "pmaddwd %%mm6, %%mm4           \n\t"
1859
 
                "pmaddwd %%mm6, %%mm2           \n\t"
1860
 
#ifndef FAST_BGR2YV12
1861
 
                "psrad $8, %%mm4                \n\t"
1862
 
                "psrad $8, %%mm1                \n\t"
1863
 
                "psrad $8, %%mm2                \n\t"
1864
 
                "psrad $8, %%mm3                \n\t"
1865
 
#endif
1866
 
                "packssdw %%mm2, %%mm4          \n\t"
1867
 
                "packssdw %%mm3, %%mm1          \n\t"
1868
 
                "pmaddwd %%mm5, %%mm4           \n\t"
1869
 
                "pmaddwd %%mm5, %%mm1           \n\t"
1870
 
                "add $24, %%"REG_b"             \n\t"
1871
 
                "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1872
 
                "psraw $7, %%mm4                \n\t"
1873
 
                
1874
 
                "movq %%mm0, %%mm1              \n\t"
1875
 
                "punpckldq %%mm4, %%mm0         \n\t"
1876
 
                "punpckhdq %%mm4, %%mm1         \n\t"
1877
 
                "packsswb %%mm1, %%mm0          \n\t"
1878
 
                "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1879
 
 
1880
 
                "movd %%mm0, (%2, %%"REG_a")    \n\t"
1881
 
                "punpckhdq %%mm0, %%mm0         \n\t"
1882
 
                "movd %%mm0, (%3, %%"REG_a")    \n\t"
1883
 
                "add $4, %%"REG_a"              \n\t"
1884
 
                " js 1b                         \n\t"
1885
 
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1886
 
                : "%"REG_a, "%"REG_b
1887
 
        );
1888
 
#else
1889
 
        int i;
1890
 
        for(i=0; i<width; i++)
1891
 
        {
1892
 
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893
 
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894
 
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1895
 
 
1896
 
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897
 
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1898
 
        }
1899
 
#endif
1900
 
}
1901
 
 
1902
 
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1903
 
{
1904
 
        int i;
1905
 
        for(i=0; i<width; i++)
1906
 
        {
1907
 
                int d= ((uint16_t*)src)[i];
1908
 
                int b= d&0x1F;
1909
 
                int g= (d>>5)&0x3F;
1910
 
                int r= (d>>11)&0x1F;
1911
 
 
1912
 
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913
 
        }
1914
 
}
1915
 
 
1916
 
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1917
 
{
1918
 
        int i;
1919
 
        for(i=0; i<width; i++)
1920
 
        {
1921
 
                int d0= ((uint32_t*)src1)[i];
1922
 
                int d1= ((uint32_t*)src2)[i];
1923
 
                
1924
 
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925
 
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1926
 
 
1927
 
                int dh2= (dh>>11) + (dh<<21);
1928
 
                int d= dh2 + dl;
1929
 
 
1930
 
                int b= d&0x7F;
1931
 
                int r= (d>>11)&0x7F;
1932
 
                int g= d>>21;
1933
 
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934
 
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935
 
        }
1936
 
}
1937
 
 
1938
 
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1939
 
{
1940
 
        int i;
1941
 
        for(i=0; i<width; i++)
1942
 
        {
1943
 
                int d= ((uint16_t*)src)[i];
1944
 
                int b= d&0x1F;
1945
 
                int g= (d>>5)&0x1F;
1946
 
                int r= (d>>10)&0x1F;
1947
 
 
1948
 
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949
 
        }
1950
 
}
1951
 
 
1952
 
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1953
 
{
1954
 
        int i;
1955
 
        for(i=0; i<width; i++)
1956
 
        {
1957
 
                int d0= ((uint32_t*)src1)[i];
1958
 
                int d1= ((uint32_t*)src2)[i];
1959
 
                
1960
 
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961
 
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1962
 
 
1963
 
                int dh2= (dh>>11) + (dh<<21);
1964
 
                int d= dh2 + dl;
1965
 
 
1966
 
                int b= d&0x7F;
1967
 
                int r= (d>>10)&0x7F;
1968
 
                int g= d>>21;
1969
 
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970
 
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1971
 
        }
1972
 
}
1973
 
 
1974
 
 
1975
 
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1976
 
{
1977
 
        int i;
1978
 
        for(i=0; i<width; i++)
1979
 
        {
1980
 
                int r=  ((uint32_t*)src)[i]&0xFF;
1981
 
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1982
 
                int b= (((uint32_t*)src)[i]>>16)&0xFF;
1983
 
 
1984
 
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1985
 
        }
1986
 
}
1987
 
 
1988
 
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1989
 
{
1990
 
        int i;
1991
 
        for(i=0; i<width; i++)
1992
 
        {
1993
 
                const int a= ((uint32_t*)src1)[2*i+0];
1994
 
                const int e= ((uint32_t*)src1)[2*i+1];
1995
 
                const int c= ((uint32_t*)src2)[2*i+0];
1996
 
                const int d= ((uint32_t*)src2)[2*i+1];
1997
 
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998
 
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999
 
                const int r=  l&0x3FF;
2000
 
                const int g=  h>>8;
2001
 
                const int b=  l>>16;
2002
 
 
2003
 
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004
 
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005
 
        }
2006
 
}
2007
 
 
2008
 
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2009
 
{
2010
 
        int i;
2011
 
        for(i=0; i<width; i++)
2012
 
        {
2013
 
                int r= src[i*3+0];
2014
 
                int g= src[i*3+1];
2015
 
                int b= src[i*3+2];
2016
 
 
2017
 
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2018
 
        }
2019
 
}
2020
 
 
2021
 
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2022
 
{
2023
 
        int i;
2024
 
        for(i=0; i<width; i++)
2025
 
        {
2026
 
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027
 
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028
 
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2029
 
 
2030
 
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031
 
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2032
 
        }
2033
 
}
2034
 
 
2035
 
 
2036
 
// Bilinear / Bicubic scaling
2037
 
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038
 
                                  int16_t *filter, int16_t *filterPos, int filterSize)
2039
 
{
2040
 
#ifdef HAVE_MMX
2041
 
        assert(filterSize % 4 == 0 && filterSize>0);
2042
 
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2043
 
        {
2044
 
                long counter= -2*dstW;
2045
 
                filter-= counter*2;
2046
 
                filterPos-= counter/2;
2047
 
                dst-= counter/2;
2048
 
                asm volatile(
2049
 
                        "pxor %%mm7, %%mm7              \n\t"
2050
 
                        "movq "MANGLE(w02)", %%mm6      \n\t"
2051
 
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2052
 
                        "mov %%"REG_a", %%"REG_BP"      \n\t"
2053
 
                        ".balign 16                     \n\t"
2054
 
                        "1:                             \n\t"
2055
 
                        "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056
 
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2057
 
                        "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058
 
                        "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059
 
                        "movd (%3, %%"REG_a"), %%mm0    \n\t"
2060
 
                        "movd (%3, %%"REG_b"), %%mm2    \n\t"
2061
 
                        "punpcklbw %%mm7, %%mm0         \n\t"
2062
 
                        "punpcklbw %%mm7, %%mm2         \n\t"
2063
 
                        "pmaddwd %%mm1, %%mm0           \n\t"
2064
 
                        "pmaddwd %%mm2, %%mm3           \n\t"
2065
 
                        "psrad $8, %%mm0                \n\t"
2066
 
                        "psrad $8, %%mm3                \n\t"
2067
 
                        "packssdw %%mm3, %%mm0          \n\t"
2068
 
                        "pmaddwd %%mm6, %%mm0           \n\t"
2069
 
                        "packssdw %%mm0, %%mm0          \n\t"
2070
 
                        "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2071
 
                        "add $4, %%"REG_BP"             \n\t"
2072
 
                        " jnc 1b                        \n\t"
2073
 
 
2074
 
                        "pop %%"REG_BP"                 \n\t"
2075
 
                        : "+a" (counter)
2076
 
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077
 
                        : "%"REG_b
2078
 
                );
2079
 
        }
2080
 
        else if(filterSize==8)
2081
 
        {
2082
 
                long counter= -2*dstW;
2083
 
                filter-= counter*4;
2084
 
                filterPos-= counter/2;
2085
 
                dst-= counter/2;
2086
 
                asm volatile(
2087
 
                        "pxor %%mm7, %%mm7              \n\t"
2088
 
                        "movq "MANGLE(w02)", %%mm6      \n\t"
2089
 
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2090
 
                        "mov %%"REG_a", %%"REG_BP"      \n\t"
2091
 
                        ".balign 16                     \n\t"
2092
 
                        "1:                             \n\t"
2093
 
                        "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094
 
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2095
 
                        "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096
 
                        "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097
 
                        "movd (%3, %%"REG_a"), %%mm0    \n\t"
2098
 
                        "movd (%3, %%"REG_b"), %%mm2    \n\t"
2099
 
                        "punpcklbw %%mm7, %%mm0         \n\t"
2100
 
                        "punpcklbw %%mm7, %%mm2         \n\t"
2101
 
                        "pmaddwd %%mm1, %%mm0           \n\t"
2102
 
                        "pmaddwd %%mm2, %%mm3           \n\t"
2103
 
 
2104
 
                        "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105
 
                        "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106
 
                        "movd 4(%3, %%"REG_a"), %%mm4   \n\t"
2107
 
                        "movd 4(%3, %%"REG_b"), %%mm2   \n\t"
2108
 
                        "punpcklbw %%mm7, %%mm4         \n\t"
2109
 
                        "punpcklbw %%mm7, %%mm2         \n\t"
2110
 
                        "pmaddwd %%mm1, %%mm4           \n\t"
2111
 
                        "pmaddwd %%mm2, %%mm5           \n\t"
2112
 
                        "paddd %%mm4, %%mm0             \n\t"
2113
 
                        "paddd %%mm5, %%mm3             \n\t"
2114
 
                                                
2115
 
                        "psrad $8, %%mm0                \n\t"
2116
 
                        "psrad $8, %%mm3                \n\t"
2117
 
                        "packssdw %%mm3, %%mm0          \n\t"
2118
 
                        "pmaddwd %%mm6, %%mm0           \n\t"
2119
 
                        "packssdw %%mm0, %%mm0          \n\t"
2120
 
                        "movd %%mm0, (%4, %%"REG_BP")   \n\t"
2121
 
                        "add $4, %%"REG_BP"             \n\t"
2122
 
                        " jnc 1b                        \n\t"
2123
 
 
2124
 
                        "pop %%"REG_BP"                 \n\t"
2125
 
                        : "+a" (counter)
2126
 
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2127
 
                        : "%"REG_b
2128
 
                );
2129
 
        }
2130
 
        else
2131
 
        {
2132
 
                uint8_t *offset = src+filterSize;
2133
 
                long counter= -2*dstW;
2134
 
//              filter-= counter*filterSize/2;
2135
 
                filterPos-= counter/2;
2136
 
                dst-= counter/2;
2137
 
                asm volatile(
2138
 
                        "pxor %%mm7, %%mm7              \n\t"
2139
 
                        "movq "MANGLE(w02)", %%mm6      \n\t"
2140
 
                        ".balign 16                     \n\t"
2141
 
                        "1:                             \n\t"
2142
 
                        "mov %2, %%"REG_c"              \n\t"
2143
 
                        "movzwl (%%"REG_c", %0), %%eax  \n\t"
2144
 
                        "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2145
 
                        "mov %5, %%"REG_c"              \n\t"
2146
 
                        "pxor %%mm4, %%mm4              \n\t"
2147
 
                        "pxor %%mm5, %%mm5              \n\t"
2148
 
                        "2:                             \n\t"
2149
 
                        "movq (%1), %%mm1               \n\t"
2150
 
                        "movq (%1, %6), %%mm3           \n\t"
2151
 
                        "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152
 
                        "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2153
 
                        "punpcklbw %%mm7, %%mm0         \n\t"
2154
 
                        "punpcklbw %%mm7, %%mm2         \n\t"
2155
 
                        "pmaddwd %%mm1, %%mm0           \n\t"
2156
 
                        "pmaddwd %%mm2, %%mm3           \n\t"
2157
 
                        "paddd %%mm3, %%mm5             \n\t"
2158
 
                        "paddd %%mm0, %%mm4             \n\t"
2159
 
                        "add $8, %1                     \n\t"
2160
 
                        "add $4, %%"REG_c"              \n\t"
2161
 
                        "cmp %4, %%"REG_c"              \n\t"
2162
 
                        " jb 2b                         \n\t"
2163
 
                        "add %6, %1                     \n\t"
2164
 
                        "psrad $8, %%mm4                \n\t"
2165
 
                        "psrad $8, %%mm5                \n\t"
2166
 
                        "packssdw %%mm5, %%mm4          \n\t"
2167
 
                        "pmaddwd %%mm6, %%mm4           \n\t"
2168
 
                        "packssdw %%mm4, %%mm4          \n\t"
2169
 
                        "mov %3, %%"REG_a"              \n\t"
2170
 
                        "movd %%mm4, (%%"REG_a", %0)    \n\t"
2171
 
                        "add $4, %0                     \n\t"
2172
 
                        " jnc 1b                        \n\t"
2173
 
 
2174
 
                        : "+r" (counter), "+r" (filter)
2175
 
                        : "m" (filterPos), "m" (dst), "m"(offset),
2176
 
                          "m" (src), "r" ((long)filterSize*2)
2177
 
                        : "%"REG_b, "%"REG_a, "%"REG_c
2178
 
                );
2179
 
        }
2180
 
#else
2181
 
#ifdef HAVE_ALTIVEC
2182
 
        hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183
 
#else
2184
 
        int i;
2185
 
        for(i=0; i<dstW; i++)
2186
 
        {
2187
 
                int j;
2188
 
                int srcPos= filterPos[i];
2189
 
                int val=0;
2190
 
//              printf("filterPos: %d\n", filterPos[i]);
2191
 
                for(j=0; j<filterSize; j++)
2192
 
                {
2193
 
//                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194
 
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2195
 
                }
2196
 
//              filter += hFilterSize;
2197
 
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2198
 
//              dst[i] = val>>7;
2199
 
        }
2200
 
#endif
2201
 
#endif
2202
 
}
2203
 
      // *** horizontal scale Y line to temp buffer
2204
 
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205
 
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2206
 
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2207
 
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208
 
                                   int32_t *mmx2FilterPos)
2209
 
{
2210
 
    if(srcFormat==IMGFMT_YUY2)
2211
 
    {
2212
 
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213
 
        src= formatConvBuffer;
2214
 
    }
2215
 
    else if(srcFormat==IMGFMT_UYVY)
2216
 
    {
2217
 
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218
 
        src= formatConvBuffer;
2219
 
    }
2220
 
    else if(srcFormat==IMGFMT_BGR32)
2221
 
    {
2222
 
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223
 
        src= formatConvBuffer;
2224
 
    }
2225
 
    else if(srcFormat==IMGFMT_BGR24)
2226
 
    {
2227
 
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228
 
        src= formatConvBuffer;
2229
 
    }
2230
 
    else if(srcFormat==IMGFMT_BGR16)
2231
 
    {
2232
 
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233
 
        src= formatConvBuffer;
2234
 
    }
2235
 
    else if(srcFormat==IMGFMT_BGR15)
2236
 
    {
2237
 
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238
 
        src= formatConvBuffer;
2239
 
    }
2240
 
    else if(srcFormat==IMGFMT_RGB32)
2241
 
    {
2242
 
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243
 
        src= formatConvBuffer;
2244
 
    }
2245
 
    else if(srcFormat==IMGFMT_RGB24)
2246
 
    {
2247
 
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248
 
        src= formatConvBuffer;
2249
 
    }
2250
 
 
2251
 
#ifdef HAVE_MMX
2252
 
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253
 
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2254
 
#else
2255
 
    if(!(flags&SWS_FAST_BILINEAR))
2256
 
#endif
2257
 
    {
2258
 
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2259
 
    }
2260
 
    else // Fast Bilinear upscale / crap downscale
2261
 
    {
2262
 
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2263
 
#ifdef HAVE_MMX2
2264
 
        int i;
2265
 
        if(canMMX2BeUsed)
2266
 
        {
2267
 
                asm volatile(
2268
 
                        "pxor %%mm7, %%mm7              \n\t"
2269
 
                        "mov %0, %%"REG_c"              \n\t"
2270
 
                        "mov %1, %%"REG_D"              \n\t"
2271
 
                        "mov %2, %%"REG_d"              \n\t"
2272
 
                        "mov %3, %%"REG_b"              \n\t"
2273
 
                        "xor %%"REG_a", %%"REG_a"       \n\t" // i
2274
 
                        PREFETCH" (%%"REG_c")           \n\t"
2275
 
                        PREFETCH" 32(%%"REG_c")         \n\t"
2276
 
                        PREFETCH" 64(%%"REG_c")         \n\t"
2277
 
 
2278
 
#ifdef ARCH_X86_64
2279
 
 
2280
 
#define FUNNY_Y_CODE \
2281
 
                        "movl (%%"REG_b"), %%esi        \n\t"\
2282
 
                        "call *%4                       \n\t"\
2283
 
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284
 
                        "add %%"REG_S", %%"REG_c"       \n\t"\
2285
 
                        "add %%"REG_a", %%"REG_D"       \n\t"\
2286
 
                        "xor %%"REG_a", %%"REG_a"       \n\t"\
2287
 
 
2288
 
#else
2289
 
 
2290
 
#define FUNNY_Y_CODE \
2291
 
                        "movl (%%"REG_b"), %%esi        \n\t"\
2292
 
                        "call *%4                       \n\t"\
2293
 
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294
 
                        "add %%"REG_a", %%"REG_D"       \n\t"\
2295
 
                        "xor %%"REG_a", %%"REG_a"       \n\t"\
2296
 
 
2297
 
#endif
2298
 
 
2299
 
FUNNY_Y_CODE
2300
 
FUNNY_Y_CODE
2301
 
FUNNY_Y_CODE
2302
 
FUNNY_Y_CODE
2303
 
FUNNY_Y_CODE
2304
 
FUNNY_Y_CODE
2305
 
FUNNY_Y_CODE
2306
 
FUNNY_Y_CODE
2307
 
 
2308
 
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2309
 
                        "m" (funnyYCode)
2310
 
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2311
 
                );
2312
 
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2313
 
        }
2314
 
        else
2315
 
        {
2316
 
#endif
2317
 
        int xInc_shr16 = xInc >> 16;
2318
 
        int xInc_mask = xInc & 0xffff;
2319
 
        //NO MMX just normal asm ...
2320
 
        asm volatile(
2321
 
                "xor %%"REG_a", %%"REG_a"       \n\t" // i
2322
 
                "xor %%"REG_b", %%"REG_b"       \n\t" // xx
2323
 
                "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2324
 
                ".balign 16                     \n\t"
2325
 
                "1:                             \n\t"
2326
 
                "movzbl  (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327
 
                "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2328
 
                "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2329
 
                "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330
 
                "shll $16, %%edi                \n\t"
2331
 
                "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332
 
                "mov %1, %%"REG_D"              \n\t"
2333
 
                "shrl $9, %%esi                 \n\t"
2334
 
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2335
 
                "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2336
 
                "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2337
 
 
2338
 
                "movzbl (%0, %%"REG_b"), %%edi  \n\t" //src[xx]
2339
 
                "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2340
 
                "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2341
 
                "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342
 
                "shll $16, %%edi                \n\t"
2343
 
                "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344
 
                "mov %1, %%"REG_D"              \n\t"
2345
 
                "shrl $9, %%esi                 \n\t"
2346
 
                "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2347
 
                "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2348
 
                "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2349
 
 
2350
 
 
2351
 
                "add $2, %%"REG_a"              \n\t"
2352
 
                "cmp %2, %%"REG_a"              \n\t"
2353
 
                " jb 1b                         \n\t"
2354
 
 
2355
 
 
2356
 
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2357
 
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2358
 
                );
2359
 
#ifdef HAVE_MMX2
2360
 
        } //if MMX2 can't be used
2361
 
#endif
2362
 
#else
2363
 
        int i;
2364
 
        unsigned int xpos=0;
2365
 
        for(i=0;i<dstWidth;i++)
2366
 
        {
2367
 
                register unsigned int xx=xpos>>16;
2368
 
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369
 
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370
 
                xpos+=xInc;
2371
 
        }
2372
 
#endif
2373
 
    }
2374
 
}
2375
 
 
2376
 
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2377
 
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2378
 
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2379
 
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380
 
                                   int32_t *mmx2FilterPos)
2381
 
{
2382
 
    if(srcFormat==IMGFMT_YUY2)
2383
 
    {
2384
 
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385
 
        src1= formatConvBuffer;
2386
 
        src2= formatConvBuffer+2048;
2387
 
    }
2388
 
    else if(srcFormat==IMGFMT_UYVY)
2389
 
    {
2390
 
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391
 
        src1= formatConvBuffer;
2392
 
        src2= formatConvBuffer+2048;
2393
 
    }
2394
 
    else if(srcFormat==IMGFMT_BGR32)
2395
 
    {
2396
 
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397
 
        src1= formatConvBuffer;
2398
 
        src2= formatConvBuffer+2048;
2399
 
    }
2400
 
    else if(srcFormat==IMGFMT_BGR24)
2401
 
    {
2402
 
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403
 
        src1= formatConvBuffer;
2404
 
        src2= formatConvBuffer+2048;
2405
 
    }
2406
 
    else if(srcFormat==IMGFMT_BGR16)
2407
 
    {
2408
 
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409
 
        src1= formatConvBuffer;
2410
 
        src2= formatConvBuffer+2048;
2411
 
    }
2412
 
    else if(srcFormat==IMGFMT_BGR15)
2413
 
    {
2414
 
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415
 
        src1= formatConvBuffer;
2416
 
        src2= formatConvBuffer+2048;
2417
 
    }
2418
 
    else if(srcFormat==IMGFMT_RGB32)
2419
 
    {
2420
 
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421
 
        src1= formatConvBuffer;
2422
 
        src2= formatConvBuffer+2048;
2423
 
    }
2424
 
    else if(srcFormat==IMGFMT_RGB24)
2425
 
    {
2426
 
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427
 
        src1= formatConvBuffer;
2428
 
        src2= formatConvBuffer+2048;
2429
 
    }
2430
 
    else if(isGray(srcFormat))
2431
 
    {
2432
 
        return;
2433
 
    }
2434
 
 
2435
 
#ifdef HAVE_MMX
2436
 
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437
 
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2438
 
#else
2439
 
    if(!(flags&SWS_FAST_BILINEAR))
2440
 
#endif
2441
 
    {
2442
 
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443
 
        RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2444
 
    }
2445
 
    else // Fast Bilinear upscale / crap downscale
2446
 
    {
2447
 
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2448
 
#ifdef HAVE_MMX2
2449
 
        int i;
2450
 
        if(canMMX2BeUsed)
2451
 
        {
2452
 
                asm volatile(
2453
 
                        "pxor %%mm7, %%mm7              \n\t"
2454
 
                        "mov %0, %%"REG_c"              \n\t"
2455
 
                        "mov %1, %%"REG_D"              \n\t"
2456
 
                        "mov %2, %%"REG_d"              \n\t"
2457
 
                        "mov %3, %%"REG_b"              \n\t"
2458
 
                        "xor %%"REG_a", %%"REG_a"       \n\t" // i
2459
 
                        PREFETCH" (%%"REG_c")           \n\t"
2460
 
                        PREFETCH" 32(%%"REG_c")         \n\t"
2461
 
                        PREFETCH" 64(%%"REG_c")         \n\t"
2462
 
 
2463
 
#ifdef ARCH_X86_64
2464
 
 
2465
 
#define FUNNY_UV_CODE \
2466
 
                        "movl (%%"REG_b"), %%esi        \n\t"\
2467
 
                        "call *%4                       \n\t"\
2468
 
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469
 
                        "add %%"REG_S", %%"REG_c"       \n\t"\
2470
 
                        "add %%"REG_a", %%"REG_D"       \n\t"\
2471
 
                        "xor %%"REG_a", %%"REG_a"       \n\t"\
2472
 
 
2473
 
#else
2474
 
 
2475
 
#define FUNNY_UV_CODE \
2476
 
                        "movl (%%"REG_b"), %%esi        \n\t"\
2477
 
                        "call *%4                       \n\t"\
2478
 
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479
 
                        "add %%"REG_a", %%"REG_D"       \n\t"\
2480
 
                        "xor %%"REG_a", %%"REG_a"       \n\t"\
2481
 
 
2482
 
#endif
2483
 
 
2484
 
FUNNY_UV_CODE
2485
 
FUNNY_UV_CODE
2486
 
FUNNY_UV_CODE
2487
 
FUNNY_UV_CODE
2488
 
                        "xor %%"REG_a", %%"REG_a"       \n\t" // i
2489
 
                        "mov %5, %%"REG_c"              \n\t" // src
2490
 
                        "mov %1, %%"REG_D"              \n\t" // buf1
2491
 
                        "add $4096, %%"REG_D"           \n\t"
2492
 
                        PREFETCH" (%%"REG_c")           \n\t"
2493
 
                        PREFETCH" 32(%%"REG_c")         \n\t"
2494
 
                        PREFETCH" 64(%%"REG_c")         \n\t"
2495
 
 
2496
 
FUNNY_UV_CODE
2497
 
FUNNY_UV_CODE
2498
 
FUNNY_UV_CODE
2499
 
FUNNY_UV_CODE
2500
 
 
2501
 
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502
 
                        "m" (funnyUVCode), "m" (src2)
2503
 
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2504
 
                );
2505
 
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2506
 
                {
2507
 
//                      printf("%d %d %d\n", dstWidth, i, srcW);
2508
 
                        dst[i] = src1[srcW-1]*128;
2509
 
                        dst[i+2048] = src2[srcW-1]*128;
2510
 
                }
2511
 
        }
2512
 
        else
2513
 
        {
2514
 
#endif
2515
 
        long xInc_shr16 = (long) (xInc >> 16);
2516
 
        int xInc_mask = xInc & 0xffff; 
2517
 
        asm volatile(
2518
 
                "xor %%"REG_a", %%"REG_a"       \n\t" // i
2519
 
                "xor %%"REG_b", %%"REG_b"               \n\t" // xx
2520
 
                "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2521
 
                ".balign 16                     \n\t"
2522
 
                "1:                             \n\t"
2523
 
                "mov %0, %%"REG_S"              \n\t"
2524
 
                "movzbl  (%%"REG_S", %%"REG_b"), %%edi  \n\t" //src[xx]
2525
 
                "movzbl 1(%%"REG_S", %%"REG_b"), %%esi  \n\t" //src[xx+1]
2526
 
                "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2527
 
                "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528
 
                "shll $16, %%edi                \n\t"
2529
 
                "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530
 
                "mov %1, %%"REG_D"              \n\t"
2531
 
                "shrl $9, %%esi                 \n\t"
2532
 
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2533
 
 
2534
 
                "movzbl  (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535
 
                "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2536
 
                "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2537
 
                "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538
 
                "shll $16, %%edi                \n\t"
2539
 
                "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540
 
                "mov %1, %%"REG_D"              \n\t"
2541
 
                "shrl $9, %%esi                 \n\t"
2542
 
                "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2543
 
 
2544
 
                "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2545
 
                "adc %3, %%"REG_b"              \n\t" //xx+= xInc>>8 + carry
2546
 
                "add $1, %%"REG_a"              \n\t"
2547
 
                "cmp %2, %%"REG_a"              \n\t"
2548
 
                " jb 1b                         \n\t"
2549
 
 
2550
 
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2551
 
   which is needed to support GCC-4.0 */
2552
 
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2553
 
                :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2554
 
#else
2555
 
                :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2556
 
#endif
2557
 
                "r" (src2)
2558
 
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2559
 
                );
2560
 
#ifdef HAVE_MMX2
2561
 
        } //if MMX2 can't be used
2562
 
#endif
2563
 
#else
2564
 
        int i;
2565
 
        unsigned int xpos=0;
2566
 
        for(i=0;i<dstWidth;i++)
2567
 
        {
2568
 
                register unsigned int xx=xpos>>16;
2569
 
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2570
 
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2571
 
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2572
 
/* slower
2573
 
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2574
 
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2575
 
*/
2576
 
                xpos+=xInc;
2577
 
        }
2578
 
#endif
2579
 
   }
2580
 
}
2581
 
 
2582
 
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2583
 
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2584
 
 
2585
 
        /* load a few things into local vars to make the code more readable? and faster */
2586
 
        const int srcW= c->srcW;
2587
 
        const int dstW= c->dstW;
2588
 
        const int dstH= c->dstH;
2589
 
        const int chrDstW= c->chrDstW;
2590
 
        const int chrSrcW= c->chrSrcW;
2591
 
        const int lumXInc= c->lumXInc;
2592
 
        const int chrXInc= c->chrXInc;
2593
 
        const int dstFormat= c->dstFormat;
2594
 
        const int srcFormat= c->srcFormat;
2595
 
        const int flags= c->flags;
2596
 
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2597
 
        int16_t *vLumFilterPos= c->vLumFilterPos;
2598
 
        int16_t *vChrFilterPos= c->vChrFilterPos;
2599
 
        int16_t *hLumFilterPos= c->hLumFilterPos;
2600
 
        int16_t *hChrFilterPos= c->hChrFilterPos;
2601
 
        int16_t *vLumFilter= c->vLumFilter;
2602
 
        int16_t *vChrFilter= c->vChrFilter;
2603
 
        int16_t *hLumFilter= c->hLumFilter;
2604
 
        int16_t *hChrFilter= c->hChrFilter;
2605
 
        int32_t *lumMmxFilter= c->lumMmxFilter;
2606
 
        int32_t *chrMmxFilter= c->chrMmxFilter;
2607
 
        const int vLumFilterSize= c->vLumFilterSize;
2608
 
        const int vChrFilterSize= c->vChrFilterSize;
2609
 
        const int hLumFilterSize= c->hLumFilterSize;
2610
 
        const int hChrFilterSize= c->hChrFilterSize;
2611
 
        int16_t **lumPixBuf= c->lumPixBuf;
2612
 
        int16_t **chrPixBuf= c->chrPixBuf;
2613
 
        const int vLumBufSize= c->vLumBufSize;
2614
 
        const int vChrBufSize= c->vChrBufSize;
2615
 
        uint8_t *funnyYCode= c->funnyYCode;
2616
 
        uint8_t *funnyUVCode= c->funnyUVCode;
2617
 
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2618
 
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2619
 
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2620
 
        int lastDstY;
2621
 
 
2622
 
        /* vars whch will change and which we need to storw back in the context */
2623
 
        int dstY= c->dstY;
2624
 
        int lumBufIndex= c->lumBufIndex;
2625
 
        int chrBufIndex= c->chrBufIndex;
2626
 
        int lastInLumBuf= c->lastInLumBuf;
2627
 
        int lastInChrBuf= c->lastInChrBuf;
2628
 
        
2629
 
        if(isPacked(c->srcFormat)){
2630
 
                src[0]=
2631
 
                src[1]=
2632
 
                src[2]= src[0];
2633
 
                srcStride[0]=
2634
 
                srcStride[1]=
2635
 
                srcStride[2]= srcStride[0];
2636
 
        }
2637
 
        srcStride[1]<<= c->vChrDrop;
2638
 
        srcStride[2]<<= c->vChrDrop;
2639
 
 
2640
 
//      printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641
 
//              (int)dst[0], (int)dst[1], (int)dst[2]);
2642
 
 
2643
 
#if 0 //self test FIXME move to a vfilter or something
2644
 
{
2645
 
static volatile int i=0;
2646
 
i++;
2647
 
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2648
 
        selfTest(src, srcStride, c->srcW, c->srcH);
2649
 
i--;
2650
 
}
2651
 
#endif
2652
 
 
2653
 
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654
 
//dstStride[0],dstStride[1],dstStride[2]);
2655
 
 
2656
 
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2657
 
        {
2658
 
                static int firstTime=1; //FIXME move this into the context perhaps
2659
 
                if(flags & SWS_PRINT_INFO && firstTime)
2660
 
                {
2661
 
                        MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2662
 
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2663
 
                        firstTime=0;
2664
 
                }
2665
 
        }
2666
 
 
2667
 
        /* Note the user might start scaling the picture in the middle so this will not get executed
2668
 
           this is not really intended but works currently, so ppl might do it */
2669
 
        if(srcSliceY ==0){
2670
 
                lumBufIndex=0;
2671
 
                chrBufIndex=0;
2672
 
                dstY=0; 
2673
 
                lastInLumBuf= -1;
2674
 
                lastInChrBuf= -1;
2675
 
        }
2676
 
 
2677
 
        lastDstY= dstY;
2678
 
 
2679
 
        for(;dstY < dstH; dstY++){
2680
 
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2681
 
                const int chrDstY= dstY>>c->chrDstVSubSample;
2682
 
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2683
 
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2684
 
 
2685
 
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2686
 
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2687
 
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2688
 
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2689
 
 
2690
 
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2691
 
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2692
 
                //handle holes (FAST_BILINEAR & weird filters)
2693
 
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2694
 
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2695
 
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2696
 
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2697
 
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2698
 
 
2699
 
                // Do we have enough lines in this slice to output the dstY line
2700
 
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2701
 
                {
2702
 
                        //Do horizontal scaling
2703
 
                        while(lastInLumBuf < lastLumSrcY)
2704
 
                        {
2705
 
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2706
 
                                lumBufIndex++;
2707
 
//                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2708
 
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2709
 
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2710
 
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2711
 
//                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2712
 
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713
 
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714
 
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2715
 
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2716
 
                                lastInLumBuf++;
2717
 
                        }
2718
 
                        while(lastInChrBuf < lastChrSrcY)
2719
 
                        {
2720
 
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721
 
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722
 
                                chrBufIndex++;
2723
 
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2724
 
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2725
 
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726
 
                                //FIXME replace parameters through context struct (some at least)
2727
 
 
2728
 
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2729
 
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2730
 
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2731
 
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2732
 
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2733
 
                                lastInChrBuf++;
2734
 
                        }
2735
 
                        //wrap buf index around to stay inside the ring buffer
2736
 
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2737
 
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2738
 
                }
2739
 
                else // not enough lines left in this slice -> load the rest in the buffer
2740
 
                {
2741
 
/*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2742
 
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2743
 
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2744
 
                        vChrBufSize, vLumBufSize);*/
2745
 
 
2746
 
                        //Do horizontal scaling
2747
 
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2748
 
                        {
2749
 
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2750
 
                                lumBufIndex++;
2751
 
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2752
 
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2753
 
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2754
 
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2755
 
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2756
 
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2757
 
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2758
 
                                lastInLumBuf++;
2759
 
                        }
2760
 
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2761
 
                        {
2762
 
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2763
 
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2764
 
                                chrBufIndex++;
2765
 
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2766
 
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2767
 
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2768
 
 
2769
 
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2770
 
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2771
 
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2772
 
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2773
 
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2774
 
                                lastInChrBuf++;
2775
 
                        }
2776
 
                        //wrap buf index around to stay inside the ring buffer
2777
 
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2778
 
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2779
 
                        break; //we can't output a dstY line so let's try with the next slice
2780
 
                }
2781
 
 
2782
 
#ifdef HAVE_MMX
2783
 
                b5Dither= dither8[dstY&1];
2784
 
                g6Dither= dither4[dstY&1];
2785
 
                g5Dither= dither8[dstY&1];
2786
 
                r5Dither= dither8[(dstY+1)&1];
2787
 
#endif
2788
 
            if(dstY < dstH-2)
2789
 
            {
2790
 
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2791
 
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2792
 
#ifdef HAVE_MMX
2793
 
                int i;
2794
 
                for(i=0; i<vLumFilterSize; i++)
2795
 
                {
2796
 
                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2797
 
                        lumMmxFilter[4*i+2]= 
2798
 
                        lumMmxFilter[4*i+3]= 
2799
 
                                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2800
 
                }
2801
 
                for(i=0; i<vChrFilterSize; i++)
2802
 
                {
2803
 
                        chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2804
 
                        chrMmxFilter[4*i+2]= 
2805
 
                        chrMmxFilter[4*i+3]= 
2806
 
                                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2807
 
                }
2808
 
#endif
2809
 
                if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2810
 
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811
 
                        if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2812
 
                        RENAME(yuv2nv12X)(c,
2813
 
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2814
 
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2815
 
                                dest, uDest, dstW, chrDstW, dstFormat);
2816
 
                }
2817
 
                else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2818
 
                {
2819
 
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2820
 
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2821
 
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2822
 
                        {
2823
 
                                int16_t *lumBuf = lumPixBuf[0];
2824
 
                                int16_t *chrBuf= chrPixBuf[0];
2825
 
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2826
 
                        }
2827
 
                        else //General YV12
2828
 
                        {
2829
 
                                RENAME(yuv2yuvX)(c,
2830
 
                                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2831
 
                                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2832
 
                                        dest, uDest, vDest, dstW, chrDstW);
2833
 
                        }
2834
 
                }
2835
 
                else
2836
 
                {
2837
 
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2838
 
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2839
 
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2840
 
                        {
2841
 
                                int chrAlpha= vChrFilter[2*dstY+1];
2842
 
                                RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2843
 
                                                 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2844
 
                        }
2845
 
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2846
 
                        {
2847
 
                                int lumAlpha= vLumFilter[2*dstY+1];
2848
 
                                int chrAlpha= vChrFilter[2*dstY+1];
2849
 
                                RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2850
 
                                                 dest, dstW, lumAlpha, chrAlpha, dstY);
2851
 
                        }
2852
 
                        else //General RGB
2853
 
                        {
2854
 
                                RENAME(yuv2packedX)(c,
2855
 
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2856
 
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2857
 
                                        dest, dstW, dstY);
2858
 
                        }
2859
 
                }
2860
 
            }
2861
 
            else // hmm looks like we can't use MMX here without overwriting this array's tail
2862
 
            {
2863
 
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864
 
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865
 
                if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2866
 
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2867
 
                        if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2868
 
                        yuv2nv12XinC(
2869
 
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2870
 
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2871
 
                                dest, uDest, dstW, chrDstW, dstFormat);
2872
 
                }
2873
 
                else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2874
 
                {
2875
 
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876
 
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2877
 
                        yuv2yuvXinC(
2878
 
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2879
 
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880
 
                                dest, uDest, vDest, dstW, chrDstW);
2881
 
                }
2882
 
                else
2883
 
                {
2884
 
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885
 
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886
 
                        yuv2packedXinC(c, 
2887
 
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2888
 
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2889
 
                                dest, dstW, dstY);
2890
 
                }
2891
 
            }
2892
 
        }
2893
 
 
2894
 
#ifdef HAVE_MMX
2895
 
        __asm __volatile(SFENCE:::"memory");
2896
 
        __asm __volatile(EMMS:::"memory");
2897
 
#endif
2898
 
        /* store changed local vars back in the context */
2899
 
        c->dstY= dstY;
2900
 
        c->lumBufIndex= lumBufIndex;
2901
 
        c->chrBufIndex= chrBufIndex;
2902
 
        c->lastInLumBuf= lastInLumBuf;
2903
 
        c->lastInChrBuf= lastInChrBuf;
2904
 
 
2905
 
        return dstY - lastDstY;
2906
 
}