36
36
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
38
#define YSCALEYUV2YV12X(offset, dest, end, pos) \
40
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
41
"movq %%mm3, %%mm4 \n\t"\
42
"lea " offset "(%0), %%"REG_d" \n\t"\
43
"mov (%%"REG_d"), %%"REG_S" \n\t"\
44
".p2align 4 \n\t" /* FIXME Unroll? */\
46
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
47
"movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
48
"movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
49
"add $16, %%"REG_d" \n\t"\
50
"mov (%%"REG_d"), %%"REG_S" \n\t"\
51
"test %%"REG_S", %%"REG_S" \n\t"\
52
"pmulhw %%mm0, %%mm2 \n\t"\
53
"pmulhw %%mm0, %%mm5 \n\t"\
54
"paddw %%mm2, %%mm3 \n\t"\
55
"paddw %%mm5, %%mm4 \n\t"\
57
"psraw $3, %%mm3 \n\t"\
58
"psraw $3, %%mm4 \n\t"\
59
"packuswb %%mm4, %%mm3 \n\t"\
60
MOVNTQ(%%mm3, (%1, %3))\
63
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
64
"movq %%mm3, %%mm4 \n\t"\
65
"lea " offset "(%0), %%"REG_d" \n\t"\
66
"mov (%%"REG_d"), %%"REG_S" \n\t"\
68
:: "r" (&c->redDither),\
69
"r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
73
static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
74
const int16_t **lumSrc, int lumFilterSize,
75
const int16_t *chrFilter, const int16_t **chrUSrc,
76
const int16_t **chrVSrc,
77
int chrFilterSize, const int16_t **alpSrc,
78
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
79
uint8_t *aDest, int dstW, int chrDstW)
82
x86_reg uv_off = c->uv_off;
83
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
84
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
86
if (CONFIG_SWSCALE_ALPHA && aDest) {
87
YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
90
YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
93
#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
95
"lea " offset "(%0), %%"REG_d" \n\t"\
96
"pxor %%mm4, %%mm4 \n\t"\
97
"pxor %%mm5, %%mm5 \n\t"\
98
"pxor %%mm6, %%mm6 \n\t"\
99
"pxor %%mm7, %%mm7 \n\t"\
100
"mov (%%"REG_d"), %%"REG_S" \n\t"\
103
"movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
104
"movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
105
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
106
"movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
107
"movq %%mm0, %%mm3 \n\t"\
108
"punpcklwd %%mm1, %%mm0 \n\t"\
109
"punpckhwd %%mm1, %%mm3 \n\t"\
110
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
111
"pmaddwd %%mm1, %%mm0 \n\t"\
112
"pmaddwd %%mm1, %%mm3 \n\t"\
113
"paddd %%mm0, %%mm4 \n\t"\
114
"paddd %%mm3, %%mm5 \n\t"\
115
"movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
116
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
117
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
118
"test %%"REG_S", %%"REG_S" \n\t"\
119
"movq %%mm2, %%mm0 \n\t"\
120
"punpcklwd %%mm3, %%mm2 \n\t"\
121
"punpckhwd %%mm3, %%mm0 \n\t"\
122
"pmaddwd %%mm1, %%mm2 \n\t"\
123
"pmaddwd %%mm1, %%mm0 \n\t"\
124
"paddd %%mm2, %%mm6 \n\t"\
125
"paddd %%mm0, %%mm7 \n\t"\
127
"psrad $16, %%mm4 \n\t"\
128
"psrad $16, %%mm5 \n\t"\
129
"psrad $16, %%mm6 \n\t"\
130
"psrad $16, %%mm7 \n\t"\
131
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
132
"packssdw %%mm5, %%mm4 \n\t"\
133
"packssdw %%mm7, %%mm6 \n\t"\
134
"paddw %%mm0, %%mm4 \n\t"\
135
"paddw %%mm0, %%mm6 \n\t"\
136
"psraw $3, %%mm4 \n\t"\
137
"psraw $3, %%mm6 \n\t"\
138
"packuswb %%mm6, %%mm4 \n\t"\
139
MOVNTQ(%%mm4, (%1, %3))\
142
"lea " offset "(%0), %%"REG_d" \n\t"\
143
"pxor %%mm4, %%mm4 \n\t"\
144
"pxor %%mm5, %%mm5 \n\t"\
145
"pxor %%mm6, %%mm6 \n\t"\
146
"pxor %%mm7, %%mm7 \n\t"\
147
"mov (%%"REG_d"), %%"REG_S" \n\t"\
149
:: "r" (&c->redDither),\
150
"r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
151
: "%"REG_a, "%"REG_d, "%"REG_S\
154
static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
155
const int16_t **lumSrc, int lumFilterSize,
156
const int16_t *chrFilter, const int16_t **chrUSrc,
157
const int16_t **chrVSrc,
158
int chrFilterSize, const int16_t **alpSrc,
159
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
160
uint8_t *aDest, int dstW, int chrDstW)
163
x86_reg uv_off = c->uv_off;
164
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
165
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
167
if (CONFIG_SWSCALE_ALPHA && aDest) {
168
YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
171
YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
174
static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
175
const int16_t *chrUSrc, const int16_t *chrVSrc,
176
const int16_t *alpSrc,
177
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
178
uint8_t *aDest, int dstW, int chrDstW)
181
const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
182
uint8_t *dst[4]= { aDest, dest, uDest, vDest };
183
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
188
"mov %2, %%"REG_a" \n\t"
189
".p2align 4 \n\t" /* FIXME Unroll? */
191
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
192
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
193
"psraw $7, %%mm0 \n\t"
194
"psraw $7, %%mm1 \n\t"
195
"packuswb %%mm1, %%mm0 \n\t"
196
MOVNTQ(%%mm0, (%1, %%REGa))
197
"add $8, %%"REG_a" \n\t"
199
:: "r" (src[p]), "r" (dst[p] + counter[p]),
207
static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
208
const int16_t *chrUSrc, const int16_t *chrVSrc,
209
const int16_t *alpSrc,
210
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
211
uint8_t *aDest, int dstW, int chrDstW)
214
const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
215
uint8_t *dst[4]= { aDest, dest, uDest, vDest };
216
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
221
"mov %2, %%"REG_a" \n\t"
222
"pcmpeqw %%mm7, %%mm7 \n\t"
223
"psrlw $15, %%mm7 \n\t"
224
"psllw $6, %%mm7 \n\t"
225
".p2align 4 \n\t" /* FIXME Unroll? */
227
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
228
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
229
"paddsw %%mm7, %%mm0 \n\t"
230
"paddsw %%mm7, %%mm1 \n\t"
231
"psraw $7, %%mm0 \n\t"
232
"psraw $7, %%mm1 \n\t"
233
"packuswb %%mm1, %%mm0 \n\t"
234
MOVNTQ(%%mm0, (%1, %%REGa))
235
"add $8, %%"REG_a" \n\t"
237
:: "r" (src[p]), "r" (dst[p] + counter[p]),
245
38
#define YSCALEYUV2PACKEDX_UV \
246
39
__asm__ volatile(\
247
40
"xor %%"REG_a", %%"REG_a" \n\t"\
1289
1088
* YV12 to RGB without scaling or interpolating
1291
static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
1292
const uint16_t *ubuf0, const uint16_t *ubuf1,
1293
const uint16_t *vbuf0, const uint16_t *vbuf1,
1294
const uint16_t *abuf0, uint8_t *dest,
1295
int dstW, int uvalpha, enum PixelFormat dstFormat,
1298
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1300
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1301
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1303
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304
"mov %4, %%"REG_b" \n\t"
1305
"push %%"REG_BP" \n\t"
1306
YSCALEYUV2RGB1(%%REGBP, %5)
1307
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1308
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1309
"pop %%"REG_BP" \n\t"
1310
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1311
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1316
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1317
"mov %4, %%"REG_b" \n\t"
1318
"push %%"REG_BP" \n\t"
1319
YSCALEYUV2RGB1(%%REGBP, %5)
1320
"pcmpeqd %%mm7, %%mm7 \n\t"
1321
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1322
"pop %%"REG_BP" \n\t"
1323
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1324
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1329
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1331
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1332
"mov %4, %%"REG_b" \n\t"
1333
"push %%"REG_BP" \n\t"
1334
YSCALEYUV2RGB1b(%%REGBP, %5)
1335
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1336
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1337
"pop %%"REG_BP" \n\t"
1338
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1339
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1344
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1345
"mov %4, %%"REG_b" \n\t"
1346
"push %%"REG_BP" \n\t"
1347
YSCALEYUV2RGB1b(%%REGBP, %5)
1348
"pcmpeqd %%mm7, %%mm7 \n\t"
1349
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1350
"pop %%"REG_BP" \n\t"
1351
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1352
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1359
static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
1360
const uint16_t *ubuf0, const uint16_t *ubuf1,
1361
const uint16_t *vbuf0, const uint16_t *vbuf1,
1362
const uint16_t *abuf0, uint8_t *dest,
1363
int dstW, int uvalpha, enum PixelFormat dstFormat,
1366
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1368
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1370
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1371
"mov %4, %%"REG_b" \n\t"
1372
"push %%"REG_BP" \n\t"
1373
YSCALEYUV2RGB1(%%REGBP, %5)
1374
"pxor %%mm7, %%mm7 \n\t"
1375
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1376
"pop %%"REG_BP" \n\t"
1377
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1378
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1383
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1384
"mov %4, %%"REG_b" \n\t"
1385
"push %%"REG_BP" \n\t"
1386
YSCALEYUV2RGB1b(%%REGBP, %5)
1387
"pxor %%mm7, %%mm7 \n\t"
1388
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1389
"pop %%"REG_BP" \n\t"
1390
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1391
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1397
static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
1398
const uint16_t *ubuf0, const uint16_t *ubuf1,
1399
const uint16_t *vbuf0, const uint16_t *vbuf1,
1400
const uint16_t *abuf0, uint8_t *dest,
1401
int dstW, int uvalpha, enum PixelFormat dstFormat,
1404
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1406
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1408
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1409
"mov %4, %%"REG_b" \n\t"
1410
"push %%"REG_BP" \n\t"
1411
YSCALEYUV2RGB1(%%REGBP, %5)
1412
"pxor %%mm7, %%mm7 \n\t"
1413
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1415
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1416
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1417
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1419
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1420
"pop %%"REG_BP" \n\t"
1421
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1422
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1427
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1428
"mov %4, %%"REG_b" \n\t"
1429
"push %%"REG_BP" \n\t"
1430
YSCALEYUV2RGB1b(%%REGBP, %5)
1431
"pxor %%mm7, %%mm7 \n\t"
1432
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1434
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1435
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1436
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1438
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1439
"pop %%"REG_BP" \n\t"
1440
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1441
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1447
static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
1448
const uint16_t *ubuf0, const uint16_t *ubuf1,
1449
const uint16_t *vbuf0, const uint16_t *vbuf1,
1450
const uint16_t *abuf0, uint8_t *dest,
1451
int dstW, int uvalpha, enum PixelFormat dstFormat,
1454
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1090
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1091
const int16_t *ubuf[2], const int16_t *bguf[2],
1092
const int16_t *abuf0, uint8_t *dest,
1093
int dstW, int uvalpha, int y)
1095
const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1096
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1098
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1099
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1101
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1102
"mov %4, %%"REG_b" \n\t"
1103
"push %%"REG_BP" \n\t"
1104
YSCALEYUV2RGB1(%%REGBP, %5)
1105
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1106
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1107
"pop %%"REG_BP" \n\t"
1108
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1109
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1114
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1115
"mov %4, %%"REG_b" \n\t"
1116
"push %%"REG_BP" \n\t"
1117
YSCALEYUV2RGB1(%%REGBP, %5)
1118
"pcmpeqd %%mm7, %%mm7 \n\t"
1119
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1120
"pop %%"REG_BP" \n\t"
1121
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1122
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1127
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1129
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1130
"mov %4, %%"REG_b" \n\t"
1131
"push %%"REG_BP" \n\t"
1132
YSCALEYUV2RGB1b(%%REGBP, %5)
1133
YSCALEYUV2RGB1_ALPHA(%%REGBP)
1134
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1135
"pop %%"REG_BP" \n\t"
1136
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1137
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1142
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1143
"mov %4, %%"REG_b" \n\t"
1144
"push %%"REG_BP" \n\t"
1145
YSCALEYUV2RGB1b(%%REGBP, %5)
1146
"pcmpeqd %%mm7, %%mm7 \n\t"
1147
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1148
"pop %%"REG_BP" \n\t"
1149
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1150
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1157
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1158
const int16_t *ubuf[2], const int16_t *bguf[2],
1159
const int16_t *abuf0, uint8_t *dest,
1160
int dstW, int uvalpha, int y)
1162
const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1163
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1165
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1167
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1168
"mov %4, %%"REG_b" \n\t"
1169
"push %%"REG_BP" \n\t"
1170
YSCALEYUV2RGB1(%%REGBP, %5)
1171
"pxor %%mm7, %%mm7 \n\t"
1172
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1173
"pop %%"REG_BP" \n\t"
1174
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1175
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1180
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1181
"mov %4, %%"REG_b" \n\t"
1182
"push %%"REG_BP" \n\t"
1183
YSCALEYUV2RGB1b(%%REGBP, %5)
1184
"pxor %%mm7, %%mm7 \n\t"
1185
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1186
"pop %%"REG_BP" \n\t"
1187
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1188
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1194
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1195
const int16_t *ubuf[2], const int16_t *bguf[2],
1196
const int16_t *abuf0, uint8_t *dest,
1197
int dstW, int uvalpha, int y)
1199
const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1200
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1202
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1204
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1205
"mov %4, %%"REG_b" \n\t"
1206
"push %%"REG_BP" \n\t"
1207
YSCALEYUV2RGB1(%%REGBP, %5)
1208
"pxor %%mm7, %%mm7 \n\t"
1209
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1211
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1212
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1213
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1215
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1216
"pop %%"REG_BP" \n\t"
1217
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1218
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1223
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1224
"mov %4, %%"REG_b" \n\t"
1225
"push %%"REG_BP" \n\t"
1226
YSCALEYUV2RGB1b(%%REGBP, %5)
1227
"pxor %%mm7, %%mm7 \n\t"
1228
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1230
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1231
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1232
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1234
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1235
"pop %%"REG_BP" \n\t"
1236
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1237
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1243
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1244
const int16_t *ubuf[2], const int16_t *bguf[2],
1245
const int16_t *abuf0, uint8_t *dest,
1246
int dstW, int uvalpha, int y)
1248
const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1249
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1456
1251
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1457
1252
__asm__ volatile(
1570
#if !COMPILE_TEMPLATE_MMX2
1571
//FIXME yuy2* can read up to 7 samples too much
1573
static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
1574
int width, uint32_t *unused)
1577
"movq "MANGLE(bm01010101)", %%mm2 \n\t"
1578
"mov %0, %%"REG_a" \n\t"
1580
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1581
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1582
"pand %%mm2, %%mm0 \n\t"
1583
"pand %%mm2, %%mm1 \n\t"
1584
"packuswb %%mm1, %%mm0 \n\t"
1585
"movq %%mm0, (%2, %%"REG_a") \n\t"
1586
"add $8, %%"REG_a" \n\t"
1588
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1593
static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
1594
const uint8_t *src1, const uint8_t *src2,
1595
int width, uint32_t *unused)
1598
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1599
"mov %0, %%"REG_a" \n\t"
1601
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1602
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1603
"psrlw $8, %%mm0 \n\t"
1604
"psrlw $8, %%mm1 \n\t"
1605
"packuswb %%mm1, %%mm0 \n\t"
1606
"movq %%mm0, %%mm1 \n\t"
1607
"psrlw $8, %%mm0 \n\t"
1608
"pand %%mm4, %%mm1 \n\t"
1609
"packuswb %%mm0, %%mm0 \n\t"
1610
"packuswb %%mm1, %%mm1 \n\t"
1611
"movd %%mm0, (%3, %%"REG_a") \n\t"
1612
"movd %%mm1, (%2, %%"REG_a") \n\t"
1613
"add $4, %%"REG_a" \n\t"
1615
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1618
assert(src1 == src2);
1621
static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
1622
const uint8_t *src1, const uint8_t *src2,
1623
int width, uint32_t *unused)
1626
"mov %0, %%"REG_a" \n\t"
1628
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1629
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1630
"movq (%2, %%"REG_a",2), %%mm2 \n\t"
1631
"movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1632
"psrlw $8, %%mm0 \n\t"
1633
"psrlw $8, %%mm1 \n\t"
1634
"psrlw $8, %%mm2 \n\t"
1635
"psrlw $8, %%mm3 \n\t"
1636
"packuswb %%mm1, %%mm0 \n\t"
1637
"packuswb %%mm3, %%mm2 \n\t"
1638
"movq %%mm0, (%3, %%"REG_a") \n\t"
1639
"movq %%mm2, (%4, %%"REG_a") \n\t"
1640
"add $8, %%"REG_a" \n\t"
1642
: : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1647
/* This is almost identical to the previous, end exists only because
1648
* yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1649
static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
1650
int width, uint32_t *unused)
1653
"mov %0, %%"REG_a" \n\t"
1655
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1656
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1657
"psrlw $8, %%mm0 \n\t"
1658
"psrlw $8, %%mm1 \n\t"
1659
"packuswb %%mm1, %%mm0 \n\t"
1660
"movq %%mm0, (%2, %%"REG_a") \n\t"
1661
"add $8, %%"REG_a" \n\t"
1663
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1668
static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
1669
const uint8_t *src1, const uint8_t *src2,
1670
int width, uint32_t *unused)
1673
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1674
"mov %0, %%"REG_a" \n\t"
1676
"movq (%1, %%"REG_a",4), %%mm0 \n\t"
1677
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1678
"pand %%mm4, %%mm0 \n\t"
1679
"pand %%mm4, %%mm1 \n\t"
1680
"packuswb %%mm1, %%mm0 \n\t"
1681
"movq %%mm0, %%mm1 \n\t"
1682
"psrlw $8, %%mm0 \n\t"
1683
"pand %%mm4, %%mm1 \n\t"
1684
"packuswb %%mm0, %%mm0 \n\t"
1685
"packuswb %%mm1, %%mm1 \n\t"
1686
"movd %%mm0, (%3, %%"REG_a") \n\t"
1687
"movd %%mm1, (%2, %%"REG_a") \n\t"
1688
"add $4, %%"REG_a" \n\t"
1690
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1693
assert(src1 == src2);
1696
static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
1697
const uint8_t *src1, const uint8_t *src2,
1698
int width, uint32_t *unused)
1701
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1702
"mov %0, %%"REG_a" \n\t"
1704
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1705
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1706
"movq (%2, %%"REG_a",2), %%mm2 \n\t"
1707
"movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1708
"pand %%mm4, %%mm0 \n\t"
1709
"pand %%mm4, %%mm1 \n\t"
1710
"pand %%mm4, %%mm2 \n\t"
1711
"pand %%mm4, %%mm3 \n\t"
1712
"packuswb %%mm1, %%mm0 \n\t"
1713
"packuswb %%mm3, %%mm2 \n\t"
1714
"movq %%mm0, (%3, %%"REG_a") \n\t"
1715
"movq %%mm2, (%4, %%"REG_a") \n\t"
1716
"add $8, %%"REG_a" \n\t"
1718
: : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1723
static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1724
const uint8_t *src, int width)
1727
"movq "MANGLE(bm01010101)", %%mm4 \n\t"
1728
"mov %0, %%"REG_a" \n\t"
1730
"movq (%1, %%"REG_a",2), %%mm0 \n\t"
1731
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1732
"movq %%mm0, %%mm2 \n\t"
1733
"movq %%mm1, %%mm3 \n\t"
1734
"pand %%mm4, %%mm0 \n\t"
1735
"pand %%mm4, %%mm1 \n\t"
1736
"psrlw $8, %%mm2 \n\t"
1737
"psrlw $8, %%mm3 \n\t"
1738
"packuswb %%mm1, %%mm0 \n\t"
1739
"packuswb %%mm3, %%mm2 \n\t"
1740
"movq %%mm0, (%2, %%"REG_a") \n\t"
1741
"movq %%mm2, (%3, %%"REG_a") \n\t"
1742
"add $8, %%"REG_a" \n\t"
1744
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1749
static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1750
const uint8_t *src1, const uint8_t *src2,
1751
int width, uint32_t *unused)
1753
RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1756
static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1757
const uint8_t *src1, const uint8_t *src2,
1758
int width, uint32_t *unused)
1760
RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1762
#endif /* !COMPILE_TEMPLATE_MMX2 */
1764
1364
static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
1765
1365
int width, enum PixelFormat srcFormat)
1903
1503
RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1906
#if !COMPILE_TEMPLATE_MMX2
1907
// bilinear / bicubic scaling
1908
static void RENAME(hScale)(int16_t *dst, int dstW,
1909
const uint8_t *src, int srcW,
1910
int xInc, const int16_t *filter,
1911
const int16_t *filterPos, int filterSize)
1913
assert(filterSize % 4 == 0 && filterSize>0);
1914
if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1915
x86_reg counter= -2*dstW;
1917
filterPos-= counter/2;
1921
"push %%"REG_b" \n\t"
1923
"pxor %%mm7, %%mm7 \n\t"
1924
"push %%"REG_BP" \n\t" // we use 7 regs here ...
1925
"mov %%"REG_a", %%"REG_BP" \n\t"
1928
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
1929
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1930
"movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1931
"movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1932
"movd (%3, %%"REG_a"), %%mm0 \n\t"
1933
"movd (%3, %%"REG_b"), %%mm2 \n\t"
1934
"punpcklbw %%mm7, %%mm0 \n\t"
1935
"punpcklbw %%mm7, %%mm2 \n\t"
1936
"pmaddwd %%mm1, %%mm0 \n\t"
1937
"pmaddwd %%mm2, %%mm3 \n\t"
1938
"movq %%mm0, %%mm4 \n\t"
1939
"punpckldq %%mm3, %%mm0 \n\t"
1940
"punpckhdq %%mm3, %%mm4 \n\t"
1941
"paddd %%mm4, %%mm0 \n\t"
1942
"psrad $7, %%mm0 \n\t"
1943
"packssdw %%mm0, %%mm0 \n\t"
1944
"movd %%mm0, (%4, %%"REG_BP") \n\t"
1945
"add $4, %%"REG_BP" \n\t"
1948
"pop %%"REG_BP" \n\t"
1950
"pop %%"REG_b" \n\t"
1953
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1958
} else if (filterSize==8) {
1959
x86_reg counter= -2*dstW;
1961
filterPos-= counter/2;
1965
"push %%"REG_b" \n\t"
1967
"pxor %%mm7, %%mm7 \n\t"
1968
"push %%"REG_BP" \n\t" // we use 7 regs here ...
1969
"mov %%"REG_a", %%"REG_BP" \n\t"
1972
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
1973
"movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1974
"movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1975
"movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1976
"movd (%3, %%"REG_a"), %%mm0 \n\t"
1977
"movd (%3, %%"REG_b"), %%mm2 \n\t"
1978
"punpcklbw %%mm7, %%mm0 \n\t"
1979
"punpcklbw %%mm7, %%mm2 \n\t"
1980
"pmaddwd %%mm1, %%mm0 \n\t"
1981
"pmaddwd %%mm2, %%mm3 \n\t"
1983
"movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1984
"movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1985
"movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1986
"movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1987
"punpcklbw %%mm7, %%mm4 \n\t"
1988
"punpcklbw %%mm7, %%mm2 \n\t"
1989
"pmaddwd %%mm1, %%mm4 \n\t"
1990
"pmaddwd %%mm2, %%mm5 \n\t"
1991
"paddd %%mm4, %%mm0 \n\t"
1992
"paddd %%mm5, %%mm3 \n\t"
1993
"movq %%mm0, %%mm4 \n\t"
1994
"punpckldq %%mm3, %%mm0 \n\t"
1995
"punpckhdq %%mm3, %%mm4 \n\t"
1996
"paddd %%mm4, %%mm0 \n\t"
1997
"psrad $7, %%mm0 \n\t"
1998
"packssdw %%mm0, %%mm0 \n\t"
1999
"movd %%mm0, (%4, %%"REG_BP") \n\t"
2000
"add $4, %%"REG_BP" \n\t"
2003
"pop %%"REG_BP" \n\t"
2005
"pop %%"REG_b" \n\t"
2008
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2014
const uint8_t *offset = src+filterSize;
2015
x86_reg counter= -2*dstW;
2016
//filter-= counter*filterSize/2;
2017
filterPos-= counter/2;
2020
"pxor %%mm7, %%mm7 \n\t"
2023
"mov %2, %%"REG_c" \n\t"
2024
"movzwl (%%"REG_c", %0), %%eax \n\t"
2025
"movzwl 2(%%"REG_c", %0), %%edx \n\t"
2026
"mov %5, %%"REG_c" \n\t"
2027
"pxor %%mm4, %%mm4 \n\t"
2028
"pxor %%mm5, %%mm5 \n\t"
2030
"movq (%1), %%mm1 \n\t"
2031
"movq (%1, %6), %%mm3 \n\t"
2032
"movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2033
"movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2034
"punpcklbw %%mm7, %%mm0 \n\t"
2035
"punpcklbw %%mm7, %%mm2 \n\t"
2036
"pmaddwd %%mm1, %%mm0 \n\t"
2037
"pmaddwd %%mm2, %%mm3 \n\t"
2038
"paddd %%mm3, %%mm5 \n\t"
2039
"paddd %%mm0, %%mm4 \n\t"
2041
"add $4, %%"REG_c" \n\t"
2042
"cmp %4, %%"REG_c" \n\t"
2045
"movq %%mm4, %%mm0 \n\t"
2046
"punpckldq %%mm5, %%mm4 \n\t"
2047
"punpckhdq %%mm5, %%mm0 \n\t"
2048
"paddd %%mm0, %%mm4 \n\t"
2049
"psrad $7, %%mm4 \n\t"
2050
"packssdw %%mm4, %%mm4 \n\t"
2051
"mov %3, %%"REG_a" \n\t"
2052
"movd %%mm4, (%%"REG_a", %0) \n\t"
2056
: "+r" (counter), "+r" (filter)
2057
: "m" (filterPos), "m" (dst), "m"(offset),
2058
"m" (src), "r" ((x86_reg)filterSize*2)
2059
: "%"REG_a, "%"REG_c, "%"REG_d
2063
#endif /* !COMPILE_TEMPLATE_MMX2 */
2065
1506
#if COMPILE_TEMPLATE_MMX2
2066
1507
static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2067
1508
int dstWidth, const uint8_t *src,