80
80
ASMALIGN(4) /* FIXME Unroll? */\
82
82
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
83
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85
85
"add $16, %%"REG_d" \n\t"\
86
86
"mov (%%"REG_d"), %%"REG_S" \n\t"\
87
87
"test %%"REG_S", %%"REG_S" \n\t"\
117
117
"mov (%%"REG_d"), %%"REG_S" \n\t"\
120
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121
"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
120
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122
122
"mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
123
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124
124
"movq %%mm0, %%mm3 \n\t"\
125
125
"punpcklwd %%mm1, %%mm0 \n\t"\
126
126
"punpckhwd %%mm1, %%mm3 \n\t"\
388
388
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
389
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
390
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
"movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
391
"movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392
392
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
393
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
"movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
394
"movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395
395
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
396
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
397
"psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
435
435
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436
436
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
437
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439
439
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
440
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
441
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
467
467
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468
468
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
469
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471
471
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
472
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
473
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
597
597
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598
598
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
599
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601
601
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
602
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
603
"psrlw $8, %%mm3 \n\t" \
616
616
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617
617
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
618
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620
620
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
621
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
622
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
856
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
857
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
"movq "MANGLE(M24A)", %%mm0 \n\t"\
859
"movq "MANGLE(M24C)", %%mm7 \n\t"\
858
"movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859
"movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
860
860
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861
861
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862
862
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
935
935
if (c->flags & SWS_ACCURATE_RND){
937
YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
937
YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941
YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
941
YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
944
YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
944
YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948
YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
948
YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
951
951
#ifdef HAVE_ALTIVEC
2019
2020
"punpcklbw %%mm7, %%mm5 \n\t"
2020
2021
"punpcklbw %%mm7, %%mm2 \n\t"
2021
2022
"paddw %%mm5, %%mm2 \n\t"
2022
"movq "MANGLE(w1111)", %%mm5 \n\t"
2023
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2023
2024
"psrlw $2, %%mm4 \n\t"
2024
2025
"psrlw $2, %%mm2 \n\t"
2026
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2027
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2027
"movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2028
"movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2029
2030
"pmaddwd %%mm4, %%mm1 \n\t"
2030
2031
"pmaddwd %%mm2, %%mm3 \n\t"
2691
2692
if (srcFormat==PIX_FMT_YUYV422)
2693
RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2694
RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2694
2695
src1= formatConvBuffer;
2695
src2= formatConvBuffer+2048;
2696
src2= formatConvBuffer+VOFW;
2697
2698
else if (srcFormat==PIX_FMT_UYVY422)
2699
RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700
RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2700
2701
src1= formatConvBuffer;
2701
src2= formatConvBuffer+2048;
2702
src2= formatConvBuffer+VOFW;
2703
2704
else if (srcFormat==PIX_FMT_RGB32)
2705
RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706
RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2706
2707
src1= formatConvBuffer;
2707
src2= formatConvBuffer+2048;
2708
src2= formatConvBuffer+VOFW;
2709
2710
else if (srcFormat==PIX_FMT_BGR24)
2711
RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712
RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2712
2713
src1= formatConvBuffer;
2713
src2= formatConvBuffer+2048;
2714
src2= formatConvBuffer+VOFW;
2715
2716
else if (srcFormat==PIX_FMT_BGR565)
2717
RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718
RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2718
2719
src1= formatConvBuffer;
2719
src2= formatConvBuffer+2048;
2720
src2= formatConvBuffer+VOFW;
2721
2722
else if (srcFormat==PIX_FMT_BGR555)
2723
RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724
RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2724
2725
src1= formatConvBuffer;
2725
src2= formatConvBuffer+2048;
2726
src2= formatConvBuffer+VOFW;
2727
2728
else if (srcFormat==PIX_FMT_BGR32)
2729
RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730
RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2730
2731
src1= formatConvBuffer;
2731
src2= formatConvBuffer+2048;
2732
src2= formatConvBuffer+VOFW;
2733
2734
else if (srcFormat==PIX_FMT_RGB24)
2735
RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736
RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2736
2737
src1= formatConvBuffer;
2737
src2= formatConvBuffer+2048;
2738
src2= formatConvBuffer+VOFW;
2739
2740
else if (srcFormat==PIX_FMT_RGB565)
2741
RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742
RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2742
2743
src1= formatConvBuffer;
2743
src2= formatConvBuffer+2048;
2744
src2= formatConvBuffer+VOFW;
2745
2746
else if (srcFormat==PIX_FMT_RGB555)
2747
RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748
RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2748
2749
src1= formatConvBuffer;
2749
src2= formatConvBuffer+2048;
2750
src2= formatConvBuffer+VOFW;
2751
2752
else if (isGray(srcFormat))
2755
2756
else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2757
RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2758
RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2758
2759
src1= formatConvBuffer;
2759
src2= formatConvBuffer+2048;
2760
src2= formatConvBuffer+VOFW;
2762
2763
#ifdef HAVE_MMX
2763
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2764
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764
2765
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2766
2767
if (!(flags&SWS_FAST_BILINEAR))
2769
2770
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2770
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771
RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2772
2773
else // Fast Bilinear upscale / crap downscale
3118
3119
lastInChrBuf++;
3120
3121
//wrap buf index around to stay inside the ring buffer
3121
if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3122
if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3122
if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3123
if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3123
3124
break; //we can't output a dstY line so let's try with the next slice
3126
3127
#ifdef HAVE_MMX
3127
b5Dither= dither8[dstY&1];
3128
g6Dither= dither4[dstY&1];
3129
g5Dither= dither8[dstY&1];
3130
r5Dither= dither8[(dstY+1)&1];
3128
b5Dither= ff_dither8[dstY&1];
3129
g6Dither= ff_dither4[dstY&1];
3130
g5Dither= ff_dither8[dstY&1];
3131
r5Dither= ff_dither8[(dstY+1)&1];
3132
3133
if (dstY < dstH-2)