~medibuntu-maintainers/mplayer/medibuntu.precise

« back to all changes in this revision

Viewing changes to ffmpeg/libswscale/x86/swscale_template.c

  • Committer: Gauvain Pocentek
  • Date: 2012-03-06 11:59:12 UTC
  • mfrom: (66.1.15 precise)
  • Revision ID: gauvain@pocentek.net-20120306115912-h9d6kt9j0l532oo5
* Merge from Ubuntu:
  - put back faac support
  - recommends apport-hooks-medibuntu
  - change Maintainer, Uploaders & Vcs-* fields.
* New upstream snapshot
* upload to unstable
* Build against external libmpeg2
* drop 51_FTBFS_arm.patch again
* no longer build depend on libcdparanoia-dev on the Hurd
* Fix FTBFS on the hurd.
  Thanks to Samuel Thibault <sthibault@debian.org> (Closes: #654974)
* Fix FTBFS on arm
* New upstream snapshot, Closes: #650339, #643621, #481807
* Imported Upstream version 1.0~rc4+svn34492
* Bump standards version
* Bump dependency on libav >= 4:0.8~, Closes: #653887
* Fix build-indep
* Build mplayer-gui again, Closes: #568514
* Drop debian/all-lang-config-mak.sh, no longer needed
* include .dfsg1 in version number
* remove get-orig-source target
* no longer prune compiler flags from the environment
* No longer advertise nor build 3fdx, mga and dxr3 backends,
  Closes: #496106, #442181, #533546
* beautify mplayer version identification string
* Brown paperbag upload.
* Next try to fix build failure on sparce after recent binutils change.
* Brown paperbag upload.
* Really fix build failure on sparc after recent binutils change.
* Properly set Replaces/Conflicts on mplayer2{,-dbg} to avoid
  file overwrite errors.
* Adjust versioning of mplayer listed in the mplayer-dbg's Depends field.
* Fix build failure on sparc after recent binutils change.
* Urgency medium bumped because of RC-level bugfix
  and speeding up x264 transition.
* Update to my @debian.org email.
* Upload to unstable
* Enable joystick support on Linux only, Closes: #638408
* Rebuild fixes toolchain issue on arm, Closes: #637077
* New upstream snapshot
* following the discussion started by Diego Biurrun <diego@biurrun.de>
  in debian-devel, I have prepared a new packaging of 'mplayer'
  (with code that comes from CVS)
* the upstream tar.bz cannot be distributed by Debian, since it contains
   CSS code; so I am repackaging it 
* I have tried my best to address all known issues:
  - the package contains the detailed Copyright made by Diego Biurrun 
  - the package does not contain CSS code, or  AFAIK other code on which 
     there is active patent enforcement
  - there is a script  debian/cvs-changelog.sh  that shows all changes
     done to files included in this source.
    This should comply with GPLv2 sec 2.a  (in spirit if not in letter)
    For this reason, the source code contains CVS directories.
* needs   make (>= 3.80) for 'html-chunked-$(1)' in DOCS/xml/Makefile

* some corrections, as suggested Diego Biurrun
  - binary codecs should go into /usr/lib/codecs (upstream default)
  - better template 'mplayer/install_codecs'
  - an empty 'font=' in mplayer.conf breaks mplayer: postinst corrected
* correction in 'mplayer/cfgnote'
* better mplayer.postinst and mplayer.config

* New upstream release
* better debian/copyright file
* do not ship a skin
* New upstream release
* changed DEB_BUILD_OPTIONS to DEB_BUILD_CONFIGURE ,
  DEB_BUILD_OPTIONS is used as in debian policy
* use gcc-3.4
* changed xlibs-dev to a long list of dependencies, for Debian/etch
* try to adhere to  http://www.mplayerhq.hu/DOCS/tech/binary-packaging.txt
  (see README.Debian for details)
* removed dependency on xlibmesa-dev, disabled opengl
* New upstream release
* Simon McVittie <hacks@pseudorandom.co.uk> wonderful work:
- Work around Debian bug #267442 (glibc's sys/uio.h and gcc's altivec.h have
  conflicting uses for __vector) by re-ordering #includes
- Fix potential symlink attack in ./configure
- Disable support for binary codecs on platforms for which those codecs
  aren't available; also disable the corresponding Debconf note when it's
  inappropriate
- Changed Build-Depends: so it works in pbuilder
- Explicitly build-depend on libjpeg62-dev, libfontconfig1-dev,
  libungif4-dev 
- Tweak debian/rules to avoid certain errors being ignored
- Use --language=all
* provide a target  'debian/rules get-orig-source' 
  that recreates the orig.tar.gz ; then use the above orig.tar.gz
* rewrote some parts of debian/rules
* don't clean and recompile docs if upstream ships them
* mplayer-doc was shipping too much stuff
* translated man pages where not installed properly
* compile with libdv4-dev
* correct README.Debian
* Forgot build-dep on libtheora
* Must not depend on libxvidcore
* New upstream release
* new release.
* rc1 to become 0.90
* new pre-release
* new pre-release
* gtk bug fixed.
* new release.
* version bumped
* 0.60 pre2 release
* 0.60 pre-release.

Show diffs side-by-side

added added

removed removed

Lines of Context:
35
35
#endif
36
36
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
37
37
 
38
 
#define YSCALEYUV2YV12X(offset, dest, end, pos) \
39
 
    __asm__ volatile(\
40
 
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
41
 
        "movq                             %%mm3, %%mm4      \n\t"\
42
 
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
43
 
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
44
 
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
45
 
        "1:                                                 \n\t"\
46
 
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
47
 
        "movq                (%%"REG_S", %3, 2), %%mm2      \n\t" /* srcData */\
48
 
        "movq               8(%%"REG_S", %3, 2), %%mm5      \n\t" /* srcData */\
49
 
        "add                                $16, %%"REG_d"  \n\t"\
50
 
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
51
 
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
52
 
        "pmulhw                           %%mm0, %%mm2      \n\t"\
53
 
        "pmulhw                           %%mm0, %%mm5      \n\t"\
54
 
        "paddw                            %%mm2, %%mm3      \n\t"\
55
 
        "paddw                            %%mm5, %%mm4      \n\t"\
56
 
        " jnz                                1b             \n\t"\
57
 
        "psraw                               $3, %%mm3      \n\t"\
58
 
        "psraw                               $3, %%mm4      \n\t"\
59
 
        "packuswb                         %%mm4, %%mm3      \n\t"\
60
 
        MOVNTQ(%%mm3, (%1, %3))\
61
 
        "add                                 $8, %3         \n\t"\
62
 
        "cmp                                 %2, %3         \n\t"\
63
 
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
64
 
        "movq                             %%mm3, %%mm4      \n\t"\
65
 
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
66
 
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
67
 
        "jb                                  1b             \n\t"\
68
 
        :: "r" (&c->redDither),\
69
 
           "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
70
 
        : "%"REG_d, "%"REG_S\
71
 
    );
72
 
 
73
 
static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
74
 
                             const int16_t **lumSrc, int lumFilterSize,
75
 
                             const int16_t *chrFilter, const int16_t **chrUSrc,
76
 
                             const int16_t **chrVSrc,
77
 
                             int chrFilterSize, const int16_t **alpSrc,
78
 
                             uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
79
 
                             uint8_t *aDest, int dstW, int chrDstW)
80
 
{
81
 
    if (uDest) {
82
 
        x86_reg uv_off = c->uv_off;
83
 
        YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
84
 
        YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
85
 
    }
86
 
    if (CONFIG_SWSCALE_ALPHA && aDest) {
87
 
        YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
88
 
    }
89
 
 
90
 
    YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
91
 
}
92
 
 
93
 
#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
94
 
    __asm__ volatile(\
95
 
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
96
 
        "pxor                             %%mm4, %%mm4      \n\t"\
97
 
        "pxor                             %%mm5, %%mm5      \n\t"\
98
 
        "pxor                             %%mm6, %%mm6      \n\t"\
99
 
        "pxor                             %%mm7, %%mm7      \n\t"\
100
 
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
101
 
        ".p2align                             4             \n\t"\
102
 
        "1:                                                 \n\t"\
103
 
        "movq                (%%"REG_S", %3, 2), %%mm0      \n\t" /* srcData */\
104
 
        "movq               8(%%"REG_S", %3, 2), %%mm2      \n\t" /* srcData */\
105
 
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
106
 
        "movq                (%%"REG_S", %3, 2), %%mm1      \n\t" /* srcData */\
107
 
        "movq                             %%mm0, %%mm3      \n\t"\
108
 
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
109
 
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
110
 
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
111
 
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
112
 
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
113
 
        "paddd                            %%mm0, %%mm4      \n\t"\
114
 
        "paddd                            %%mm3, %%mm5      \n\t"\
115
 
        "movq               8(%%"REG_S", %3, 2), %%mm3      \n\t" /* srcData */\
116
 
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
117
 
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
118
 
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
119
 
        "movq                             %%mm2, %%mm0      \n\t"\
120
 
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
121
 
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
122
 
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
123
 
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
124
 
        "paddd                            %%mm2, %%mm6      \n\t"\
125
 
        "paddd                            %%mm0, %%mm7      \n\t"\
126
 
        " jnz                                1b             \n\t"\
127
 
        "psrad                              $16, %%mm4      \n\t"\
128
 
        "psrad                              $16, %%mm5      \n\t"\
129
 
        "psrad                              $16, %%mm6      \n\t"\
130
 
        "psrad                              $16, %%mm7      \n\t"\
131
 
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
132
 
        "packssdw                         %%mm5, %%mm4      \n\t"\
133
 
        "packssdw                         %%mm7, %%mm6      \n\t"\
134
 
        "paddw                            %%mm0, %%mm4      \n\t"\
135
 
        "paddw                            %%mm0, %%mm6      \n\t"\
136
 
        "psraw                               $3, %%mm4      \n\t"\
137
 
        "psraw                               $3, %%mm6      \n\t"\
138
 
        "packuswb                         %%mm6, %%mm4      \n\t"\
139
 
        MOVNTQ(%%mm4, (%1, %3))\
140
 
        "add                                 $8, %3         \n\t"\
141
 
        "cmp                                 %2, %3         \n\t"\
142
 
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
143
 
        "pxor                             %%mm4, %%mm4      \n\t"\
144
 
        "pxor                             %%mm5, %%mm5      \n\t"\
145
 
        "pxor                             %%mm6, %%mm6      \n\t"\
146
 
        "pxor                             %%mm7, %%mm7      \n\t"\
147
 
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
148
 
        "jb                                  1b             \n\t"\
149
 
        :: "r" (&c->redDither),\
150
 
        "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
151
 
        : "%"REG_a, "%"REG_d, "%"REG_S\
152
 
    );
153
 
 
154
 
static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
155
 
                                const int16_t **lumSrc, int lumFilterSize,
156
 
                                const int16_t *chrFilter, const int16_t **chrUSrc,
157
 
                                const int16_t **chrVSrc,
158
 
                                int chrFilterSize, const int16_t **alpSrc,
159
 
                                uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
160
 
                                uint8_t *aDest, int dstW, int chrDstW)
161
 
{
162
 
    if (uDest) {
163
 
        x86_reg uv_off = c->uv_off;
164
 
        YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
165
 
        YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
166
 
    }
167
 
    if (CONFIG_SWSCALE_ALPHA && aDest) {
168
 
        YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
169
 
    }
170
 
 
171
 
    YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
172
 
}
173
 
 
174
 
static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
175
 
                             const int16_t *chrUSrc, const int16_t *chrVSrc,
176
 
                             const int16_t *alpSrc,
177
 
                             uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
178
 
                             uint8_t *aDest, int dstW, int chrDstW)
179
 
{
180
 
    int p= 4;
181
 
    const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
182
 
    uint8_t *dst[4]= { aDest, dest, uDest, vDest };
183
 
    x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
184
 
 
185
 
    while (p--) {
186
 
        if (dst[p]) {
187
 
            __asm__ volatile(
188
 
                "mov %2, %%"REG_a"                    \n\t"
189
 
                ".p2align               4             \n\t" /* FIXME Unroll? */
190
 
                "1:                                   \n\t"
191
 
                "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
192
 
                "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"
193
 
                "psraw                 $7, %%mm0      \n\t"
194
 
                "psraw                 $7, %%mm1      \n\t"
195
 
                "packuswb           %%mm1, %%mm0      \n\t"
196
 
                MOVNTQ(%%mm0, (%1, %%REGa))
197
 
                "add                   $8, %%"REG_a"  \n\t"
198
 
                "jnc                   1b             \n\t"
199
 
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
200
 
                   "g" (-counter[p])
201
 
                : "%"REG_a
202
 
            );
203
 
        }
204
 
    }
205
 
}
206
 
 
207
 
static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
208
 
                                const int16_t *chrUSrc, const int16_t *chrVSrc,
209
 
                                const int16_t *alpSrc,
210
 
                                uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
211
 
                                uint8_t *aDest, int dstW, int chrDstW)
212
 
{
213
 
    int p= 4;
214
 
    const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
215
 
    uint8_t *dst[4]= { aDest, dest, uDest, vDest };
216
 
    x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
217
 
 
218
 
    while (p--) {
219
 
        if (dst[p]) {
220
 
            __asm__ volatile(
221
 
                "mov %2, %%"REG_a"                    \n\t"
222
 
                "pcmpeqw %%mm7, %%mm7                 \n\t"
223
 
                "psrlw                 $15, %%mm7     \n\t"
224
 
                "psllw                  $6, %%mm7     \n\t"
225
 
                ".p2align                4            \n\t" /* FIXME Unroll? */
226
 
                "1:                                   \n\t"
227
 
                "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
228
 
                "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"
229
 
                "paddsw             %%mm7, %%mm0      \n\t"
230
 
                "paddsw             %%mm7, %%mm1      \n\t"
231
 
                "psraw                 $7, %%mm0      \n\t"
232
 
                "psraw                 $7, %%mm1      \n\t"
233
 
                "packuswb           %%mm1, %%mm0      \n\t"
234
 
                MOVNTQ(%%mm0, (%1, %%REGa))
235
 
                "add                   $8, %%"REG_a"  \n\t"
236
 
                "jnc                   1b             \n\t"
237
 
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
238
 
                   "g" (-counter[p])
239
 
                : "%"REG_a
240
 
            );
241
 
        }
242
 
    }
243
 
}
244
 
 
245
38
#define YSCALEYUV2PACKEDX_UV \
246
39
    __asm__ volatile(\
247
40
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
467
260
{
468
261
    x86_reg dummy=0;
469
262
    x86_reg dstW_reg = dstW;
470
 
    x86_reg uv_off = c->uv_off << 1;
 
263
    x86_reg uv_off = c->uv_off_byte;
471
264
 
472
265
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
473
266
        YSCALEYUV2PACKEDX_ACCURATE
500
293
{
501
294
    x86_reg dummy=0;
502
295
    x86_reg dstW_reg = dstW;
503
 
    x86_reg uv_off = c->uv_off << 1;
 
296
    x86_reg uv_off = c->uv_off_byte;
504
297
 
505
298
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
506
299
        YSCALEYUV2PACKEDX
557
350
{
558
351
    x86_reg dummy=0;
559
352
    x86_reg dstW_reg = dstW;
560
 
    x86_reg uv_off = c->uv_off << 1;
 
353
    x86_reg uv_off = c->uv_off_byte;
561
354
 
562
355
    YSCALEYUV2PACKEDX_ACCURATE
563
356
    YSCALEYUV2RGBX
581
374
{
582
375
    x86_reg dummy=0;
583
376
    x86_reg dstW_reg = dstW;
584
 
    x86_reg uv_off = c->uv_off << 1;
 
377
    x86_reg uv_off = c->uv_off_byte;
585
378
 
586
379
    YSCALEYUV2PACKEDX
587
380
    YSCALEYUV2RGBX
634
427
{
635
428
    x86_reg dummy=0;
636
429
    x86_reg dstW_reg = dstW;
637
 
    x86_reg uv_off = c->uv_off << 1;
 
430
    x86_reg uv_off = c->uv_off_byte;
638
431
 
639
432
    YSCALEYUV2PACKEDX_ACCURATE
640
433
    YSCALEYUV2RGBX
658
451
{
659
452
    x86_reg dummy=0;
660
453
    x86_reg dstW_reg = dstW;
661
 
    x86_reg uv_off = c->uv_off << 1;
 
454
    x86_reg uv_off = c->uv_off_byte;
662
455
 
663
456
    YSCALEYUV2PACKEDX
664
457
    YSCALEYUV2RGBX
791
584
{
792
585
    x86_reg dummy=0;
793
586
    x86_reg dstW_reg = dstW;
794
 
    x86_reg uv_off = c->uv_off << 1;
 
587
    x86_reg uv_off = c->uv_off_byte;
795
588
 
796
589
    YSCALEYUV2PACKEDX_ACCURATE
797
590
    YSCALEYUV2RGBX
815
608
{
816
609
    x86_reg dummy=0;
817
610
    x86_reg dstW_reg = dstW;
818
 
    x86_reg uv_off = c->uv_off << 1;
 
611
    x86_reg uv_off = c->uv_off_byte;
819
612
 
820
613
    YSCALEYUV2PACKEDX
821
614
    YSCALEYUV2RGBX
856
649
{
857
650
    x86_reg dummy=0;
858
651
    x86_reg dstW_reg = dstW;
859
 
    x86_reg uv_off = c->uv_off << 1;
 
652
    x86_reg uv_off = c->uv_off_byte;
860
653
 
861
654
    YSCALEYUV2PACKEDX_ACCURATE
862
655
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
877
670
{
878
671
    x86_reg dummy=0;
879
672
    x86_reg dstW_reg = dstW;
880
 
    x86_reg uv_off = c->uv_off << 1;
 
673
    x86_reg uv_off = c->uv_off_byte;
881
674
 
882
675
    YSCALEYUV2PACKEDX
883
676
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
895
688
    "1:                                 \n\t"\
896
689
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
897
690
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
898
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
691
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
899
692
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
900
693
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
901
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
694
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
902
695
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
903
696
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
904
697
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
969
762
/**
970
763
 * vertical bilinear scale YV12 to RGB
971
764
 */
972
 
static void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
973
 
                                const uint16_t *buf1, const uint16_t *ubuf0,
974
 
                                const uint16_t *ubuf1, const uint16_t *vbuf0,
975
 
                                const uint16_t *vbuf1, const uint16_t *abuf0,
976
 
                                const uint16_t *abuf1, uint8_t *dest,
 
765
static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
 
766
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
 
767
                                const int16_t *abuf[2], uint8_t *dest,
977
768
                                int dstW, int yalpha, int uvalpha, int y)
978
769
{
 
770
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
771
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
772
 
979
773
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 
774
        const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
980
775
#if ARCH_X86_64
981
776
        __asm__ volatile(
982
777
            YSCALEYUV2RGB(%%r8, %5)
1031
826
    }
1032
827
}
1033
828
 
1034
 
static void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
1035
 
                                const uint16_t *buf1, const uint16_t *ubuf0,
1036
 
                                const uint16_t *ubuf1, const uint16_t *vbuf0,
1037
 
                                const uint16_t *vbuf1, const uint16_t *abuf0,
1038
 
                                const uint16_t *abuf1, uint8_t *dest,
 
829
static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
 
830
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
 
831
                                const int16_t *abuf[2], uint8_t *dest,
1039
832
                                int dstW, int yalpha, int uvalpha, int y)
1040
833
{
 
834
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
835
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
836
 
1041
837
    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1042
838
    __asm__ volatile(
1043
839
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1053
849
    );
1054
850
}
1055
851
 
1056
 
static void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
1057
 
                                 const uint16_t *buf1, const uint16_t *ubuf0,
1058
 
                                 const uint16_t *ubuf1, const uint16_t *vbuf0,
1059
 
                                 const uint16_t *vbuf1, const uint16_t *abuf0,
1060
 
                                 const uint16_t *abuf1, uint8_t *dest,
 
852
static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
 
853
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
 
854
                                 const int16_t *abuf[2], uint8_t *dest,
1061
855
                                 int dstW, int yalpha, int uvalpha, int y)
1062
856
{
 
857
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
858
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
859
 
1063
860
    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1064
861
    __asm__ volatile(
1065
862
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1081
878
    );
1082
879
}
1083
880
 
1084
 
static void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
1085
 
                                 const uint16_t *buf1, const uint16_t *ubuf0,
1086
 
                                 const uint16_t *ubuf1, const uint16_t *vbuf0,
1087
 
                                 const uint16_t *vbuf1, const uint16_t *abuf0,
1088
 
                                 const uint16_t *abuf1, uint8_t *dest,
 
881
static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
 
882
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
 
883
                                 const int16_t *abuf[2], uint8_t *dest,
1089
884
                                 int dstW, int yalpha, int uvalpha, int y)
1090
885
{
 
886
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
887
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
888
 
1091
889
    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1092
890
    __asm__ volatile(
1093
891
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1121
919
    "1:                                 \n\t"\
1122
920
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1123
921
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1124
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
922
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1125
923
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1126
924
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1127
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
925
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1128
926
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1129
927
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1130
928
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
1149
947
 
1150
948
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
1151
949
 
1152
 
static void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
1153
 
                                  const uint16_t *buf1, const uint16_t *ubuf0,
1154
 
                                  const uint16_t *ubuf1, const uint16_t *vbuf0,
1155
 
                                  const uint16_t *vbuf1, const uint16_t *abuf0,
1156
 
                                  const uint16_t *abuf1, uint8_t *dest,
 
950
static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
 
951
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
 
952
                                  const int16_t *abuf[2], uint8_t *dest,
1157
953
                                  int dstW, int yalpha, int uvalpha, int y)
1158
954
{
 
955
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
956
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
957
 
1159
958
    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1160
959
    __asm__ volatile(
1161
960
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1175
974
    ".p2align              4            \n\t"\
1176
975
    "1:                                 \n\t"\
1177
976
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1178
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
977
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1179
978
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1180
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
979
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1181
980
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1182
981
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1183
982
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1228
1027
    "1:                                 \n\t"\
1229
1028
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1230
1029
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1231
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1030
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1232
1031
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1233
1032
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1234
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1033
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1235
1034
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1236
1035
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1237
1036
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
1288
1087
/**
1289
1088
 * YV12 to RGB without scaling or interpolating
1290
1089
 */
1291
 
static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
1292
 
                                const uint16_t *ubuf0, const uint16_t *ubuf1,
1293
 
                                const uint16_t *vbuf0, const uint16_t *vbuf1,
1294
 
                                const uint16_t *abuf0, uint8_t *dest,
1295
 
                                int dstW, int uvalpha, enum PixelFormat dstFormat,
1296
 
                                int flags, int y)
1297
 
{
1298
 
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1299
 
 
1300
 
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1301
 
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1302
 
            __asm__ volatile(
1303
 
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1304
 
                "mov        %4, %%"REG_b"               \n\t"
1305
 
                "push %%"REG_BP"                        \n\t"
1306
 
                YSCALEYUV2RGB1(%%REGBP, %5)
1307
 
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
1308
 
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1309
 
                "pop %%"REG_BP"                         \n\t"
1310
 
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1311
 
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1312
 
                   "a" (&c->redDither)
1313
 
            );
1314
 
        } else {
1315
 
            __asm__ volatile(
1316
 
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1317
 
                "mov        %4, %%"REG_b"               \n\t"
1318
 
                "push %%"REG_BP"                        \n\t"
1319
 
                YSCALEYUV2RGB1(%%REGBP, %5)
1320
 
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1321
 
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1322
 
                "pop %%"REG_BP"                         \n\t"
1323
 
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1324
 
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1325
 
                   "a" (&c->redDither)
1326
 
            );
1327
 
        }
1328
 
    } else {
1329
 
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1330
 
            __asm__ volatile(
1331
 
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1332
 
                "mov        %4, %%"REG_b"               \n\t"
1333
 
                "push %%"REG_BP"                        \n\t"
1334
 
                YSCALEYUV2RGB1b(%%REGBP, %5)
1335
 
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
1336
 
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1337
 
                "pop %%"REG_BP"                         \n\t"
1338
 
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1339
 
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1340
 
                   "a" (&c->redDither)
1341
 
            );
1342
 
        } else {
1343
 
            __asm__ volatile(
1344
 
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1345
 
                "mov        %4, %%"REG_b"               \n\t"
1346
 
                "push %%"REG_BP"                        \n\t"
1347
 
                YSCALEYUV2RGB1b(%%REGBP, %5)
1348
 
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1349
 
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1350
 
                "pop %%"REG_BP"                         \n\t"
1351
 
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1352
 
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1353
 
                   "a" (&c->redDither)
1354
 
            );
1355
 
        }
1356
 
    }
1357
 
}
1358
 
 
1359
 
static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
1360
 
                                const uint16_t *ubuf0, const uint16_t *ubuf1,
1361
 
                                const uint16_t *vbuf0, const uint16_t *vbuf1,
1362
 
                                const uint16_t *abuf0, uint8_t *dest,
1363
 
                                int dstW, int uvalpha, enum PixelFormat dstFormat,
1364
 
                                int flags, int y)
1365
 
{
1366
 
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1367
 
 
1368
 
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1369
 
        __asm__ volatile(
1370
 
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1371
 
            "mov        %4, %%"REG_b"               \n\t"
1372
 
            "push %%"REG_BP"                        \n\t"
1373
 
            YSCALEYUV2RGB1(%%REGBP, %5)
1374
 
            "pxor    %%mm7, %%mm7                   \n\t"
1375
 
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1376
 
            "pop %%"REG_BP"                         \n\t"
1377
 
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1378
 
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1379
 
               "a" (&c->redDither)
1380
 
        );
1381
 
    } else {
1382
 
        __asm__ volatile(
1383
 
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1384
 
            "mov        %4, %%"REG_b"               \n\t"
1385
 
            "push %%"REG_BP"                        \n\t"
1386
 
            YSCALEYUV2RGB1b(%%REGBP, %5)
1387
 
            "pxor    %%mm7, %%mm7                   \n\t"
1388
 
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1389
 
            "pop %%"REG_BP"                         \n\t"
1390
 
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1391
 
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1392
 
               "a" (&c->redDither)
1393
 
        );
1394
 
    }
1395
 
}
1396
 
 
1397
 
static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
1398
 
                                 const uint16_t *ubuf0, const uint16_t *ubuf1,
1399
 
                                 const uint16_t *vbuf0, const uint16_t *vbuf1,
1400
 
                                 const uint16_t *abuf0, uint8_t *dest,
1401
 
                                 int dstW, int uvalpha, enum PixelFormat dstFormat,
1402
 
                                 int flags, int y)
1403
 
{
1404
 
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1405
 
 
1406
 
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1407
 
        __asm__ volatile(
1408
 
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
 
            "mov        %4, %%"REG_b"               \n\t"
1410
 
            "push %%"REG_BP"                        \n\t"
1411
 
            YSCALEYUV2RGB1(%%REGBP, %5)
1412
 
            "pxor    %%mm7, %%mm7                   \n\t"
1413
 
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1414
 
#ifdef DITHER1XBPP
1415
 
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1416
 
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1417
 
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1418
 
#endif
1419
 
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1420
 
            "pop %%"REG_BP"                         \n\t"
1421
 
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1422
 
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1423
 
               "a" (&c->redDither)
1424
 
        );
1425
 
    } else {
1426
 
        __asm__ volatile(
1427
 
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1428
 
            "mov        %4, %%"REG_b"               \n\t"
1429
 
            "push %%"REG_BP"                        \n\t"
1430
 
            YSCALEYUV2RGB1b(%%REGBP, %5)
1431
 
            "pxor    %%mm7, %%mm7                   \n\t"
1432
 
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433
 
#ifdef DITHER1XBPP
1434
 
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1435
 
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1436
 
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1437
 
#endif
1438
 
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1439
 
            "pop %%"REG_BP"                         \n\t"
1440
 
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1441
 
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1442
 
               "a" (&c->redDither)
1443
 
        );
1444
 
    }
1445
 
}
1446
 
 
1447
 
static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
1448
 
                                 const uint16_t *ubuf0, const uint16_t *ubuf1,
1449
 
                                 const uint16_t *vbuf0, const uint16_t *vbuf1,
1450
 
                                 const uint16_t *abuf0, uint8_t *dest,
1451
 
                                 int dstW, int uvalpha, enum PixelFormat dstFormat,
1452
 
                                 int flags, int y)
1453
 
{
1454
 
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
1090
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
 
1091
                                const int16_t *ubuf[2], const int16_t *bguf[2],
 
1092
                                const int16_t *abuf0, uint8_t *dest,
 
1093
                                int dstW, int uvalpha, int y)
 
1094
{
 
1095
    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
1096
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
1097
 
 
1098
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
 
1099
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 
1100
            __asm__ volatile(
 
1101
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1102
                "mov        %4, %%"REG_b"               \n\t"
 
1103
                "push %%"REG_BP"                        \n\t"
 
1104
                YSCALEYUV2RGB1(%%REGBP, %5)
 
1105
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
 
1106
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
1107
                "pop %%"REG_BP"                         \n\t"
 
1108
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1109
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1110
                   "a" (&c->redDither)
 
1111
            );
 
1112
        } else {
 
1113
            __asm__ volatile(
 
1114
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1115
                "mov        %4, %%"REG_b"               \n\t"
 
1116
                "push %%"REG_BP"                        \n\t"
 
1117
                YSCALEYUV2RGB1(%%REGBP, %5)
 
1118
                "pcmpeqd %%mm7, %%mm7                   \n\t"
 
1119
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
1120
                "pop %%"REG_BP"                         \n\t"
 
1121
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1122
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1123
                   "a" (&c->redDither)
 
1124
            );
 
1125
        }
 
1126
    } else {
 
1127
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 
1128
            __asm__ volatile(
 
1129
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1130
                "mov        %4, %%"REG_b"               \n\t"
 
1131
                "push %%"REG_BP"                        \n\t"
 
1132
                YSCALEYUV2RGB1b(%%REGBP, %5)
 
1133
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
 
1134
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
1135
                "pop %%"REG_BP"                         \n\t"
 
1136
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1137
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1138
                   "a" (&c->redDither)
 
1139
            );
 
1140
        } else {
 
1141
            __asm__ volatile(
 
1142
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1143
                "mov        %4, %%"REG_b"               \n\t"
 
1144
                "push %%"REG_BP"                        \n\t"
 
1145
                YSCALEYUV2RGB1b(%%REGBP, %5)
 
1146
                "pcmpeqd %%mm7, %%mm7                   \n\t"
 
1147
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
1148
                "pop %%"REG_BP"                         \n\t"
 
1149
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1150
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1151
                   "a" (&c->redDither)
 
1152
            );
 
1153
        }
 
1154
    }
 
1155
}
 
1156
 
 
1157
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
 
1158
                                const int16_t *ubuf[2], const int16_t *bguf[2],
 
1159
                                const int16_t *abuf0, uint8_t *dest,
 
1160
                                int dstW, int uvalpha, int y)
 
1161
{
 
1162
    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
1163
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
1164
 
 
1165
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
 
1166
        __asm__ volatile(
 
1167
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1168
            "mov        %4, %%"REG_b"               \n\t"
 
1169
            "push %%"REG_BP"                        \n\t"
 
1170
            YSCALEYUV2RGB1(%%REGBP, %5)
 
1171
            "pxor    %%mm7, %%mm7                   \n\t"
 
1172
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
 
1173
            "pop %%"REG_BP"                         \n\t"
 
1174
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1175
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1176
               "a" (&c->redDither)
 
1177
        );
 
1178
    } else {
 
1179
        __asm__ volatile(
 
1180
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1181
            "mov        %4, %%"REG_b"               \n\t"
 
1182
            "push %%"REG_BP"                        \n\t"
 
1183
            YSCALEYUV2RGB1b(%%REGBP, %5)
 
1184
            "pxor    %%mm7, %%mm7                   \n\t"
 
1185
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
 
1186
            "pop %%"REG_BP"                         \n\t"
 
1187
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1188
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1189
               "a" (&c->redDither)
 
1190
        );
 
1191
    }
 
1192
}
 
1193
 
 
1194
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
 
1195
                                 const int16_t *ubuf[2], const int16_t *bguf[2],
 
1196
                                 const int16_t *abuf0, uint8_t *dest,
 
1197
                                 int dstW, int uvalpha, int y)
 
1198
{
 
1199
    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
1200
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
1201
 
 
1202
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
 
1203
        __asm__ volatile(
 
1204
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1205
            "mov        %4, %%"REG_b"               \n\t"
 
1206
            "push %%"REG_BP"                        \n\t"
 
1207
            YSCALEYUV2RGB1(%%REGBP, %5)
 
1208
            "pxor    %%mm7, %%mm7                   \n\t"
 
1209
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
1210
#ifdef DITHER1XBPP
 
1211
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
 
1212
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
 
1213
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 
1214
#endif
 
1215
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
 
1216
            "pop %%"REG_BP"                         \n\t"
 
1217
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1218
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1219
               "a" (&c->redDither)
 
1220
        );
 
1221
    } else {
 
1222
        __asm__ volatile(
 
1223
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
 
1224
            "mov        %4, %%"REG_b"               \n\t"
 
1225
            "push %%"REG_BP"                        \n\t"
 
1226
            YSCALEYUV2RGB1b(%%REGBP, %5)
 
1227
            "pxor    %%mm7, %%mm7                   \n\t"
 
1228
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
1229
#ifdef DITHER1XBPP
 
1230
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
 
1231
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
 
1232
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 
1233
#endif
 
1234
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
 
1235
            "pop %%"REG_BP"                         \n\t"
 
1236
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
1237
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
 
1238
               "a" (&c->redDither)
 
1239
        );
 
1240
    }
 
1241
}
 
1242
 
 
1243
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
 
1244
                                 const int16_t *ubuf[2], const int16_t *bguf[2],
 
1245
                                 const int16_t *abuf0, uint8_t *dest,
 
1246
                                 int dstW, int uvalpha, int y)
 
1247
{
 
1248
    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
1249
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1455
1250
 
1456
1251
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1457
1252
        __asm__ volatile(
1499
1294
    ".p2align              4            \n\t"\
1500
1295
    "1:                                 \n\t"\
1501
1296
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1502
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1297
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1503
1298
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1504
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1299
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1505
1300
    "psraw                $7, %%mm3     \n\t" \
1506
1301
    "psraw                $7, %%mm4     \n\t" \
1507
1302
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1517
1312
    "1:                                 \n\t"\
1518
1313
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1519
1314
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1520
 
    "add           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1315
    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1521
1316
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1522
1317
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1523
 
    "sub           "UV_OFFx2"("#c"), "#index"  \n\t" \
 
1318
    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
1524
1319
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1525
1320
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1526
1321
    "psrlw                $8, %%mm3     \n\t" \
1531
1326
    "psraw                $7, %%mm7     \n\t"
1532
1327
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
1533
1328
 
1534
 
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
1535
 
                                  const uint16_t *ubuf0, const uint16_t *ubuf1,
1536
 
                                  const uint16_t *vbuf0, const uint16_t *vbuf1,
1537
 
                                  const uint16_t *abuf0, uint8_t *dest,
1538
 
                                  int dstW, int uvalpha, enum PixelFormat dstFormat,
1539
 
                                  int flags, int y)
 
1329
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
 
1330
                                  const int16_t *ubuf[2], const int16_t *bguf[2],
 
1331
                                  const int16_t *abuf0, uint8_t *dest,
 
1332
                                  int dstW, int uvalpha, int y)
1540
1333
{
1541
 
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
1334
    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
1335
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1542
1336
 
1543
1337
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1544
1338
        __asm__ volatile(
1567
1361
    }
1568
1362
}
1569
1363
 
1570
 
#if !COMPILE_TEMPLATE_MMX2
1571
 
//FIXME yuy2* can read up to 7 samples too much
1572
 
 
1573
 
static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
1574
 
                            int width, uint32_t *unused)
1575
 
{
1576
 
    __asm__ volatile(
1577
 
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1578
 
        "mov                    %0, %%"REG_a"       \n\t"
1579
 
        "1:                                         \n\t"
1580
 
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1581
 
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1582
 
        "pand                %%mm2, %%mm0           \n\t"
1583
 
        "pand                %%mm2, %%mm1           \n\t"
1584
 
        "packuswb            %%mm1, %%mm0           \n\t"
1585
 
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1586
 
        "add                    $8, %%"REG_a"       \n\t"
1587
 
        " js                    1b                  \n\t"
1588
 
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1589
 
        : "%"REG_a
1590
 
    );
1591
 
}
1592
 
 
1593
 
static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
1594
 
                             const uint8_t *src1, const uint8_t *src2,
1595
 
                             int width, uint32_t *unused)
1596
 
{
1597
 
    __asm__ volatile(
1598
 
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1599
 
        "mov                    %0, %%"REG_a"       \n\t"
1600
 
        "1:                                         \n\t"
1601
 
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1602
 
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1603
 
        "psrlw                  $8, %%mm0           \n\t"
1604
 
        "psrlw                  $8, %%mm1           \n\t"
1605
 
        "packuswb            %%mm1, %%mm0           \n\t"
1606
 
        "movq                %%mm0, %%mm1           \n\t"
1607
 
        "psrlw                  $8, %%mm0           \n\t"
1608
 
        "pand                %%mm4, %%mm1           \n\t"
1609
 
        "packuswb            %%mm0, %%mm0           \n\t"
1610
 
        "packuswb            %%mm1, %%mm1           \n\t"
1611
 
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1612
 
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1613
 
        "add                    $4, %%"REG_a"       \n\t"
1614
 
        " js                    1b                  \n\t"
1615
 
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1616
 
        : "%"REG_a
1617
 
    );
1618
 
    assert(src1 == src2);
1619
 
}
1620
 
 
1621
 
static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
1622
 
                           const uint8_t *src1, const uint8_t *src2,
1623
 
                           int width, uint32_t *unused)
1624
 
{
1625
 
    __asm__ volatile(
1626
 
        "mov                    %0, %%"REG_a"       \n\t"
1627
 
        "1:                                         \n\t"
1628
 
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1629
 
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1630
 
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1631
 
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1632
 
        "psrlw                  $8, %%mm0           \n\t"
1633
 
        "psrlw                  $8, %%mm1           \n\t"
1634
 
        "psrlw                  $8, %%mm2           \n\t"
1635
 
        "psrlw                  $8, %%mm3           \n\t"
1636
 
        "packuswb            %%mm1, %%mm0           \n\t"
1637
 
        "packuswb            %%mm3, %%mm2           \n\t"
1638
 
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1639
 
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1640
 
        "add                    $8, %%"REG_a"       \n\t"
1641
 
        " js                    1b                  \n\t"
1642
 
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1643
 
        : "%"REG_a
1644
 
    );
1645
 
}
1646
 
 
1647
 
/* This is almost identical to the previous, end exists only because
1648
 
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1649
 
static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
1650
 
                            int width, uint32_t *unused)
1651
 
{
1652
 
    __asm__ volatile(
1653
 
        "mov                  %0, %%"REG_a"         \n\t"
1654
 
        "1:                                         \n\t"
1655
 
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1656
 
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1657
 
        "psrlw                $8, %%mm0             \n\t"
1658
 
        "psrlw                $8, %%mm1             \n\t"
1659
 
        "packuswb          %%mm1, %%mm0             \n\t"
1660
 
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1661
 
        "add                  $8, %%"REG_a"         \n\t"
1662
 
        " js                  1b                    \n\t"
1663
 
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1664
 
        : "%"REG_a
1665
 
    );
1666
 
}
1667
 
 
1668
 
static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
1669
 
                             const uint8_t *src1, const uint8_t *src2,
1670
 
                             int width, uint32_t *unused)
1671
 
{
1672
 
    __asm__ volatile(
1673
 
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1674
 
        "mov                    %0, %%"REG_a"       \n\t"
1675
 
        "1:                                         \n\t"
1676
 
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1677
 
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1678
 
        "pand                %%mm4, %%mm0           \n\t"
1679
 
        "pand                %%mm4, %%mm1           \n\t"
1680
 
        "packuswb            %%mm1, %%mm0           \n\t"
1681
 
        "movq                %%mm0, %%mm1           \n\t"
1682
 
        "psrlw                  $8, %%mm0           \n\t"
1683
 
        "pand                %%mm4, %%mm1           \n\t"
1684
 
        "packuswb            %%mm0, %%mm0           \n\t"
1685
 
        "packuswb            %%mm1, %%mm1           \n\t"
1686
 
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1687
 
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1688
 
        "add                    $4, %%"REG_a"       \n\t"
1689
 
        " js                    1b                  \n\t"
1690
 
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1691
 
        : "%"REG_a
1692
 
    );
1693
 
    assert(src1 == src2);
1694
 
}
1695
 
 
1696
 
static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
1697
 
                           const uint8_t *src1, const uint8_t *src2,
1698
 
                           int width, uint32_t *unused)
1699
 
{
1700
 
    __asm__ volatile(
1701
 
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1702
 
        "mov                    %0, %%"REG_a"       \n\t"
1703
 
        "1:                                         \n\t"
1704
 
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1705
 
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1706
 
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1707
 
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1708
 
        "pand                %%mm4, %%mm0           \n\t"
1709
 
        "pand                %%mm4, %%mm1           \n\t"
1710
 
        "pand                %%mm4, %%mm2           \n\t"
1711
 
        "pand                %%mm4, %%mm3           \n\t"
1712
 
        "packuswb            %%mm1, %%mm0           \n\t"
1713
 
        "packuswb            %%mm3, %%mm2           \n\t"
1714
 
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1715
 
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1716
 
        "add                    $8, %%"REG_a"       \n\t"
1717
 
        " js                    1b                  \n\t"
1718
 
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1719
 
        : "%"REG_a
1720
 
    );
1721
 
}
1722
 
 
1723
 
static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1724
 
                                              const uint8_t *src, int width)
1725
 
{
1726
 
    __asm__ volatile(
1727
 
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1728
 
        "mov                    %0, %%"REG_a"       \n\t"
1729
 
        "1:                                         \n\t"
1730
 
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1731
 
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1732
 
        "movq                %%mm0, %%mm2           \n\t"
1733
 
        "movq                %%mm1, %%mm3           \n\t"
1734
 
        "pand                %%mm4, %%mm0           \n\t"
1735
 
        "pand                %%mm4, %%mm1           \n\t"
1736
 
        "psrlw                  $8, %%mm2           \n\t"
1737
 
        "psrlw                  $8, %%mm3           \n\t"
1738
 
        "packuswb            %%mm1, %%mm0           \n\t"
1739
 
        "packuswb            %%mm3, %%mm2           \n\t"
1740
 
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1741
 
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1742
 
        "add                    $8, %%"REG_a"       \n\t"
1743
 
        " js                    1b                  \n\t"
1744
 
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1745
 
        : "%"REG_a
1746
 
    );
1747
 
}
1748
 
 
1749
 
static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1750
 
                             const uint8_t *src1, const uint8_t *src2,
1751
 
                             int width, uint32_t *unused)
1752
 
{
1753
 
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1754
 
}
1755
 
 
1756
 
static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1757
 
                             const uint8_t *src1, const uint8_t *src2,
1758
 
                             int width, uint32_t *unused)
1759
 
{
1760
 
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1761
 
}
1762
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
1763
 
 
1764
1364
static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
1765
1365
                                                  int width, enum PixelFormat srcFormat)
1766
1366
{
1903
1503
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1904
1504
}
1905
1505
 
1906
 
#if !COMPILE_TEMPLATE_MMX2
1907
 
// bilinear / bicubic scaling
1908
 
static void RENAME(hScale)(int16_t *dst, int dstW,
1909
 
                           const uint8_t *src, int srcW,
1910
 
                           int xInc, const int16_t *filter,
1911
 
                           const int16_t *filterPos, int filterSize)
1912
 
{
1913
 
    assert(filterSize % 4 == 0 && filterSize>0);
1914
 
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1915
 
        x86_reg counter= -2*dstW;
1916
 
        filter-= counter*2;
1917
 
        filterPos-= counter/2;
1918
 
        dst-= counter/2;
1919
 
        __asm__ volatile(
1920
 
#if defined(PIC)
1921
 
            "push            %%"REG_b"              \n\t"
1922
 
#endif
1923
 
            "pxor                %%mm7, %%mm7       \n\t"
1924
 
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
1925
 
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
1926
 
            ".p2align                4              \n\t"
1927
 
            "1:                                     \n\t"
1928
 
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
1929
 
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
1930
 
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
1931
 
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
1932
 
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
1933
 
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
1934
 
            "punpcklbw           %%mm7, %%mm0       \n\t"
1935
 
            "punpcklbw           %%mm7, %%mm2       \n\t"
1936
 
            "pmaddwd             %%mm1, %%mm0       \n\t"
1937
 
            "pmaddwd             %%mm2, %%mm3       \n\t"
1938
 
            "movq                %%mm0, %%mm4       \n\t"
1939
 
            "punpckldq           %%mm3, %%mm0       \n\t"
1940
 
            "punpckhdq           %%mm3, %%mm4       \n\t"
1941
 
            "paddd               %%mm4, %%mm0       \n\t"
1942
 
            "psrad                  $7, %%mm0       \n\t"
1943
 
            "packssdw            %%mm0, %%mm0       \n\t"
1944
 
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
1945
 
            "add                    $4, %%"REG_BP"  \n\t"
1946
 
            " jnc                   1b              \n\t"
1947
 
 
1948
 
            "pop            %%"REG_BP"              \n\t"
1949
 
#if defined(PIC)
1950
 
            "pop             %%"REG_b"              \n\t"
1951
 
#endif
1952
 
            : "+a" (counter)
1953
 
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1954
 
#if !defined(PIC)
1955
 
            : "%"REG_b
1956
 
#endif
1957
 
        );
1958
 
    } else if (filterSize==8) {
1959
 
        x86_reg counter= -2*dstW;
1960
 
        filter-= counter*4;
1961
 
        filterPos-= counter/2;
1962
 
        dst-= counter/2;
1963
 
        __asm__ volatile(
1964
 
#if defined(PIC)
1965
 
            "push             %%"REG_b"             \n\t"
1966
 
#endif
1967
 
            "pxor                 %%mm7, %%mm7      \n\t"
1968
 
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
1969
 
            "mov              %%"REG_a", %%"REG_BP" \n\t"
1970
 
            ".p2align                 4             \n\t"
1971
 
            "1:                                     \n\t"
1972
 
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
1973
 
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
1974
 
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
1975
 
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
1976
 
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
1977
 
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
1978
 
            "punpcklbw            %%mm7, %%mm0      \n\t"
1979
 
            "punpcklbw            %%mm7, %%mm2      \n\t"
1980
 
            "pmaddwd              %%mm1, %%mm0      \n\t"
1981
 
            "pmaddwd              %%mm2, %%mm3      \n\t"
1982
 
 
1983
 
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
1984
 
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
1985
 
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
1986
 
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
1987
 
            "punpcklbw            %%mm7, %%mm4      \n\t"
1988
 
            "punpcklbw            %%mm7, %%mm2      \n\t"
1989
 
            "pmaddwd              %%mm1, %%mm4      \n\t"
1990
 
            "pmaddwd              %%mm2, %%mm5      \n\t"
1991
 
            "paddd                %%mm4, %%mm0      \n\t"
1992
 
            "paddd                %%mm5, %%mm3      \n\t"
1993
 
            "movq                 %%mm0, %%mm4      \n\t"
1994
 
            "punpckldq            %%mm3, %%mm0      \n\t"
1995
 
            "punpckhdq            %%mm3, %%mm4      \n\t"
1996
 
            "paddd                %%mm4, %%mm0      \n\t"
1997
 
            "psrad                   $7, %%mm0      \n\t"
1998
 
            "packssdw             %%mm0, %%mm0      \n\t"
1999
 
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2000
 
            "add                     $4, %%"REG_BP" \n\t"
2001
 
            " jnc                    1b             \n\t"
2002
 
 
2003
 
            "pop             %%"REG_BP"             \n\t"
2004
 
#if defined(PIC)
2005
 
            "pop              %%"REG_b"             \n\t"
2006
 
#endif
2007
 
            : "+a" (counter)
2008
 
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2009
 
#if !defined(PIC)
2010
 
            : "%"REG_b
2011
 
#endif
2012
 
        );
2013
 
    } else {
2014
 
        const uint8_t *offset = src+filterSize;
2015
 
        x86_reg counter= -2*dstW;
2016
 
        //filter-= counter*filterSize/2;
2017
 
        filterPos-= counter/2;
2018
 
        dst-= counter/2;
2019
 
        __asm__ volatile(
2020
 
            "pxor                  %%mm7, %%mm7     \n\t"
2021
 
            ".p2align                  4            \n\t"
2022
 
            "1:                                     \n\t"
2023
 
            "mov                      %2, %%"REG_c" \n\t"
2024
 
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2025
 
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2026
 
            "mov                      %5, %%"REG_c" \n\t"
2027
 
            "pxor                  %%mm4, %%mm4     \n\t"
2028
 
            "pxor                  %%mm5, %%mm5     \n\t"
2029
 
            "2:                                     \n\t"
2030
 
            "movq                   (%1), %%mm1     \n\t"
2031
 
            "movq               (%1, %6), %%mm3     \n\t"
2032
 
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2033
 
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2034
 
            "punpcklbw             %%mm7, %%mm0     \n\t"
2035
 
            "punpcklbw             %%mm7, %%mm2     \n\t"
2036
 
            "pmaddwd               %%mm1, %%mm0     \n\t"
2037
 
            "pmaddwd               %%mm2, %%mm3     \n\t"
2038
 
            "paddd                 %%mm3, %%mm5     \n\t"
2039
 
            "paddd                 %%mm0, %%mm4     \n\t"
2040
 
            "add                      $8, %1        \n\t"
2041
 
            "add                      $4, %%"REG_c" \n\t"
2042
 
            "cmp                      %4, %%"REG_c" \n\t"
2043
 
            " jb                      2b            \n\t"
2044
 
            "add                      %6, %1        \n\t"
2045
 
            "movq                  %%mm4, %%mm0     \n\t"
2046
 
            "punpckldq             %%mm5, %%mm4     \n\t"
2047
 
            "punpckhdq             %%mm5, %%mm0     \n\t"
2048
 
            "paddd                 %%mm0, %%mm4     \n\t"
2049
 
            "psrad                    $7, %%mm4     \n\t"
2050
 
            "packssdw              %%mm4, %%mm4     \n\t"
2051
 
            "mov                      %3, %%"REG_a" \n\t"
2052
 
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2053
 
            "add                      $4, %0        \n\t"
2054
 
            " jnc                     1b            \n\t"
2055
 
 
2056
 
            : "+r" (counter), "+r" (filter)
2057
 
            : "m" (filterPos), "m" (dst), "m"(offset),
2058
 
            "m" (src), "r" ((x86_reg)filterSize*2)
2059
 
            : "%"REG_a, "%"REG_c, "%"REG_d
2060
 
        );
2061
 
    }
2062
 
}
2063
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
2064
 
 
2065
1506
#if COMPILE_TEMPLATE_MMX2
2066
1507
static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2067
1508
                                 int dstWidth, const uint8_t *src,
2072
1513
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2073
1514
    int i;
2074
1515
#if defined(PIC)
2075
 
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
 
1516
    uint64_t ebxsave;
 
1517
#endif
 
1518
#if ARCH_X86_64
 
1519
    uint64_t retsave;
2076
1520
#endif
2077
1521
 
2078
1522
    __asm__ volatile(
2079
1523
#if defined(PIC)
2080
1524
        "mov               %%"REG_b", %5        \n\t"
 
1525
#if ARCH_X86_64
 
1526
        "mov               -8(%%rsp), %%"REG_a" \n\t"
 
1527
        "mov               %%"REG_a", %6        \n\t"
 
1528
#endif
 
1529
#else
 
1530
#if ARCH_X86_64
 
1531
        "mov               -8(%%rsp), %%"REG_a" \n\t"
 
1532
        "mov               %%"REG_a", %5        \n\t"
 
1533
#endif
2081
1534
#endif
2082
1535
        "pxor                  %%mm7, %%mm7     \n\t"
2083
1536
        "mov                      %0, %%"REG_c" \n\t"
2119
1572
 
2120
1573
#if defined(PIC)
2121
1574
        "mov                      %5, %%"REG_b" \n\t"
 
1575
#if ARCH_X86_64
 
1576
        "mov                      %6, %%"REG_a" \n\t"
 
1577
        "mov               %%"REG_a", -8(%%rsp) \n\t"
 
1578
#endif
 
1579
#else
 
1580
#if ARCH_X86_64
 
1581
        "mov                      %5, %%"REG_a" \n\t"
 
1582
        "mov               %%"REG_a", -8(%%rsp) \n\t"
 
1583
#endif
2122
1584
#endif
2123
1585
        :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2124
1586
           "m" (mmx2FilterCode)
2125
1587
#if defined(PIC)
2126
1588
          ,"m" (ebxsave)
2127
1589
#endif
 
1590
#if ARCH_X86_64
 
1591
          ,"m"(retsave)
 
1592
#endif
2128
1593
        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2129
1594
#if !defined(PIC)
2130
1595
         ,"%"REG_b
2146
1611
#if defined(PIC)
2147
1612
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2148
1613
#endif
 
1614
#if ARCH_X86_64
 
1615
    DECLARE_ALIGNED(8, uint64_t, retsave);
 
1616
#endif
2149
1617
 
2150
1618
    __asm__ volatile(
2151
1619
#if defined(PIC)
2152
1620
        "mov          %%"REG_b", %7         \n\t"
 
1621
#if ARCH_X86_64
 
1622
        "mov          -8(%%rsp), %%"REG_a"  \n\t"
 
1623
        "mov          %%"REG_a", %8         \n\t"
 
1624
#endif
 
1625
#else
 
1626
#if ARCH_X86_64
 
1627
        "mov          -8(%%rsp), %%"REG_a"  \n\t"
 
1628
        "mov          %%"REG_a", %7         \n\t"
 
1629
#endif
2153
1630
#endif
2154
1631
        "pxor             %%mm7, %%mm7      \n\t"
2155
1632
        "mov                 %0, %%"REG_c"  \n\t"
2179
1656
 
2180
1657
#if defined(PIC)
2181
1658
        "mov %7, %%"REG_b"    \n\t"
 
1659
#if ARCH_X86_64
 
1660
        "mov                 %8, %%"REG_a"  \n\t"
 
1661
        "mov          %%"REG_a", -8(%%rsp)  \n\t"
 
1662
#endif
 
1663
#else
 
1664
#if ARCH_X86_64
 
1665
        "mov                 %7, %%"REG_a"  \n\t"
 
1666
        "mov          %%"REG_a", -8(%%rsp)  \n\t"
 
1667
#endif
2182
1668
#endif
2183
1669
        :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
2184
1670
           "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
2185
1671
#if defined(PIC)
2186
1672
          ,"m" (ebxsave)
2187
1673
#endif
 
1674
#if ARCH_X86_64
 
1675
          ,"m"(retsave)
 
1676
#endif
2188
1677
        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2189
1678
#if !defined(PIC)
2190
1679
         ,"%"REG_b
2203
1692
    enum PixelFormat srcFormat = c->srcFormat,
2204
1693
                     dstFormat = c->dstFormat;
2205
1694
 
2206
 
    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat)) {
 
1695
    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
 
1696
        dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
2207
1697
        if (!(c->flags & SWS_BITEXACT)) {
2208
1698
            if (c->flags & SWS_ACCURATE_RND) {
2209
 
                c->yuv2yuv1 = RENAME(yuv2yuv1_ar    );
2210
 
                c->yuv2yuvX = RENAME(yuv2yuvX_ar    );
2211
1699
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
2212
1700
                    switch (c->dstFormat) {
2213
1701
                    case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
2219
1707
                    }
2220
1708
                }
2221
1709
            } else {
2222
 
                c->yuv2yuv1 = RENAME(yuv2yuv1    );
2223
 
                c->yuv2yuvX = RENAME(yuv2yuvX    );
2224
1710
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
2225
1711
                    switch (c->dstFormat) {
2226
1712
                    case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
2261
1747
        }
2262
1748
    }
2263
1749
 
2264
 
#if !COMPILE_TEMPLATE_MMX2
2265
 
    c->hScale       = RENAME(hScale      );
2266
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
2267
 
 
 
1750
    if (c->srcBpc == 8 && c->dstBpc <= 10) {
2268
1751
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2269
1752
#if COMPILE_TEMPLATE_MMX2
2270
1753
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2278
1761
#if COMPILE_TEMPLATE_MMX2
2279
1762
    }
2280
1763
#endif /* COMPILE_TEMPLATE_MMX2 */
 
1764
    }
2281
1765
 
2282
 
#if !COMPILE_TEMPLATE_MMX2
2283
 
    switch(srcFormat) {
2284
 
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2285
 
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2286
 
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2287
 
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2288
 
        case PIX_FMT_YUV420P16BE:
2289
 
        case PIX_FMT_YUV422P16BE:
2290
 
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2291
 
        case PIX_FMT_YUV420P16LE:
2292
 
        case PIX_FMT_YUV422P16LE:
2293
 
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2294
 
        default: break;
2295
 
    }
2296
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
2297
1766
    if (!c->chrSrcHSubSample) {
2298
1767
        switch(srcFormat) {
2299
1768
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2303
1772
    }
2304
1773
 
2305
1774
    switch (srcFormat) {
2306
 
#if !COMPILE_TEMPLATE_MMX2
2307
 
    case PIX_FMT_YUYV422  :
2308
 
    case PIX_FMT_YUV420P16BE:
2309
 
    case PIX_FMT_YUV422P16BE:
2310
 
    case PIX_FMT_YUV444P16BE:
2311
 
    case PIX_FMT_Y400A    :
2312
 
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2313
 
    case PIX_FMT_UYVY422  :
2314
 
    case PIX_FMT_YUV420P16LE:
2315
 
    case PIX_FMT_YUV422P16LE:
2316
 
    case PIX_FMT_YUV444P16LE:
2317
 
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2318
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
2319
1775
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
2320
1776
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
2321
1777
    default: break;
2322
1778
    }
2323
 
#if !COMPILE_TEMPLATE_MMX2
2324
 
    if (c->alpPixBuf) {
2325
 
        switch (srcFormat) {
2326
 
        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
2327
 
        default: break;
2328
 
        }
2329
 
    }
2330
 
#endif /* !COMPILE_TEMPLATE_MMX2 */
2331
1779
}