3
3
* Copyright (c) 2000, 2001 Fabrice Bellard.
4
4
* Copyright (c) 2002-2004 Michael Niedermayer
6
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8
* and improved by Zdenek Kabelac <kabi@users.sf.net>
6
10
* This file is part of FFmpeg.
8
12
* FFmpeg is free software; you can redistribute it and/or
18
22
* You should have received a copy of the GNU Lesser General Public
19
23
* License along with FFmpeg; if not, write to the Free Software
20
24
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
24
* and improved by Zdenek Kabelac <kabi@users.sf.net>
27
/* This header intentionally has no multiple inclusion guards. It is meant to
28
* be included multiple times and generates different code depending on the
29
* value of certain #defines. */
27
31
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
28
32
clobber bug - now it will work with 2.95.2 and also with -fPIC
30
34
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
33
37
"lea (%3, %3), %%"REG_a" \n\t"
35
39
"movq (%1), %%mm0 \n\t"
96
100
"add $16, %2 \n\t"
97
101
"subl $4, %0 \n\t"
99
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
103
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
100
104
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
102
106
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
143
147
"add $32, %2 \n\t"
144
148
"subl $4, %0 \n\t"
146
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
150
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
147
151
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
149
153
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
159
163
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
162
166
"pcmpeqb %%mm6, %%mm6 \n\t"
163
167
"testl $1, %0 \n\t"
213
217
"add $32, %2 \n\t"
214
218
"subl $4, %0 \n\t"
216
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
220
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
217
221
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
219
223
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
268
272
"add $16, %2 \n\t"
269
273
"subl $4, %0 \n\t"
271
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
275
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
272
276
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
274
278
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
320
324
"add $32, %2 \n\t"
321
325
"subl $4, %0 \n\t"
323
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
327
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
324
328
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
326
330
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
336
340
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
339
343
"lea (%3, %3), %%"REG_a" \n\t"
341
345
"movq (%1), %%mm0 \n\t"
408
412
"add $32, %2 \n\t"
409
413
"subl $2, %0 \n\t"
411
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
415
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
412
416
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
414
418
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
462
466
"add $32, %2 \n\t"
463
467
"subl $2, %0 \n\t"
465
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
469
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
466
470
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
468
472
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
478
482
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
481
485
"pcmpeqb %%mm6, %%mm6 \n\t"
482
486
"testl $1, %0 \n\t"
535
539
"add $32, %2 \n\t"
536
540
"subl $2, %0 \n\t"
538
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
542
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
539
543
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
541
545
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
552
556
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
556
560
"lea (%3, %3), %%"REG_a" \n\t"
558
562
"movq (%1), %%mm0 \n\t"
589
593
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
592
596
"lea (%3, %3), %%"REG_a" \n\t"
593
597
"movq (%1), %%mm0 \n\t"
594
598
"sub %3, %2 \n\t"
620
624
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
624
628
"lea (%3, %3), %%"REG_a" \n\t"
625
629
"movq (%1), %%mm0 \n\t"
626
630
"sub %3, %2 \n\t"
653
657
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
656
660
"lea (%3, %3), %%"REG_a" \n\t"
658
662
"movq (%2), %%mm0 \n\t"
681
685
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
684
688
"lea (%3, %3), %%"REG_a" \n\t"
686
690
"movq (%1), %%mm0 \n\t"
713
717
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
716
720
"lea (%3, %3), %%"REG_a" \n\t"
717
721
"movq (%1), %%mm0 \n\t"
718
722
"sub %3, %2 \n\t"
748
752
:"%"REG_a, "memory");
751
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
755
/* Note this is not correctly rounded, but this function is only
756
* used for B-frames so it does not matter. */
752
757
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
756
761
"lea (%3, %3), %%"REG_a" \n\t"
757
762
"movq (%1), %%mm0 \n\t"
758
763
PAVGB" 1(%1), %%mm0 \n\t"
790
795
:"%"REG_a, "memory");
798
static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
802
"movd (%1), %%mm0 \n\t"
803
"movd (%1, %2), %%mm1 \n\t"
804
"movd (%1, %2, 2), %%mm2 \n\t"
805
"movd (%1, %3), %%mm3 \n\t"
806
PAVGB" (%0), %%mm0 \n\t"
807
PAVGB" (%0, %2), %%mm1 \n\t"
808
PAVGB" (%0, %2, 2), %%mm2 \n\t"
809
PAVGB" (%0, %3), %%mm3 \n\t"
810
"movd %%mm0, (%1) \n\t"
811
"movd %%mm1, (%1, %2) \n\t"
812
"movd %%mm2, (%1, %2, 2) \n\t"
813
"movd %%mm3, (%1, %3) \n\t"
814
::"S"(pixels), "D"(block),
815
"r" ((long)line_size), "r"(3L*line_size)
817
block += 4*line_size;
818
pixels += 4*line_size;
793
823
//FIXME the following could be optimized too ...
794
824
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
795
825
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);