2
* MMX optimized DSP utils
3
* Copyright (c) 2000, 2001 Fabrice Bellard.
5
* This library is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU Lesser General Public
7
* License as published by the Free Software Foundation; either
8
* version 2 of the License, or (at your option) any later version.
10
* This library is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
* Lesser General Public License for more details.
15
* You should have received a copy of the GNU Lesser General Public
16
* License along with this library; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
22
#include "../dsputil.h"
23
#include "../simple_idct.h"
25
int mm_flags; /* multimedia extension flags */
27
/* pixel operations */
28
static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
29
static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
30
static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
32
static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
33
static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
34
static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
35
static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
37
#define JUMPALIGN() __asm __volatile (".balign 8"::)
38
#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
40
#define MOVQ_WONE(regd) \
42
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
43
"psrlw $15, %%" #regd ::)
45
#define MOVQ_BFE(regd) \
47
"pcmpeqd %%" #regd ", %%" #regd " \n\t"\
48
"paddb %%" #regd ", %%" #regd " \n\t" ::)
51
#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
52
#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
54
// for shared library it's better to use this way for accessing constants
56
#define MOVQ_BONE(regd) \
58
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
59
"psrlw $15, %%" #regd " \n\t" \
60
"packuswb %%" #regd ", %%" #regd " \n\t" ::)
62
#define MOVQ_WTWO(regd) \
64
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
65
"psrlw $15, %%" #regd " \n\t" \
66
"psllw $1, %%" #regd " \n\t"::)
70
// using regr as temporary and for the output result
71
// first argument is unmodifed and second is trashed
72
// regfe is supposed to contain 0xfefefefefefefefe
73
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
74
"movq " #rega ", " #regr " \n\t"\
75
"pand " #regb ", " #regr " \n\t"\
76
"pxor " #rega ", " #regb " \n\t"\
77
"pand " #regfe "," #regb " \n\t"\
78
"psrlq $1, " #regb " \n\t"\
79
"paddb " #regb ", " #regr " \n\t"
81
#define PAVGB_MMX(rega, regb, regr, regfe) \
82
"movq " #rega ", " #regr " \n\t"\
83
"por " #regb ", " #regr " \n\t"\
84
"pxor " #rega ", " #regb " \n\t"\
85
"pand " #regfe "," #regb " \n\t"\
86
"psrlq $1, " #regb " \n\t"\
87
"psubb " #regb ", " #regr " \n\t"
89
// mm6 is supposed to contain 0xfefefefefefefefe
90
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
91
"movq " #rega ", " #regr " \n\t"\
92
"movq " #regc ", " #regp " \n\t"\
93
"pand " #regb ", " #regr " \n\t"\
94
"pand " #regd ", " #regp " \n\t"\
95
"pxor " #rega ", " #regb " \n\t"\
96
"pxor " #regc ", " #regd " \n\t"\
97
"pand %%mm6, " #regb " \n\t"\
98
"pand %%mm6, " #regd " \n\t"\
99
"psrlq $1, " #regb " \n\t"\
100
"psrlq $1, " #regd " \n\t"\
101
"paddb " #regb ", " #regr " \n\t"\
102
"paddb " #regd ", " #regp " \n\t"
104
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
105
"movq " #rega ", " #regr " \n\t"\
106
"movq " #regc ", " #regp " \n\t"\
107
"por " #regb ", " #regr " \n\t"\
108
"por " #regd ", " #regp " \n\t"\
109
"pxor " #rega ", " #regb " \n\t"\
110
"pxor " #regc ", " #regd " \n\t"\
111
"pand %%mm6, " #regb " \n\t"\
112
"pand %%mm6, " #regd " \n\t"\
113
"psrlq $1, " #regd " \n\t"\
114
"psrlq $1, " #regb " \n\t"\
115
"psubb " #regb ", " #regr " \n\t"\
116
"psubb " #regd ", " #regp " \n\t"
118
/***********************************/
119
/* MMX no rounding */
120
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
121
#define SET_RND MOVQ_WONE
122
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
123
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
125
#include "dsputil_mmx_rnd.h"
131
/***********************************/
134
#define DEF(x, y) x ## _ ## y ##_mmx
135
#define SET_RND MOVQ_WTWO
136
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
137
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
139
#include "dsputil_mmx_rnd.h"
146
/***********************************/
149
#define DEF(x) x ## _3dnow
150
/* for Athlons PAVGUSB is prefered */
151
#define PAVGB "pavgusb"
153
#include "dsputil_mmx_avg.h"
158
/***********************************/
161
#define DEF(x) x ## _mmx2
163
/* Introduced only in MMX2 set */
164
#define PAVGB "pavgb"
166
#include "dsputil_mmx_avg.h"
171
/***********************************/
174
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
177
"movl $-128, %%eax \n\t"
178
"pxor %%mm7, %%mm7 \n\t"
181
"movq (%0), %%mm0 \n\t"
182
"movq (%0, %2), %%mm2 \n\t"
183
"movq %%mm0, %%mm1 \n\t"
184
"movq %%mm2, %%mm3 \n\t"
185
"punpcklbw %%mm7, %%mm0 \n\t"
186
"punpckhbw %%mm7, %%mm1 \n\t"
187
"punpcklbw %%mm7, %%mm2 \n\t"
188
"punpckhbw %%mm7, %%mm3 \n\t"
189
"movq %%mm0, (%1, %%eax)\n\t"
190
"movq %%mm1, 8(%1, %%eax)\n\t"
191
"movq %%mm2, 16(%1, %%eax)\n\t"
192
"movq %%mm3, 24(%1, %%eax)\n\t"
194
"addl $32, %%eax \n\t"
197
: "r" (block+64), "r" (line_size), "r" (line_size*2)
202
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
205
"pxor %%mm7, %%mm7 \n\t"
206
"movl $-128, %%eax \n\t"
209
"movq (%0), %%mm0 \n\t"
210
"movq (%1), %%mm2 \n\t"
211
"movq %%mm0, %%mm1 \n\t"
212
"movq %%mm2, %%mm3 \n\t"
213
"punpcklbw %%mm7, %%mm0 \n\t"
214
"punpckhbw %%mm7, %%mm1 \n\t"
215
"punpcklbw %%mm7, %%mm2 \n\t"
216
"punpckhbw %%mm7, %%mm3 \n\t"
217
"psubw %%mm2, %%mm0 \n\t"
218
"psubw %%mm3, %%mm1 \n\t"
219
"movq %%mm0, (%2, %%eax)\n\t"
220
"movq %%mm1, 8(%2, %%eax)\n\t"
223
"addl $16, %%eax \n\t"
225
: "+r" (s1), "+r" (s2)
226
: "r" (block+64), "r" (stride)
231
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
236
/* read the pixels */
242
"movq 8%3, %%mm1\n\t"
243
"movq 16%3, %%mm2\n\t"
244
"movq 24%3, %%mm3\n\t"
245
"movq 32%3, %%mm4\n\t"
246
"movq 40%3, %%mm5\n\t"
247
"movq 48%3, %%mm6\n\t"
248
"movq 56%3, %%mm7\n\t"
249
"packuswb %%mm1, %%mm0\n\t"
250
"packuswb %%mm3, %%mm2\n\t"
251
"packuswb %%mm5, %%mm4\n\t"
252
"packuswb %%mm7, %%mm6\n\t"
253
"movq %%mm0, (%0)\n\t"
254
"movq %%mm2, (%0, %1)\n\t"
255
"movq %%mm4, (%0, %1, 2)\n\t"
256
"movq %%mm6, (%0, %2)\n\t"
257
::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
262
// if here would be an exact copy of the code above
263
// compiler would generate some very strange code
266
"movq (%3), %%mm0\n\t"
267
"movq 8(%3), %%mm1\n\t"
268
"movq 16(%3), %%mm2\n\t"
269
"movq 24(%3), %%mm3\n\t"
270
"movq 32(%3), %%mm4\n\t"
271
"movq 40(%3), %%mm5\n\t"
272
"movq 48(%3), %%mm6\n\t"
273
"movq 56(%3), %%mm7\n\t"
274
"packuswb %%mm1, %%mm0\n\t"
275
"packuswb %%mm3, %%mm2\n\t"
276
"packuswb %%mm5, %%mm4\n\t"
277
"packuswb %%mm7, %%mm6\n\t"
278
"movq %%mm0, (%0)\n\t"
279
"movq %%mm2, (%0, %1)\n\t"
280
"movq %%mm4, (%0, %1, 2)\n\t"
281
"movq %%mm6, (%0, %2)\n\t"
282
::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
286
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
292
/* read the pixels */
299
"movq (%2), %%mm0\n\t"
300
"movq 8(%2), %%mm1\n\t"
301
"movq 16(%2), %%mm2\n\t"
302
"movq 24(%2), %%mm3\n\t"
305
"movq %%mm4, %%mm5\n\t"
306
"punpcklbw %%mm7, %%mm4\n\t"
307
"punpckhbw %%mm7, %%mm5\n\t"
308
"paddsw %%mm4, %%mm0\n\t"
309
"paddsw %%mm5, %%mm1\n\t"
310
"movq %%mm6, %%mm5\n\t"
311
"punpcklbw %%mm7, %%mm6\n\t"
312
"punpckhbw %%mm7, %%mm5\n\t"
313
"paddsw %%mm6, %%mm2\n\t"
314
"paddsw %%mm5, %%mm3\n\t"
315
"packuswb %%mm1, %%mm0\n\t"
316
"packuswb %%mm3, %%mm2\n\t"
319
:"+m"(*pix), "+m"(*(pix+line_size))
327
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
330
"lea (%3, %3), %%eax \n\t"
333
"movq (%1), %%mm0 \n\t"
334
"movq (%1, %3), %%mm1 \n\t"
335
"movq %%mm0, (%2) \n\t"
336
"movq %%mm1, (%2, %3) \n\t"
337
"addl %%eax, %1 \n\t"
338
"addl %%eax, %2 \n\t"
339
"movq (%1), %%mm0 \n\t"
340
"movq (%1, %3), %%mm1 \n\t"
341
"movq %%mm0, (%2) \n\t"
342
"movq %%mm1, (%2, %3) \n\t"
343
"addl %%eax, %1 \n\t"
344
"addl %%eax, %2 \n\t"
347
: "+g"(h), "+r" (pixels), "+r" (block)
353
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
356
"lea (%3, %3), %%eax \n\t"
359
"movq (%1), %%mm0 \n\t"
360
"movq 8(%1), %%mm4 \n\t"
361
"movq (%1, %3), %%mm1 \n\t"
362
"movq 8(%1, %3), %%mm5 \n\t"
363
"movq %%mm0, (%2) \n\t"
364
"movq %%mm4, 8(%2) \n\t"
365
"movq %%mm1, (%2, %3) \n\t"
366
"movq %%mm5, 8(%2, %3) \n\t"
367
"addl %%eax, %1 \n\t"
368
"addl %%eax, %2 \n\t"
369
"movq (%1), %%mm0 \n\t"
370
"movq 8(%1), %%mm4 \n\t"
371
"movq (%1, %3), %%mm1 \n\t"
372
"movq 8(%1, %3), %%mm5 \n\t"
373
"movq %%mm0, (%2) \n\t"
374
"movq %%mm4, 8(%2) \n\t"
375
"movq %%mm1, (%2, %3) \n\t"
376
"movq %%mm5, 8(%2, %3) \n\t"
377
"addl %%eax, %1 \n\t"
378
"addl %%eax, %2 \n\t"
381
: "+g"(h), "+r" (pixels), "+r" (block)
387
static void clear_blocks_mmx(DCTELEM *blocks)
390
"pxor %%mm7, %%mm7 \n\t"
391
"movl $-128*6, %%eax \n\t"
393
"movq %%mm7, (%0, %%eax) \n\t"
394
"movq %%mm7, 8(%0, %%eax) \n\t"
395
"movq %%mm7, 16(%0, %%eax) \n\t"
396
"movq %%mm7, 24(%0, %%eax) \n\t"
397
"addl $32, %%eax \n\t"
399
: : "r" (((int)blocks)+128*6)
404
static int pix_sum16_mmx(uint8_t * pix, int line_size){
407
int index= -line_size*h;
410
"pxor %%mm7, %%mm7 \n\t"
411
"pxor %%mm6, %%mm6 \n\t"
413
"movq (%2, %1), %%mm0 \n\t"
414
"movq (%2, %1), %%mm1 \n\t"
415
"movq 8(%2, %1), %%mm2 \n\t"
416
"movq 8(%2, %1), %%mm3 \n\t"
417
"punpcklbw %%mm7, %%mm0 \n\t"
418
"punpckhbw %%mm7, %%mm1 \n\t"
419
"punpcklbw %%mm7, %%mm2 \n\t"
420
"punpckhbw %%mm7, %%mm3 \n\t"
421
"paddw %%mm0, %%mm1 \n\t"
422
"paddw %%mm2, %%mm3 \n\t"
423
"paddw %%mm1, %%mm3 \n\t"
424
"paddw %%mm3, %%mm6 \n\t"
427
"movq %%mm6, %%mm5 \n\t"
428
"psrlq $32, %%mm6 \n\t"
429
"paddw %%mm5, %%mm6 \n\t"
430
"movq %%mm6, %%mm5 \n\t"
431
"psrlq $16, %%mm6 \n\t"
432
"paddw %%mm5, %%mm6 \n\t"
433
"movd %%mm6, %0 \n\t"
434
"andl $0xFFFF, %0 \n\t"
435
: "=&r" (sum), "+r" (index)
436
: "r" (pix - index), "r" (line_size)
442
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
446
"movq (%1, %0), %%mm0 \n\t"
447
"movq (%2, %0), %%mm1 \n\t"
448
"paddb %%mm0, %%mm1 \n\t"
449
"movq %%mm1, (%2, %0) \n\t"
450
"movq 8(%1, %0), %%mm0 \n\t"
451
"movq 8(%2, %0), %%mm1 \n\t"
452
"paddb %%mm0, %%mm1 \n\t"
453
"movq %%mm1, 8(%2, %0) \n\t"
458
: "r"(src), "r"(dst), "r"(w-15)
461
dst[i+0] += src[i+0];
464
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
471
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
472
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
474
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
476
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
477
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
479
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
480
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
481
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
483
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
484
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
486
"pmaddwd %%mm3,%%mm3\n"
487
"pmaddwd %%mm4,%%mm4\n"
489
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
490
pix2^2+pix3^2+pix6^2+pix7^2) */
491
"paddd %%mm3,%%mm4\n"
492
"paddd %%mm2,%%mm7\n"
495
"paddd %%mm4,%%mm7\n"
500
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
501
"paddd %%mm7,%%mm1\n"
503
: "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
507
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
511
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
512
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
514
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
515
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
516
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
517
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
519
/* todo: mm1-mm2, mm3-mm4 */
520
/* algo: substract mm1 from mm2 with saturation and vice versa */
521
/* OR the results to get absolute difference */
524
"psubusb %%mm2,%%mm1\n"
525
"psubusb %%mm4,%%mm3\n"
526
"psubusb %%mm5,%%mm2\n"
527
"psubusb %%mm6,%%mm4\n"
532
/* now convert to 16-bit vectors so we can square them */
536
"punpckhbw %%mm0,%%mm2\n"
537
"punpckhbw %%mm0,%%mm4\n"
538
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
539
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
541
"pmaddwd %%mm2,%%mm2\n"
542
"pmaddwd %%mm4,%%mm4\n"
543
"pmaddwd %%mm1,%%mm1\n"
544
"pmaddwd %%mm3,%%mm3\n"
549
"paddd %%mm2,%%mm1\n"
550
"paddd %%mm4,%%mm3\n"
551
"paddd %%mm1,%%mm7\n"
552
"paddd %%mm3,%%mm7\n"
558
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
559
"paddd %%mm7,%%mm1\n"
561
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
565
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
569
"movq (%2, %0), %%mm0 \n\t"
570
"movq (%1, %0), %%mm1 \n\t"
571
"psubb %%mm0, %%mm1 \n\t"
572
"movq %%mm1, (%3, %0) \n\t"
573
"movq 8(%2, %0), %%mm0 \n\t"
574
"movq 8(%1, %0), %%mm1 \n\t"
575
"psubb %%mm0, %%mm1 \n\t"
576
"movq %%mm1, 8(%3, %0) \n\t"
581
: "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
584
dst[i+0] = src1[i+0]-src2[i+0];
586
#define LBUTTERFLY2(a1,b1,a2,b2)\
587
"paddw " #b1 ", " #a1 " \n\t"\
588
"paddw " #b2 ", " #a2 " \n\t"\
589
"paddw " #b1 ", " #b1 " \n\t"\
590
"paddw " #b2 ", " #b2 " \n\t"\
591
"psubw " #a1 ", " #b1 " \n\t"\
592
"psubw " #a2 ", " #b2 " \n\t"
595
LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
596
LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
597
LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
598
LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
599
LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
600
LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
603
"pxor " #z ", " #z " \n\t"\
604
"pcmpgtw " #a ", " #z " \n\t"\
605
"pxor " #z ", " #a " \n\t"\
606
"psubw " #z ", " #a " \n\t"
608
#define MMABS_SUM(a,z, sum)\
609
"pxor " #z ", " #z " \n\t"\
610
"pcmpgtw " #a ", " #z " \n\t"\
611
"pxor " #z ", " #a " \n\t"\
612
"psubw " #z ", " #a " \n\t"\
613
"paddusw " #a ", " #sum " \n\t"
615
#define MMABS_MMX2(a,z)\
616
"pxor " #z ", " #z " \n\t"\
617
"psubw " #a ", " #z " \n\t"\
618
"pmaxsw " #z ", " #a " \n\t"
620
#define MMABS_SUM_MMX2(a,z, sum)\
621
"pxor " #z ", " #z " \n\t"\
622
"psubw " #a ", " #z " \n\t"\
623
"pmaxsw " #z ", " #a " \n\t"\
624
"paddusw " #a ", " #sum " \n\t"
626
#define SBUTTERFLY(a,b,t,n)\
627
"movq " #a ", " #t " \n\t" /* abcd */\
628
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
629
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
631
#define TRANSPOSE4(a,b,c,d,t)\
632
SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
633
SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
634
SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
635
SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
637
#define LOAD4(o, a, b, c, d)\
638
"movq "#o"(%1), " #a " \n\t"\
639
"movq "#o"+16(%1), " #b " \n\t"\
640
"movq "#o"+32(%1), " #c " \n\t"\
641
"movq "#o"+48(%1), " #d " \n\t"
643
#define STORE4(o, a, b, c, d)\
644
"movq "#a", "#o"(%1) \n\t"\
645
"movq "#b", "#o"+16(%1) \n\t"\
646
"movq "#c", "#o"+32(%1) \n\t"\
647
"movq "#d", "#o"+48(%1) \n\t"\
649
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
650
uint64_t temp[16] __align8;
653
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
656
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
657
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
661
"movq %%mm7, 112(%1) \n\t"
663
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
664
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
666
"movq 112(%1), %%mm7 \n\t"
667
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
668
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
670
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
671
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
675
"movq %%mm7, 120(%1) \n\t"
677
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
678
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
680
"movq 120(%1), %%mm7 \n\t"
681
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
682
"movq %%mm7, %%mm5 \n\t"//FIXME remove
683
"movq %%mm6, %%mm7 \n\t"
684
"movq %%mm0, %%mm6 \n\t"
685
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
687
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
688
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
691
"movq %%mm7, 64(%1) \n\t"
693
MMABS_SUM(%%mm1, %%mm7, %%mm0)
694
MMABS_SUM(%%mm2, %%mm7, %%mm0)
695
MMABS_SUM(%%mm3, %%mm7, %%mm0)
696
MMABS_SUM(%%mm4, %%mm7, %%mm0)
697
MMABS_SUM(%%mm5, %%mm7, %%mm0)
698
MMABS_SUM(%%mm6, %%mm7, %%mm0)
699
"movq 64(%1), %%mm1 \n\t"
700
MMABS_SUM(%%mm1, %%mm7, %%mm0)
701
"movq %%mm0, 64(%1) \n\t"
703
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
704
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
707
"movq %%mm7, (%1) \n\t"
709
MMABS_SUM(%%mm1, %%mm7, %%mm0)
710
MMABS_SUM(%%mm2, %%mm7, %%mm0)
711
MMABS_SUM(%%mm3, %%mm7, %%mm0)
712
MMABS_SUM(%%mm4, %%mm7, %%mm0)
713
MMABS_SUM(%%mm5, %%mm7, %%mm0)
714
MMABS_SUM(%%mm6, %%mm7, %%mm0)
715
"movq (%1), %%mm1 \n\t"
716
MMABS_SUM(%%mm1, %%mm7, %%mm0)
717
"movq 64(%1), %%mm1 \n\t"
718
MMABS_SUM(%%mm1, %%mm7, %%mm0)
720
"movq %%mm0, %%mm1 \n\t"
721
"psrlq $32, %%mm0 \n\t"
722
"paddusw %%mm1, %%mm0 \n\t"
723
"movq %%mm0, %%mm1 \n\t"
724
"psrlq $16, %%mm0 \n\t"
725
"paddusw %%mm1, %%mm0 \n\t"
726
"movd %%mm0, %0 \n\t"
734
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
735
uint64_t temp[16] __align8;
738
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
741
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
742
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
746
"movq %%mm7, 112(%1) \n\t"
748
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
749
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
751
"movq 112(%1), %%mm7 \n\t"
752
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
753
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
755
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
756
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
760
"movq %%mm7, 120(%1) \n\t"
762
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
763
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
765
"movq 120(%1), %%mm7 \n\t"
766
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
767
"movq %%mm7, %%mm5 \n\t"//FIXME remove
768
"movq %%mm6, %%mm7 \n\t"
769
"movq %%mm0, %%mm6 \n\t"
770
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
772
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
773
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
776
"movq %%mm7, 64(%1) \n\t"
777
MMABS_MMX2(%%mm0, %%mm7)
778
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
779
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
780
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
781
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
782
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
783
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
784
"movq 64(%1), %%mm1 \n\t"
785
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
786
"movq %%mm0, 64(%1) \n\t"
788
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
789
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
792
"movq %%mm7, (%1) \n\t"
793
MMABS_MMX2(%%mm0, %%mm7)
794
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
795
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
796
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
797
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
798
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
799
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
800
"movq (%1), %%mm1 \n\t"
801
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
802
"movq 64(%1), %%mm1 \n\t"
803
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
805
"movq %%mm0, %%mm1 \n\t"
806
"psrlq $32, %%mm0 \n\t"
807
"paddusw %%mm1, %%mm0 \n\t"
808
"movq %%mm0, %%mm1 \n\t"
809
"psrlq $16, %%mm0 \n\t"
810
"paddusw %%mm1, %%mm0 \n\t"
811
"movd %%mm0, %0 \n\t"
820
WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
821
WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
823
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
824
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
826
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
827
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
828
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
829
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
830
"movq "#in7", " #m3 " \n\t" /* d */\
831
"movq "#in0", %%mm5 \n\t" /* D */\
832
"paddw " #m3 ", %%mm5 \n\t" /* x4 */\
833
"psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
834
"movq "#in1", %%mm5 \n\t" /* C */\
835
"movq "#in2", %%mm6 \n\t" /* B */\
836
"paddw " #m6 ", %%mm5 \n\t" /* x3 */\
837
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\
838
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
839
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
840
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
841
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\
842
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
843
"psraw $5, %%mm5 \n\t"\
844
"packuswb %%mm5, %%mm5 \n\t"\
845
OP(%%mm5, out, %%mm7, d)
847
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
848
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
852
"pxor %%mm7, %%mm7 \n\t"\
854
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
855
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
856
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
857
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
858
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
859
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
860
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
861
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
862
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
863
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
864
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
865
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
866
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
867
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
868
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
869
"paddw %%mm3, %%mm5 \n\t" /* b */\
870
"paddw %%mm2, %%mm6 \n\t" /* c */\
871
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
872
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
873
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
874
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
875
"paddw %%mm4, %%mm0 \n\t" /* a */\
876
"paddw %%mm1, %%mm5 \n\t" /* d */\
877
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
878
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
879
"paddw %6, %%mm6 \n\t"\
880
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
881
"psraw $5, %%mm0 \n\t"\
882
"movq %%mm0, %5 \n\t"\
883
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
885
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
886
"movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
887
"movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
888
"psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
889
"psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
890
"punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
891
"punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
892
"paddw %%mm0, %%mm2 \n\t" /* b */\
893
"paddw %%mm5, %%mm3 \n\t" /* c */\
894
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
895
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
896
"movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
897
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
898
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
899
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
900
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
901
"paddw %%mm2, %%mm1 \n\t" /* a */\
902
"paddw %%mm6, %%mm4 \n\t" /* d */\
903
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
904
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
905
"paddw %6, %%mm1 \n\t"\
906
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
907
"psraw $5, %%mm3 \n\t"\
908
"movq %5, %%mm1 \n\t"\
909
"packuswb %%mm3, %%mm1 \n\t"\
910
OP_MMX2(%%mm1, (%1),%%mm4, q)\
911
/* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
913
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
914
"movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
915
"movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
916
"psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
917
"psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
918
"punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
919
"punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
920
"paddw %%mm1, %%mm5 \n\t" /* b */\
921
"paddw %%mm4, %%mm0 \n\t" /* c */\
922
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
923
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
924
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
925
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
926
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
927
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
928
"paddw %%mm3, %%mm2 \n\t" /* d */\
929
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
930
"movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
931
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
932
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
933
"paddw %%mm2, %%mm6 \n\t" /* a */\
934
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
935
"paddw %6, %%mm0 \n\t"\
936
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
937
"psraw $5, %%mm0 \n\t"\
938
/* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
940
"paddw %%mm5, %%mm3 \n\t" /* a */\
941
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
942
"paddw %%mm4, %%mm6 \n\t" /* b */\
943
"pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
944
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
945
"paddw %%mm1, %%mm4 \n\t" /* c */\
946
"paddw %%mm2, %%mm5 \n\t" /* d */\
947
"paddw %%mm6, %%mm6 \n\t" /* 2b */\
948
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
949
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
950
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
951
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
952
"paddw %6, %%mm4 \n\t"\
953
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
954
"psraw $5, %%mm4 \n\t"\
955
"packuswb %%mm4, %%mm0 \n\t"\
956
OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
962
: "+a"(src), "+c"(dst), "+m"(h)\
963
: "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
968
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
971
/* quick HACK, XXX FIXME MUST be optimized */\
974
temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
975
temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
976
temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
977
temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
978
temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
979
temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
980
temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
981
temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
982
temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
983
temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
984
temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
985
temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
986
temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
987
temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
988
temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
989
temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
991
"movq (%0), %%mm0 \n\t"\
992
"movq 8(%0), %%mm1 \n\t"\
993
"paddw %2, %%mm0 \n\t"\
994
"paddw %2, %%mm1 \n\t"\
995
"psraw $5, %%mm0 \n\t"\
996
"psraw $5, %%mm1 \n\t"\
997
"packuswb %%mm1, %%mm0 \n\t"\
998
OP_3DNOW(%%mm0, (%1), %%mm1, q)\
999
"movq 16(%0), %%mm0 \n\t"\
1000
"movq 24(%0), %%mm1 \n\t"\
1001
"paddw %2, %%mm0 \n\t"\
1002
"paddw %2, %%mm1 \n\t"\
1003
"psraw $5, %%mm0 \n\t"\
1004
"psraw $5, %%mm1 \n\t"\
1005
"packuswb %%mm1, %%mm0 \n\t"\
1006
OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1007
:: "r"(temp), "r"(dst), "m"(ROUNDER)\
1015
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1019
"pxor %%mm7, %%mm7 \n\t"\
1021
"movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1022
"movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1023
"movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1024
"punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1025
"punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1026
"pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1027
"pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1028
"movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1029
"movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1030
"psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1031
"psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1032
"psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1033
"punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1034
"punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1035
"punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1036
"paddw %%mm3, %%mm5 \n\t" /* b */\
1037
"paddw %%mm2, %%mm6 \n\t" /* c */\
1038
"paddw %%mm5, %%mm5 \n\t" /* 2b */\
1039
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1040
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1041
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1042
"paddw %%mm4, %%mm0 \n\t" /* a */\
1043
"paddw %%mm1, %%mm5 \n\t" /* d */\
1044
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1045
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1046
"paddw %6, %%mm6 \n\t"\
1047
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1048
"psraw $5, %%mm0 \n\t"\
1049
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1051
"movd 5(%0), %%mm5 \n\t" /* FGHI */\
1052
"punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1053
"pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1054
"paddw %%mm5, %%mm1 \n\t" /* a */\
1055
"paddw %%mm6, %%mm2 \n\t" /* b */\
1056
"pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1057
"pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1058
"paddw %%mm6, %%mm3 \n\t" /* c */\
1059
"paddw %%mm5, %%mm4 \n\t" /* d */\
1060
"paddw %%mm2, %%mm2 \n\t" /* 2b */\
1061
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1062
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1063
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1064
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1065
"paddw %6, %%mm1 \n\t"\
1066
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1067
"psraw $5, %%mm3 \n\t"\
1068
"packuswb %%mm3, %%mm0 \n\t"\
1069
OP_MMX2(%%mm0, (%1), %%mm4, q)\
1075
: "+a"(src), "+c"(dst), "+m"(h)\
1076
: "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1081
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1084
/* quick HACK, XXX FIXME MUST be optimized */\
1087
temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1088
temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1089
temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1090
temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1091
temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1092
temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1093
temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1094
temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1096
"movq (%0), %%mm0 \n\t"\
1097
"movq 8(%0), %%mm1 \n\t"\
1098
"paddw %2, %%mm0 \n\t"\
1099
"paddw %2, %%mm1 \n\t"\
1100
"psraw $5, %%mm0 \n\t"\
1101
"psraw $5, %%mm1 \n\t"\
1102
"packuswb %%mm1, %%mm0 \n\t"\
1103
OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1104
:: "r"(temp), "r"(dst), "m"(ROUNDER)\
1112
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1114
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1115
uint64_t temp[17*4];\
1116
uint64_t *temp_ptr= temp;\
1121
"pxor %%mm7, %%mm7 \n\t"\
1123
"movq (%0), %%mm0 \n\t"\
1124
"movq (%0), %%mm1 \n\t"\
1125
"movq 8(%0), %%mm2 \n\t"\
1126
"movq 8(%0), %%mm3 \n\t"\
1127
"punpcklbw %%mm7, %%mm0 \n\t"\
1128
"punpckhbw %%mm7, %%mm1 \n\t"\
1129
"punpcklbw %%mm7, %%mm2 \n\t"\
1130
"punpckhbw %%mm7, %%mm3 \n\t"\
1131
"movq %%mm0, (%1) \n\t"\
1132
"movq %%mm1, 17*8(%1) \n\t"\
1133
"movq %%mm2, 2*17*8(%1) \n\t"\
1134
"movq %%mm3, 3*17*8(%1) \n\t"\
1139
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
1147
/*FIXME reorder for speed */\
1149
/*"pxor %%mm7, %%mm7 \n\t"*/\
1151
"movq (%0), %%mm0 \n\t"\
1152
"movq 8(%0), %%mm1 \n\t"\
1153
"movq 16(%0), %%mm2 \n\t"\
1154
"movq 24(%0), %%mm3 \n\t"\
1155
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1156
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1158
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1160
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1162
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1163
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1165
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1166
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1168
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1169
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1171
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1172
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1174
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1176
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1177
"addl %4, %1 \n\t" \
1178
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1179
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1181
"addl $136, %0 \n\t"\
1186
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1187
: "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
1192
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1193
uint64_t temp[9*4];\
1194
uint64_t *temp_ptr= temp;\
1199
"pxor %%mm7, %%mm7 \n\t"\
1201
"movq (%0), %%mm0 \n\t"\
1202
"movq (%0), %%mm1 \n\t"\
1203
"punpcklbw %%mm7, %%mm0 \n\t"\
1204
"punpckhbw %%mm7, %%mm1 \n\t"\
1205
"movq %%mm0, (%1) \n\t"\
1206
"movq %%mm1, 9*8(%1) \n\t"\
1211
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
1219
/*FIXME reorder for speed */\
1221
/*"pxor %%mm7, %%mm7 \n\t"*/\
1223
"movq (%0), %%mm0 \n\t"\
1224
"movq 8(%0), %%mm1 \n\t"\
1225
"movq 16(%0), %%mm2 \n\t"\
1226
"movq 24(%0), %%mm3 \n\t"\
1227
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1228
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1230
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1232
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1234
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1236
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1238
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1239
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1241
"addl $72, %0 \n\t"\
1246
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1247
: "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
1252
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1253
OPNAME ## pixels8_mmx(dst, src, stride, 8);\
1256
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1258
uint8_t * const half= (uint8_t*)temp;\
1259
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1260
OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1263
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1264
OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1267
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1269
uint8_t * const half= (uint8_t*)temp;\
1270
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1271
OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1274
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1276
uint8_t * const half= (uint8_t*)temp;\
1277
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1278
OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1281
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1282
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1285
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1287
uint8_t * const half= (uint8_t*)temp;\
1288
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1289
OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1291
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1292
uint64_t half[8 + 9];\
1293
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1294
uint8_t * const halfHV= ((uint8_t*)half);\
1295
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1296
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1297
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1298
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1300
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1301
uint64_t half[8 + 9];\
1302
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1303
uint8_t * const halfHV= ((uint8_t*)half);\
1304
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1305
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1306
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1307
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1309
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1310
uint64_t half[8 + 9];\
1311
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1312
uint8_t * const halfHV= ((uint8_t*)half);\
1313
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1314
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1315
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1316
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1318
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1319
uint64_t half[8 + 9];\
1320
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1321
uint8_t * const halfHV= ((uint8_t*)half);\
1322
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1323
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1324
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1325
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1327
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1328
uint64_t half[8 + 9];\
1329
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1330
uint8_t * const halfHV= ((uint8_t*)half);\
1331
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1332
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1333
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1335
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1336
uint64_t half[8 + 9];\
1337
uint8_t * const halfH= ((uint8_t*)half) + 64;\
1338
uint8_t * const halfHV= ((uint8_t*)half);\
1339
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1340
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1341
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1343
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1344
uint64_t half[8 + 9];\
1345
uint8_t * const halfH= ((uint8_t*)half);\
1346
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1347
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1348
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1350
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1351
uint64_t half[8 + 9];\
1352
uint8_t * const halfH= ((uint8_t*)half);\
1353
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1354
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1355
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1357
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1359
uint8_t * const halfH= ((uint8_t*)half);\
1360
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1361
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1363
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1364
OPNAME ## pixels16_mmx(dst, src, stride, 16);\
1367
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1369
uint8_t * const half= (uint8_t*)temp;\
1370
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1371
OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1374
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1375
OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1378
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1380
uint8_t * const half= (uint8_t*)temp;\
1381
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1382
OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1385
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1387
uint8_t * const half= (uint8_t*)temp;\
1388
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1389
OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1392
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1393
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1396
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1398
uint8_t * const half= (uint8_t*)temp;\
1399
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1400
OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1402
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1403
uint64_t half[16*2 + 17*2];\
1404
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1405
uint8_t * const halfHV= ((uint8_t*)half);\
1406
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1407
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1408
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1409
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1411
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1412
uint64_t half[16*2 + 17*2];\
1413
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1414
uint8_t * const halfHV= ((uint8_t*)half);\
1415
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1416
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1417
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1418
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1420
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1421
uint64_t half[16*2 + 17*2];\
1422
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1423
uint8_t * const halfHV= ((uint8_t*)half);\
1424
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1425
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1426
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1427
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1429
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1430
uint64_t half[16*2 + 17*2];\
1431
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1432
uint8_t * const halfHV= ((uint8_t*)half);\
1433
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1434
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1435
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1436
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1438
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1439
uint64_t half[16*2 + 17*2];\
1440
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1441
uint8_t * const halfHV= ((uint8_t*)half);\
1442
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1443
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1444
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1446
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1447
uint64_t half[16*2 + 17*2];\
1448
uint8_t * const halfH= ((uint8_t*)half) + 256;\
1449
uint8_t * const halfHV= ((uint8_t*)half);\
1450
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1451
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1452
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1454
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1455
uint64_t half[17*2];\
1456
uint8_t * const halfH= ((uint8_t*)half);\
1457
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1458
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1459
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1461
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1462
uint64_t half[17*2];\
1463
uint8_t * const halfH= ((uint8_t*)half);\
1464
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1465
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1466
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1468
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1469
uint64_t half[17*2];\
1470
uint8_t * const halfH= ((uint8_t*)half);\
1471
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1472
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1476
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1477
#define AVG_3DNOW_OP(a,b,temp, size) \
1478
"mov" #size " " #b ", " #temp " \n\t"\
1479
"pavgusb " #temp ", " #a " \n\t"\
1480
"mov" #size " " #a ", " #b " \n\t"
1481
#define AVG_MMX2_OP(a,b,temp, size) \
1482
"mov" #size " " #b ", " #temp " \n\t"\
1483
"pavgb " #temp ", " #a " \n\t"\
1484
"mov" #size " " #a ", " #b " \n\t"
1486
QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1487
QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1488
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1489
QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1490
QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1491
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1492
QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1493
QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1494
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1497
static void just_return() { return; }
1500
#define SET_QPEL_FUNC(postfix1, postfix2) \
1501
c->put_ ## postfix1 = put_ ## postfix2;\
1502
c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1503
c->avg_ ## postfix1 = avg_ ## postfix2;
1505
/* external functions, from idct_mmx.c */
1506
void ff_mmx_idct(DCTELEM *block);
1507
void ff_mmxext_idct(DCTELEM *block);
1509
/* XXX: those functions should be suppressed ASAP when all IDCTs are
1511
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1513
ff_mmx_idct (block);
1514
put_pixels_clamped_mmx(block, dest, line_size);
1516
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1518
ff_mmx_idct (block);
1519
add_pixels_clamped_mmx(block, dest, line_size);
1521
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1523
ff_mmxext_idct (block);
1524
put_pixels_clamped_mmx(block, dest, line_size);
1526
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1528
ff_mmxext_idct (block);
1529
add_pixels_clamped_mmx(block, dest, line_size);
1532
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
1534
mm_flags = mm_support();
1536
if (avctx->dsp_mask) {
1537
if (avctx->dsp_mask & FF_MM_FORCE)
1538
mm_flags |= (avctx->dsp_mask & 0xffff);
1540
mm_flags &= ~(avctx->dsp_mask & 0xffff);
1544
fprintf(stderr, "libavcodec: CPU flags:");
1545
if (mm_flags & MM_MMX)
1546
fprintf(stderr, " mmx");
1547
if (mm_flags & MM_MMXEXT)
1548
fprintf(stderr, " mmxext");
1549
if (mm_flags & MM_3DNOW)
1550
fprintf(stderr, " 3dnow");
1551
if (mm_flags & MM_SSE)
1552
fprintf(stderr, " sse");
1553
if (mm_flags & MM_SSE2)
1554
fprintf(stderr, " sse2");
1555
fprintf(stderr, "\n");
1558
if (mm_flags & MM_MMX) {
1559
const int dct_algo = avctx->dct_algo;
1560
const int idct_algo= avctx->idct_algo;
1562
#ifdef CONFIG_ENCODERS
1563
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
1564
c->fdct = ff_fdct_mmx;
1565
#endif //CONFIG_ENCODERS
1567
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
1568
c->idct_put= ff_simple_idct_put_mmx;
1569
c->idct_add= ff_simple_idct_add_mmx;
1570
c->idct = ff_simple_idct_mmx;
1571
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
1572
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
1573
if(mm_flags & MM_MMXEXT){
1574
c->idct_put= ff_libmpeg2mmx2_idct_put;
1575
c->idct_add= ff_libmpeg2mmx2_idct_add;
1576
c->idct = ff_mmxext_idct;
1578
c->idct_put= ff_libmpeg2mmx_idct_put;
1579
c->idct_add= ff_libmpeg2mmx_idct_add;
1580
c->idct = ff_mmx_idct;
1582
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
1585
c->get_pixels = get_pixels_mmx;
1586
c->diff_pixels = diff_pixels_mmx;
1587
c->put_pixels_clamped = put_pixels_clamped_mmx;
1588
c->add_pixels_clamped = add_pixels_clamped_mmx;
1589
c->clear_blocks = clear_blocks_mmx;
1590
c->pix_sum = pix_sum16_mmx;
1592
c->put_pixels_tab[0][0] = put_pixels16_mmx;
1593
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1594
c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1595
c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1597
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1598
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1599
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1600
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1602
c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1603
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1604
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1605
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1607
c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1608
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1609
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1610
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1612
c->put_pixels_tab[1][0] = put_pixels8_mmx;
1613
c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1614
c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1615
c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1617
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1618
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1619
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1620
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1622
c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1623
c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1624
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1625
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1627
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1628
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1629
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1630
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
1632
c->add_bytes= add_bytes_mmx;
1633
c->diff_bytes= diff_bytes_mmx;
1635
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1636
c->hadamard8_diff[1]= hadamard8_diff_mmx;
1638
c->pix_norm1 = pix_norm1_mmx;
1639
c->sse[0] = sse16_mmx;
1641
if (mm_flags & MM_MMXEXT) {
1642
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1643
c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1645
c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1646
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1647
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1649
c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1650
c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1652
c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1653
c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1654
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1656
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1657
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1659
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1660
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1661
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1662
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1663
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1664
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1665
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1669
SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1670
SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1671
SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1672
SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1673
SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1674
SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1675
SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1676
SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1677
SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1678
SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1679
SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1680
SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1681
SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1682
SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1683
SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1684
SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1685
SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1686
SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1687
SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1688
SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1689
SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1690
SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1691
SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1692
SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1693
SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1694
SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1695
SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1696
SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1697
SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1698
SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1699
SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1700
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
1702
} else if (mm_flags & MM_3DNOW) {
1703
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1704
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1706
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1707
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1708
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1710
c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1711
c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1713
c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1714
c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1715
c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1717
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1718
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1719
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1720
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1721
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1722
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1723
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
1726
SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1727
SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1728
SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1729
SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1730
SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1731
SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1732
SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1733
SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1734
SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1735
SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1736
SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1737
SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1738
SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1739
SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1740
SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1741
SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1742
SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1743
SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1744
SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1745
SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1746
SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1747
SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1748
SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1749
SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1750
SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1751
SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1752
SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1753
SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1754
SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1755
SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1756
SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1757
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
1761
dsputil_init_pix_mmx(c, avctx);
1763
// for speed testing
1764
get_pixels = just_return;
1765
put_pixels_clamped = just_return;
1766
add_pixels_clamped = just_return;
1768
pix_abs16x16 = just_return;
1769
pix_abs16x16_x2 = just_return;
1770
pix_abs16x16_y2 = just_return;
1771
pix_abs16x16_xy2 = just_return;
1773
put_pixels_tab[0] = just_return;
1774
put_pixels_tab[1] = just_return;
1775
put_pixels_tab[2] = just_return;
1776
put_pixels_tab[3] = just_return;
1778
put_no_rnd_pixels_tab[0] = just_return;
1779
put_no_rnd_pixels_tab[1] = just_return;
1780
put_no_rnd_pixels_tab[2] = just_return;
1781
put_no_rnd_pixels_tab[3] = just_return;
1783
avg_pixels_tab[0] = just_return;
1784
avg_pixels_tab[1] = just_return;
1785
avg_pixels_tab[2] = just_return;
1786
avg_pixels_tab[3] = just_return;
1788
avg_no_rnd_pixels_tab[0] = just_return;
1789
avg_no_rnd_pixels_tab[1] = just_return;
1790
avg_no_rnd_pixels_tab[2] = just_return;
1791
avg_no_rnd_pixels_tab[3] = just_return;
1793
//av_fdct = just_return;
1794
//ff_idct = just_return;