2
* DSP utils : average functions are compiled twice for 3dnow/mmx2
3
* Copyright (c) 2000, 2001 Fabrice Bellard.
4
* Copyright (c) 2002-2004 Michael Niedermayer
6
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8
* and improved by Zdenek Kabelac <kabi@users.sf.net>
10
* This file is part of FFmpeg.
12
* FFmpeg is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU Lesser General Public
14
* License as published by the Free Software Foundation; either
15
* version 2.1 of the License, or (at your option) any later version.
17
* FFmpeg is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
* Lesser General Public License for more details.
22
* You should have received a copy of the GNU Lesser General Public
23
* License along with FFmpeg; if not, write to the Free Software
24
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
/* This header intentionally has no multiple inclusion guards. It is meant to
28
* be included multiple times and generates different code depending on the
29
* value of certain #defines. */
31
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
32
clobber bug - now it will work with 2.95.2 and also with -fPIC
34
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
37
"lea (%3, %3), %%"REG_a" \n\t"
39
"movq (%1), %%mm0 \n\t"
40
"movq (%1, %3), %%mm1 \n\t"
41
PAVGB" 1(%1), %%mm0 \n\t"
42
PAVGB" 1(%1, %3), %%mm1 \n\t"
43
"movq %%mm0, (%2) \n\t"
44
"movq %%mm1, (%2, %3) \n\t"
45
"add %%"REG_a", %1 \n\t"
46
"add %%"REG_a", %2 \n\t"
47
"movq (%1), %%mm0 \n\t"
48
"movq (%1, %3), %%mm1 \n\t"
49
PAVGB" 1(%1), %%mm0 \n\t"
50
PAVGB" 1(%1, %3), %%mm1 \n\t"
51
"add %%"REG_a", %1 \n\t"
52
"movq %%mm0, (%2) \n\t"
53
"movq %%mm1, (%2, %3) \n\t"
54
"add %%"REG_a", %2 \n\t"
57
:"+g"(h), "+S"(pixels), "+D"(block)
58
:"r" ((x86_reg)line_size)
62
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
67
"movd (%1), %%mm0 \n\t"
68
"movd (%2), %%mm1 \n\t"
71
PAVGB" %%mm1, %%mm0 \n\t"
72
"movd %%mm0, (%3) \n\t"
76
"movd (%1), %%mm0 \n\t"
78
"movd (%1), %%mm1 \n\t"
79
"movd (%2), %%mm2 \n\t"
80
"movd 4(%2), %%mm3 \n\t"
82
PAVGB" %%mm2, %%mm0 \n\t"
83
PAVGB" %%mm3, %%mm1 \n\t"
84
"movd %%mm0, (%3) \n\t"
86
"movd %%mm1, (%3) \n\t"
88
"movd (%1), %%mm0 \n\t"
90
"movd (%1), %%mm1 \n\t"
91
"movd 8(%2), %%mm2 \n\t"
92
"movd 12(%2), %%mm3 \n\t"
94
PAVGB" %%mm2, %%mm0 \n\t"
95
PAVGB" %%mm3, %%mm1 \n\t"
96
"movd %%mm0, (%3) \n\t"
98
"movd %%mm1, (%3) \n\t"
103
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
104
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
106
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
108
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
113
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
118
"movq (%1), %%mm0 \n\t"
119
"movq (%2), %%mm1 \n\t"
122
PAVGB" %%mm1, %%mm0 \n\t"
123
"movq %%mm0, (%3) \n\t"
127
"movq (%1), %%mm0 \n\t"
129
"movq (%1), %%mm1 \n\t"
131
PAVGB" (%2), %%mm0 \n\t"
132
PAVGB" 8(%2), %%mm1 \n\t"
133
"movq %%mm0, (%3) \n\t"
135
"movq %%mm1, (%3) \n\t"
137
"movq (%1), %%mm0 \n\t"
139
"movq (%1), %%mm1 \n\t"
141
PAVGB" 16(%2), %%mm0 \n\t"
142
PAVGB" 24(%2), %%mm1 \n\t"
143
"movq %%mm0, (%3) \n\t"
145
"movq %%mm1, (%3) \n\t"
150
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
151
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
153
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
155
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
157
//the following should be used, though better not with gcc ...
158
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
159
:"r"(src1Stride), "r"(dstStride)
163
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
166
"pcmpeqb %%mm6, %%mm6 \n\t"
169
"movq (%1), %%mm0 \n\t"
170
"movq (%2), %%mm1 \n\t"
173
"pxor %%mm6, %%mm0 \n\t"
174
"pxor %%mm6, %%mm1 \n\t"
175
PAVGB" %%mm1, %%mm0 \n\t"
176
"pxor %%mm6, %%mm0 \n\t"
177
"movq %%mm0, (%3) \n\t"
181
"movq (%1), %%mm0 \n\t"
183
"movq (%1), %%mm1 \n\t"
185
"movq (%2), %%mm2 \n\t"
186
"movq 8(%2), %%mm3 \n\t"
187
"pxor %%mm6, %%mm0 \n\t"
188
"pxor %%mm6, %%mm1 \n\t"
189
"pxor %%mm6, %%mm2 \n\t"
190
"pxor %%mm6, %%mm3 \n\t"
191
PAVGB" %%mm2, %%mm0 \n\t"
192
PAVGB" %%mm3, %%mm1 \n\t"
193
"pxor %%mm6, %%mm0 \n\t"
194
"pxor %%mm6, %%mm1 \n\t"
195
"movq %%mm0, (%3) \n\t"
197
"movq %%mm1, (%3) \n\t"
199
"movq (%1), %%mm0 \n\t"
201
"movq (%1), %%mm1 \n\t"
203
"movq 16(%2), %%mm2 \n\t"
204
"movq 24(%2), %%mm3 \n\t"
205
"pxor %%mm6, %%mm0 \n\t"
206
"pxor %%mm6, %%mm1 \n\t"
207
"pxor %%mm6, %%mm2 \n\t"
208
"pxor %%mm6, %%mm3 \n\t"
209
PAVGB" %%mm2, %%mm0 \n\t"
210
PAVGB" %%mm3, %%mm1 \n\t"
211
"pxor %%mm6, %%mm0 \n\t"
212
"pxor %%mm6, %%mm1 \n\t"
213
"movq %%mm0, (%3) \n\t"
215
"movq %%mm1, (%3) \n\t"
220
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
221
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
223
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
225
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
227
//the following should be used, though better not with gcc ...
228
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
229
:"r"(src1Stride), "r"(dstStride)
233
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
238
"movd (%1), %%mm0 \n\t"
239
"movd (%2), %%mm1 \n\t"
242
PAVGB" %%mm1, %%mm0 \n\t"
243
PAVGB" (%3), %%mm0 \n\t"
244
"movd %%mm0, (%3) \n\t"
248
"movd (%1), %%mm0 \n\t"
250
"movd (%1), %%mm1 \n\t"
252
PAVGB" (%2), %%mm0 \n\t"
253
PAVGB" 4(%2), %%mm1 \n\t"
254
PAVGB" (%3), %%mm0 \n\t"
255
"movd %%mm0, (%3) \n\t"
257
PAVGB" (%3), %%mm1 \n\t"
258
"movd %%mm1, (%3) \n\t"
260
"movd (%1), %%mm0 \n\t"
262
"movd (%1), %%mm1 \n\t"
264
PAVGB" 8(%2), %%mm0 \n\t"
265
PAVGB" 12(%2), %%mm1 \n\t"
266
PAVGB" (%3), %%mm0 \n\t"
267
"movd %%mm0, (%3) \n\t"
269
PAVGB" (%3), %%mm1 \n\t"
270
"movd %%mm1, (%3) \n\t"
275
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
276
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
278
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
280
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
285
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
290
"movq (%1), %%mm0 \n\t"
291
"movq (%2), %%mm1 \n\t"
294
PAVGB" %%mm1, %%mm0 \n\t"
295
PAVGB" (%3), %%mm0 \n\t"
296
"movq %%mm0, (%3) \n\t"
300
"movq (%1), %%mm0 \n\t"
302
"movq (%1), %%mm1 \n\t"
304
PAVGB" (%2), %%mm0 \n\t"
305
PAVGB" 8(%2), %%mm1 \n\t"
306
PAVGB" (%3), %%mm0 \n\t"
307
"movq %%mm0, (%3) \n\t"
309
PAVGB" (%3), %%mm1 \n\t"
310
"movq %%mm1, (%3) \n\t"
312
"movq (%1), %%mm0 \n\t"
314
"movq (%1), %%mm1 \n\t"
316
PAVGB" 16(%2), %%mm0 \n\t"
317
PAVGB" 24(%2), %%mm1 \n\t"
318
PAVGB" (%3), %%mm0 \n\t"
319
"movq %%mm0, (%3) \n\t"
321
PAVGB" (%3), %%mm1 \n\t"
322
"movq %%mm1, (%3) \n\t"
327
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
328
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
330
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
332
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
334
//the following should be used, though better not with gcc ...
335
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
336
:"r"(src1Stride), "r"(dstStride)
340
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
343
"lea (%3, %3), %%"REG_a" \n\t"
345
"movq (%1), %%mm0 \n\t"
346
"movq (%1, %3), %%mm1 \n\t"
347
"movq 8(%1), %%mm2 \n\t"
348
"movq 8(%1, %3), %%mm3 \n\t"
349
PAVGB" 1(%1), %%mm0 \n\t"
350
PAVGB" 1(%1, %3), %%mm1 \n\t"
351
PAVGB" 9(%1), %%mm2 \n\t"
352
PAVGB" 9(%1, %3), %%mm3 \n\t"
353
"movq %%mm0, (%2) \n\t"
354
"movq %%mm1, (%2, %3) \n\t"
355
"movq %%mm2, 8(%2) \n\t"
356
"movq %%mm3, 8(%2, %3) \n\t"
357
"add %%"REG_a", %1 \n\t"
358
"add %%"REG_a", %2 \n\t"
359
"movq (%1), %%mm0 \n\t"
360
"movq (%1, %3), %%mm1 \n\t"
361
"movq 8(%1), %%mm2 \n\t"
362
"movq 8(%1, %3), %%mm3 \n\t"
363
PAVGB" 1(%1), %%mm0 \n\t"
364
PAVGB" 1(%1, %3), %%mm1 \n\t"
365
PAVGB" 9(%1), %%mm2 \n\t"
366
PAVGB" 9(%1, %3), %%mm3 \n\t"
367
"add %%"REG_a", %1 \n\t"
368
"movq %%mm0, (%2) \n\t"
369
"movq %%mm1, (%2, %3) \n\t"
370
"movq %%mm2, 8(%2) \n\t"
371
"movq %%mm3, 8(%2, %3) \n\t"
372
"add %%"REG_a", %2 \n\t"
375
:"+g"(h), "+S"(pixels), "+D"(block)
376
:"r" ((x86_reg)line_size)
377
:"%"REG_a, "memory");
380
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
385
"movq (%1), %%mm0 \n\t"
386
"movq 8(%1), %%mm1 \n\t"
387
PAVGB" (%2), %%mm0 \n\t"
388
PAVGB" 8(%2), %%mm1 \n\t"
391
"movq %%mm0, (%3) \n\t"
392
"movq %%mm1, 8(%3) \n\t"
396
"movq (%1), %%mm0 \n\t"
397
"movq 8(%1), %%mm1 \n\t"
399
PAVGB" (%2), %%mm0 \n\t"
400
PAVGB" 8(%2), %%mm1 \n\t"
401
"movq %%mm0, (%3) \n\t"
402
"movq %%mm1, 8(%3) \n\t"
404
"movq (%1), %%mm0 \n\t"
405
"movq 8(%1), %%mm1 \n\t"
407
PAVGB" 16(%2), %%mm0 \n\t"
408
PAVGB" 24(%2), %%mm1 \n\t"
409
"movq %%mm0, (%3) \n\t"
410
"movq %%mm1, 8(%3) \n\t"
415
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
416
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
418
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
420
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
422
//the following should be used, though better not with gcc ...
423
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
424
:"r"(src1Stride), "r"(dstStride)
428
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
433
"movq (%1), %%mm0 \n\t"
434
"movq 8(%1), %%mm1 \n\t"
435
PAVGB" (%2), %%mm0 \n\t"
436
PAVGB" 8(%2), %%mm1 \n\t"
439
PAVGB" (%3), %%mm0 \n\t"
440
PAVGB" 8(%3), %%mm1 \n\t"
441
"movq %%mm0, (%3) \n\t"
442
"movq %%mm1, 8(%3) \n\t"
446
"movq (%1), %%mm0 \n\t"
447
"movq 8(%1), %%mm1 \n\t"
449
PAVGB" (%2), %%mm0 \n\t"
450
PAVGB" 8(%2), %%mm1 \n\t"
451
PAVGB" (%3), %%mm0 \n\t"
452
PAVGB" 8(%3), %%mm1 \n\t"
453
"movq %%mm0, (%3) \n\t"
454
"movq %%mm1, 8(%3) \n\t"
456
"movq (%1), %%mm0 \n\t"
457
"movq 8(%1), %%mm1 \n\t"
459
PAVGB" 16(%2), %%mm0 \n\t"
460
PAVGB" 24(%2), %%mm1 \n\t"
461
PAVGB" (%3), %%mm0 \n\t"
462
PAVGB" 8(%3), %%mm1 \n\t"
463
"movq %%mm0, (%3) \n\t"
464
"movq %%mm1, 8(%3) \n\t"
469
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
470
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
472
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
474
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
476
//the following should be used, though better not with gcc ...
477
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
478
:"r"(src1Stride), "r"(dstStride)
482
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
485
"pcmpeqb %%mm6, %%mm6 \n\t"
488
"movq (%1), %%mm0 \n\t"
489
"movq 8(%1), %%mm1 \n\t"
490
"movq (%2), %%mm2 \n\t"
491
"movq 8(%2), %%mm3 \n\t"
492
"pxor %%mm6, %%mm0 \n\t"
493
"pxor %%mm6, %%mm1 \n\t"
494
"pxor %%mm6, %%mm2 \n\t"
495
"pxor %%mm6, %%mm3 \n\t"
496
PAVGB" %%mm2, %%mm0 \n\t"
497
PAVGB" %%mm3, %%mm1 \n\t"
498
"pxor %%mm6, %%mm0 \n\t"
499
"pxor %%mm6, %%mm1 \n\t"
502
"movq %%mm0, (%3) \n\t"
503
"movq %%mm1, 8(%3) \n\t"
507
"movq (%1), %%mm0 \n\t"
508
"movq 8(%1), %%mm1 \n\t"
510
"movq (%2), %%mm2 \n\t"
511
"movq 8(%2), %%mm3 \n\t"
512
"pxor %%mm6, %%mm0 \n\t"
513
"pxor %%mm6, %%mm1 \n\t"
514
"pxor %%mm6, %%mm2 \n\t"
515
"pxor %%mm6, %%mm3 \n\t"
516
PAVGB" %%mm2, %%mm0 \n\t"
517
PAVGB" %%mm3, %%mm1 \n\t"
518
"pxor %%mm6, %%mm0 \n\t"
519
"pxor %%mm6, %%mm1 \n\t"
520
"movq %%mm0, (%3) \n\t"
521
"movq %%mm1, 8(%3) \n\t"
523
"movq (%1), %%mm0 \n\t"
524
"movq 8(%1), %%mm1 \n\t"
526
"movq 16(%2), %%mm2 \n\t"
527
"movq 24(%2), %%mm3 \n\t"
528
"pxor %%mm6, %%mm0 \n\t"
529
"pxor %%mm6, %%mm1 \n\t"
530
"pxor %%mm6, %%mm2 \n\t"
531
"pxor %%mm6, %%mm3 \n\t"
532
PAVGB" %%mm2, %%mm0 \n\t"
533
PAVGB" %%mm3, %%mm1 \n\t"
534
"pxor %%mm6, %%mm0 \n\t"
535
"pxor %%mm6, %%mm1 \n\t"
536
"movq %%mm0, (%3) \n\t"
537
"movq %%mm1, 8(%3) \n\t"
542
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
543
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
545
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
547
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
549
//the following should be used, though better not with gcc ...
550
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
551
:"r"(src1Stride), "r"(dstStride)
555
/* GL: this function does incorrect rounding if overflow */
556
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
560
"lea (%3, %3), %%"REG_a" \n\t"
562
"movq (%1), %%mm0 \n\t"
563
"movq (%1, %3), %%mm2 \n\t"
564
"movq 1(%1), %%mm1 \n\t"
565
"movq 1(%1, %3), %%mm3 \n\t"
566
"add %%"REG_a", %1 \n\t"
567
"psubusb %%mm6, %%mm0 \n\t"
568
"psubusb %%mm6, %%mm2 \n\t"
569
PAVGB" %%mm1, %%mm0 \n\t"
570
PAVGB" %%mm3, %%mm2 \n\t"
571
"movq %%mm0, (%2) \n\t"
572
"movq %%mm2, (%2, %3) \n\t"
573
"movq (%1), %%mm0 \n\t"
574
"movq 1(%1), %%mm1 \n\t"
575
"movq (%1, %3), %%mm2 \n\t"
576
"movq 1(%1, %3), %%mm3 \n\t"
577
"add %%"REG_a", %2 \n\t"
578
"add %%"REG_a", %1 \n\t"
579
"psubusb %%mm6, %%mm0 \n\t"
580
"psubusb %%mm6, %%mm2 \n\t"
581
PAVGB" %%mm1, %%mm0 \n\t"
582
PAVGB" %%mm3, %%mm2 \n\t"
583
"movq %%mm0, (%2) \n\t"
584
"movq %%mm2, (%2, %3) \n\t"
585
"add %%"REG_a", %2 \n\t"
588
:"+g"(h), "+S"(pixels), "+D"(block)
589
:"r" ((x86_reg)line_size)
590
:"%"REG_a, "memory");
593
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
596
"lea (%3, %3), %%"REG_a" \n\t"
597
"movq (%1), %%mm0 \n\t"
600
"movq (%1, %3), %%mm1 \n\t"
601
"movq (%1, %%"REG_a"), %%mm2 \n\t"
602
"add %%"REG_a", %1 \n\t"
603
PAVGB" %%mm1, %%mm0 \n\t"
604
PAVGB" %%mm2, %%mm1 \n\t"
605
"movq %%mm0, (%2, %3) \n\t"
606
"movq %%mm1, (%2, %%"REG_a") \n\t"
607
"movq (%1, %3), %%mm1 \n\t"
608
"movq (%1, %%"REG_a"), %%mm0 \n\t"
609
"add %%"REG_a", %2 \n\t"
610
"add %%"REG_a", %1 \n\t"
611
PAVGB" %%mm1, %%mm2 \n\t"
612
PAVGB" %%mm0, %%mm1 \n\t"
613
"movq %%mm2, (%2, %3) \n\t"
614
"movq %%mm1, (%2, %%"REG_a") \n\t"
615
"add %%"REG_a", %2 \n\t"
618
:"+g"(h), "+S"(pixels), "+D" (block)
619
:"r" ((x86_reg)line_size)
620
:"%"REG_a, "memory");
623
/* GL: this function does incorrect rounding if overflow */
624
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
628
"lea (%3, %3), %%"REG_a" \n\t"
629
"movq (%1), %%mm0 \n\t"
632
"movq (%1, %3), %%mm1 \n\t"
633
"movq (%1, %%"REG_a"), %%mm2 \n\t"
634
"add %%"REG_a", %1 \n\t"
635
"psubusb %%mm6, %%mm1 \n\t"
636
PAVGB" %%mm1, %%mm0 \n\t"
637
PAVGB" %%mm2, %%mm1 \n\t"
638
"movq %%mm0, (%2, %3) \n\t"
639
"movq %%mm1, (%2, %%"REG_a") \n\t"
640
"movq (%1, %3), %%mm1 \n\t"
641
"movq (%1, %%"REG_a"), %%mm0 \n\t"
642
"add %%"REG_a", %2 \n\t"
643
"add %%"REG_a", %1 \n\t"
644
"psubusb %%mm6, %%mm1 \n\t"
645
PAVGB" %%mm1, %%mm2 \n\t"
646
PAVGB" %%mm0, %%mm1 \n\t"
647
"movq %%mm2, (%2, %3) \n\t"
648
"movq %%mm1, (%2, %%"REG_a") \n\t"
649
"add %%"REG_a", %2 \n\t"
652
:"+g"(h), "+S"(pixels), "+D" (block)
653
:"r" ((x86_reg)line_size)
654
:"%"REG_a, "memory");
657
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
660
"lea (%3, %3), %%"REG_a" \n\t"
662
"movq (%2), %%mm0 \n\t"
663
"movq (%2, %3), %%mm1 \n\t"
664
PAVGB" (%1), %%mm0 \n\t"
665
PAVGB" (%1, %3), %%mm1 \n\t"
666
"movq %%mm0, (%2) \n\t"
667
"movq %%mm1, (%2, %3) \n\t"
668
"add %%"REG_a", %1 \n\t"
669
"add %%"REG_a", %2 \n\t"
670
"movq (%2), %%mm0 \n\t"
671
"movq (%2, %3), %%mm1 \n\t"
672
PAVGB" (%1), %%mm0 \n\t"
673
PAVGB" (%1, %3), %%mm1 \n\t"
674
"add %%"REG_a", %1 \n\t"
675
"movq %%mm0, (%2) \n\t"
676
"movq %%mm1, (%2, %3) \n\t"
677
"add %%"REG_a", %2 \n\t"
680
:"+g"(h), "+S"(pixels), "+D"(block)
681
:"r" ((x86_reg)line_size)
682
:"%"REG_a, "memory");
685
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
688
"lea (%3, %3), %%"REG_a" \n\t"
690
"movq (%1), %%mm0 \n\t"
691
"movq (%1, %3), %%mm2 \n\t"
692
PAVGB" 1(%1), %%mm0 \n\t"
693
PAVGB" 1(%1, %3), %%mm2 \n\t"
694
PAVGB" (%2), %%mm0 \n\t"
695
PAVGB" (%2, %3), %%mm2 \n\t"
696
"add %%"REG_a", %1 \n\t"
697
"movq %%mm0, (%2) \n\t"
698
"movq %%mm2, (%2, %3) \n\t"
699
"movq (%1), %%mm0 \n\t"
700
"movq (%1, %3), %%mm2 \n\t"
701
PAVGB" 1(%1), %%mm0 \n\t"
702
PAVGB" 1(%1, %3), %%mm2 \n\t"
703
"add %%"REG_a", %2 \n\t"
704
"add %%"REG_a", %1 \n\t"
705
PAVGB" (%2), %%mm0 \n\t"
706
PAVGB" (%2, %3), %%mm2 \n\t"
707
"movq %%mm0, (%2) \n\t"
708
"movq %%mm2, (%2, %3) \n\t"
709
"add %%"REG_a", %2 \n\t"
712
:"+g"(h), "+S"(pixels), "+D"(block)
713
:"r" ((x86_reg)line_size)
714
:"%"REG_a, "memory");
717
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
720
"lea (%3, %3), %%"REG_a" \n\t"
721
"movq (%1), %%mm0 \n\t"
724
"movq (%1, %3), %%mm1 \n\t"
725
"movq (%1, %%"REG_a"), %%mm2 \n\t"
726
"add %%"REG_a", %1 \n\t"
727
PAVGB" %%mm1, %%mm0 \n\t"
728
PAVGB" %%mm2, %%mm1 \n\t"
729
"movq (%2, %3), %%mm3 \n\t"
730
"movq (%2, %%"REG_a"), %%mm4 \n\t"
731
PAVGB" %%mm3, %%mm0 \n\t"
732
PAVGB" %%mm4, %%mm1 \n\t"
733
"movq %%mm0, (%2, %3) \n\t"
734
"movq %%mm1, (%2, %%"REG_a") \n\t"
735
"movq (%1, %3), %%mm1 \n\t"
736
"movq (%1, %%"REG_a"), %%mm0 \n\t"
737
PAVGB" %%mm1, %%mm2 \n\t"
738
PAVGB" %%mm0, %%mm1 \n\t"
739
"add %%"REG_a", %2 \n\t"
740
"add %%"REG_a", %1 \n\t"
741
"movq (%2, %3), %%mm3 \n\t"
742
"movq (%2, %%"REG_a"), %%mm4 \n\t"
743
PAVGB" %%mm3, %%mm2 \n\t"
744
PAVGB" %%mm4, %%mm1 \n\t"
745
"movq %%mm2, (%2, %3) \n\t"
746
"movq %%mm1, (%2, %%"REG_a") \n\t"
747
"add %%"REG_a", %2 \n\t"
750
:"+g"(h), "+S"(pixels), "+D"(block)
751
:"r" ((x86_reg)line_size)
752
:"%"REG_a, "memory");
755
/* Note this is not correctly rounded, but this function is only
756
* used for B-frames so it does not matter. */
757
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
761
"lea (%3, %3), %%"REG_a" \n\t"
762
"movq (%1), %%mm0 \n\t"
763
PAVGB" 1(%1), %%mm0 \n\t"
766
"movq (%1, %%"REG_a"), %%mm2 \n\t"
767
"movq (%1, %3), %%mm1 \n\t"
768
"psubusb %%mm6, %%mm2 \n\t"
769
PAVGB" 1(%1, %3), %%mm1 \n\t"
770
PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
771
"add %%"REG_a", %1 \n\t"
772
PAVGB" %%mm1, %%mm0 \n\t"
773
PAVGB" %%mm2, %%mm1 \n\t"
774
PAVGB" (%2), %%mm0 \n\t"
775
PAVGB" (%2, %3), %%mm1 \n\t"
776
"movq %%mm0, (%2) \n\t"
777
"movq %%mm1, (%2, %3) \n\t"
778
"movq (%1, %3), %%mm1 \n\t"
779
"movq (%1, %%"REG_a"), %%mm0 \n\t"
780
PAVGB" 1(%1, %3), %%mm1 \n\t"
781
PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
782
"add %%"REG_a", %2 \n\t"
783
"add %%"REG_a", %1 \n\t"
784
PAVGB" %%mm1, %%mm2 \n\t"
785
PAVGB" %%mm0, %%mm1 \n\t"
786
PAVGB" (%2), %%mm2 \n\t"
787
PAVGB" (%2, %3), %%mm1 \n\t"
788
"movq %%mm2, (%2) \n\t"
789
"movq %%mm1, (%2, %3) \n\t"
790
"add %%"REG_a", %2 \n\t"
793
:"+g"(h), "+S"(pixels), "+D"(block)
794
:"r" ((x86_reg)line_size)
795
:"%"REG_a, "memory");
798
static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
802
"movd (%1), %%mm0 \n\t"
803
"movd (%1, %2), %%mm1 \n\t"
804
"movd (%1, %2, 2), %%mm2 \n\t"
805
"movd (%1, %3), %%mm3 \n\t"
806
PAVGB" (%0), %%mm0 \n\t"
807
PAVGB" (%0, %2), %%mm1 \n\t"
808
PAVGB" (%0, %2, 2), %%mm2 \n\t"
809
PAVGB" (%0, %3), %%mm3 \n\t"
810
"movd %%mm0, (%1) \n\t"
811
"movd %%mm1, (%1, %2) \n\t"
812
"movd %%mm2, (%1, %2, 2) \n\t"
813
"movd %%mm3, (%1, %3) \n\t"
814
::"S"(pixels), "D"(block),
815
"r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
817
block += 4*line_size;
818
pixels += 4*line_size;
823
//FIXME the following could be optimized too ...
824
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
825
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
826
DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
828
static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
829
DEF(put_pixels8_y2)(block , pixels , line_size, h);
830
DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
832
static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
833
DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
834
DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
836
static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
837
DEF(avg_pixels8)(block , pixels , line_size, h);
838
DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
840
static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
841
DEF(avg_pixels8_x2)(block , pixels , line_size, h);
842
DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
844
static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
845
DEF(avg_pixels8_y2)(block , pixels , line_size, h);
846
DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
848
static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
849
DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
850
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
853
#define QPEL_2TAP_L3(OPNAME) \
854
static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
857
"movq (%1,%2), %%mm0 \n\t"\
858
"movq 8(%1,%2), %%mm1 \n\t"\
859
PAVGB" (%1,%3), %%mm0 \n\t"\
860
PAVGB" 8(%1,%3), %%mm1 \n\t"\
861
PAVGB" (%1), %%mm0 \n\t"\
862
PAVGB" 8(%1), %%mm1 \n\t"\
863
STORE_OP( (%1,%4),%%mm0)\
864
STORE_OP(8(%1,%4),%%mm1)\
865
"movq %%mm0, (%1,%4) \n\t"\
866
"movq %%mm1, 8(%1,%4) \n\t"\
871
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
872
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
876
static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
879
"movq (%1,%2), %%mm0 \n\t"\
880
PAVGB" (%1,%3), %%mm0 \n\t"\
881
PAVGB" (%1), %%mm0 \n\t"\
882
STORE_OP((%1,%4),%%mm0)\
883
"movq %%mm0, (%1,%4) \n\t"\
888
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
889
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
894
#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
897
#define STORE_OP(a,b)