2
* DSP utils : average functions are compiled twice for 3dnow/mmx2
3
* Copyright (c) 2000, 2001 Fabrice Bellard.
4
* Copyright (c) 2002-2004 Michael Niedermayer
6
* This library is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2 of the License, or (at your option) any later version.
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with this library; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
22
* and improved by Zdenek Kabelac <kabi@users.sf.net>
25
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
26
clobber bug - now it will work with 2.95.2 and also with -fPIC
28
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
31
"lea (%3, %3), %%"REG_a" \n\t"
33
"movq (%1), %%mm0 \n\t"
34
"movq (%1, %3), %%mm1 \n\t"
35
PAVGB" 1(%1), %%mm0 \n\t"
36
PAVGB" 1(%1, %3), %%mm1 \n\t"
37
"movq %%mm0, (%2) \n\t"
38
"movq %%mm1, (%2, %3) \n\t"
39
"add %%"REG_a", %1 \n\t"
40
"add %%"REG_a", %2 \n\t"
41
"movq (%1), %%mm0 \n\t"
42
"movq (%1, %3), %%mm1 \n\t"
43
PAVGB" 1(%1), %%mm0 \n\t"
44
PAVGB" 1(%1, %3), %%mm1 \n\t"
45
"add %%"REG_a", %1 \n\t"
46
"movq %%mm0, (%2) \n\t"
47
"movq %%mm1, (%2, %3) \n\t"
48
"add %%"REG_a", %2 \n\t"
51
:"+g"(h), "+S"(pixels), "+D"(block)
52
:"r" ((long)line_size)
56
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
61
"movd (%1), %%mm0 \n\t"
62
"movd (%2), %%mm1 \n\t"
65
PAVGB" %%mm1, %%mm0 \n\t"
66
"movd %%mm0, (%3) \n\t"
70
"movd (%1), %%mm0 \n\t"
72
"movd (%1), %%mm1 \n\t"
73
"movd (%2), %%mm2 \n\t"
74
"movd 4(%2), %%mm3 \n\t"
76
PAVGB" %%mm2, %%mm0 \n\t"
77
PAVGB" %%mm3, %%mm1 \n\t"
78
"movd %%mm0, (%3) \n\t"
80
"movd %%mm1, (%3) \n\t"
82
"movd (%1), %%mm0 \n\t"
84
"movd (%1), %%mm1 \n\t"
85
"movd 8(%2), %%mm2 \n\t"
86
"movd 12(%2), %%mm3 \n\t"
88
PAVGB" %%mm2, %%mm0 \n\t"
89
PAVGB" %%mm3, %%mm1 \n\t"
90
"movd %%mm0, (%3) \n\t"
92
"movd %%mm1, (%3) \n\t"
97
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
98
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
100
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
102
:"S"((long)src1Stride), "D"((long)dstStride)
107
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
112
"movq (%1), %%mm0 \n\t"
113
"movq (%2), %%mm1 \n\t"
116
PAVGB" %%mm1, %%mm0 \n\t"
117
"movq %%mm0, (%3) \n\t"
121
"movq (%1), %%mm0 \n\t"
123
"movq (%1), %%mm1 \n\t"
125
PAVGB" (%2), %%mm0 \n\t"
126
PAVGB" 8(%2), %%mm1 \n\t"
127
"movq %%mm0, (%3) \n\t"
129
"movq %%mm1, (%3) \n\t"
131
"movq (%1), %%mm0 \n\t"
133
"movq (%1), %%mm1 \n\t"
135
PAVGB" 16(%2), %%mm0 \n\t"
136
PAVGB" 24(%2), %%mm1 \n\t"
137
"movq %%mm0, (%3) \n\t"
139
"movq %%mm1, (%3) \n\t"
144
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
145
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
147
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
149
:"S"((long)src1Stride), "D"((long)dstStride)
151
//the following should be used, though better not with gcc ...
152
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
153
:"r"(src1Stride), "r"(dstStride)
157
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
160
"pcmpeqb %%mm6, %%mm6 \n\t"
163
"movq (%1), %%mm0 \n\t"
164
"movq (%2), %%mm1 \n\t"
167
"pxor %%mm6, %%mm0 \n\t"
168
"pxor %%mm6, %%mm1 \n\t"
169
PAVGB" %%mm1, %%mm0 \n\t"
170
"pxor %%mm6, %%mm0 \n\t"
171
"movq %%mm0, (%3) \n\t"
175
"movq (%1), %%mm0 \n\t"
177
"movq (%1), %%mm1 \n\t"
179
"movq (%2), %%mm2 \n\t"
180
"movq 8(%2), %%mm3 \n\t"
181
"pxor %%mm6, %%mm0 \n\t"
182
"pxor %%mm6, %%mm1 \n\t"
183
"pxor %%mm6, %%mm2 \n\t"
184
"pxor %%mm6, %%mm3 \n\t"
185
PAVGB" %%mm2, %%mm0 \n\t"
186
PAVGB" %%mm3, %%mm1 \n\t"
187
"pxor %%mm6, %%mm0 \n\t"
188
"pxor %%mm6, %%mm1 \n\t"
189
"movq %%mm0, (%3) \n\t"
191
"movq %%mm1, (%3) \n\t"
193
"movq (%1), %%mm0 \n\t"
195
"movq (%1), %%mm1 \n\t"
197
"movq 16(%2), %%mm2 \n\t"
198
"movq 24(%2), %%mm3 \n\t"
199
"pxor %%mm6, %%mm0 \n\t"
200
"pxor %%mm6, %%mm1 \n\t"
201
"pxor %%mm6, %%mm2 \n\t"
202
"pxor %%mm6, %%mm3 \n\t"
203
PAVGB" %%mm2, %%mm0 \n\t"
204
PAVGB" %%mm3, %%mm1 \n\t"
205
"pxor %%mm6, %%mm0 \n\t"
206
"pxor %%mm6, %%mm1 \n\t"
207
"movq %%mm0, (%3) \n\t"
209
"movq %%mm1, (%3) \n\t"
214
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
215
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
217
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
219
:"S"((long)src1Stride), "D"((long)dstStride)
221
//the following should be used, though better not with gcc ...
222
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
223
:"r"(src1Stride), "r"(dstStride)
227
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
232
"movd (%1), %%mm0 \n\t"
233
"movd (%2), %%mm1 \n\t"
236
PAVGB" %%mm1, %%mm0 \n\t"
237
PAVGB" (%3), %%mm0 \n\t"
238
"movd %%mm0, (%3) \n\t"
242
"movd (%1), %%mm0 \n\t"
244
"movd (%1), %%mm1 \n\t"
246
PAVGB" (%2), %%mm0 \n\t"
247
PAVGB" 4(%2), %%mm1 \n\t"
248
PAVGB" (%3), %%mm0 \n\t"
249
"movd %%mm0, (%3) \n\t"
251
PAVGB" (%3), %%mm1 \n\t"
252
"movd %%mm1, (%3) \n\t"
254
"movd (%1), %%mm0 \n\t"
256
"movd (%1), %%mm1 \n\t"
258
PAVGB" 8(%2), %%mm0 \n\t"
259
PAVGB" 12(%2), %%mm1 \n\t"
260
PAVGB" (%3), %%mm0 \n\t"
261
"movd %%mm0, (%3) \n\t"
263
PAVGB" (%3), %%mm1 \n\t"
264
"movd %%mm1, (%3) \n\t"
269
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
270
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
272
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
274
:"S"((long)src1Stride), "D"((long)dstStride)
279
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
284
"movq (%1), %%mm0 \n\t"
285
"movq (%2), %%mm1 \n\t"
288
PAVGB" %%mm1, %%mm0 \n\t"
289
PAVGB" (%3), %%mm0 \n\t"
290
"movq %%mm0, (%3) \n\t"
294
"movq (%1), %%mm0 \n\t"
296
"movq (%1), %%mm1 \n\t"
298
PAVGB" (%2), %%mm0 \n\t"
299
PAVGB" 8(%2), %%mm1 \n\t"
300
PAVGB" (%3), %%mm0 \n\t"
301
"movq %%mm0, (%3) \n\t"
303
PAVGB" (%3), %%mm1 \n\t"
304
"movq %%mm1, (%3) \n\t"
306
"movq (%1), %%mm0 \n\t"
308
"movq (%1), %%mm1 \n\t"
310
PAVGB" 16(%2), %%mm0 \n\t"
311
PAVGB" 24(%2), %%mm1 \n\t"
312
PAVGB" (%3), %%mm0 \n\t"
313
"movq %%mm0, (%3) \n\t"
315
PAVGB" (%3), %%mm1 \n\t"
316
"movq %%mm1, (%3) \n\t"
321
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
322
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
324
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
326
:"S"((long)src1Stride), "D"((long)dstStride)
328
//the following should be used, though better not with gcc ...
329
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
330
:"r"(src1Stride), "r"(dstStride)
334
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
337
"lea (%3, %3), %%"REG_a" \n\t"
339
"movq (%1), %%mm0 \n\t"
340
"movq (%1, %3), %%mm1 \n\t"
341
"movq 8(%1), %%mm2 \n\t"
342
"movq 8(%1, %3), %%mm3 \n\t"
343
PAVGB" 1(%1), %%mm0 \n\t"
344
PAVGB" 1(%1, %3), %%mm1 \n\t"
345
PAVGB" 9(%1), %%mm2 \n\t"
346
PAVGB" 9(%1, %3), %%mm3 \n\t"
347
"movq %%mm0, (%2) \n\t"
348
"movq %%mm1, (%2, %3) \n\t"
349
"movq %%mm2, 8(%2) \n\t"
350
"movq %%mm3, 8(%2, %3) \n\t"
351
"add %%"REG_a", %1 \n\t"
352
"add %%"REG_a", %2 \n\t"
353
"movq (%1), %%mm0 \n\t"
354
"movq (%1, %3), %%mm1 \n\t"
355
"movq 8(%1), %%mm2 \n\t"
356
"movq 8(%1, %3), %%mm3 \n\t"
357
PAVGB" 1(%1), %%mm0 \n\t"
358
PAVGB" 1(%1, %3), %%mm1 \n\t"
359
PAVGB" 9(%1), %%mm2 \n\t"
360
PAVGB" 9(%1, %3), %%mm3 \n\t"
361
"add %%"REG_a", %1 \n\t"
362
"movq %%mm0, (%2) \n\t"
363
"movq %%mm1, (%2, %3) \n\t"
364
"movq %%mm2, 8(%2) \n\t"
365
"movq %%mm3, 8(%2, %3) \n\t"
366
"add %%"REG_a", %2 \n\t"
369
:"+g"(h), "+S"(pixels), "+D"(block)
370
:"r" ((long)line_size)
371
:"%"REG_a, "memory");
374
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
379
"movq (%1), %%mm0 \n\t"
380
"movq 8(%1), %%mm1 \n\t"
381
PAVGB" (%2), %%mm0 \n\t"
382
PAVGB" 8(%2), %%mm1 \n\t"
385
"movq %%mm0, (%3) \n\t"
386
"movq %%mm1, 8(%3) \n\t"
390
"movq (%1), %%mm0 \n\t"
391
"movq 8(%1), %%mm1 \n\t"
393
PAVGB" (%2), %%mm0 \n\t"
394
PAVGB" 8(%2), %%mm1 \n\t"
395
"movq %%mm0, (%3) \n\t"
396
"movq %%mm1, 8(%3) \n\t"
398
"movq (%1), %%mm0 \n\t"
399
"movq 8(%1), %%mm1 \n\t"
401
PAVGB" 16(%2), %%mm0 \n\t"
402
PAVGB" 24(%2), %%mm1 \n\t"
403
"movq %%mm0, (%3) \n\t"
404
"movq %%mm1, 8(%3) \n\t"
409
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
410
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
412
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
414
:"S"((long)src1Stride), "D"((long)dstStride)
416
//the following should be used, though better not with gcc ...
417
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
418
:"r"(src1Stride), "r"(dstStride)
422
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
427
"movq (%1), %%mm0 \n\t"
428
"movq 8(%1), %%mm1 \n\t"
429
PAVGB" (%2), %%mm0 \n\t"
430
PAVGB" 8(%2), %%mm1 \n\t"
433
PAVGB" (%3), %%mm0 \n\t"
434
PAVGB" 8(%3), %%mm1 \n\t"
435
"movq %%mm0, (%3) \n\t"
436
"movq %%mm1, 8(%3) \n\t"
440
"movq (%1), %%mm0 \n\t"
441
"movq 8(%1), %%mm1 \n\t"
443
PAVGB" (%2), %%mm0 \n\t"
444
PAVGB" 8(%2), %%mm1 \n\t"
445
PAVGB" (%3), %%mm0 \n\t"
446
PAVGB" 8(%3), %%mm1 \n\t"
447
"movq %%mm0, (%3) \n\t"
448
"movq %%mm1, 8(%3) \n\t"
450
"movq (%1), %%mm0 \n\t"
451
"movq 8(%1), %%mm1 \n\t"
453
PAVGB" 16(%2), %%mm0 \n\t"
454
PAVGB" 24(%2), %%mm1 \n\t"
455
PAVGB" (%3), %%mm0 \n\t"
456
PAVGB" 8(%3), %%mm1 \n\t"
457
"movq %%mm0, (%3) \n\t"
458
"movq %%mm1, 8(%3) \n\t"
463
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
464
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
466
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
468
:"S"((long)src1Stride), "D"((long)dstStride)
470
//the following should be used, though better not with gcc ...
471
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
472
:"r"(src1Stride), "r"(dstStride)
476
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
479
"pcmpeqb %%mm6, %%mm6 \n\t"
482
"movq (%1), %%mm0 \n\t"
483
"movq 8(%1), %%mm1 \n\t"
484
"movq (%2), %%mm2 \n\t"
485
"movq 8(%2), %%mm3 \n\t"
486
"pxor %%mm6, %%mm0 \n\t"
487
"pxor %%mm6, %%mm1 \n\t"
488
"pxor %%mm6, %%mm2 \n\t"
489
"pxor %%mm6, %%mm3 \n\t"
490
PAVGB" %%mm2, %%mm0 \n\t"
491
PAVGB" %%mm3, %%mm1 \n\t"
492
"pxor %%mm6, %%mm0 \n\t"
493
"pxor %%mm6, %%mm1 \n\t"
496
"movq %%mm0, (%3) \n\t"
497
"movq %%mm1, 8(%3) \n\t"
501
"movq (%1), %%mm0 \n\t"
502
"movq 8(%1), %%mm1 \n\t"
504
"movq (%2), %%mm2 \n\t"
505
"movq 8(%2), %%mm3 \n\t"
506
"pxor %%mm6, %%mm0 \n\t"
507
"pxor %%mm6, %%mm1 \n\t"
508
"pxor %%mm6, %%mm2 \n\t"
509
"pxor %%mm6, %%mm3 \n\t"
510
PAVGB" %%mm2, %%mm0 \n\t"
511
PAVGB" %%mm3, %%mm1 \n\t"
512
"pxor %%mm6, %%mm0 \n\t"
513
"pxor %%mm6, %%mm1 \n\t"
514
"movq %%mm0, (%3) \n\t"
515
"movq %%mm1, 8(%3) \n\t"
517
"movq (%1), %%mm0 \n\t"
518
"movq 8(%1), %%mm1 \n\t"
520
"movq 16(%2), %%mm2 \n\t"
521
"movq 24(%2), %%mm3 \n\t"
522
"pxor %%mm6, %%mm0 \n\t"
523
"pxor %%mm6, %%mm1 \n\t"
524
"pxor %%mm6, %%mm2 \n\t"
525
"pxor %%mm6, %%mm3 \n\t"
526
PAVGB" %%mm2, %%mm0 \n\t"
527
PAVGB" %%mm3, %%mm1 \n\t"
528
"pxor %%mm6, %%mm0 \n\t"
529
"pxor %%mm6, %%mm1 \n\t"
530
"movq %%mm0, (%3) \n\t"
531
"movq %%mm1, 8(%3) \n\t"
536
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
537
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
539
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
541
:"S"((long)src1Stride), "D"((long)dstStride)
543
//the following should be used, though better not with gcc ...
544
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
545
:"r"(src1Stride), "r"(dstStride)
549
/* GL: this function does incorrect rounding if overflow */
550
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
554
"lea (%3, %3), %%"REG_a" \n\t"
556
"movq (%1), %%mm0 \n\t"
557
"movq (%1, %3), %%mm2 \n\t"
558
"movq 1(%1), %%mm1 \n\t"
559
"movq 1(%1, %3), %%mm3 \n\t"
560
"add %%"REG_a", %1 \n\t"
561
"psubusb %%mm6, %%mm0 \n\t"
562
"psubusb %%mm6, %%mm2 \n\t"
563
PAVGB" %%mm1, %%mm0 \n\t"
564
PAVGB" %%mm3, %%mm2 \n\t"
565
"movq %%mm0, (%2) \n\t"
566
"movq %%mm2, (%2, %3) \n\t"
567
"movq (%1), %%mm0 \n\t"
568
"movq 1(%1), %%mm1 \n\t"
569
"movq (%1, %3), %%mm2 \n\t"
570
"movq 1(%1, %3), %%mm3 \n\t"
571
"add %%"REG_a", %2 \n\t"
572
"add %%"REG_a", %1 \n\t"
573
"psubusb %%mm6, %%mm0 \n\t"
574
"psubusb %%mm6, %%mm2 \n\t"
575
PAVGB" %%mm1, %%mm0 \n\t"
576
PAVGB" %%mm3, %%mm2 \n\t"
577
"movq %%mm0, (%2) \n\t"
578
"movq %%mm2, (%2, %3) \n\t"
579
"add %%"REG_a", %2 \n\t"
582
:"+g"(h), "+S"(pixels), "+D"(block)
583
:"r" ((long)line_size)
584
:"%"REG_a, "memory");
587
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
590
"lea (%3, %3), %%"REG_a" \n\t"
591
"movq (%1), %%mm0 \n\t"
594
"movq (%1, %3), %%mm1 \n\t"
595
"movq (%1, %%"REG_a"), %%mm2 \n\t"
596
"add %%"REG_a", %1 \n\t"
597
PAVGB" %%mm1, %%mm0 \n\t"
598
PAVGB" %%mm2, %%mm1 \n\t"
599
"movq %%mm0, (%2, %3) \n\t"
600
"movq %%mm1, (%2, %%"REG_a") \n\t"
601
"movq (%1, %3), %%mm1 \n\t"
602
"movq (%1, %%"REG_a"), %%mm0 \n\t"
603
"add %%"REG_a", %2 \n\t"
604
"add %%"REG_a", %1 \n\t"
605
PAVGB" %%mm1, %%mm2 \n\t"
606
PAVGB" %%mm0, %%mm1 \n\t"
607
"movq %%mm2, (%2, %3) \n\t"
608
"movq %%mm1, (%2, %%"REG_a") \n\t"
609
"add %%"REG_a", %2 \n\t"
612
:"+g"(h), "+S"(pixels), "+D" (block)
613
:"r" ((long)line_size)
614
:"%"REG_a, "memory");
617
/* GL: this function does incorrect rounding if overflow */
618
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
622
"lea (%3, %3), %%"REG_a" \n\t"
623
"movq (%1), %%mm0 \n\t"
626
"movq (%1, %3), %%mm1 \n\t"
627
"movq (%1, %%"REG_a"), %%mm2 \n\t"
628
"add %%"REG_a", %1 \n\t"
629
"psubusb %%mm6, %%mm1 \n\t"
630
PAVGB" %%mm1, %%mm0 \n\t"
631
PAVGB" %%mm2, %%mm1 \n\t"
632
"movq %%mm0, (%2, %3) \n\t"
633
"movq %%mm1, (%2, %%"REG_a") \n\t"
634
"movq (%1, %3), %%mm1 \n\t"
635
"movq (%1, %%"REG_a"), %%mm0 \n\t"
636
"add %%"REG_a", %2 \n\t"
637
"add %%"REG_a", %1 \n\t"
638
"psubusb %%mm6, %%mm1 \n\t"
639
PAVGB" %%mm1, %%mm2 \n\t"
640
PAVGB" %%mm0, %%mm1 \n\t"
641
"movq %%mm2, (%2, %3) \n\t"
642
"movq %%mm1, (%2, %%"REG_a") \n\t"
643
"add %%"REG_a", %2 \n\t"
646
:"+g"(h), "+S"(pixels), "+D" (block)
647
:"r" ((long)line_size)
648
:"%"REG_a, "memory");
651
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
654
"lea (%3, %3), %%"REG_a" \n\t"
656
"movq (%2), %%mm0 \n\t"
657
"movq (%2, %3), %%mm1 \n\t"
658
PAVGB" (%1), %%mm0 \n\t"
659
PAVGB" (%1, %3), %%mm1 \n\t"
660
"movq %%mm0, (%2) \n\t"
661
"movq %%mm1, (%2, %3) \n\t"
662
"add %%"REG_a", %1 \n\t"
663
"add %%"REG_a", %2 \n\t"
664
"movq (%2), %%mm0 \n\t"
665
"movq (%2, %3), %%mm1 \n\t"
666
PAVGB" (%1), %%mm0 \n\t"
667
PAVGB" (%1, %3), %%mm1 \n\t"
668
"add %%"REG_a", %1 \n\t"
669
"movq %%mm0, (%2) \n\t"
670
"movq %%mm1, (%2, %3) \n\t"
671
"add %%"REG_a", %2 \n\t"
674
:"+g"(h), "+S"(pixels), "+D"(block)
675
:"r" ((long)line_size)
676
:"%"REG_a, "memory");
679
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
682
"lea (%3, %3), %%"REG_a" \n\t"
684
"movq (%1), %%mm0 \n\t"
685
"movq (%1, %3), %%mm2 \n\t"
686
PAVGB" 1(%1), %%mm0 \n\t"
687
PAVGB" 1(%1, %3), %%mm2 \n\t"
688
PAVGB" (%2), %%mm0 \n\t"
689
PAVGB" (%2, %3), %%mm2 \n\t"
690
"add %%"REG_a", %1 \n\t"
691
"movq %%mm0, (%2) \n\t"
692
"movq %%mm2, (%2, %3) \n\t"
693
"movq (%1), %%mm0 \n\t"
694
"movq (%1, %3), %%mm2 \n\t"
695
PAVGB" 1(%1), %%mm0 \n\t"
696
PAVGB" 1(%1, %3), %%mm2 \n\t"
697
"add %%"REG_a", %2 \n\t"
698
"add %%"REG_a", %1 \n\t"
699
PAVGB" (%2), %%mm0 \n\t"
700
PAVGB" (%2, %3), %%mm2 \n\t"
701
"movq %%mm0, (%2) \n\t"
702
"movq %%mm2, (%2, %3) \n\t"
703
"add %%"REG_a", %2 \n\t"
706
:"+g"(h), "+S"(pixels), "+D"(block)
707
:"r" ((long)line_size)
708
:"%"REG_a, "memory");
711
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
714
"lea (%3, %3), %%"REG_a" \n\t"
715
"movq (%1), %%mm0 \n\t"
718
"movq (%1, %3), %%mm1 \n\t"
719
"movq (%1, %%"REG_a"), %%mm2 \n\t"
720
"add %%"REG_a", %1 \n\t"
721
PAVGB" %%mm1, %%mm0 \n\t"
722
PAVGB" %%mm2, %%mm1 \n\t"
723
"movq (%2, %3), %%mm3 \n\t"
724
"movq (%2, %%"REG_a"), %%mm4 \n\t"
725
PAVGB" %%mm3, %%mm0 \n\t"
726
PAVGB" %%mm4, %%mm1 \n\t"
727
"movq %%mm0, (%2, %3) \n\t"
728
"movq %%mm1, (%2, %%"REG_a") \n\t"
729
"movq (%1, %3), %%mm1 \n\t"
730
"movq (%1, %%"REG_a"), %%mm0 \n\t"
731
PAVGB" %%mm1, %%mm2 \n\t"
732
PAVGB" %%mm0, %%mm1 \n\t"
733
"add %%"REG_a", %2 \n\t"
734
"add %%"REG_a", %1 \n\t"
735
"movq (%2, %3), %%mm3 \n\t"
736
"movq (%2, %%"REG_a"), %%mm4 \n\t"
737
PAVGB" %%mm3, %%mm2 \n\t"
738
PAVGB" %%mm4, %%mm1 \n\t"
739
"movq %%mm2, (%2, %3) \n\t"
740
"movq %%mm1, (%2, %%"REG_a") \n\t"
741
"add %%"REG_a", %2 \n\t"
744
:"+g"(h), "+S"(pixels), "+D"(block)
745
:"r" ((long)line_size)
746
:"%"REG_a, "memory");
749
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
750
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
754
"lea (%3, %3), %%"REG_a" \n\t"
755
"movq (%1), %%mm0 \n\t"
756
PAVGB" 1(%1), %%mm0 \n\t"
759
"movq (%1, %%"REG_a"), %%mm2 \n\t"
760
"movq (%1, %3), %%mm1 \n\t"
761
"psubusb %%mm6, %%mm2 \n\t"
762
PAVGB" 1(%1, %3), %%mm1 \n\t"
763
PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
764
"add %%"REG_a", %1 \n\t"
765
PAVGB" %%mm1, %%mm0 \n\t"
766
PAVGB" %%mm2, %%mm1 \n\t"
767
PAVGB" (%2), %%mm0 \n\t"
768
PAVGB" (%2, %3), %%mm1 \n\t"
769
"movq %%mm0, (%2) \n\t"
770
"movq %%mm1, (%2, %3) \n\t"
771
"movq (%1, %3), %%mm1 \n\t"
772
"movq (%1, %%"REG_a"), %%mm0 \n\t"
773
PAVGB" 1(%1, %3), %%mm1 \n\t"
774
PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
775
"add %%"REG_a", %2 \n\t"
776
"add %%"REG_a", %1 \n\t"
777
PAVGB" %%mm1, %%mm2 \n\t"
778
PAVGB" %%mm0, %%mm1 \n\t"
779
PAVGB" (%2), %%mm2 \n\t"
780
PAVGB" (%2, %3), %%mm1 \n\t"
781
"movq %%mm2, (%2) \n\t"
782
"movq %%mm1, (%2, %3) \n\t"
783
"add %%"REG_a", %2 \n\t"
786
:"+g"(h), "+S"(pixels), "+D"(block)
787
:"r" ((long)line_size)
788
:"%"REG_a, "memory");
791
//FIXME the following could be optimized too ...
792
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
793
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
794
DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
796
static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
797
DEF(put_pixels8_y2)(block , pixels , line_size, h);
798
DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
800
static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
801
DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
802
DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
804
static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
805
DEF(avg_pixels8)(block , pixels , line_size, h);
806
DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
808
static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
809
DEF(avg_pixels8_x2)(block , pixels , line_size, h);
810
DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
812
static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
813
DEF(avg_pixels8_y2)(block , pixels , line_size, h);
814
DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
816
static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
817
DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
818
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);