2
* DSP utils mmx functions are compiled twice for rnd/no_rnd
3
* Copyright (c) 2000, 2001 Fabrice Bellard.
4
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
6
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8
* and improved by Zdenek Kabelac <kabi@users.sf.net>
10
* This file is part of FFmpeg.
12
* FFmpeg is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU Lesser General Public
14
* License as published by the Free Software Foundation; either
15
* version 2.1 of the License, or (at your option) any later version.
17
* FFmpeg is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
* Lesser General Public License for more details.
22
* You should have received a copy of the GNU Lesser General Public
23
* License along with FFmpeg; if not, write to the Free Software
24
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
32
"lea (%3, %3), %%"REG_a" \n\t"
35
"movq (%1), %%mm0 \n\t"
36
"movq 1(%1), %%mm1 \n\t"
37
"movq (%1, %3), %%mm2 \n\t"
38
"movq 1(%1, %3), %%mm3 \n\t"
39
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
40
"movq %%mm4, (%2) \n\t"
41
"movq %%mm5, (%2, %3) \n\t"
42
"add %%"REG_a", %1 \n\t"
43
"add %%"REG_a", %2 \n\t"
44
"movq (%1), %%mm0 \n\t"
45
"movq 1(%1), %%mm1 \n\t"
46
"movq (%1, %3), %%mm2 \n\t"
47
"movq 1(%1, %3), %%mm3 \n\t"
48
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
49
"movq %%mm4, (%2) \n\t"
50
"movq %%mm5, (%2, %3) \n\t"
51
"add %%"REG_a", %1 \n\t"
52
"add %%"REG_a", %2 \n\t"
55
:"+g"(h), "+S"(pixels), "+D"(block)
60
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
66
"movq (%1), %%mm0 \n\t"
67
"movq (%2), %%mm1 \n\t"
70
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
71
"movq %%mm4, (%3) \n\t"
76
"movq (%1), %%mm0 \n\t"
77
"movq (%2), %%mm1 \n\t"
79
"movq (%1), %%mm2 \n\t"
80
"movq 8(%2), %%mm3 \n\t"
82
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
83
"movq %%mm4, (%3) \n\t"
85
"movq %%mm5, (%3) \n\t"
87
"movq (%1), %%mm0 \n\t"
88
"movq 16(%2), %%mm1 \n\t"
90
"movq (%1), %%mm2 \n\t"
91
"movq 24(%2), %%mm3 \n\t"
94
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
95
"movq %%mm4, (%3) \n\t"
97
"movq %%mm5, (%3) \n\t"
101
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
102
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
104
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
106
:"S"((long)src1Stride), "D"((long)dstStride)
110
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
114
"lea (%3, %3), %%"REG_a" \n\t"
117
"movq (%1), %%mm0 \n\t"
118
"movq 1(%1), %%mm1 \n\t"
119
"movq (%1, %3), %%mm2 \n\t"
120
"movq 1(%1, %3), %%mm3 \n\t"
121
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
122
"movq %%mm4, (%2) \n\t"
123
"movq %%mm5, (%2, %3) \n\t"
124
"movq 8(%1), %%mm0 \n\t"
125
"movq 9(%1), %%mm1 \n\t"
126
"movq 8(%1, %3), %%mm2 \n\t"
127
"movq 9(%1, %3), %%mm3 \n\t"
128
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
129
"movq %%mm4, 8(%2) \n\t"
130
"movq %%mm5, 8(%2, %3) \n\t"
131
"add %%"REG_a", %1 \n\t"
132
"add %%"REG_a", %2 \n\t"
133
"movq (%1), %%mm0 \n\t"
134
"movq 1(%1), %%mm1 \n\t"
135
"movq (%1, %3), %%mm2 \n\t"
136
"movq 1(%1, %3), %%mm3 \n\t"
137
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
138
"movq %%mm4, (%2) \n\t"
139
"movq %%mm5, (%2, %3) \n\t"
140
"movq 8(%1), %%mm0 \n\t"
141
"movq 9(%1), %%mm1 \n\t"
142
"movq 8(%1, %3), %%mm2 \n\t"
143
"movq 9(%1, %3), %%mm3 \n\t"
144
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
145
"movq %%mm4, 8(%2) \n\t"
146
"movq %%mm5, 8(%2, %3) \n\t"
147
"add %%"REG_a", %1 \n\t"
148
"add %%"REG_a", %2 \n\t"
151
:"+g"(h), "+S"(pixels), "+D"(block)
152
:"r"((long)line_size)
156
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
162
"movq (%1), %%mm0 \n\t"
163
"movq (%2), %%mm1 \n\t"
164
"movq 8(%1), %%mm2 \n\t"
165
"movq 8(%2), %%mm3 \n\t"
168
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
169
"movq %%mm4, (%3) \n\t"
170
"movq %%mm5, 8(%3) \n\t"
175
"movq (%1), %%mm0 \n\t"
176
"movq (%2), %%mm1 \n\t"
177
"movq 8(%1), %%mm2 \n\t"
178
"movq 8(%2), %%mm3 \n\t"
180
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
181
"movq %%mm4, (%3) \n\t"
182
"movq %%mm5, 8(%3) \n\t"
184
"movq (%1), %%mm0 \n\t"
185
"movq 16(%2), %%mm1 \n\t"
186
"movq 8(%1), %%mm2 \n\t"
187
"movq 24(%2), %%mm3 \n\t"
189
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
190
"movq %%mm4, (%3) \n\t"
191
"movq %%mm5, 8(%3) \n\t"
196
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
197
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
199
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
201
:"S"((long)src1Stride), "D"((long)dstStride)
205
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
209
"lea (%3, %3), %%"REG_a" \n\t"
210
"movq (%1), %%mm0 \n\t"
213
"movq (%1, %3), %%mm1 \n\t"
214
"movq (%1, %%"REG_a"),%%mm2 \n\t"
215
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
216
"movq %%mm4, (%2) \n\t"
217
"movq %%mm5, (%2, %3) \n\t"
218
"add %%"REG_a", %1 \n\t"
219
"add %%"REG_a", %2 \n\t"
220
"movq (%1, %3), %%mm1 \n\t"
221
"movq (%1, %%"REG_a"),%%mm0 \n\t"
222
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
223
"movq %%mm4, (%2) \n\t"
224
"movq %%mm5, (%2, %3) \n\t"
225
"add %%"REG_a", %1 \n\t"
226
"add %%"REG_a", %2 \n\t"
229
:"+g"(h), "+S"(pixels), "+D"(block)
230
:"r"((long)line_size)
234
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
237
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
239
"movq (%1), %%mm0 \n\t"
240
"movq 1(%1), %%mm4 \n\t"
241
"movq %%mm0, %%mm1 \n\t"
242
"movq %%mm4, %%mm5 \n\t"
243
"punpcklbw %%mm7, %%mm0 \n\t"
244
"punpcklbw %%mm7, %%mm4 \n\t"
245
"punpckhbw %%mm7, %%mm1 \n\t"
246
"punpckhbw %%mm7, %%mm5 \n\t"
247
"paddusw %%mm0, %%mm4 \n\t"
248
"paddusw %%mm1, %%mm5 \n\t"
249
"xor %%"REG_a", %%"REG_a" \n\t"
253
"movq (%1, %%"REG_a"), %%mm0 \n\t"
254
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
255
"movq %%mm0, %%mm1 \n\t"
256
"movq %%mm2, %%mm3 \n\t"
257
"punpcklbw %%mm7, %%mm0 \n\t"
258
"punpcklbw %%mm7, %%mm2 \n\t"
259
"punpckhbw %%mm7, %%mm1 \n\t"
260
"punpckhbw %%mm7, %%mm3 \n\t"
261
"paddusw %%mm2, %%mm0 \n\t"
262
"paddusw %%mm3, %%mm1 \n\t"
263
"paddusw %%mm6, %%mm4 \n\t"
264
"paddusw %%mm6, %%mm5 \n\t"
265
"paddusw %%mm0, %%mm4 \n\t"
266
"paddusw %%mm1, %%mm5 \n\t"
267
"psrlw $2, %%mm4 \n\t"
268
"psrlw $2, %%mm5 \n\t"
269
"packuswb %%mm5, %%mm4 \n\t"
270
"movq %%mm4, (%2, %%"REG_a") \n\t"
271
"add %3, %%"REG_a" \n\t"
273
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
274
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
275
"movq %%mm2, %%mm3 \n\t"
276
"movq %%mm4, %%mm5 \n\t"
277
"punpcklbw %%mm7, %%mm2 \n\t"
278
"punpcklbw %%mm7, %%mm4 \n\t"
279
"punpckhbw %%mm7, %%mm3 \n\t"
280
"punpckhbw %%mm7, %%mm5 \n\t"
281
"paddusw %%mm2, %%mm4 \n\t"
282
"paddusw %%mm3, %%mm5 \n\t"
283
"paddusw %%mm6, %%mm0 \n\t"
284
"paddusw %%mm6, %%mm1 \n\t"
285
"paddusw %%mm4, %%mm0 \n\t"
286
"paddusw %%mm5, %%mm1 \n\t"
287
"psrlw $2, %%mm0 \n\t"
288
"psrlw $2, %%mm1 \n\t"
289
"packuswb %%mm1, %%mm0 \n\t"
290
"movq %%mm0, (%2, %%"REG_a") \n\t"
291
"add %3, %%"REG_a" \n\t"
295
:"+g"(h), "+S"(pixels)
296
:"D"(block), "r"((long)line_size)
301
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
307
"movd %0, %%mm0 \n\t"
308
"movd %1, %%mm1 \n\t"
309
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
310
"movd %%mm2, %0 \n\t"
320
// in case more speed is needed - unroling would certainly help
321
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
327
"movq %0, %%mm0 \n\t"
328
"movq %1, %%mm1 \n\t"
329
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
330
"movq %%mm2, %0 \n\t"
340
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
346
"movq %0, %%mm0 \n\t"
347
"movq %1, %%mm1 \n\t"
348
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
349
"movq %%mm2, %0 \n\t"
350
"movq 8%0, %%mm0 \n\t"
351
"movq 8%1, %%mm1 \n\t"
352
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
353
"movq %%mm2, 8%0 \n\t"
363
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
369
"movq %1, %%mm0 \n\t"
370
"movq 1%1, %%mm1 \n\t"
371
"movq %0, %%mm3 \n\t"
372
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
373
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
374
"movq %%mm0, %0 \n\t"
383
static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
389
"movq %1, %%mm0 \n\t"
390
"movq %2, %%mm1 \n\t"
391
"movq %0, %%mm3 \n\t"
392
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
393
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
394
"movq %%mm0, %0 \n\t"
396
:"m"(*src1), "m"(*src2)
404
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
410
"movq %1, %%mm0 \n\t"
411
"movq 1%1, %%mm1 \n\t"
412
"movq %0, %%mm3 \n\t"
413
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
414
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
415
"movq %%mm0, %0 \n\t"
416
"movq 8%1, %%mm0 \n\t"
417
"movq 9%1, %%mm1 \n\t"
418
"movq 8%0, %%mm3 \n\t"
419
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
420
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
421
"movq %%mm0, 8%0 \n\t"
430
static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
436
"movq %1, %%mm0 \n\t"
437
"movq %2, %%mm1 \n\t"
438
"movq %0, %%mm3 \n\t"
439
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
440
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
441
"movq %%mm0, %0 \n\t"
442
"movq 8%1, %%mm0 \n\t"
443
"movq 8%2, %%mm1 \n\t"
444
"movq 8%0, %%mm3 \n\t"
445
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
446
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
447
"movq %%mm0, 8%0 \n\t"
449
:"m"(*src1), "m"(*src2)
457
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
461
"lea (%3, %3), %%"REG_a" \n\t"
462
"movq (%1), %%mm0 \n\t"
465
"movq (%1, %3), %%mm1 \n\t"
466
"movq (%1, %%"REG_a"), %%mm2 \n\t"
467
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
468
"movq (%2), %%mm3 \n\t"
469
PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
470
"movq (%2, %3), %%mm3 \n\t"
471
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
472
"movq %%mm0, (%2) \n\t"
473
"movq %%mm1, (%2, %3) \n\t"
474
"add %%"REG_a", %1 \n\t"
475
"add %%"REG_a", %2 \n\t"
477
"movq (%1, %3), %%mm1 \n\t"
478
"movq (%1, %%"REG_a"), %%mm0 \n\t"
479
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
480
"movq (%2), %%mm3 \n\t"
481
PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
482
"movq (%2, %3), %%mm3 \n\t"
483
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
484
"movq %%mm2, (%2) \n\t"
485
"movq %%mm1, (%2, %3) \n\t"
486
"add %%"REG_a", %1 \n\t"
487
"add %%"REG_a", %2 \n\t"
491
:"+g"(h), "+S"(pixels), "+D"(block)
492
:"r"((long)line_size)
496
// this routine is 'slightly' suboptimal but mostly unused
497
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
500
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
502
"movq (%1), %%mm0 \n\t"
503
"movq 1(%1), %%mm4 \n\t"
504
"movq %%mm0, %%mm1 \n\t"
505
"movq %%mm4, %%mm5 \n\t"
506
"punpcklbw %%mm7, %%mm0 \n\t"
507
"punpcklbw %%mm7, %%mm4 \n\t"
508
"punpckhbw %%mm7, %%mm1 \n\t"
509
"punpckhbw %%mm7, %%mm5 \n\t"
510
"paddusw %%mm0, %%mm4 \n\t"
511
"paddusw %%mm1, %%mm5 \n\t"
512
"xor %%"REG_a", %%"REG_a" \n\t"
516
"movq (%1, %%"REG_a"), %%mm0 \n\t"
517
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
518
"movq %%mm0, %%mm1 \n\t"
519
"movq %%mm2, %%mm3 \n\t"
520
"punpcklbw %%mm7, %%mm0 \n\t"
521
"punpcklbw %%mm7, %%mm2 \n\t"
522
"punpckhbw %%mm7, %%mm1 \n\t"
523
"punpckhbw %%mm7, %%mm3 \n\t"
524
"paddusw %%mm2, %%mm0 \n\t"
525
"paddusw %%mm3, %%mm1 \n\t"
526
"paddusw %%mm6, %%mm4 \n\t"
527
"paddusw %%mm6, %%mm5 \n\t"
528
"paddusw %%mm0, %%mm4 \n\t"
529
"paddusw %%mm1, %%mm5 \n\t"
530
"psrlw $2, %%mm4 \n\t"
531
"psrlw $2, %%mm5 \n\t"
532
"movq (%2, %%"REG_a"), %%mm3 \n\t"
533
"packuswb %%mm5, %%mm4 \n\t"
534
"pcmpeqd %%mm2, %%mm2 \n\t"
535
"paddb %%mm2, %%mm2 \n\t"
536
PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
537
"movq %%mm5, (%2, %%"REG_a") \n\t"
538
"add %3, %%"REG_a" \n\t"
540
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
541
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
542
"movq %%mm2, %%mm3 \n\t"
543
"movq %%mm4, %%mm5 \n\t"
544
"punpcklbw %%mm7, %%mm2 \n\t"
545
"punpcklbw %%mm7, %%mm4 \n\t"
546
"punpckhbw %%mm7, %%mm3 \n\t"
547
"punpckhbw %%mm7, %%mm5 \n\t"
548
"paddusw %%mm2, %%mm4 \n\t"
549
"paddusw %%mm3, %%mm5 \n\t"
550
"paddusw %%mm6, %%mm0 \n\t"
551
"paddusw %%mm6, %%mm1 \n\t"
552
"paddusw %%mm4, %%mm0 \n\t"
553
"paddusw %%mm5, %%mm1 \n\t"
554
"psrlw $2, %%mm0 \n\t"
555
"psrlw $2, %%mm1 \n\t"
556
"movq (%2, %%"REG_a"), %%mm3 \n\t"
557
"packuswb %%mm1, %%mm0 \n\t"
558
"pcmpeqd %%mm2, %%mm2 \n\t"
559
"paddb %%mm2, %%mm2 \n\t"
560
PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
561
"movq %%mm1, (%2, %%"REG_a") \n\t"
562
"add %3, %%"REG_a" \n\t"
566
:"+g"(h), "+S"(pixels)
567
:"D"(block), "r"((long)line_size)
572
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
573
DEF(put, pixels8_y2)(block , pixels , line_size, h);
574
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
577
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
578
DEF(put, pixels8_xy2)(block , pixels , line_size, h);
579
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
582
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
583
DEF(avg, pixels8_y2)(block , pixels , line_size, h);
584
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
587
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
588
DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
589
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);