2
* MMX optimized DSP utils
3
* Copyright (c) 2000, 2001 Fabrice Bellard
4
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6
* This file is part of FFmpeg.
8
* FFmpeg is free software; you can redistribute it and/or
9
* modify it under the terms of the GNU Lesser General Public
10
* License as published by the Free Software Foundation; either
11
* version 2.1 of the License, or (at your option) any later version.
13
* FFmpeg is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
* Lesser General Public License for more details.
18
* You should have received a copy of the GNU Lesser General Public
19
* License along with FFmpeg; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/dsputil.h"
27
#include "libavcodec/mpegvideo.h"
28
#include "libavcodec/mathops.h"
29
#include "dsputil_mmx.h"
32
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
35
"mov $-128, %%"REG_a" \n\t"
36
"pxor %%mm7, %%mm7 \n\t"
39
"movq (%0), %%mm0 \n\t"
40
"movq (%0, %2), %%mm2 \n\t"
41
"movq %%mm0, %%mm1 \n\t"
42
"movq %%mm2, %%mm3 \n\t"
43
"punpcklbw %%mm7, %%mm0 \n\t"
44
"punpckhbw %%mm7, %%mm1 \n\t"
45
"punpcklbw %%mm7, %%mm2 \n\t"
46
"punpckhbw %%mm7, %%mm3 \n\t"
47
"movq %%mm0, (%1, %%"REG_a") \n\t"
48
"movq %%mm1, 8(%1, %%"REG_a") \n\t"
49
"movq %%mm2, 16(%1, %%"REG_a") \n\t"
50
"movq %%mm3, 24(%1, %%"REG_a") \n\t"
52
"add $32, %%"REG_a" \n\t"
55
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
60
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
63
"pxor %%xmm7, %%xmm7 \n\t"
64
"movq (%0), %%xmm0 \n\t"
65
"movq (%0, %2), %%xmm1 \n\t"
66
"movq (%0, %2,2), %%xmm2 \n\t"
67
"movq (%0, %3), %%xmm3 \n\t"
68
"lea (%0,%2,4), %0 \n\t"
69
"punpcklbw %%xmm7, %%xmm0 \n\t"
70
"punpcklbw %%xmm7, %%xmm1 \n\t"
71
"punpcklbw %%xmm7, %%xmm2 \n\t"
72
"punpcklbw %%xmm7, %%xmm3 \n\t"
73
"movdqa %%xmm0, (%1) \n\t"
74
"movdqa %%xmm1, 16(%1) \n\t"
75
"movdqa %%xmm2, 32(%1) \n\t"
76
"movdqa %%xmm3, 48(%1) \n\t"
77
"movq (%0), %%xmm0 \n\t"
78
"movq (%0, %2), %%xmm1 \n\t"
79
"movq (%0, %2,2), %%xmm2 \n\t"
80
"movq (%0, %3), %%xmm3 \n\t"
81
"punpcklbw %%xmm7, %%xmm0 \n\t"
82
"punpcklbw %%xmm7, %%xmm1 \n\t"
83
"punpcklbw %%xmm7, %%xmm2 \n\t"
84
"punpcklbw %%xmm7, %%xmm3 \n\t"
85
"movdqa %%xmm0, 64(%1) \n\t"
86
"movdqa %%xmm1, 80(%1) \n\t"
87
"movdqa %%xmm2, 96(%1) \n\t"
88
"movdqa %%xmm3, 112(%1) \n\t"
90
: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
94
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
97
"pxor %%mm7, %%mm7 \n\t"
98
"mov $-128, %%"REG_a" \n\t"
101
"movq (%0), %%mm0 \n\t"
102
"movq (%1), %%mm2 \n\t"
103
"movq %%mm0, %%mm1 \n\t"
104
"movq %%mm2, %%mm3 \n\t"
105
"punpcklbw %%mm7, %%mm0 \n\t"
106
"punpckhbw %%mm7, %%mm1 \n\t"
107
"punpcklbw %%mm7, %%mm2 \n\t"
108
"punpckhbw %%mm7, %%mm3 \n\t"
109
"psubw %%mm2, %%mm0 \n\t"
110
"psubw %%mm3, %%mm1 \n\t"
111
"movq %%mm0, (%2, %%"REG_a") \n\t"
112
"movq %%mm1, 8(%2, %%"REG_a") \n\t"
115
"add $16, %%"REG_a" \n\t"
117
: "+r" (s1), "+r" (s2)
118
: "r" (block+64), "r" ((x86_reg)stride)
123
static int pix_sum16_mmx(uint8_t * pix, int line_size){
126
x86_reg index= -line_size*h;
129
"pxor %%mm7, %%mm7 \n\t"
130
"pxor %%mm6, %%mm6 \n\t"
132
"movq (%2, %1), %%mm0 \n\t"
133
"movq (%2, %1), %%mm1 \n\t"
134
"movq 8(%2, %1), %%mm2 \n\t"
135
"movq 8(%2, %1), %%mm3 \n\t"
136
"punpcklbw %%mm7, %%mm0 \n\t"
137
"punpckhbw %%mm7, %%mm1 \n\t"
138
"punpcklbw %%mm7, %%mm2 \n\t"
139
"punpckhbw %%mm7, %%mm3 \n\t"
140
"paddw %%mm0, %%mm1 \n\t"
141
"paddw %%mm2, %%mm3 \n\t"
142
"paddw %%mm1, %%mm3 \n\t"
143
"paddw %%mm3, %%mm6 \n\t"
146
"movq %%mm6, %%mm5 \n\t"
147
"psrlq $32, %%mm6 \n\t"
148
"paddw %%mm5, %%mm6 \n\t"
149
"movq %%mm6, %%mm5 \n\t"
150
"psrlq $16, %%mm6 \n\t"
151
"paddw %%mm5, %%mm6 \n\t"
152
"movd %%mm6, %0 \n\t"
153
"andl $0xFFFF, %0 \n\t"
154
: "=&r" (sum), "+r" (index)
155
: "r" (pix - index), "r" ((x86_reg)line_size)
161
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
168
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
169
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
171
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
173
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
174
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
176
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
177
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
178
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
180
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
181
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
183
"pmaddwd %%mm3,%%mm3\n"
184
"pmaddwd %%mm4,%%mm4\n"
186
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
187
pix2^2+pix3^2+pix6^2+pix7^2) */
188
"paddd %%mm3,%%mm4\n"
189
"paddd %%mm2,%%mm7\n"
192
"paddd %%mm4,%%mm7\n"
197
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
198
"paddd %%mm7,%%mm1\n"
200
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
204
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
209
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
210
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
212
"movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
213
"movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
214
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
215
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
217
/* todo: mm1-mm2, mm3-mm4 */
218
/* algo: subtract mm1 from mm2 with saturation and vice versa */
219
/* OR the results to get absolute difference */
222
"psubusb %%mm2,%%mm1\n"
223
"psubusb %%mm4,%%mm3\n"
224
"psubusb %%mm5,%%mm2\n"
225
"psubusb %%mm6,%%mm4\n"
230
/* now convert to 16-bit vectors so we can square them */
234
"punpckhbw %%mm0,%%mm2\n"
235
"punpckhbw %%mm0,%%mm4\n"
236
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
237
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
239
"pmaddwd %%mm2,%%mm2\n"
240
"pmaddwd %%mm4,%%mm4\n"
241
"pmaddwd %%mm1,%%mm1\n"
242
"pmaddwd %%mm3,%%mm3\n"
244
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
245
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
247
"paddd %%mm2,%%mm1\n"
248
"paddd %%mm4,%%mm3\n"
249
"paddd %%mm1,%%mm7\n"
250
"paddd %%mm3,%%mm7\n"
256
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
257
"paddd %%mm7,%%mm1\n"
259
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
260
: "r" ((x86_reg)line_size) , "m" (h)
265
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
269
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
270
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
272
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
273
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
274
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
275
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
277
/* todo: mm1-mm2, mm3-mm4 */
278
/* algo: subtract mm1 from mm2 with saturation and vice versa */
279
/* OR the results to get absolute difference */
282
"psubusb %%mm2,%%mm1\n"
283
"psubusb %%mm4,%%mm3\n"
284
"psubusb %%mm5,%%mm2\n"
285
"psubusb %%mm6,%%mm4\n"
290
/* now convert to 16-bit vectors so we can square them */
294
"punpckhbw %%mm0,%%mm2\n"
295
"punpckhbw %%mm0,%%mm4\n"
296
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
297
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
299
"pmaddwd %%mm2,%%mm2\n"
300
"pmaddwd %%mm4,%%mm4\n"
301
"pmaddwd %%mm1,%%mm1\n"
302
"pmaddwd %%mm3,%%mm3\n"
307
"paddd %%mm2,%%mm1\n"
308
"paddd %%mm4,%%mm3\n"
309
"paddd %%mm1,%%mm7\n"
310
"paddd %%mm3,%%mm7\n"
316
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
317
"paddd %%mm7,%%mm1\n"
319
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
320
: "r" ((x86_reg)line_size) , "m" (h)
325
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
329
"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
330
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
332
"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
333
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
334
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
335
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
337
/* todo: mm1-mm2, mm3-mm4 */
338
/* algo: subtract mm1 from mm2 with saturation and vice versa */
339
/* OR the results to get absolute difference */
340
"movdqa %%xmm1,%%xmm5\n"
341
"movdqa %%xmm3,%%xmm6\n"
342
"psubusb %%xmm2,%%xmm1\n"
343
"psubusb %%xmm4,%%xmm3\n"
344
"psubusb %%xmm5,%%xmm2\n"
345
"psubusb %%xmm6,%%xmm4\n"
347
"por %%xmm1,%%xmm2\n"
348
"por %%xmm3,%%xmm4\n"
350
/* now convert to 16-bit vectors so we can square them */
351
"movdqa %%xmm2,%%xmm1\n"
352
"movdqa %%xmm4,%%xmm3\n"
354
"punpckhbw %%xmm0,%%xmm2\n"
355
"punpckhbw %%xmm0,%%xmm4\n"
356
"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
357
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
359
"pmaddwd %%xmm2,%%xmm2\n"
360
"pmaddwd %%xmm4,%%xmm4\n"
361
"pmaddwd %%xmm1,%%xmm1\n"
362
"pmaddwd %%xmm3,%%xmm3\n"
364
"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
365
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
367
"paddd %%xmm2,%%xmm1\n"
368
"paddd %%xmm4,%%xmm3\n"
369
"paddd %%xmm1,%%xmm7\n"
370
"paddd %%xmm3,%%xmm7\n"
375
"movdqa %%xmm7,%%xmm1\n"
376
"psrldq $8, %%xmm7\n" /* shift hi qword to lo */
377
"paddd %%xmm1,%%xmm7\n"
378
"movdqa %%xmm7,%%xmm1\n"
379
"psrldq $4, %%xmm7\n" /* shift hi dword to lo */
380
"paddd %%xmm1,%%xmm7\n"
382
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
383
: "r" ((x86_reg)line_size));
387
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
395
"movq %%mm0, %%mm1\n"
399
"movq %%mm0, %%mm2\n"
400
"movq %%mm1, %%mm3\n"
401
"punpcklbw %%mm7,%%mm0\n"
402
"punpcklbw %%mm7,%%mm1\n"
403
"punpckhbw %%mm7,%%mm2\n"
404
"punpckhbw %%mm7,%%mm3\n"
405
"psubw %%mm1, %%mm0\n"
406
"psubw %%mm3, %%mm2\n"
411
"movq %%mm4, %%mm1\n"
415
"movq %%mm4, %%mm5\n"
416
"movq %%mm1, %%mm3\n"
417
"punpcklbw %%mm7,%%mm4\n"
418
"punpcklbw %%mm7,%%mm1\n"
419
"punpckhbw %%mm7,%%mm5\n"
420
"punpckhbw %%mm7,%%mm3\n"
421
"psubw %%mm1, %%mm4\n"
422
"psubw %%mm3, %%mm5\n"
423
"psubw %%mm4, %%mm0\n"
424
"psubw %%mm5, %%mm2\n"
425
"pxor %%mm3, %%mm3\n"
426
"pxor %%mm1, %%mm1\n"
427
"pcmpgtw %%mm0, %%mm3\n\t"
428
"pcmpgtw %%mm2, %%mm1\n\t"
429
"pxor %%mm3, %%mm0\n"
430
"pxor %%mm1, %%mm2\n"
431
"psubw %%mm3, %%mm0\n"
432
"psubw %%mm1, %%mm2\n"
433
"paddw %%mm0, %%mm2\n"
434
"paddw %%mm2, %%mm6\n"
440
"movq %%mm0, %%mm1\n"
444
"movq %%mm0, %%mm2\n"
445
"movq %%mm1, %%mm3\n"
446
"punpcklbw %%mm7,%%mm0\n"
447
"punpcklbw %%mm7,%%mm1\n"
448
"punpckhbw %%mm7,%%mm2\n"
449
"punpckhbw %%mm7,%%mm3\n"
450
"psubw %%mm1, %%mm0\n"
451
"psubw %%mm3, %%mm2\n"
452
"psubw %%mm0, %%mm4\n"
453
"psubw %%mm2, %%mm5\n"
454
"pxor %%mm3, %%mm3\n"
455
"pxor %%mm1, %%mm1\n"
456
"pcmpgtw %%mm4, %%mm3\n\t"
457
"pcmpgtw %%mm5, %%mm1\n\t"
458
"pxor %%mm3, %%mm4\n"
459
"pxor %%mm1, %%mm5\n"
460
"psubw %%mm3, %%mm4\n"
461
"psubw %%mm1, %%mm5\n"
462
"paddw %%mm4, %%mm5\n"
463
"paddw %%mm5, %%mm6\n"
468
"movq %%mm4, %%mm1\n"
472
"movq %%mm4, %%mm5\n"
473
"movq %%mm1, %%mm3\n"
474
"punpcklbw %%mm7,%%mm4\n"
475
"punpcklbw %%mm7,%%mm1\n"
476
"punpckhbw %%mm7,%%mm5\n"
477
"punpckhbw %%mm7,%%mm3\n"
478
"psubw %%mm1, %%mm4\n"
479
"psubw %%mm3, %%mm5\n"
480
"psubw %%mm4, %%mm0\n"
481
"psubw %%mm5, %%mm2\n"
482
"pxor %%mm3, %%mm3\n"
483
"pxor %%mm1, %%mm1\n"
484
"pcmpgtw %%mm0, %%mm3\n\t"
485
"pcmpgtw %%mm2, %%mm1\n\t"
486
"pxor %%mm3, %%mm0\n"
487
"pxor %%mm1, %%mm2\n"
488
"psubw %%mm3, %%mm0\n"
489
"psubw %%mm1, %%mm2\n"
490
"paddw %%mm0, %%mm2\n"
491
"paddw %%mm2, %%mm6\n"
497
"movq %%mm6, %%mm0\n"
498
"punpcklwd %%mm7,%%mm0\n"
499
"punpckhwd %%mm7,%%mm6\n"
500
"paddd %%mm0, %%mm6\n"
504
"paddd %%mm6,%%mm0\n"
506
: "+r" (pix1), "=r"(tmp)
507
: "r" ((x86_reg)line_size) , "g" (h-2)
512
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
522
"movq %%mm0, %%mm2\n"
523
"movq %%mm1, %%mm3\n"
524
"punpcklbw %%mm7,%%mm0\n"
525
"punpcklbw %%mm7,%%mm1\n"
526
"punpckhbw %%mm7,%%mm2\n"
527
"punpckhbw %%mm7,%%mm3\n"
528
"psubw %%mm1, %%mm0\n"
529
"psubw %%mm3, %%mm2\n"
535
"movq %%mm4, %%mm5\n"
536
"movq %%mm1, %%mm3\n"
537
"punpcklbw %%mm7,%%mm4\n"
538
"punpcklbw %%mm7,%%mm1\n"
539
"punpckhbw %%mm7,%%mm5\n"
540
"punpckhbw %%mm7,%%mm3\n"
541
"psubw %%mm1, %%mm4\n"
542
"psubw %%mm3, %%mm5\n"
543
"psubw %%mm4, %%mm0\n"
544
"psubw %%mm5, %%mm2\n"
545
"pxor %%mm3, %%mm3\n"
546
"pxor %%mm1, %%mm1\n"
547
"pcmpgtw %%mm0, %%mm3\n\t"
548
"pcmpgtw %%mm2, %%mm1\n\t"
549
"pxor %%mm3, %%mm0\n"
550
"pxor %%mm1, %%mm2\n"
551
"psubw %%mm3, %%mm0\n"
552
"psubw %%mm1, %%mm2\n"
553
"paddw %%mm0, %%mm2\n"
554
"paddw %%mm2, %%mm6\n"
561
"movq %%mm0, %%mm2\n"
562
"movq %%mm1, %%mm3\n"
563
"punpcklbw %%mm7,%%mm0\n"
564
"punpcklbw %%mm7,%%mm1\n"
565
"punpckhbw %%mm7,%%mm2\n"
566
"punpckhbw %%mm7,%%mm3\n"
567
"psubw %%mm1, %%mm0\n"
568
"psubw %%mm3, %%mm2\n"
569
"psubw %%mm0, %%mm4\n"
570
"psubw %%mm2, %%mm5\n"
571
"pxor %%mm3, %%mm3\n"
572
"pxor %%mm1, %%mm1\n"
573
"pcmpgtw %%mm4, %%mm3\n\t"
574
"pcmpgtw %%mm5, %%mm1\n\t"
575
"pxor %%mm3, %%mm4\n"
576
"pxor %%mm1, %%mm5\n"
577
"psubw %%mm3, %%mm4\n"
578
"psubw %%mm1, %%mm5\n"
579
"paddw %%mm4, %%mm5\n"
580
"paddw %%mm5, %%mm6\n"
586
"movq %%mm4, %%mm5\n"
587
"movq %%mm1, %%mm3\n"
588
"punpcklbw %%mm7,%%mm4\n"
589
"punpcklbw %%mm7,%%mm1\n"
590
"punpckhbw %%mm7,%%mm5\n"
591
"punpckhbw %%mm7,%%mm3\n"
592
"psubw %%mm1, %%mm4\n"
593
"psubw %%mm3, %%mm5\n"
594
"psubw %%mm4, %%mm0\n"
595
"psubw %%mm5, %%mm2\n"
596
"pxor %%mm3, %%mm3\n"
597
"pxor %%mm1, %%mm1\n"
598
"pcmpgtw %%mm0, %%mm3\n\t"
599
"pcmpgtw %%mm2, %%mm1\n\t"
600
"pxor %%mm3, %%mm0\n"
601
"pxor %%mm1, %%mm2\n"
602
"psubw %%mm3, %%mm0\n"
603
"psubw %%mm1, %%mm2\n"
604
"paddw %%mm0, %%mm2\n"
605
"paddw %%mm2, %%mm6\n"
611
"movq %%mm6, %%mm0\n"
612
"punpcklwd %%mm7,%%mm0\n"
613
"punpckhwd %%mm7,%%mm6\n"
614
"paddd %%mm0, %%mm6\n"
618
"paddd %%mm6,%%mm0\n"
620
: "+r" (pix1), "=r"(tmp)
621
: "r" ((x86_reg)line_size) , "g" (h-2)
623
return tmp + hf_noise8_mmx(pix+8, line_size, h);
626
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
627
MpegEncContext *c = p;
630
if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
631
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
632
score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
634
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
635
else return score1 + FFABS(score2)*8;
638
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
639
MpegEncContext *c = p;
640
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
641
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
643
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
644
else return score1 + FFABS(score2)*8;
647
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
650
assert( (((int)pix) & 7) == 0);
651
assert((line_size &7) ==0);
653
#define SUM(in0, in1, out0, out1) \
654
"movq (%0), %%mm2\n"\
655
"movq 8(%0), %%mm3\n"\
657
"movq %%mm2, " #out0 "\n"\
658
"movq %%mm3, " #out1 "\n"\
659
"psubusb " #in0 ", %%mm2\n"\
660
"psubusb " #in1 ", %%mm3\n"\
661
"psubusb " #out0 ", " #in0 "\n"\
662
"psubusb " #out1 ", " #in1 "\n"\
663
"por %%mm2, " #in0 "\n"\
664
"por %%mm3, " #in1 "\n"\
665
"movq " #in0 ", %%mm2\n"\
666
"movq " #in1 ", %%mm3\n"\
667
"punpcklbw %%mm7, " #in0 "\n"\
668
"punpcklbw %%mm7, " #in1 "\n"\
669
"punpckhbw %%mm7, %%mm2\n"\
670
"punpckhbw %%mm7, %%mm3\n"\
671
"paddw " #in1 ", " #in0 "\n"\
672
"paddw %%mm3, %%mm2\n"\
673
"paddw %%mm2, " #in0 "\n"\
674
"paddw " #in0 ", %%mm6\n"
687
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
689
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
696
"paddw %%mm6,%%mm0\n"
699
"paddw %%mm6,%%mm0\n"
701
: "+r" (pix), "=r"(tmp)
702
: "r" ((x86_reg)line_size) , "m" (h)
708
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
711
assert( (((int)pix) & 7) == 0);
712
assert((line_size &7) ==0);
714
#define SUM(in0, in1, out0, out1) \
715
"movq (%0), " #out0 "\n"\
716
"movq 8(%0), " #out1 "\n"\
718
"psadbw " #out0 ", " #in0 "\n"\
719
"psadbw " #out1 ", " #in1 "\n"\
720
"paddw " #in1 ", " #in0 "\n"\
721
"paddw " #in0 ", %%mm6\n"
733
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
735
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
741
: "+r" (pix), "=r"(tmp)
742
: "r" ((x86_reg)line_size) , "m" (h)
748
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
751
assert( (((int)pix1) & 7) == 0);
752
assert( (((int)pix2) & 7) == 0);
753
assert((line_size &7) ==0);
755
#define SUM(in0, in1, out0, out1) \
757
"movq (%1)," #out0 "\n"\
758
"movq 8(%0),%%mm3\n"\
759
"movq 8(%1)," #out1 "\n"\
762
"psubb " #out0 ", %%mm2\n"\
763
"psubb " #out1 ", %%mm3\n"\
764
"pxor %%mm7, %%mm2\n"\
765
"pxor %%mm7, %%mm3\n"\
766
"movq %%mm2, " #out0 "\n"\
767
"movq %%mm3, " #out1 "\n"\
768
"psubusb " #in0 ", %%mm2\n"\
769
"psubusb " #in1 ", %%mm3\n"\
770
"psubusb " #out0 ", " #in0 "\n"\
771
"psubusb " #out1 ", " #in1 "\n"\
772
"por %%mm2, " #in0 "\n"\
773
"por %%mm3, " #in1 "\n"\
774
"movq " #in0 ", %%mm2\n"\
775
"movq " #in1 ", %%mm3\n"\
776
"punpcklbw %%mm7, " #in0 "\n"\
777
"punpcklbw %%mm7, " #in1 "\n"\
778
"punpckhbw %%mm7, %%mm2\n"\
779
"punpckhbw %%mm7, %%mm3\n"\
780
"paddw " #in1 ", " #in0 "\n"\
781
"paddw %%mm3, %%mm2\n"\
782
"paddw %%mm2, " #in0 "\n"\
783
"paddw " #in0 ", %%mm6\n"
789
"pcmpeqw %%mm7,%%mm7\n"
791
"packsswb %%mm7, %%mm7\n"
798
"psubb %%mm2, %%mm0\n"
799
"psubb %%mm3, %%mm1\n"
800
"pxor %%mm7, %%mm0\n"
801
"pxor %%mm7, %%mm1\n"
805
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
807
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
814
"paddw %%mm6,%%mm0\n"
817
"paddw %%mm6,%%mm0\n"
819
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
820
: "r" ((x86_reg)line_size) , "m" (h)
826
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
829
assert( (((int)pix1) & 7) == 0);
830
assert( (((int)pix2) & 7) == 0);
831
assert((line_size &7) ==0);
833
#define SUM(in0, in1, out0, out1) \
834
"movq (%0)," #out0 "\n"\
836
"movq 8(%0)," #out1 "\n"\
837
"movq 8(%1),%%mm3\n"\
840
"psubb %%mm2, " #out0 "\n"\
841
"psubb %%mm3, " #out1 "\n"\
842
"pxor %%mm7, " #out0 "\n"\
843
"pxor %%mm7, " #out1 "\n"\
844
"psadbw " #out0 ", " #in0 "\n"\
845
"psadbw " #out1 ", " #in1 "\n"\
846
"paddw " #in1 ", " #in0 "\n"\
847
"paddw " #in0 ", %%mm6\n"
852
"pcmpeqw %%mm7,%%mm7\n"
854
"packsswb %%mm7, %%mm7\n"
861
"psubb %%mm2, %%mm0\n"
862
"psubb %%mm3, %%mm1\n"
863
"pxor %%mm7, %%mm0\n"
864
"pxor %%mm7, %%mm1\n"
868
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
870
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
876
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
877
: "r" ((x86_reg)line_size) , "m" (h)
883
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
887
"movq (%2, %0), %%mm0 \n\t"
888
"movq (%1, %0), %%mm1 \n\t"
889
"psubb %%mm0, %%mm1 \n\t"
890
"movq %%mm1, (%3, %0) \n\t"
891
"movq 8(%2, %0), %%mm0 \n\t"
892
"movq 8(%1, %0), %%mm1 \n\t"
893
"psubb %%mm0, %%mm1 \n\t"
894
"movq %%mm1, 8(%3, %0) \n\t"
899
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
902
dst[i+0] = src1[i+0]-src2[i+0];
905
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
911
"movq -1(%1, %0), %%mm0 \n\t" // LT
912
"movq (%1, %0), %%mm1 \n\t" // T
913
"movq -1(%2, %0), %%mm2 \n\t" // L
914
"movq (%2, %0), %%mm3 \n\t" // X
915
"movq %%mm2, %%mm4 \n\t" // L
916
"psubb %%mm0, %%mm2 \n\t"
917
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
918
"movq %%mm4, %%mm5 \n\t" // L
919
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
920
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
921
"pminub %%mm2, %%mm4 \n\t"
922
"pmaxub %%mm1, %%mm4 \n\t"
923
"psubb %%mm4, %%mm3 \n\t" // dst - pred
924
"movq %%mm3, (%3, %0) \n\t"
929
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
935
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
937
*left_top= src1[w-1];
941
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
942
"mov"#m" "#p1", "#a" \n\t"\
943
"mov"#m" "#p2", "#t" \n\t"\
944
"punpcklbw "#a", "#t" \n\t"\
945
"punpcklbw "#a", "#a" \n\t"\
946
"psubw "#t", "#a" \n\t"\
948
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
949
uint8_t *p1b=p1, *p2b=p2;\
951
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
952
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
953
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
956
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
957
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
958
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
959
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
960
"mov"#m1" "#mm"0, %0 \n\t"\
961
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
962
"mov"#m1" %0, "#mm"0 \n\t"\
963
: "+m"(temp), "+r"(p1b), "+r"(p2b)\
964
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
967
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
969
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
970
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
972
#define LBUTTERFLY2(a1,b1,a2,b2)\
973
"paddw " #b1 ", " #a1 " \n\t"\
974
"paddw " #b2 ", " #a2 " \n\t"\
975
"paddw " #b1 ", " #b1 " \n\t"\
976
"paddw " #b2 ", " #b2 " \n\t"\
977
"psubw " #a1 ", " #b1 " \n\t"\
978
"psubw " #a2 ", " #b2 " \n\t"
980
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
981
LBUTTERFLY2(m0, m1, m2, m3)\
982
LBUTTERFLY2(m4, m5, m6, m7)\
983
LBUTTERFLY2(m0, m2, m1, m3)\
984
LBUTTERFLY2(m4, m6, m5, m7)\
985
LBUTTERFLY2(m0, m4, m1, m5)\
986
LBUTTERFLY2(m2, m6, m3, m7)\
988
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
990
#define MMABS_MMX(a,z)\
991
"pxor " #z ", " #z " \n\t"\
992
"pcmpgtw " #a ", " #z " \n\t"\
993
"pxor " #z ", " #a " \n\t"\
994
"psubw " #z ", " #a " \n\t"
996
#define MMABS_MMX2(a,z)\
997
"pxor " #z ", " #z " \n\t"\
998
"psubw " #a ", " #z " \n\t"\
999
"pmaxsw " #z ", " #a " \n\t"
1001
#define MMABS_SSSE3(a,z)\
1002
"pabsw " #a ", " #a " \n\t"
1004
#define MMABS_SUM(a,z, sum)\
1006
"paddusw " #a ", " #sum " \n\t"
1008
#define MMABS_SUM_8x8_NOSPILL\
1009
MMABS(%%xmm0, %%xmm8)\
1010
MMABS(%%xmm1, %%xmm9)\
1011
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1012
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1013
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1014
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1015
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1016
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1017
"paddusw %%xmm1, %%xmm0 \n\t"
1020
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1022
#define MMABS_SUM_8x8_SSE2\
1023
"movdqa %%xmm7, (%1) \n\t"\
1024
MMABS(%%xmm0, %%xmm7)\
1025
MMABS(%%xmm1, %%xmm7)\
1026
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1027
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1028
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1029
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1030
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1031
"movdqa (%1), %%xmm2 \n\t"\
1032
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1033
"paddusw %%xmm1, %%xmm0 \n\t"
1036
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1037
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1038
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1039
#define HSUM_MMX(a, t, dst)\
1040
"movq "#a", "#t" \n\t"\
1041
"psrlq $32, "#a" \n\t"\
1042
"paddusw "#t", "#a" \n\t"\
1043
"movq "#a", "#t" \n\t"\
1044
"psrlq $16, "#a" \n\t"\
1045
"paddusw "#t", "#a" \n\t"\
1046
"movd "#a", "#dst" \n\t"\
1048
#define HSUM_MMX2(a, t, dst)\
1049
"pshufw $0x0E, "#a", "#t" \n\t"\
1050
"paddusw "#t", "#a" \n\t"\
1051
"pshufw $0x01, "#a", "#t" \n\t"\
1052
"paddusw "#t", "#a" \n\t"\
1053
"movd "#a", "#dst" \n\t"\
1055
#define HSUM_SSE2(a, t, dst)\
1056
"movhlps "#a", "#t" \n\t"\
1057
"paddusw "#t", "#a" \n\t"\
1058
"pshuflw $0x0E, "#a", "#t" \n\t"\
1059
"paddusw "#t", "#a" \n\t"\
1060
"pshuflw $0x01, "#a", "#t" \n\t"\
1061
"paddusw "#t", "#a" \n\t"\
1062
"movd "#a", "#dst" \n\t"\
1064
#define HADAMARD8_DIFF_MMX(cpu) \
1065
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1066
DECLARE_ALIGNED(8, uint64_t, temp)[13];\
1071
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1076
"movq %%mm7, 96(%1) \n\t"\
1078
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1079
STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1081
"movq 96(%1), %%mm7 \n\t"\
1082
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1083
STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1089
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1094
"movq %%mm7, 96(%1) \n\t"\
1096
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1097
STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1099
"movq 96(%1), %%mm7 \n\t"\
1100
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1101
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1102
"movq %%mm6, %%mm7 \n\t"\
1103
"movq %%mm0, %%mm6 \n\t"\
1105
LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1108
"movq %%mm7, 64(%1) \n\t"\
1109
MMABS(%%mm0, %%mm7)\
1110
MMABS(%%mm1, %%mm7)\
1111
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1112
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1113
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1114
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1115
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1116
"movq 64(%1), %%mm2 \n\t"\
1117
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1118
"paddusw %%mm1, %%mm0 \n\t"\
1119
"movq %%mm0, 64(%1) \n\t"\
1121
LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1122
LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1125
"movq %%mm7, (%1) \n\t"\
1126
MMABS(%%mm0, %%mm7)\
1127
MMABS(%%mm1, %%mm7)\
1128
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1129
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1130
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1131
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1132
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1133
"movq (%1), %%mm2 \n\t"\
1134
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1135
"paddusw 64(%1), %%mm0 \n\t"\
1136
"paddusw %%mm1, %%mm0 \n\t"\
1138
HSUM(%%mm0, %%mm1, %0)\
1145
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1147
#define HADAMARD8_DIFF_SSE2(cpu) \
1148
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1149
DECLARE_ALIGNED(16, uint64_t, temp)[4];\
1154
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1157
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1158
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1159
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1161
HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1167
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1169
#define MMABS(a,z) MMABS_MMX(a,z)
1170
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1171
HADAMARD8_DIFF_MMX(mmx)
1175
#define MMABS(a,z) MMABS_MMX2(a,z)
1176
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1177
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1178
HADAMARD8_DIFF_MMX(mmx2)
1179
HADAMARD8_DIFF_SSE2(sse2)
1181
#undef MMABS_SUM_8x8
1185
#define MMABS(a,z) MMABS_SSSE3(a,z)
1186
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1187
HADAMARD8_DIFF_SSE2(ssse3)
1189
#undef MMABS_SUM_8x8
1192
#define DCT_SAD4(m,mm,o)\
1193
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1194
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1195
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1196
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1197
MMABS_SUM(mm##2, mm##6, mm##0)\
1198
MMABS_SUM(mm##3, mm##7, mm##1)\
1199
MMABS_SUM(mm##4, mm##6, mm##0)\
1200
MMABS_SUM(mm##5, mm##7, mm##1)\
1202
#define DCT_SAD_MMX\
1203
"pxor %%mm0, %%mm0 \n\t"\
1204
"pxor %%mm1, %%mm1 \n\t"\
1205
DCT_SAD4(q, %%mm, 0)\
1206
DCT_SAD4(q, %%mm, 8)\
1207
DCT_SAD4(q, %%mm, 64)\
1208
DCT_SAD4(q, %%mm, 72)\
1209
"paddusw %%mm1, %%mm0 \n\t"\
1210
HSUM(%%mm0, %%mm1, %0)
1212
#define DCT_SAD_SSE2\
1213
"pxor %%xmm0, %%xmm0 \n\t"\
1214
"pxor %%xmm1, %%xmm1 \n\t"\
1215
DCT_SAD4(dqa, %%xmm, 0)\
1216
DCT_SAD4(dqa, %%xmm, 64)\
1217
"paddusw %%xmm1, %%xmm0 \n\t"\
1218
HSUM(%%xmm0, %%xmm1, %0)
1220
#define DCT_SAD_FUNC(cpu) \
1221
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1231
#define DCT_SAD DCT_SAD_MMX
1232
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1233
#define MMABS(a,z) MMABS_MMX(a,z)
1238
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1239
#define MMABS(a,z) MMABS_MMX2(a,z)
1244
#define DCT_SAD DCT_SAD_SSE2
1245
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1250
#define MMABS(a,z) MMABS_SSSE3(a,z)
1257
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1261
"pxor %%mm4, %%mm4 \n"
1264
"movq (%2,%0), %%mm2 \n"
1265
"movq (%3,%0,2), %%mm0 \n"
1266
"movq 8(%3,%0,2), %%mm1 \n"
1267
"punpckhbw %%mm2, %%mm3 \n"
1268
"punpcklbw %%mm2, %%mm2 \n"
1269
"psraw $8, %%mm3 \n"
1270
"psraw $8, %%mm2 \n"
1271
"psubw %%mm3, %%mm1 \n"
1272
"psubw %%mm2, %%mm0 \n"
1273
"pmaddwd %%mm1, %%mm1 \n"
1274
"pmaddwd %%mm0, %%mm0 \n"
1275
"paddd %%mm1, %%mm4 \n"
1276
"paddd %%mm0, %%mm4 \n"
1278
"movq %%mm4, %%mm3 \n"
1279
"psrlq $32, %%mm3 \n"
1280
"paddd %%mm3, %%mm4 \n"
1283
:"r"(pix1), "r"(pix2)
1288
#define PHADDD(a, t)\
1289
"movq "#a", "#t" \n\t"\
1290
"psrlq $32, "#a" \n\t"\
1291
"paddd "#t", "#a" \n\t"
1293
pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1294
pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1295
pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1297
#define PMULHRW(x, y, s, o)\
1298
"pmulhw " #s ", "#x " \n\t"\
1299
"pmulhw " #s ", "#y " \n\t"\
1300
"paddw " #o ", "#x " \n\t"\
1301
"paddw " #o ", "#y " \n\t"\
1302
"psraw $1, "#x " \n\t"\
1303
"psraw $1, "#y " \n\t"
1304
#define DEF(x) x ## _mmx
1305
#define SET_RND MOVQ_WONE
1306
#define SCALE_OFFSET 1
1308
#include "dsputil_mmx_qns_template.c"
1315
#define DEF(x) x ## _3dnow
1317
#define SCALE_OFFSET 0
1318
#define PMULHRW(x, y, s, o)\
1319
"pmulhrw " #s ", "#x " \n\t"\
1320
"pmulhrw " #s ", "#y " \n\t"
1322
#include "dsputil_mmx_qns_template.c"
1331
#define DEF(x) x ## _ssse3
1333
#define SCALE_OFFSET -1
1334
#define PHADDD(a, t)\
1335
"pshufw $0x0E, "#a", "#t" \n\t"\
1336
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1337
#define PMULHRW(x, y, s, o)\
1338
"pmulhrsw " #s ", "#x " \n\t"\
1339
"pmulhrsw " #s ", "#y " \n\t"
1341
#include "dsputil_mmx_qns_template.c"
1351
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1353
if (mm_flags & FF_MM_MMX) {
1354
const int dct_algo = avctx->dct_algo;
1355
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1356
if(mm_flags & FF_MM_SSE2){
1357
c->fdct = ff_fdct_sse2;
1358
}else if(mm_flags & FF_MM_MMX2){
1359
c->fdct = ff_fdct_mmx2;
1361
c->fdct = ff_fdct_mmx;
1365
c->get_pixels = get_pixels_mmx;
1366
c->diff_pixels = diff_pixels_mmx;
1367
c->pix_sum = pix_sum16_mmx;
1369
c->diff_bytes= diff_bytes_mmx;
1370
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1372
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1373
c->hadamard8_diff[1]= hadamard8_diff_mmx;
1375
c->pix_norm1 = pix_norm1_mmx;
1376
c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1377
c->sse[1] = sse8_mmx;
1378
c->vsad[4]= vsad_intra16_mmx;
1380
c->nsse[0] = nsse16_mmx;
1381
c->nsse[1] = nsse8_mmx;
1382
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1383
c->vsad[0] = vsad16_mmx;
1386
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387
c->try_8x8basis= try_8x8basis_mmx;
1389
c->add_8x8basis= add_8x8basis_mmx;
1391
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1394
if (mm_flags & FF_MM_MMX2) {
1395
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1396
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1397
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1398
c->vsad[4]= vsad_intra16_mmx2;
1400
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1401
c->vsad[0] = vsad16_mmx2;
1404
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1407
if(mm_flags & FF_MM_SSE2){
1408
c->get_pixels = get_pixels_sse2;
1409
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1410
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1411
c->hadamard8_diff[1]= hadamard8_diff_sse2;
1413
c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
1418
if(mm_flags & FF_MM_SSSE3){
1419
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1420
c->try_8x8basis= try_8x8basis_ssse3;
1422
c->add_8x8basis= add_8x8basis_ssse3;
1423
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1424
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1425
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1429
if(mm_flags & FF_MM_3DNOW){
1430
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1431
c->try_8x8basis= try_8x8basis_3dnow;
1433
c->add_8x8basis= add_8x8basis_3dnow;
1437
dsputil_init_pix_mmx(c, avctx);