2
* The simplest mpeg encoder (well, it was the simplest!)
3
* Copyright (c) 2000,2001 Fabrice Bellard.
5
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
6
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
8
* This file is part of FFmpeg.
10
* FFmpeg is free software; you can redistribute it and/or
11
* modify it under the terms of the GNU Lesser General Public
12
* License as published by the Free Software Foundation; either
13
* version 2.1 of the License, or (at your option) any later version.
15
* FFmpeg is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* Lesser General Public License for more details.
20
* You should have received a copy of the GNU Lesser General Public
21
* License along with FFmpeg; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/avcodec.h"
27
#include "libavcodec/dsputil.h"
28
#include "libavcodec/mpegvideo.h"
29
#include "dsputil_mmx.h"
31
extern uint16_t inv_zigzag_direct16[64];
34
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
35
DCTELEM *block, int n, int qscale)
37
x86_reg level, qmul, qadd, nCoeffs;
41
assert(s->block_last_index[n]>=0 || s->h263_aic);
45
level = block[0] * s->y_dc_scale;
47
level = block[0] * s->c_dc_scale;
48
qadd = (qscale - 1) | 1;
56
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
57
//printf("%d %d ", qmul, qadd);
59
"movd %1, %%mm6 \n\t" //qmul
60
"packssdw %%mm6, %%mm6 \n\t"
61
"packssdw %%mm6, %%mm6 \n\t"
62
"movd %2, %%mm5 \n\t" //qadd
63
"pxor %%mm7, %%mm7 \n\t"
64
"packssdw %%mm5, %%mm5 \n\t"
65
"packssdw %%mm5, %%mm5 \n\t"
66
"psubw %%mm5, %%mm7 \n\t"
67
"pxor %%mm4, %%mm4 \n\t"
70
"movq (%0, %3), %%mm0 \n\t"
71
"movq 8(%0, %3), %%mm1 \n\t"
73
"pmullw %%mm6, %%mm0 \n\t"
74
"pmullw %%mm6, %%mm1 \n\t"
76
"movq (%0, %3), %%mm2 \n\t"
77
"movq 8(%0, %3), %%mm3 \n\t"
79
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
80
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
82
"pxor %%mm2, %%mm0 \n\t"
83
"pxor %%mm3, %%mm1 \n\t"
85
"paddw %%mm7, %%mm0 \n\t"
86
"paddw %%mm7, %%mm1 \n\t"
88
"pxor %%mm0, %%mm2 \n\t"
89
"pxor %%mm1, %%mm3 \n\t"
91
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
92
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
94
"pandn %%mm2, %%mm0 \n\t"
95
"pandn %%mm3, %%mm1 \n\t"
97
"movq %%mm0, (%0, %3) \n\t"
98
"movq %%mm1, 8(%0, %3) \n\t"
102
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
109
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
110
DCTELEM *block, int n, int qscale)
112
x86_reg qmul, qadd, nCoeffs;
115
qadd = (qscale - 1) | 1;
117
assert(s->block_last_index[n]>=0 || s->h263_aic);
119
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
120
//printf("%d %d ", qmul, qadd);
122
"movd %1, %%mm6 \n\t" //qmul
123
"packssdw %%mm6, %%mm6 \n\t"
124
"packssdw %%mm6, %%mm6 \n\t"
125
"movd %2, %%mm5 \n\t" //qadd
126
"pxor %%mm7, %%mm7 \n\t"
127
"packssdw %%mm5, %%mm5 \n\t"
128
"packssdw %%mm5, %%mm5 \n\t"
129
"psubw %%mm5, %%mm7 \n\t"
130
"pxor %%mm4, %%mm4 \n\t"
133
"movq (%0, %3), %%mm0 \n\t"
134
"movq 8(%0, %3), %%mm1 \n\t"
136
"pmullw %%mm6, %%mm0 \n\t"
137
"pmullw %%mm6, %%mm1 \n\t"
139
"movq (%0, %3), %%mm2 \n\t"
140
"movq 8(%0, %3), %%mm3 \n\t"
142
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
143
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
145
"pxor %%mm2, %%mm0 \n\t"
146
"pxor %%mm3, %%mm1 \n\t"
148
"paddw %%mm7, %%mm0 \n\t"
149
"paddw %%mm7, %%mm1 \n\t"
151
"pxor %%mm0, %%mm2 \n\t"
152
"pxor %%mm1, %%mm3 \n\t"
154
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
155
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
157
"pandn %%mm2, %%mm0 \n\t"
158
"pandn %%mm3, %%mm1 \n\t"
160
"movq %%mm0, (%0, %3) \n\t"
161
"movq %%mm1, 8(%0, %3) \n\t"
165
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
173
Note: looking at PARANOID:
174
"enable all paranoid tests for rounding, overflows, etc..."
177
if (level < -2048 || level > 2047)
178
fprintf(stderr, "unquant error %d %d\n", i, level);
180
We can suppose that result of two multiplications can't be greater than 0xFFFF
181
i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
182
a complex multiplication.
183
=====================================================
184
Full formula for multiplication of 2 integer numbers
185
which are represent as high:low words:
186
input: value1 = high1:low1
188
output: value3 = value1*value2
189
value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
190
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
191
but this algorithm will compute only 0x66cb0ce4
192
this limited by 16-bit size of operands
193
---------------------------------
196
tlow1 = tlow1 + tlow2
197
high3:low3 = low1*low2
200
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
201
DCTELEM *block, int n, int qscale)
204
const uint16_t *quant_matrix;
207
assert(s->block_last_index[n]>=0);
209
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
212
block0 = block[0] * s->y_dc_scale;
214
block0 = block[0] * s->c_dc_scale;
215
/* XXX: only mpeg1 */
216
quant_matrix = s->intra_matrix;
218
"pcmpeqw %%mm7, %%mm7 \n\t"
219
"psrlw $15, %%mm7 \n\t"
220
"movd %2, %%mm6 \n\t"
221
"packssdw %%mm6, %%mm6 \n\t"
222
"packssdw %%mm6, %%mm6 \n\t"
223
"mov %3, %%"REG_a" \n\t"
226
"movq (%0, %%"REG_a"), %%mm0 \n\t"
227
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
228
"movq (%1, %%"REG_a"), %%mm4 \n\t"
229
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
230
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
231
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
232
"pxor %%mm2, %%mm2 \n\t"
233
"pxor %%mm3, %%mm3 \n\t"
234
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
235
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
236
"pxor %%mm2, %%mm0 \n\t"
237
"pxor %%mm3, %%mm1 \n\t"
238
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
239
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
240
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
241
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
242
"pxor %%mm4, %%mm4 \n\t"
243
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
244
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
245
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
246
"psraw $3, %%mm0 \n\t"
247
"psraw $3, %%mm1 \n\t"
248
"psubw %%mm7, %%mm0 \n\t"
249
"psubw %%mm7, %%mm1 \n\t"
250
"por %%mm7, %%mm0 \n\t"
251
"por %%mm7, %%mm1 \n\t"
252
"pxor %%mm2, %%mm0 \n\t"
253
"pxor %%mm3, %%mm1 \n\t"
254
"psubw %%mm2, %%mm0 \n\t"
255
"psubw %%mm3, %%mm1 \n\t"
256
"pandn %%mm0, %%mm4 \n\t"
257
"pandn %%mm1, %%mm5 \n\t"
258
"movq %%mm4, (%0, %%"REG_a") \n\t"
259
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
261
"add $16, %%"REG_a" \n\t"
263
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
269
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
270
DCTELEM *block, int n, int qscale)
273
const uint16_t *quant_matrix;
275
assert(s->block_last_index[n]>=0);
277
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
279
quant_matrix = s->inter_matrix;
281
"pcmpeqw %%mm7, %%mm7 \n\t"
282
"psrlw $15, %%mm7 \n\t"
283
"movd %2, %%mm6 \n\t"
284
"packssdw %%mm6, %%mm6 \n\t"
285
"packssdw %%mm6, %%mm6 \n\t"
286
"mov %3, %%"REG_a" \n\t"
289
"movq (%0, %%"REG_a"), %%mm0 \n\t"
290
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
291
"movq (%1, %%"REG_a"), %%mm4 \n\t"
292
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
293
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
294
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
295
"pxor %%mm2, %%mm2 \n\t"
296
"pxor %%mm3, %%mm3 \n\t"
297
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
298
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
299
"pxor %%mm2, %%mm0 \n\t"
300
"pxor %%mm3, %%mm1 \n\t"
301
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
302
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
303
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
304
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
305
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
306
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
307
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
308
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
309
"pxor %%mm4, %%mm4 \n\t"
310
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
311
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
312
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
313
"psraw $4, %%mm0 \n\t"
314
"psraw $4, %%mm1 \n\t"
315
"psubw %%mm7, %%mm0 \n\t"
316
"psubw %%mm7, %%mm1 \n\t"
317
"por %%mm7, %%mm0 \n\t"
318
"por %%mm7, %%mm1 \n\t"
319
"pxor %%mm2, %%mm0 \n\t"
320
"pxor %%mm3, %%mm1 \n\t"
321
"psubw %%mm2, %%mm0 \n\t"
322
"psubw %%mm3, %%mm1 \n\t"
323
"pandn %%mm0, %%mm4 \n\t"
324
"pandn %%mm1, %%mm5 \n\t"
325
"movq %%mm4, (%0, %%"REG_a") \n\t"
326
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
328
"add $16, %%"REG_a" \n\t"
330
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
335
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
336
DCTELEM *block, int n, int qscale)
339
const uint16_t *quant_matrix;
342
assert(s->block_last_index[n]>=0);
344
if(s->alternate_scan) nCoeffs= 63; //FIXME
345
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
348
block0 = block[0] * s->y_dc_scale;
350
block0 = block[0] * s->c_dc_scale;
351
quant_matrix = s->intra_matrix;
353
"pcmpeqw %%mm7, %%mm7 \n\t"
354
"psrlw $15, %%mm7 \n\t"
355
"movd %2, %%mm6 \n\t"
356
"packssdw %%mm6, %%mm6 \n\t"
357
"packssdw %%mm6, %%mm6 \n\t"
358
"mov %3, %%"REG_a" \n\t"
361
"movq (%0, %%"REG_a"), %%mm0 \n\t"
362
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
363
"movq (%1, %%"REG_a"), %%mm4 \n\t"
364
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
365
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
366
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
367
"pxor %%mm2, %%mm2 \n\t"
368
"pxor %%mm3, %%mm3 \n\t"
369
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
370
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
371
"pxor %%mm2, %%mm0 \n\t"
372
"pxor %%mm3, %%mm1 \n\t"
373
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
374
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
375
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
376
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
377
"pxor %%mm4, %%mm4 \n\t"
378
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
379
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
380
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
381
"psraw $3, %%mm0 \n\t"
382
"psraw $3, %%mm1 \n\t"
383
"pxor %%mm2, %%mm0 \n\t"
384
"pxor %%mm3, %%mm1 \n\t"
385
"psubw %%mm2, %%mm0 \n\t"
386
"psubw %%mm3, %%mm1 \n\t"
387
"pandn %%mm0, %%mm4 \n\t"
388
"pandn %%mm1, %%mm5 \n\t"
389
"movq %%mm4, (%0, %%"REG_a") \n\t"
390
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
392
"add $16, %%"REG_a" \n\t"
394
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
398
//Note, we do not do mismatch control for intra as errors cannot accumulate
401
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
402
DCTELEM *block, int n, int qscale)
405
const uint16_t *quant_matrix;
407
assert(s->block_last_index[n]>=0);
409
if(s->alternate_scan) nCoeffs= 63; //FIXME
410
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
412
quant_matrix = s->inter_matrix;
414
"pcmpeqw %%mm7, %%mm7 \n\t"
415
"psrlq $48, %%mm7 \n\t"
416
"movd %2, %%mm6 \n\t"
417
"packssdw %%mm6, %%mm6 \n\t"
418
"packssdw %%mm6, %%mm6 \n\t"
419
"mov %3, %%"REG_a" \n\t"
422
"movq (%0, %%"REG_a"), %%mm0 \n\t"
423
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
424
"movq (%1, %%"REG_a"), %%mm4 \n\t"
425
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
426
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
427
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
428
"pxor %%mm2, %%mm2 \n\t"
429
"pxor %%mm3, %%mm3 \n\t"
430
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
431
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
432
"pxor %%mm2, %%mm0 \n\t"
433
"pxor %%mm3, %%mm1 \n\t"
434
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
435
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
436
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
437
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
438
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
439
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
440
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
441
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
442
"pxor %%mm4, %%mm4 \n\t"
443
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
444
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
445
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
446
"psrlw $4, %%mm0 \n\t"
447
"psrlw $4, %%mm1 \n\t"
448
"pxor %%mm2, %%mm0 \n\t"
449
"pxor %%mm3, %%mm1 \n\t"
450
"psubw %%mm2, %%mm0 \n\t"
451
"psubw %%mm3, %%mm1 \n\t"
452
"pandn %%mm0, %%mm4 \n\t"
453
"pandn %%mm1, %%mm5 \n\t"
454
"pxor %%mm4, %%mm7 \n\t"
455
"pxor %%mm5, %%mm7 \n\t"
456
"movq %%mm4, (%0, %%"REG_a") \n\t"
457
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
459
"add $16, %%"REG_a" \n\t"
461
"movd 124(%0, %3), %%mm0 \n\t"
462
"movq %%mm7, %%mm6 \n\t"
463
"psrlq $32, %%mm7 \n\t"
464
"pxor %%mm6, %%mm7 \n\t"
465
"movq %%mm7, %%mm6 \n\t"
466
"psrlq $16, %%mm7 \n\t"
467
"pxor %%mm6, %%mm7 \n\t"
468
"pslld $31, %%mm7 \n\t"
469
"psrlq $15, %%mm7 \n\t"
470
"pxor %%mm7, %%mm0 \n\t"
471
"movd %%mm0, 124(%0, %3) \n\t"
473
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
478
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
479
const int intra= s->mb_intra;
480
int *sum= s->dct_error_sum[intra];
481
uint16_t *offset= s->dct_offset[intra];
483
s->dct_count[intra]++;
486
"pxor %%mm7, %%mm7 \n\t"
488
"pxor %%mm0, %%mm0 \n\t"
489
"pxor %%mm1, %%mm1 \n\t"
490
"movq (%0), %%mm2 \n\t"
491
"movq 8(%0), %%mm3 \n\t"
492
"pcmpgtw %%mm2, %%mm0 \n\t"
493
"pcmpgtw %%mm3, %%mm1 \n\t"
494
"pxor %%mm0, %%mm2 \n\t"
495
"pxor %%mm1, %%mm3 \n\t"
496
"psubw %%mm0, %%mm2 \n\t"
497
"psubw %%mm1, %%mm3 \n\t"
498
"movq %%mm2, %%mm4 \n\t"
499
"movq %%mm3, %%mm5 \n\t"
500
"psubusw (%2), %%mm2 \n\t"
501
"psubusw 8(%2), %%mm3 \n\t"
502
"pxor %%mm0, %%mm2 \n\t"
503
"pxor %%mm1, %%mm3 \n\t"
504
"psubw %%mm0, %%mm2 \n\t"
505
"psubw %%mm1, %%mm3 \n\t"
506
"movq %%mm2, (%0) \n\t"
507
"movq %%mm3, 8(%0) \n\t"
508
"movq %%mm4, %%mm2 \n\t"
509
"movq %%mm5, %%mm3 \n\t"
510
"punpcklwd %%mm7, %%mm4 \n\t"
511
"punpckhwd %%mm7, %%mm2 \n\t"
512
"punpcklwd %%mm7, %%mm5 \n\t"
513
"punpckhwd %%mm7, %%mm3 \n\t"
514
"paddd (%1), %%mm4 \n\t"
515
"paddd 8(%1), %%mm2 \n\t"
516
"paddd 16(%1), %%mm5 \n\t"
517
"paddd 24(%1), %%mm3 \n\t"
518
"movq %%mm4, (%1) \n\t"
519
"movq %%mm2, 8(%1) \n\t"
520
"movq %%mm5, 16(%1) \n\t"
521
"movq %%mm3, 24(%1) \n\t"
527
: "+r" (block), "+r" (sum), "+r" (offset)
532
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
533
const int intra= s->mb_intra;
534
int *sum= s->dct_error_sum[intra];
535
uint16_t *offset= s->dct_offset[intra];
537
s->dct_count[intra]++;
540
"pxor %%xmm7, %%xmm7 \n\t"
542
"pxor %%xmm0, %%xmm0 \n\t"
543
"pxor %%xmm1, %%xmm1 \n\t"
544
"movdqa (%0), %%xmm2 \n\t"
545
"movdqa 16(%0), %%xmm3 \n\t"
546
"pcmpgtw %%xmm2, %%xmm0 \n\t"
547
"pcmpgtw %%xmm3, %%xmm1 \n\t"
548
"pxor %%xmm0, %%xmm2 \n\t"
549
"pxor %%xmm1, %%xmm3 \n\t"
550
"psubw %%xmm0, %%xmm2 \n\t"
551
"psubw %%xmm1, %%xmm3 \n\t"
552
"movdqa %%xmm2, %%xmm4 \n\t"
553
"movdqa %%xmm3, %%xmm5 \n\t"
554
"psubusw (%2), %%xmm2 \n\t"
555
"psubusw 16(%2), %%xmm3 \n\t"
556
"pxor %%xmm0, %%xmm2 \n\t"
557
"pxor %%xmm1, %%xmm3 \n\t"
558
"psubw %%xmm0, %%xmm2 \n\t"
559
"psubw %%xmm1, %%xmm3 \n\t"
560
"movdqa %%xmm2, (%0) \n\t"
561
"movdqa %%xmm3, 16(%0) \n\t"
562
"movdqa %%xmm4, %%xmm6 \n\t"
563
"movdqa %%xmm5, %%xmm0 \n\t"
564
"punpcklwd %%xmm7, %%xmm4 \n\t"
565
"punpckhwd %%xmm7, %%xmm6 \n\t"
566
"punpcklwd %%xmm7, %%xmm5 \n\t"
567
"punpckhwd %%xmm7, %%xmm0 \n\t"
568
"paddd (%1), %%xmm4 \n\t"
569
"paddd 16(%1), %%xmm6 \n\t"
570
"paddd 32(%1), %%xmm5 \n\t"
571
"paddd 48(%1), %%xmm0 \n\t"
572
"movdqa %%xmm4, (%1) \n\t"
573
"movdqa %%xmm6, 16(%1) \n\t"
574
"movdqa %%xmm5, 32(%1) \n\t"
575
"movdqa %%xmm0, 48(%1) \n\t"
581
: "+r" (block), "+r" (sum), "+r" (offset)
587
#define HAVE_SSSE3_BAK
593
#define RENAME(a) a ## _MMX
594
#define RENAMEl(a) a ## _mmx
595
#include "mpegvideo_mmx_template.c"
600
#define RENAME(a) a ## _MMX2
601
#define RENAMEl(a) a ## _mmx2
602
#include "mpegvideo_mmx_template.c"
607
#define RENAME(a) a ## _SSE2
608
#define RENAMEl(a) a ## _sse2
609
#include "mpegvideo_mmx_template.c"
611
#ifdef HAVE_SSSE3_BAK
615
#define RENAME(a) a ## _SSSE3
616
#define RENAMEl(a) a ## _sse2
617
#include "mpegvideo_mmx_template.c"
620
void MPV_common_init_mmx(MpegEncContext *s)
622
if (mm_flags & FF_MM_MMX) {
623
const int dct_algo = s->avctx->dct_algo;
625
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
626
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
627
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
628
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
629
if(!(s->flags & CODEC_FLAG_BITEXACT))
630
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
631
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
633
if (mm_flags & FF_MM_SSE2) {
634
s->denoise_dct= denoise_dct_sse2;
636
s->denoise_dct= denoise_dct_mmx;
639
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
641
if(mm_flags & FF_MM_SSSE3){
642
s->dct_quantize= dct_quantize_SSSE3;
645
if(mm_flags & FF_MM_SSE2){
646
s->dct_quantize= dct_quantize_SSE2;
647
} else if(mm_flags & FF_MM_MMXEXT){
648
s->dct_quantize= dct_quantize_MMX2;
650
s->dct_quantize= dct_quantize_MMX;