2
* Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
4
* This file is part of Libav.
6
* Libav is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 2 of the License, or
9
* (at your option) any later version.
11
* Libav is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with Libav; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
* mmx/mmx2/3dnow postprocess code.
26
#include "libavutil/x86_cpu.h"
28
#define ALIGN_MASK "$-8"
36
#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
38
#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
40
#define PAVGB(a,b) REAL_PAVGB(a,b)
43
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
45
#define PMINUB(b,a,t) \
46
"movq " #a ", " #t " \n\t"\
47
"psubusb " #b ", " #t " \n\t"\
48
"psubb " #t ", " #a " \n\t"
52
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
55
"psubusb " #a ", " #b " \n\t"\
56
"paddb " #a ", " #b " \n\t"
59
//FIXME? |255-0| = 1 (should not be a problem ...)
62
* Check if the middle 8x8 Block in the given 8x16 block is flat
64
static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
66
src+= stride*4; // src points to begin of the 8x8 Block
70
: : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
74
"lea (%2, %3), %%"REG_a" \n\t"
75
// 0 1 2 3 4 5 6 7 8 9
76
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
78
"movq (%2), %%mm0 \n\t"
79
"movq (%%"REG_a"), %%mm1 \n\t"
80
"movq %%mm0, %%mm3 \n\t"
81
"movq %%mm0, %%mm4 \n\t"
83
PMINUB(%%mm1, %%mm3, %%mm5)
84
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
85
"paddb %%mm7, %%mm0 \n\t"
86
"pcmpgtb %%mm6, %%mm0 \n\t"
88
"movq (%%"REG_a",%3), %%mm2 \n\t"
90
PMINUB(%%mm2, %%mm3, %%mm5)
91
"psubb %%mm2, %%mm1 \n\t"
92
"paddb %%mm7, %%mm1 \n\t"
93
"pcmpgtb %%mm6, %%mm1 \n\t"
94
"paddb %%mm1, %%mm0 \n\t"
96
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
98
PMINUB(%%mm1, %%mm3, %%mm5)
99
"psubb %%mm1, %%mm2 \n\t"
100
"paddb %%mm7, %%mm2 \n\t"
101
"pcmpgtb %%mm6, %%mm2 \n\t"
102
"paddb %%mm2, %%mm0 \n\t"
104
"lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
106
"movq (%2, %3, 4), %%mm2 \n\t"
108
PMINUB(%%mm2, %%mm3, %%mm5)
109
"psubb %%mm2, %%mm1 \n\t"
110
"paddb %%mm7, %%mm1 \n\t"
111
"pcmpgtb %%mm6, %%mm1 \n\t"
112
"paddb %%mm1, %%mm0 \n\t"
114
"movq (%%"REG_a"), %%mm1 \n\t"
116
PMINUB(%%mm1, %%mm3, %%mm5)
117
"psubb %%mm1, %%mm2 \n\t"
118
"paddb %%mm7, %%mm2 \n\t"
119
"pcmpgtb %%mm6, %%mm2 \n\t"
120
"paddb %%mm2, %%mm0 \n\t"
122
"movq (%%"REG_a", %3), %%mm2 \n\t"
124
PMINUB(%%mm2, %%mm3, %%mm5)
125
"psubb %%mm2, %%mm1 \n\t"
126
"paddb %%mm7, %%mm1 \n\t"
127
"pcmpgtb %%mm6, %%mm1 \n\t"
128
"paddb %%mm1, %%mm0 \n\t"
130
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
132
PMINUB(%%mm1, %%mm3, %%mm5)
133
"psubb %%mm1, %%mm2 \n\t"
134
"paddb %%mm7, %%mm2 \n\t"
135
"pcmpgtb %%mm6, %%mm2 \n\t"
136
"paddb %%mm2, %%mm0 \n\t"
137
"psubusb %%mm3, %%mm4 \n\t"
141
"pxor %%mm7, %%mm7 \n\t"
142
"psadbw %%mm7, %%mm0 \n\t"
144
"movq %%mm0, %%mm1 \n\t"
145
"psrlw $8, %%mm0 \n\t"
146
"paddb %%mm1, %%mm0 \n\t"
147
"movq %%mm0, %%mm1 \n\t"
148
"psrlq $16, %%mm0 \n\t"
149
"paddb %%mm1, %%mm0 \n\t"
150
"movq %%mm0, %%mm1 \n\t"
151
"psrlq $32, %%mm0 \n\t"
152
"paddb %%mm1, %%mm0 \n\t"
154
"movq %4, %%mm7 \n\t" // QP,..., QP
155
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
156
"psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
157
"packssdw %%mm4, %%mm4 \n\t"
158
"movd %%mm0, %0 \n\t"
159
"movd %%mm4, %1 \n\t"
161
: "=r" (numEq), "=r" (dcOk)
162
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
166
numEq= (-numEq) &0xFF;
167
if(numEq > c->ppMode.flatnessThreshold){
177
* Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
178
* using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
181
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
183
#if HAVE_MMX2 || HAVE_AMD3DNOW
185
__asm__ volatile( //"movv %0 %1 %2\n\t"
186
"movq %2, %%mm0 \n\t" // QP,..., QP
187
"pxor %%mm4, %%mm4 \n\t"
189
"movq (%0), %%mm6 \n\t"
190
"movq (%0, %1), %%mm5 \n\t"
191
"movq %%mm5, %%mm1 \n\t"
192
"movq %%mm6, %%mm2 \n\t"
193
"psubusb %%mm6, %%mm5 \n\t"
194
"psubusb %%mm1, %%mm2 \n\t"
195
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
196
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
197
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
199
"pand %%mm2, %%mm6 \n\t"
200
"pandn %%mm1, %%mm2 \n\t"
201
"por %%mm2, %%mm6 \n\t"// First Line to Filter
203
"movq (%0, %1, 8), %%mm5 \n\t"
204
"lea (%0, %1, 4), %%"REG_a" \n\t"
205
"lea (%0, %1, 8), %%"REG_c" \n\t"
206
"sub %1, %%"REG_c" \n\t"
207
"add %1, %0 \n\t" // %0 points to line 1 not 0
208
"movq (%0, %1, 8), %%mm7 \n\t"
209
"movq %%mm5, %%mm1 \n\t"
210
"movq %%mm7, %%mm2 \n\t"
211
"psubusb %%mm7, %%mm5 \n\t"
212
"psubusb %%mm1, %%mm2 \n\t"
213
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
214
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
215
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
217
"pand %%mm2, %%mm7 \n\t"
218
"pandn %%mm1, %%mm2 \n\t"
219
"por %%mm2, %%mm7 \n\t" // First Line to Filter
223
// %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
228
"movq (%0, %1), %%mm0 \n\t" // 1
229
"movq %%mm0, %%mm1 \n\t" // 1
230
PAVGB(%%mm6, %%mm0) //1 1 /2
231
PAVGB(%%mm6, %%mm0) //3 1 /4
233
"movq (%0, %1, 4), %%mm2 \n\t" // 1
234
"movq %%mm2, %%mm5 \n\t" // 1
235
PAVGB((%%REGa), %%mm2) // 11 /2
236
PAVGB((%0, %1, 2), %%mm2) // 211 /4
237
"movq %%mm2, %%mm3 \n\t" // 211 /4
238
"movq (%0), %%mm4 \n\t" // 1
239
PAVGB(%%mm4, %%mm3) // 4 211 /8
240
PAVGB(%%mm0, %%mm3) //642211 /16
241
"movq %%mm3, (%0) \n\t" // X
242
// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
243
"movq %%mm1, %%mm0 \n\t" // 1
244
PAVGB(%%mm6, %%mm0) //1 1 /2
245
"movq %%mm4, %%mm3 \n\t" // 1
246
PAVGB((%0,%1,2), %%mm3) // 1 1 /2
247
PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
248
PAVGB((%%REGa), %%mm5) // 211 /4
249
PAVGB(%%mm5, %%mm3) // 2 2211 /8
250
PAVGB(%%mm0, %%mm3) //4242211 /16
251
"movq %%mm3, (%0,%1) \n\t" // X
252
// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
253
PAVGB(%%mm4, %%mm6) //11 /2
254
"movq (%%"REG_c"), %%mm0 \n\t" // 1
255
PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
256
"movq %%mm0, %%mm3 \n\t" // 11/2
257
PAVGB(%%mm1, %%mm0) // 2 11/4
258
PAVGB(%%mm6, %%mm0) //222 11/8
259
PAVGB(%%mm2, %%mm0) //22242211/16
260
"movq (%0, %1, 2), %%mm2 \n\t" // 1
261
"movq %%mm0, (%0, %1, 2) \n\t" // X
262
// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
263
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
264
PAVGB((%%REGc), %%mm0) // 11 /2
265
PAVGB(%%mm0, %%mm6) //11 11 /4
266
PAVGB(%%mm1, %%mm4) // 11 /2
267
PAVGB(%%mm2, %%mm1) // 11 /2
268
PAVGB(%%mm1, %%mm6) //1122 11 /8
269
PAVGB(%%mm5, %%mm6) //112242211 /16
270
"movq (%%"REG_a"), %%mm5 \n\t" // 1
271
"movq %%mm6, (%%"REG_a") \n\t" // X
272
// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
273
"movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
274
PAVGB(%%mm7, %%mm6) // 11 /2
275
PAVGB(%%mm4, %%mm6) // 11 11 /4
276
PAVGB(%%mm3, %%mm6) // 11 2211 /8
277
PAVGB(%%mm5, %%mm2) // 11 /2
278
"movq (%0, %1, 4), %%mm4 \n\t" // 1
279
PAVGB(%%mm4, %%mm2) // 112 /4
280
PAVGB(%%mm2, %%mm6) // 112242211 /16
281
"movq %%mm6, (%0, %1, 4) \n\t" // X
282
// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
283
PAVGB(%%mm7, %%mm1) // 11 2 /4
284
PAVGB(%%mm4, %%mm5) // 11 /2
285
PAVGB(%%mm5, %%mm0) // 11 11 /4
286
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
287
PAVGB(%%mm6, %%mm1) // 11 4 2 /8
288
PAVGB(%%mm0, %%mm1) // 11224222 /16
289
"movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
290
// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
291
PAVGB((%%REGc), %%mm2) // 112 4 /8
292
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
293
PAVGB(%%mm0, %%mm6) // 1 1 /2
294
PAVGB(%%mm7, %%mm6) // 1 12 /4
295
PAVGB(%%mm2, %%mm6) // 1122424 /4
296
"movq %%mm6, (%%"REG_c") \n\t" // X
297
// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
298
PAVGB(%%mm7, %%mm5) // 11 2 /4
299
PAVGB(%%mm7, %%mm5) // 11 6 /8
301
PAVGB(%%mm3, %%mm0) // 112 /4
302
PAVGB(%%mm0, %%mm5) // 112246 /16
303
"movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
307
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
310
#else //HAVE_MMX2 || HAVE_AMD3DNOW
311
const int l1= stride;
312
const int l2= stride + l1;
313
const int l3= stride + l2;
314
const int l4= stride + l3;
315
const int l5= stride + l4;
316
const int l6= stride + l5;
317
const int l7= stride + l6;
318
const int l8= stride + l7;
319
const int l9= stride + l8;
322
for(x=0; x<BLOCK_SIZE; x++){
323
const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
324
const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
327
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
328
sums[1] = sums[0] - first + src[l4];
329
sums[2] = sums[1] - first + src[l5];
330
sums[3] = sums[2] - first + src[l6];
331
sums[4] = sums[3] - first + src[l7];
332
sums[5] = sums[4] - src[l1] + src[l8];
333
sums[6] = sums[5] - src[l2] + last;
334
sums[7] = sums[6] - src[l3] + last;
335
sums[8] = sums[7] - src[l4] + last;
336
sums[9] = sums[8] - src[l5] + last;
338
src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
339
src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
340
src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
341
src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
342
src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
343
src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
344
src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
345
src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
349
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
351
#endif //HAVE_ALTIVEC
354
* Experimental Filter 1
355
* will not damage linear gradients
356
* Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
357
* can only smooth blocks at the expected locations (it cannot smooth them if they did move)
358
* MMX2 version does correct clipping C version does not
360
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
362
#if HAVE_MMX2 || HAVE_AMD3DNOW
366
"pxor %%mm7, %%mm7 \n\t" // 0
367
"lea (%0, %1), %%"REG_a" \n\t"
368
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
369
// 0 1 2 3 4 5 6 7 8 9
370
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
371
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
372
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
373
"movq %%mm1, %%mm2 \n\t" // line 4
374
"psubusb %%mm0, %%mm1 \n\t"
375
"psubusb %%mm2, %%mm0 \n\t"
376
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
377
"movq (%%"REG_c"), %%mm3 \n\t" // line 5
378
"movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
379
"movq %%mm3, %%mm5 \n\t" // line 5
380
"psubusb %%mm4, %%mm3 \n\t"
381
"psubusb %%mm5, %%mm4 \n\t"
382
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
383
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
384
"movq %%mm2, %%mm1 \n\t" // line 4
385
"psubusb %%mm5, %%mm2 \n\t"
386
"movq %%mm2, %%mm4 \n\t"
387
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
388
"psubusb %%mm1, %%mm5 \n\t"
389
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
390
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
391
"movq %%mm4, %%mm3 \n\t" // d
392
"movq %2, %%mm0 \n\t"
393
"paddusb %%mm0, %%mm0 \n\t"
394
"psubusb %%mm0, %%mm4 \n\t"
395
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
396
"psubusb "MANGLE(b01)", %%mm3 \n\t"
397
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
399
PAVGB(%%mm7, %%mm3) // d/2
400
"movq %%mm3, %%mm1 \n\t" // d/2
401
PAVGB(%%mm7, %%mm3) // d/4
402
PAVGB(%%mm1, %%mm3) // 3*d/8
404
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
405
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
406
"psubusb %%mm3, %%mm0 \n\t"
407
"pxor %%mm2, %%mm0 \n\t"
408
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
410
"movq (%%"REG_c"), %%mm0 \n\t" // line 5
411
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
412
"paddusb %%mm3, %%mm0 \n\t"
413
"pxor %%mm2, %%mm0 \n\t"
414
"movq %%mm0, (%%"REG_c") \n\t" // line 5
416
PAVGB(%%mm7, %%mm1) // d/4
418
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
419
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
420
"psubusb %%mm1, %%mm0 \n\t"
421
"pxor %%mm2, %%mm0 \n\t"
422
"movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3
424
"movq (%%"REG_c", %1), %%mm0 \n\t" // line 6
425
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
426
"paddusb %%mm1, %%mm0 \n\t"
427
"pxor %%mm2, %%mm0 \n\t"
428
"movq %%mm0, (%%"REG_c", %1) \n\t" // line 6
430
PAVGB(%%mm7, %%mm1) // d/8
432
"movq (%%"REG_a", %1), %%mm0 \n\t" // line 2
433
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
434
"psubusb %%mm1, %%mm0 \n\t"
435
"pxor %%mm2, %%mm0 \n\t"
436
"movq %%mm0, (%%"REG_a", %1) \n\t" // line 2
438
"movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7
439
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
440
"paddusb %%mm1, %%mm0 \n\t"
441
"pxor %%mm2, %%mm0 \n\t"
442
"movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7
445
: "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
448
#else //HAVE_MMX2 || HAVE_AMD3DNOW
450
const int l1= stride;
451
const int l2= stride + l1;
452
const int l3= stride + l2;
453
const int l4= stride + l3;
454
const int l5= stride + l4;
455
const int l6= stride + l5;
456
const int l7= stride + l6;
457
// const int l8= stride + l7;
458
// const int l9= stride + l8;
462
for(x=0; x<BLOCK_SIZE; x++){
463
int a= src[l3] - src[l4];
464
int b= src[l4] - src[l5];
465
int c= src[l5] - src[l6];
467
int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
471
int v = d * FFSIGN(-b);
482
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
486
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
488
#if HAVE_MMX2 || HAVE_AMD3DNOW
491
const int l1= stride;
492
const int l2= stride + l1;
493
const int l3= stride + l2;
494
const int l4= (int)tmp - (int)src - stride*3;
495
const int l5= (int)tmp - (int)src - stride*3 + 8;
496
const int l6= stride*3 + l3;
497
const int l7= stride + l6;
498
const int l8= stride + l7;
500
memcpy(tmp, src+stride*7, 8);
501
memcpy(tmp+8, src+stride*8, 8);
506
#if 0 //slightly more accurate and slightly slower
507
"pxor %%mm7, %%mm7 \n\t" // 0
508
"lea (%0, %1), %%"REG_a" \n\t"
509
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
511
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
512
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
515
"movq (%0, %1, 2), %%mm0 \n\t" // l2
516
"movq (%0), %%mm1 \n\t" // l0
517
"movq %%mm0, %%mm2 \n\t" // l2
518
PAVGB(%%mm7, %%mm0) // ~l2/2
519
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
520
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
522
"movq (%%"REG_a"), %%mm1 \n\t" // l1
523
"movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3
524
"movq %%mm1, %%mm4 \n\t" // l1
525
PAVGB(%%mm7, %%mm1) // ~l1/2
526
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
527
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
529
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
530
"psubusb %%mm1, %%mm0 \n\t"
531
"psubusb %%mm4, %%mm1 \n\t"
532
"por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
533
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
535
"movq (%0, %1, 4), %%mm0 \n\t" // l4
536
"movq %%mm0, %%mm4 \n\t" // l4
537
PAVGB(%%mm7, %%mm0) // ~l4/2
538
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
539
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
541
"movq (%%"REG_c"), %%mm2 \n\t" // l5
542
"movq %%mm3, %%mm5 \n\t" // l3
543
PAVGB(%%mm7, %%mm3) // ~l3/2
544
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
545
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
547
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
548
"psubusb %%mm3, %%mm0 \n\t"
549
"psubusb %%mm6, %%mm3 \n\t"
550
"por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
551
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
552
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
554
"movq (%%"REG_c", %1), %%mm6 \n\t" // l6
555
"movq %%mm6, %%mm5 \n\t" // l6
556
PAVGB(%%mm7, %%mm6) // ~l6/2
557
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
558
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
560
"movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7
561
"movq %%mm2, %%mm4 \n\t" // l5
562
PAVGB(%%mm7, %%mm2) // ~l5/2
563
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
564
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
566
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
567
"psubusb %%mm2, %%mm6 \n\t"
568
"psubusb %%mm4, %%mm2 \n\t"
569
"por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
570
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
573
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
574
"movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
575
"paddusb "MANGLE(b01)", %%mm4 \n\t"
576
"pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
577
"psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
578
"pand %%mm4, %%mm3 \n\t"
580
"movq %%mm3, %%mm1 \n\t"
581
// "psubusb "MANGLE(b01)", %%mm3 \n\t"
584
"paddusb %%mm1, %%mm3 \n\t"
585
// "paddusb "MANGLE(b01)", %%mm3 \n\t"
587
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3
588
"movq (%0, %1, 4), %%mm5 \n\t" //l4
589
"movq (%0, %1, 4), %%mm4 \n\t" //l4
590
"psubusb %%mm6, %%mm5 \n\t"
591
"psubusb %%mm4, %%mm6 \n\t"
592
"por %%mm6, %%mm5 \n\t" // |l3-l4|
593
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
594
"pxor %%mm6, %%mm0 \n\t"
595
"pand %%mm0, %%mm3 \n\t"
596
PMINUB(%%mm5, %%mm3, %%mm0)
598
"psubusb "MANGLE(b01)", %%mm3 \n\t"
601
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"
602
"movq (%0, %1, 4), %%mm2 \n\t"
603
"pxor %%mm6, %%mm0 \n\t"
604
"pxor %%mm6, %%mm2 \n\t"
605
"psubb %%mm3, %%mm0 \n\t"
606
"paddb %%mm3, %%mm2 \n\t"
607
"pxor %%mm6, %%mm0 \n\t"
608
"pxor %%mm6, %%mm2 \n\t"
609
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"
610
"movq %%mm2, (%0, %1, 4) \n\t"
613
"lea (%0, %1), %%"REG_a" \n\t"
614
"pcmpeqb %%mm6, %%mm6 \n\t" // -1
616
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
617
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
620
"movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3
621
"movq (%0, %1, 4), %%mm0 \n\t" // l4
622
"pxor %%mm6, %%mm1 \n\t" // -l3-1
623
PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
624
// mm1=-l3-1, mm0=128-q
626
"movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5
627
"movq (%%"REG_a", %1), %%mm3 \n\t" // l2
628
"pxor %%mm6, %%mm2 \n\t" // -l5-1
629
"movq %%mm2, %%mm5 \n\t" // -l5-1
630
"movq "MANGLE(b80)", %%mm4 \n\t" // 128
631
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
632
PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
633
PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
634
PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
635
PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
636
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
638
"movq (%%"REG_a"), %%mm2 \n\t" // l1
639
"pxor %%mm6, %%mm2 \n\t" // -l1-1
640
PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
641
PAVGB((%0), %%mm1) // (l0-l3+256)/2
642
"movq "MANGLE(b80)", %%mm3 \n\t" // 128
643
PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
644
PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
645
PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
646
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
648
PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2
649
"movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7
650
"pxor %%mm6, %%mm1 \n\t" // -l7-1
651
PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
652
"movq "MANGLE(b80)", %%mm2 \n\t" // 128
653
PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
654
PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
655
PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
656
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
658
"movq "MANGLE(b00)", %%mm1 \n\t" // 0
659
"movq "MANGLE(b00)", %%mm5 \n\t" // 0
660
"psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
661
"psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
662
PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
663
PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
664
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
666
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
668
"movq "MANGLE(b00)", %%mm7 \n\t" // 0
669
"movq %2, %%mm2 \n\t" // QP
670
PAVGB(%%mm6, %%mm2) // 128 + QP/2
671
"psubb %%mm6, %%mm2 \n\t"
673
"movq %%mm4, %%mm1 \n\t"
674
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
675
"pxor %%mm1, %%mm4 \n\t"
676
"psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
677
"pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
678
"psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
679
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
681
"movq %%mm4, %%mm3 \n\t" // d
682
"psubusb "MANGLE(b01)", %%mm4 \n\t"
683
PAVGB(%%mm7, %%mm4) // d/32
684
PAVGB(%%mm7, %%mm4) // (d + 32)/64
685
"paddb %%mm3, %%mm4 \n\t" // 5d/64
686
"pand %%mm2, %%mm4 \n\t"
688
"movq "MANGLE(b80)", %%mm5 \n\t" // 128
689
"psubb %%mm0, %%mm5 \n\t" // q
690
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
691
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
692
"pxor %%mm7, %%mm5 \n\t"
694
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
695
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
697
"pand %%mm7, %%mm4 \n\t"
698
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"
699
"movq (%0, %1, 4), %%mm2 \n\t"
700
"pxor %%mm1, %%mm0 \n\t"
701
"pxor %%mm1, %%mm2 \n\t"
702
"paddb %%mm4, %%mm0 \n\t"
703
"psubb %%mm4, %%mm2 \n\t"
704
"pxor %%mm1, %%mm0 \n\t"
705
"pxor %%mm1, %%mm2 \n\t"
706
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"
707
"movq %%mm2, (%0, %1, 4) \n\t"
710
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
718
for(x=0; x<BLOCK_SIZE; x++){
719
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
720
if(FFABS(middleEnergy)< 8*QP){
721
const int q=(src[l4] - src[l5])/2;
722
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
723
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
725
int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
729
d*= FFSIGN(-middleEnergy);
748
int d= src[x+y*stride] - tmp[x+(y-4)*8];
758
src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
762
if(num%1000000 == 0){
763
av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
772
"pxor %%mm7, %%mm7 \n\t"
773
"lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
774
"and "ALIGN_MASK", %%"REG_c" \n\t" // align
776
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
777
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
779
"movq (%0), %%mm0 \n\t"
780
"movq %%mm0, %%mm1 \n\t"
781
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
782
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
784
"movq (%0, %1), %%mm2 \n\t"
785
"lea (%0, %1, 2), %%"REG_a" \n\t"
786
"movq %%mm2, %%mm3 \n\t"
787
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
788
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
790
"movq (%%"REG_a"), %%mm4 \n\t"
791
"movq %%mm4, %%mm5 \n\t"
792
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
793
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
795
"paddw %%mm0, %%mm0 \n\t" // 2L0
796
"paddw %%mm1, %%mm1 \n\t" // 2H0
797
"psubw %%mm4, %%mm2 \n\t" // L1 - L2
798
"psubw %%mm5, %%mm3 \n\t" // H1 - H2
799
"psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
800
"psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
802
"psllw $2, %%mm2 \n\t" // 4L1 - 4L2
803
"psllw $2, %%mm3 \n\t" // 4H1 - 4H2
804
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
805
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
807
"movq (%%"REG_a", %1), %%mm2 \n\t"
808
"movq %%mm2, %%mm3 \n\t"
809
"punpcklbw %%mm7, %%mm2 \n\t" // L3
810
"punpckhbw %%mm7, %%mm3 \n\t" // H3
812
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
813
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
814
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
815
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
816
"movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
817
"movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
819
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"
820
"movq %%mm0, %%mm1 \n\t"
821
"punpcklbw %%mm7, %%mm0 \n\t" // L4
822
"punpckhbw %%mm7, %%mm1 \n\t" // H4
824
"psubw %%mm0, %%mm2 \n\t" // L3 - L4
825
"psubw %%mm1, %%mm3 \n\t" // H3 - H4
826
"movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
827
"movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
828
"paddw %%mm4, %%mm4 \n\t" // 2L2
829
"paddw %%mm5, %%mm5 \n\t" // 2H2
830
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
831
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
833
"lea (%%"REG_a", %1), %0 \n\t"
834
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
835
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
836
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
837
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
839
"movq (%0, %1, 2), %%mm2 \n\t"
840
"movq %%mm2, %%mm3 \n\t"
841
"punpcklbw %%mm7, %%mm2 \n\t" // L5
842
"punpckhbw %%mm7, %%mm3 \n\t" // H5
843
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
844
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
845
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
846
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
848
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
849
"punpcklbw %%mm7, %%mm6 \n\t" // L6
850
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
851
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
852
"punpckhbw %%mm7, %%mm6 \n\t" // H6
853
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
855
"paddw %%mm0, %%mm0 \n\t" // 2L4
856
"paddw %%mm1, %%mm1 \n\t" // 2H4
857
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
858
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
860
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
861
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
862
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
863
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
865
"movq (%0, %1, 4), %%mm2 \n\t"
866
"movq %%mm2, %%mm3 \n\t"
867
"punpcklbw %%mm7, %%mm2 \n\t" // L7
868
"punpckhbw %%mm7, %%mm3 \n\t" // H7
870
"paddw %%mm2, %%mm2 \n\t" // 2L7
871
"paddw %%mm3, %%mm3 \n\t" // 2H7
872
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
873
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
875
"movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
876
"movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
879
"movq %%mm7, %%mm6 \n\t" // 0
880
"psubw %%mm0, %%mm6 \n\t"
881
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
882
"movq %%mm7, %%mm6 \n\t" // 0
883
"psubw %%mm1, %%mm6 \n\t"
884
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
885
"movq %%mm7, %%mm6 \n\t" // 0
886
"psubw %%mm2, %%mm6 \n\t"
887
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
888
"movq %%mm7, %%mm6 \n\t" // 0
889
"psubw %%mm3, %%mm6 \n\t"
890
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
892
"movq %%mm7, %%mm6 \n\t" // 0
893
"pcmpgtw %%mm0, %%mm6 \n\t"
894
"pxor %%mm6, %%mm0 \n\t"
895
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
896
"movq %%mm7, %%mm6 \n\t" // 0
897
"pcmpgtw %%mm1, %%mm6 \n\t"
898
"pxor %%mm6, %%mm1 \n\t"
899
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
900
"movq %%mm7, %%mm6 \n\t" // 0
901
"pcmpgtw %%mm2, %%mm6 \n\t"
902
"pxor %%mm6, %%mm2 \n\t"
903
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
904
"movq %%mm7, %%mm6 \n\t" // 0
905
"pcmpgtw %%mm3, %%mm6 \n\t"
906
"pxor %%mm6, %%mm3 \n\t"
907
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
911
"pminsw %%mm2, %%mm0 \n\t"
912
"pminsw %%mm3, %%mm1 \n\t"
914
"movq %%mm0, %%mm6 \n\t"
915
"psubusw %%mm2, %%mm6 \n\t"
916
"psubw %%mm6, %%mm0 \n\t"
917
"movq %%mm1, %%mm6 \n\t"
918
"psubusw %%mm3, %%mm6 \n\t"
919
"psubw %%mm6, %%mm1 \n\t"
922
"movd %2, %%mm2 \n\t" // QP
923
"punpcklbw %%mm7, %%mm2 \n\t"
925
"movq %%mm7, %%mm6 \n\t" // 0
926
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
927
"pxor %%mm6, %%mm4 \n\t"
928
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
929
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
930
"pxor %%mm7, %%mm5 \n\t"
931
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
933
"psllw $3, %%mm2 \n\t" // 8QP
934
"movq %%mm2, %%mm3 \n\t" // 8QP
935
"pcmpgtw %%mm4, %%mm2 \n\t"
936
"pcmpgtw %%mm5, %%mm3 \n\t"
937
"pand %%mm2, %%mm4 \n\t"
938
"pand %%mm3, %%mm5 \n\t"
941
"psubusw %%mm0, %%mm4 \n\t" // hd
942
"psubusw %%mm1, %%mm5 \n\t" // ld
945
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
946
"pmullw %%mm2, %%mm4 \n\t"
947
"pmullw %%mm2, %%mm5 \n\t"
948
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
949
"paddw %%mm2, %%mm4 \n\t"
950
"paddw %%mm2, %%mm5 \n\t"
951
"psrlw $6, %%mm4 \n\t"
952
"psrlw $6, %%mm5 \n\t"
954
"movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
955
"movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
957
"pxor %%mm2, %%mm2 \n\t"
958
"pxor %%mm3, %%mm3 \n\t"
960
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
961
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
962
"pxor %%mm2, %%mm0 \n\t"
963
"pxor %%mm3, %%mm1 \n\t"
964
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
965
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
966
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
967
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
969
"pxor %%mm6, %%mm2 \n\t"
970
"pxor %%mm7, %%mm3 \n\t"
971
"pand %%mm2, %%mm4 \n\t"
972
"pand %%mm3, %%mm5 \n\t"
975
"pminsw %%mm0, %%mm4 \n\t"
976
"pminsw %%mm1, %%mm5 \n\t"
978
"movq %%mm4, %%mm2 \n\t"
979
"psubusw %%mm0, %%mm2 \n\t"
980
"psubw %%mm2, %%mm4 \n\t"
981
"movq %%mm5, %%mm2 \n\t"
982
"psubusw %%mm1, %%mm2 \n\t"
983
"psubw %%mm2, %%mm5 \n\t"
985
"pxor %%mm6, %%mm4 \n\t"
986
"pxor %%mm7, %%mm5 \n\t"
987
"psubw %%mm6, %%mm4 \n\t"
988
"psubw %%mm7, %%mm5 \n\t"
989
"packsswb %%mm5, %%mm4 \n\t"
990
"movq (%0), %%mm0 \n\t"
991
"paddb %%mm4, %%mm0 \n\t"
992
"movq %%mm0, (%0) \n\t"
993
"movq (%0, %1), %%mm0 \n\t"
994
"psubb %%mm4, %%mm0 \n\t"
995
"movq %%mm0, (%0, %1) \n\t"
998
: "r" ((x86_reg)stride), "m" (c->pQPb)
1001
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1002
const int l1= stride;
1003
const int l2= stride + l1;
1004
const int l3= stride + l2;
1005
const int l4= stride + l3;
1006
const int l5= stride + l4;
1007
const int l6= stride + l5;
1008
const int l7= stride + l6;
1009
const int l8= stride + l7;
1010
// const int l9= stride + l8;
1013
for(x=0; x<BLOCK_SIZE; x++){
1014
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1015
if(FFABS(middleEnergy) < 8*c->QP){
1016
const int q=(src[l4] - src[l5])/2;
1017
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1018
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1020
int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1024
d*= FFSIGN(-middleEnergy);
1039
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1041
#endif //HAVE_ALTIVEC
1044
static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1046
#if HAVE_MMX2 || HAVE_AMD3DNOW
1048
"pxor %%mm6, %%mm6 \n\t"
1049
"pcmpeqb %%mm7, %%mm7 \n\t"
1050
"movq %2, %%mm0 \n\t"
1051
"punpcklbw %%mm6, %%mm0 \n\t"
1052
"psrlw $1, %%mm0 \n\t"
1053
"psubw %%mm7, %%mm0 \n\t"
1054
"packuswb %%mm0, %%mm0 \n\t"
1055
"movq %%mm0, %3 \n\t"
1057
"lea (%0, %1), %%"REG_a" \n\t"
1058
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1060
// 0 1 2 3 4 5 6 7 8 9
1061
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1063
#undef REAL_FIND_MIN_MAX
1066
#define REAL_FIND_MIN_MAX(addr)\
1067
"movq " #addr ", %%mm0 \n\t"\
1068
"pminub %%mm0, %%mm7 \n\t"\
1069
"pmaxub %%mm0, %%mm6 \n\t"
1071
#define REAL_FIND_MIN_MAX(addr)\
1072
"movq " #addr ", %%mm0 \n\t"\
1073
"movq %%mm7, %%mm1 \n\t"\
1074
"psubusb %%mm0, %%mm6 \n\t"\
1075
"paddb %%mm0, %%mm6 \n\t"\
1076
"psubusb %%mm0, %%mm1 \n\t"\
1077
"psubb %%mm1, %%mm7 \n\t"
1079
#define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1081
FIND_MIN_MAX((%%REGa))
1082
FIND_MIN_MAX((%%REGa, %1))
1083
FIND_MIN_MAX((%%REGa, %1, 2))
1084
FIND_MIN_MAX((%0, %1, 4))
1085
FIND_MIN_MAX((%%REGd))
1086
FIND_MIN_MAX((%%REGd, %1))
1087
FIND_MIN_MAX((%%REGd, %1, 2))
1088
FIND_MIN_MAX((%0, %1, 8))
1090
"movq %%mm7, %%mm4 \n\t"
1091
"psrlq $8, %%mm7 \n\t"
1093
"pminub %%mm4, %%mm7 \n\t" // min of pixels
1094
"pshufw $0xF9, %%mm7, %%mm4 \n\t"
1095
"pminub %%mm4, %%mm7 \n\t" // min of pixels
1096
"pshufw $0xFE, %%mm7, %%mm4 \n\t"
1097
"pminub %%mm4, %%mm7 \n\t"
1099
"movq %%mm7, %%mm1 \n\t"
1100
"psubusb %%mm4, %%mm1 \n\t"
1101
"psubb %%mm1, %%mm7 \n\t"
1102
"movq %%mm7, %%mm4 \n\t"
1103
"psrlq $16, %%mm7 \n\t"
1104
"movq %%mm7, %%mm1 \n\t"
1105
"psubusb %%mm4, %%mm1 \n\t"
1106
"psubb %%mm1, %%mm7 \n\t"
1107
"movq %%mm7, %%mm4 \n\t"
1108
"psrlq $32, %%mm7 \n\t"
1109
"movq %%mm7, %%mm1 \n\t"
1110
"psubusb %%mm4, %%mm1 \n\t"
1111
"psubb %%mm1, %%mm7 \n\t"
1115
"movq %%mm6, %%mm4 \n\t"
1116
"psrlq $8, %%mm6 \n\t"
1118
"pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1119
"pshufw $0xF9, %%mm6, %%mm4 \n\t"
1120
"pmaxub %%mm4, %%mm6 \n\t"
1121
"pshufw $0xFE, %%mm6, %%mm4 \n\t"
1122
"pmaxub %%mm4, %%mm6 \n\t"
1124
"psubusb %%mm4, %%mm6 \n\t"
1125
"paddb %%mm4, %%mm6 \n\t"
1126
"movq %%mm6, %%mm4 \n\t"
1127
"psrlq $16, %%mm6 \n\t"
1128
"psubusb %%mm4, %%mm6 \n\t"
1129
"paddb %%mm4, %%mm6 \n\t"
1130
"movq %%mm6, %%mm4 \n\t"
1131
"psrlq $32, %%mm6 \n\t"
1132
"psubusb %%mm4, %%mm6 \n\t"
1133
"paddb %%mm4, %%mm6 \n\t"
1135
"movq %%mm6, %%mm0 \n\t" // max
1136
"psubb %%mm7, %%mm6 \n\t" // max - min
1137
"movd %%mm6, %%ecx \n\t"
1138
"cmpb "MANGLE(deringThreshold)", %%cl \n\t"
1140
"lea -24(%%"REG_SP"), %%"REG_c" \n\t"
1141
"and "ALIGN_MASK", %%"REG_c" \n\t"
1142
PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1143
"punpcklbw %%mm7, %%mm7 \n\t"
1144
"punpcklbw %%mm7, %%mm7 \n\t"
1145
"punpcklbw %%mm7, %%mm7 \n\t"
1146
"movq %%mm7, (%%"REG_c") \n\t"
1148
"movq (%0), %%mm0 \n\t" // L10
1149
"movq %%mm0, %%mm1 \n\t" // L10
1150
"movq %%mm0, %%mm2 \n\t" // L10
1151
"psllq $8, %%mm1 \n\t"
1152
"psrlq $8, %%mm2 \n\t"
1153
"movd -4(%0), %%mm3 \n\t"
1154
"movd 8(%0), %%mm4 \n\t"
1155
"psrlq $24, %%mm3 \n\t"
1156
"psllq $56, %%mm4 \n\t"
1157
"por %%mm3, %%mm1 \n\t" // L00
1158
"por %%mm4, %%mm2 \n\t" // L20
1159
"movq %%mm1, %%mm3 \n\t" // L00
1160
PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1161
PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1162
"psubusb %%mm7, %%mm0 \n\t"
1163
"psubusb %%mm7, %%mm2 \n\t"
1164
"psubusb %%mm7, %%mm3 \n\t"
1165
"pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1166
"pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1167
"pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1168
"paddb %%mm2, %%mm0 \n\t"
1169
"paddb %%mm3, %%mm0 \n\t"
1171
"movq (%%"REG_a"), %%mm2 \n\t" // L11
1172
"movq %%mm2, %%mm3 \n\t" // L11
1173
"movq %%mm2, %%mm4 \n\t" // L11
1174
"psllq $8, %%mm3 \n\t"
1175
"psrlq $8, %%mm4 \n\t"
1176
"movd -4(%%"REG_a"), %%mm5 \n\t"
1177
"movd 8(%%"REG_a"), %%mm6 \n\t"
1178
"psrlq $24, %%mm5 \n\t"
1179
"psllq $56, %%mm6 \n\t"
1180
"por %%mm5, %%mm3 \n\t" // L01
1181
"por %%mm6, %%mm4 \n\t" // L21
1182
"movq %%mm3, %%mm5 \n\t" // L01
1183
PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1184
PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1185
"psubusb %%mm7, %%mm2 \n\t"
1186
"psubusb %%mm7, %%mm4 \n\t"
1187
"psubusb %%mm7, %%mm5 \n\t"
1188
"pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1189
"pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1190
"pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1191
"paddb %%mm4, %%mm2 \n\t"
1192
"paddb %%mm5, %%mm2 \n\t"
1194
#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1195
"movq " #src ", " #sx " \n\t" /* src[0] */\
1196
"movq " #sx ", " #lx " \n\t" /* src[0] */\
1197
"movq " #sx ", " #t0 " \n\t" /* src[0] */\
1198
"psllq $8, " #lx " \n\t"\
1199
"psrlq $8, " #t0 " \n\t"\
1200
"movd -4" #src ", " #t1 " \n\t"\
1201
"psrlq $24, " #t1 " \n\t"\
1202
"por " #t1 ", " #lx " \n\t" /* src[-1] */\
1203
"movd 8" #src ", " #t1 " \n\t"\
1204
"psllq $56, " #t1 " \n\t"\
1205
"por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1206
"movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1207
PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1208
PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1210
"movq " #lx ", 8(%%"REG_c") \n\t"\
1211
"movq (%%"REG_c"), " #lx " \n\t"\
1212
"psubusb " #lx ", " #t1 " \n\t"\
1213
"psubusb " #lx ", " #t0 " \n\t"\
1214
"psubusb " #lx ", " #sx " \n\t"\
1215
"movq "MANGLE(b00)", " #lx " \n\t"\
1216
"pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1217
"pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1218
"pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1219
"paddb " #t1 ", " #t0 " \n\t"\
1220
"paddb " #t0 ", " #sx " \n\t"\
1222
PAVGB(plx, pplx) /* filtered */\
1223
"movq " #dst ", " #t0 " \n\t" /* dst */\
1224
"movq " #t0 ", " #t1 " \n\t" /* dst */\
1225
"psubusb %3, " #t0 " \n\t"\
1226
"paddusb %3, " #t1 " \n\t"\
1228
PMINUB(t1, pplx, t0)\
1229
"paddb " #sx ", " #ppsx " \n\t"\
1230
"paddb " #psx ", " #ppsx " \n\t"\
1231
"#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1232
"pand "MANGLE(b08)", " #ppsx " \n\t"\
1233
"pcmpeqb " #lx ", " #ppsx " \n\t"\
1234
"pand " #ppsx ", " #pplx " \n\t"\
1235
"pandn " #dst ", " #ppsx " \n\t"\
1236
"por " #pplx ", " #ppsx " \n\t"\
1237
"movq " #ppsx ", " #dst " \n\t"\
1238
"movq 8(%%"REG_c"), " #lx " \n\t"
1240
#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1241
REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1257
//DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1258
DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1259
DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1260
DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1261
DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1262
DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1263
DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1264
DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1265
DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1268
: : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2)
1269
: "%"REG_a, "%"REG_d, "%"REG_c
1271
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1278
const int QP2= c->QP/2 + 1;
1285
if(*p > max) max= *p;
1286
if(*p < min) min= *p;
1289
avg= (min + max + 1)>>1;
1291
if(max - min <deringThreshold) return;
1293
for(y=0; y<10; y++){
1296
if(src[stride*y + 0] > avg) t+= 1;
1297
if(src[stride*y + 1] > avg) t+= 2;
1298
if(src[stride*y + 2] > avg) t+= 4;
1299
if(src[stride*y + 3] > avg) t+= 8;
1300
if(src[stride*y + 4] > avg) t+= 16;
1301
if(src[stride*y + 5] > avg) t+= 32;
1302
if(src[stride*y + 6] > avg) t+= 64;
1303
if(src[stride*y + 7] > avg) t+= 128;
1304
if(src[stride*y + 8] > avg) t+= 256;
1305
if(src[stride*y + 9] > avg) t+= 512;
1308
t &= (t<<1) & (t>>1);
1313
int t = s[y-1] & s[y] & s[y+1];
1326
int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1327
+2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1328
+(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1331
#ifdef DEBUG_DERING_THRESHOLD
1332
__asm__ volatile("emms\n\t":);
1334
static long long numPixels=0;
1335
if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1336
// if((max-min)<20 || (max-min)*QP<200)
1337
// if((max-min)*QP < 500)
1340
static int numSkipped=0;
1341
static int errorSum=0;
1342
static int worstQP=0;
1343
static int worstRange=0;
1344
static int worstDiff=0;
1346
int absDiff= FFABS(diff);
1347
int error= diff*diff;
1349
if(x==1 || x==8 || y==1 || y==8) continue;
1352
if(absDiff > worstDiff){
1355
worstRange= max-min;
1359
if(1024LL*1024LL*1024LL % numSkipped == 0){
1360
av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1361
"wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1362
(float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1363
worstDiff, (float)numSkipped/numPixels);
1368
if (*p + QP2 < f) *p= *p + QP2;
1369
else if(*p - QP2 > f) *p= *p - QP2;
1374
#ifdef DEBUG_DERING_THRESHOLD
1382
*p = FFMIN(*p + 20, 255);
1385
// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1388
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1390
#endif //HAVE_ALTIVEC
1393
* Deinterlace the given block by linearly interpolating every second line.
1394
* will be called for every 8x8 block and can read & write from line 4-15
1395
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1396
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1398
static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1400
#if HAVE_MMX2 || HAVE_AMD3DNOW
1403
"lea (%0, %1), %%"REG_a" \n\t"
1404
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1405
// 0 1 2 3 4 5 6 7 8 9
1406
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1408
"movq (%0), %%mm0 \n\t"
1409
"movq (%%"REG_a", %1), %%mm1 \n\t"
1411
"movq %%mm0, (%%"REG_a") \n\t"
1412
"movq (%0, %1, 4), %%mm0 \n\t"
1414
"movq %%mm1, (%%"REG_a", %1, 2) \n\t"
1415
"movq (%%"REG_c", %1), %%mm1 \n\t"
1417
"movq %%mm0, (%%"REG_c") \n\t"
1418
"movq (%0, %1, 8), %%mm0 \n\t"
1420
"movq %%mm1, (%%"REG_c", %1, 2) \n\t"
1422
: : "r" (src), "r" ((x86_reg)stride)
1423
: "%"REG_a, "%"REG_c
1430
a= *(uint32_t*)&src[stride*0];
1431
b= *(uint32_t*)&src[stride*2];
1432
*(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1433
a= *(uint32_t*)&src[stride*4];
1434
*(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1435
b= *(uint32_t*)&src[stride*6];
1436
*(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1437
a= *(uint32_t*)&src[stride*8];
1438
*(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1445
* Deinterlace the given block by cubic interpolating every second line.
1446
* will be called for every 8x8 block and can read & write from line 4-15
1447
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1448
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1449
* this filter will read lines 3-15 and write 7-13
1451
static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1453
#if HAVE_MMX2 || HAVE_AMD3DNOW
1456
"lea (%0, %1), %%"REG_a" \n\t"
1457
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1458
"lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1459
"add %1, %%"REG_c" \n\t"
1460
"pxor %%mm7, %%mm7 \n\t"
1461
// 0 1 2 3 4 5 6 7 8 9 10
1462
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1464
#define REAL_DEINT_CUBIC(a,b,c,d,e)\
1465
"movq " #a ", %%mm0 \n\t"\
1466
"movq " #b ", %%mm1 \n\t"\
1467
"movq " #d ", %%mm2 \n\t"\
1468
"movq " #e ", %%mm3 \n\t"\
1469
PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1470
PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1471
"movq %%mm0, %%mm2 \n\t"\
1472
"punpcklbw %%mm7, %%mm0 \n\t"\
1473
"punpckhbw %%mm7, %%mm2 \n\t"\
1474
"movq %%mm1, %%mm3 \n\t"\
1475
"punpcklbw %%mm7, %%mm1 \n\t"\
1476
"punpckhbw %%mm7, %%mm3 \n\t"\
1477
"psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1478
"psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1479
"psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1480
"psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1481
"psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1482
"psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1483
"packuswb %%mm3, %%mm1 \n\t"\
1484
"movq %%mm1, " #c " \n\t"
1485
#define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1487
DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1488
DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
1489
DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1490
DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1492
: : "r" (src), "r" ((x86_reg)stride)
1493
: "%"REG_a, "%"REG_d, "%"REG_c
1495
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1499
src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1500
src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1501
src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1502
src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1505
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1509
* Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1510
* will be called for every 8x8 block and can read & write from line 4-15
1511
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1512
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1513
* this filter will read lines 4-13 and write 5-11
1515
static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1517
#if HAVE_MMX2 || HAVE_AMD3DNOW
1520
"lea (%0, %1), %%"REG_a" \n\t"
1521
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1522
"pxor %%mm7, %%mm7 \n\t"
1523
"movq (%2), %%mm0 \n\t"
1524
// 0 1 2 3 4 5 6 7 8 9 10
1525
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1527
#define REAL_DEINT_FF(a,b,c,d)\
1528
"movq " #a ", %%mm1 \n\t"\
1529
"movq " #b ", %%mm2 \n\t"\
1530
"movq " #c ", %%mm3 \n\t"\
1531
"movq " #d ", %%mm4 \n\t"\
1532
PAVGB(%%mm3, %%mm1) \
1533
PAVGB(%%mm4, %%mm0) \
1534
"movq %%mm0, %%mm3 \n\t"\
1535
"punpcklbw %%mm7, %%mm0 \n\t"\
1536
"punpckhbw %%mm7, %%mm3 \n\t"\
1537
"movq %%mm1, %%mm4 \n\t"\
1538
"punpcklbw %%mm7, %%mm1 \n\t"\
1539
"punpckhbw %%mm7, %%mm4 \n\t"\
1540
"psllw $2, %%mm1 \n\t"\
1541
"psllw $2, %%mm4 \n\t"\
1542
"psubw %%mm0, %%mm1 \n\t"\
1543
"psubw %%mm3, %%mm4 \n\t"\
1544
"movq %%mm2, %%mm5 \n\t"\
1545
"movq %%mm2, %%mm0 \n\t"\
1546
"punpcklbw %%mm7, %%mm2 \n\t"\
1547
"punpckhbw %%mm7, %%mm5 \n\t"\
1548
"paddw %%mm2, %%mm1 \n\t"\
1549
"paddw %%mm5, %%mm4 \n\t"\
1550
"psraw $2, %%mm1 \n\t"\
1551
"psraw $2, %%mm4 \n\t"\
1552
"packuswb %%mm4, %%mm1 \n\t"\
1553
"movq %%mm1, " #b " \n\t"\
1555
#define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1557
DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
1558
DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1559
DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
1560
DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1562
"movq %%mm0, (%2) \n\t"
1563
: : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1564
: "%"REG_a, "%"REG_d
1566
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1571
int t2= src[stride*1];
1573
src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1575
src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1577
src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1579
src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1584
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1588
* Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1589
* will be called for every 8x8 block and can read & write from line 4-15
1590
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1591
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1592
* this filter will read lines 4-13 and write 4-11
1594
static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1596
#if HAVE_MMX2 || HAVE_AMD3DNOW
1599
"lea (%0, %1), %%"REG_a" \n\t"
1600
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1601
"pxor %%mm7, %%mm7 \n\t"
1602
"movq (%2), %%mm0 \n\t"
1603
"movq (%3), %%mm1 \n\t"
1604
// 0 1 2 3 4 5 6 7 8 9 10
1605
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1607
#define REAL_DEINT_L5(t1,t2,a,b,c)\
1608
"movq " #a ", %%mm2 \n\t"\
1609
"movq " #b ", %%mm3 \n\t"\
1610
"movq " #c ", %%mm4 \n\t"\
1613
"movq %%mm2, %%mm5 \n\t"\
1614
"movq %%mm2, " #t1 " \n\t"\
1615
"punpcklbw %%mm7, %%mm2 \n\t"\
1616
"punpckhbw %%mm7, %%mm5 \n\t"\
1617
"movq %%mm2, %%mm6 \n\t"\
1618
"paddw %%mm2, %%mm2 \n\t"\
1619
"paddw %%mm6, %%mm2 \n\t"\
1620
"movq %%mm5, %%mm6 \n\t"\
1621
"paddw %%mm5, %%mm5 \n\t"\
1622
"paddw %%mm6, %%mm5 \n\t"\
1623
"movq %%mm3, %%mm6 \n\t"\
1624
"punpcklbw %%mm7, %%mm3 \n\t"\
1625
"punpckhbw %%mm7, %%mm6 \n\t"\
1626
"paddw %%mm3, %%mm3 \n\t"\
1627
"paddw %%mm6, %%mm6 \n\t"\
1628
"paddw %%mm3, %%mm2 \n\t"\
1629
"paddw %%mm6, %%mm5 \n\t"\
1630
"movq %%mm4, %%mm6 \n\t"\
1631
"punpcklbw %%mm7, %%mm4 \n\t"\
1632
"punpckhbw %%mm7, %%mm6 \n\t"\
1633
"psubw %%mm4, %%mm2 \n\t"\
1634
"psubw %%mm6, %%mm5 \n\t"\
1635
"psraw $2, %%mm2 \n\t"\
1636
"psraw $2, %%mm5 \n\t"\
1637
"packuswb %%mm5, %%mm2 \n\t"\
1638
"movq %%mm2, " #a " \n\t"\
1640
#define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1642
DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
1643
DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
1644
DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) )
1645
DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1646
DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
1647
DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
1648
DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
1649
DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1651
"movq %%mm0, (%2) \n\t"
1652
"movq %%mm1, (%3) \n\t"
1653
: : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1654
: "%"REG_a, "%"REG_d
1656
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1664
src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1666
src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1668
src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1670
src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1672
src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1674
src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1676
src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1678
src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1685
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1689
* Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1690
* will be called for every 8x8 block and can read & write from line 4-15
1691
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1692
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1693
* this filter will read lines 4-13 and write 4-11
1695
static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1697
#if HAVE_MMX2 || HAVE_AMD3DNOW
1700
"lea (%0, %1), %%"REG_a" \n\t"
1701
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1702
// 0 1 2 3 4 5 6 7 8 9
1703
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1705
"movq (%2), %%mm0 \n\t" // L0
1706
"movq (%%"REG_a"), %%mm1 \n\t" // L2
1707
PAVGB(%%mm1, %%mm0) // L0+L2
1708
"movq (%0), %%mm2 \n\t" // L1
1710
"movq %%mm0, (%0) \n\t"
1711
"movq (%%"REG_a", %1), %%mm0 \n\t" // L3
1712
PAVGB(%%mm0, %%mm2) // L1+L3
1713
PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1714
"movq %%mm2, (%%"REG_a") \n\t"
1715
"movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
1716
PAVGB(%%mm2, %%mm1) // L2+L4
1717
PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1718
"movq %%mm1, (%%"REG_a", %1) \n\t"
1719
"movq (%0, %1, 4), %%mm1 \n\t" // L5
1720
PAVGB(%%mm1, %%mm0) // L3+L5
1721
PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1722
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"
1723
"movq (%%"REG_d"), %%mm0 \n\t" // L6
1724
PAVGB(%%mm0, %%mm2) // L4+L6
1725
PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1726
"movq %%mm2, (%0, %1, 4) \n\t"
1727
"movq (%%"REG_d", %1), %%mm2 \n\t" // L7
1728
PAVGB(%%mm2, %%mm1) // L5+L7
1729
PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1730
"movq %%mm1, (%%"REG_d") \n\t"
1731
"movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
1732
PAVGB(%%mm1, %%mm0) // L6+L8
1733
PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1734
"movq %%mm0, (%%"REG_d", %1) \n\t"
1735
"movq (%0, %1, 8), %%mm0 \n\t" // L9
1736
PAVGB(%%mm0, %%mm2) // L7+L9
1737
PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1738
"movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1739
"movq %%mm1, (%2) \n\t"
1741
: : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1742
: "%"REG_a, "%"REG_d
1744
#else //HAVE_MMX2 || HAVE_AMD3DNOW
1749
a= *(uint32_t*)&tmp[stride*0];
1750
b= *(uint32_t*)&src[stride*0];
1751
c= *(uint32_t*)&src[stride*1];
1752
a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1753
*(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1755
a= *(uint32_t*)&src[stride*2];
1756
b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1757
*(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1759
b= *(uint32_t*)&src[stride*3];
1760
c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1761
*(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1763
c= *(uint32_t*)&src[stride*4];
1764
a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1765
*(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1767
a= *(uint32_t*)&src[stride*5];
1768
b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1769
*(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1771
b= *(uint32_t*)&src[stride*6];
1772
c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1773
*(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1775
c= *(uint32_t*)&src[stride*7];
1776
a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1777
*(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1779
a= *(uint32_t*)&src[stride*8];
1780
b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1781
*(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1783
*(uint32_t*)&tmp[stride*0]= c;
1787
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
1791
* Deinterlace the given block by applying a median filter to every second line.
1792
* will be called for every 8x8 block and can read & write from line 4-15,
1793
* lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1794
* lines 4-12 will be read into the deblocking filter and should be deinterlaced
1796
static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1802
"lea (%0, %1), %%"REG_a" \n\t"
1803
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1804
// 0 1 2 3 4 5 6 7 8 9
1805
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1807
"movq (%0), %%mm0 \n\t" //
1808
"movq (%%"REG_a", %1), %%mm2 \n\t" //
1809
"movq (%%"REG_a"), %%mm1 \n\t" //
1810
"movq %%mm0, %%mm3 \n\t"
1811
"pmaxub %%mm1, %%mm0 \n\t" //
1812
"pminub %%mm3, %%mm1 \n\t" //
1813
"pmaxub %%mm2, %%mm1 \n\t" //
1814
"pminub %%mm1, %%mm0 \n\t"
1815
"movq %%mm0, (%%"REG_a") \n\t"
1817
"movq (%0, %1, 4), %%mm0 \n\t" //
1818
"movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
1819
"movq %%mm2, %%mm3 \n\t"
1820
"pmaxub %%mm1, %%mm2 \n\t" //
1821
"pminub %%mm3, %%mm1 \n\t" //
1822
"pmaxub %%mm0, %%mm1 \n\t" //
1823
"pminub %%mm1, %%mm2 \n\t"
1824
"movq %%mm2, (%%"REG_a", %1, 2) \n\t"
1826
"movq (%%"REG_d"), %%mm2 \n\t" //
1827
"movq (%%"REG_d", %1), %%mm1 \n\t" //
1828
"movq %%mm2, %%mm3 \n\t"
1829
"pmaxub %%mm0, %%mm2 \n\t" //
1830
"pminub %%mm3, %%mm0 \n\t" //
1831
"pmaxub %%mm1, %%mm0 \n\t" //
1832
"pminub %%mm0, %%mm2 \n\t"
1833
"movq %%mm2, (%%"REG_d") \n\t"
1835
"movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
1836
"movq (%0, %1, 8), %%mm0 \n\t" //
1837
"movq %%mm2, %%mm3 \n\t"
1838
"pmaxub %%mm0, %%mm2 \n\t" //
1839
"pminub %%mm3, %%mm0 \n\t" //
1840
"pmaxub %%mm1, %%mm0 \n\t" //
1841
"pminub %%mm0, %%mm2 \n\t"
1842
"movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1845
: : "r" (src), "r" ((x86_reg)stride)
1846
: "%"REG_a, "%"REG_d
1849
#else // MMX without MMX2
1851
"lea (%0, %1), %%"REG_a" \n\t"
1852
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1853
// 0 1 2 3 4 5 6 7 8 9
1854
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1855
"pxor %%mm7, %%mm7 \n\t"
1857
#define REAL_MEDIAN(a,b,c)\
1858
"movq " #a ", %%mm0 \n\t"\
1859
"movq " #b ", %%mm2 \n\t"\
1860
"movq " #c ", %%mm1 \n\t"\
1861
"movq %%mm0, %%mm3 \n\t"\
1862
"movq %%mm1, %%mm4 \n\t"\
1863
"movq %%mm2, %%mm5 \n\t"\
1864
"psubusb %%mm1, %%mm3 \n\t"\
1865
"psubusb %%mm2, %%mm4 \n\t"\
1866
"psubusb %%mm0, %%mm5 \n\t"\
1867
"pcmpeqb %%mm7, %%mm3 \n\t"\
1868
"pcmpeqb %%mm7, %%mm4 \n\t"\
1869
"pcmpeqb %%mm7, %%mm5 \n\t"\
1870
"movq %%mm3, %%mm6 \n\t"\
1871
"pxor %%mm4, %%mm3 \n\t"\
1872
"pxor %%mm5, %%mm4 \n\t"\
1873
"pxor %%mm6, %%mm5 \n\t"\
1874
"por %%mm3, %%mm1 \n\t"\
1875
"por %%mm4, %%mm2 \n\t"\
1876
"por %%mm5, %%mm0 \n\t"\
1877
"pand %%mm2, %%mm0 \n\t"\
1878
"pand %%mm1, %%mm0 \n\t"\
1879
"movq %%mm0, " #b " \n\t"
1880
#define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
1882
MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
1883
MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
1884
MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
1885
MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
1887
: : "r" (src), "r" ((x86_reg)stride)
1888
: "%"REG_a, "%"REG_d
1894
// FIXME - there should be a way to do a few columns in parallel like w/mmx
1896
uint8_t *colsrc = src;
1897
for (y=0; y<4; y++){
1898
int a, b, c, d, e, f;
1900
b = colsrc[stride ];
1901
c = colsrc[stride*2];
1905
colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1915
* Transpose and shift the given 8x8 Block into dst1 and dst2.
1917
static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1920
"lea (%0, %1), %%"REG_a" \n\t"
1921
// 0 1 2 3 4 5 6 7 8 9
1922
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1923
"movq (%0), %%mm0 \n\t" // 12345678
1924
"movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
1925
"movq %%mm0, %%mm2 \n\t" // 12345678
1926
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1927
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1929
"movq (%%"REG_a", %1), %%mm1 \n\t"
1930
"movq (%%"REG_a", %1, 2), %%mm3 \n\t"
1931
"movq %%mm1, %%mm4 \n\t"
1932
"punpcklbw %%mm3, %%mm1 \n\t"
1933
"punpckhbw %%mm3, %%mm4 \n\t"
1935
"movq %%mm0, %%mm3 \n\t"
1936
"punpcklwd %%mm1, %%mm0 \n\t"
1937
"punpckhwd %%mm1, %%mm3 \n\t"
1938
"movq %%mm2, %%mm1 \n\t"
1939
"punpcklwd %%mm4, %%mm2 \n\t"
1940
"punpckhwd %%mm4, %%mm1 \n\t"
1942
"movd %%mm0, 128(%2) \n\t"
1943
"psrlq $32, %%mm0 \n\t"
1944
"movd %%mm0, 144(%2) \n\t"
1945
"movd %%mm3, 160(%2) \n\t"
1946
"psrlq $32, %%mm3 \n\t"
1947
"movd %%mm3, 176(%2) \n\t"
1948
"movd %%mm3, 48(%3) \n\t"
1949
"movd %%mm2, 192(%2) \n\t"
1950
"movd %%mm2, 64(%3) \n\t"
1951
"psrlq $32, %%mm2 \n\t"
1952
"movd %%mm2, 80(%3) \n\t"
1953
"movd %%mm1, 96(%3) \n\t"
1954
"psrlq $32, %%mm1 \n\t"
1955
"movd %%mm1, 112(%3) \n\t"
1957
"lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
1959
"movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1960
"movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
1961
"movq %%mm0, %%mm2 \n\t" // 12345678
1962
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1963
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1965
"movq (%%"REG_a", %1), %%mm1 \n\t"
1966
"movq (%%"REG_a", %1, 2), %%mm3 \n\t"
1967
"movq %%mm1, %%mm4 \n\t"
1968
"punpcklbw %%mm3, %%mm1 \n\t"
1969
"punpckhbw %%mm3, %%mm4 \n\t"
1971
"movq %%mm0, %%mm3 \n\t"
1972
"punpcklwd %%mm1, %%mm0 \n\t"
1973
"punpckhwd %%mm1, %%mm3 \n\t"
1974
"movq %%mm2, %%mm1 \n\t"
1975
"punpcklwd %%mm4, %%mm2 \n\t"
1976
"punpckhwd %%mm4, %%mm1 \n\t"
1978
"movd %%mm0, 132(%2) \n\t"
1979
"psrlq $32, %%mm0 \n\t"
1980
"movd %%mm0, 148(%2) \n\t"
1981
"movd %%mm3, 164(%2) \n\t"
1982
"psrlq $32, %%mm3 \n\t"
1983
"movd %%mm3, 180(%2) \n\t"
1984
"movd %%mm3, 52(%3) \n\t"
1985
"movd %%mm2, 196(%2) \n\t"
1986
"movd %%mm2, 68(%3) \n\t"
1987
"psrlq $32, %%mm2 \n\t"
1988
"movd %%mm2, 84(%3) \n\t"
1989
"movd %%mm1, 100(%3) \n\t"
1990
"psrlq $32, %%mm1 \n\t"
1991
"movd %%mm1, 116(%3) \n\t"
1994
:: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2000
* Transpose the given 8x8 block.
2002
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2005
"lea (%0, %1), %%"REG_a" \n\t"
2006
"lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
2007
// 0 1 2 3 4 5 6 7 8 9
2008
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2009
"movq (%2), %%mm0 \n\t" // 12345678
2010
"movq 16(%2), %%mm1 \n\t" // abcdefgh
2011
"movq %%mm0, %%mm2 \n\t" // 12345678
2012
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2013
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2015
"movq 32(%2), %%mm1 \n\t"
2016
"movq 48(%2), %%mm3 \n\t"
2017
"movq %%mm1, %%mm4 \n\t"
2018
"punpcklbw %%mm3, %%mm1 \n\t"
2019
"punpckhbw %%mm3, %%mm4 \n\t"
2021
"movq %%mm0, %%mm3 \n\t"
2022
"punpcklwd %%mm1, %%mm0 \n\t"
2023
"punpckhwd %%mm1, %%mm3 \n\t"
2024
"movq %%mm2, %%mm1 \n\t"
2025
"punpcklwd %%mm4, %%mm2 \n\t"
2026
"punpckhwd %%mm4, %%mm1 \n\t"
2028
"movd %%mm0, (%0) \n\t"
2029
"psrlq $32, %%mm0 \n\t"
2030
"movd %%mm0, (%%"REG_a") \n\t"
2031
"movd %%mm3, (%%"REG_a", %1) \n\t"
2032
"psrlq $32, %%mm3 \n\t"
2033
"movd %%mm3, (%%"REG_a", %1, 2) \n\t"
2034
"movd %%mm2, (%0, %1, 4) \n\t"
2035
"psrlq $32, %%mm2 \n\t"
2036
"movd %%mm2, (%%"REG_d") \n\t"
2037
"movd %%mm1, (%%"REG_d", %1) \n\t"
2038
"psrlq $32, %%mm1 \n\t"
2039
"movd %%mm1, (%%"REG_d", %1, 2) \n\t"
2042
"movq 64(%2), %%mm0 \n\t" // 12345678
2043
"movq 80(%2), %%mm1 \n\t" // abcdefgh
2044
"movq %%mm0, %%mm2 \n\t" // 12345678
2045
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2046
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2048
"movq 96(%2), %%mm1 \n\t"
2049
"movq 112(%2), %%mm3 \n\t"
2050
"movq %%mm1, %%mm4 \n\t"
2051
"punpcklbw %%mm3, %%mm1 \n\t"
2052
"punpckhbw %%mm3, %%mm4 \n\t"
2054
"movq %%mm0, %%mm3 \n\t"
2055
"punpcklwd %%mm1, %%mm0 \n\t"
2056
"punpckhwd %%mm1, %%mm3 \n\t"
2057
"movq %%mm2, %%mm1 \n\t"
2058
"punpcklwd %%mm4, %%mm2 \n\t"
2059
"punpckhwd %%mm4, %%mm1 \n\t"
2061
"movd %%mm0, 4(%0) \n\t"
2062
"psrlq $32, %%mm0 \n\t"
2063
"movd %%mm0, 4(%%"REG_a") \n\t"
2064
"movd %%mm3, 4(%%"REG_a", %1) \n\t"
2065
"psrlq $32, %%mm3 \n\t"
2066
"movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
2067
"movd %%mm2, 4(%0, %1, 4) \n\t"
2068
"psrlq $32, %%mm2 \n\t"
2069
"movd %%mm2, 4(%%"REG_d") \n\t"
2070
"movd %%mm1, 4(%%"REG_d", %1) \n\t"
2071
"psrlq $32, %%mm1 \n\t"
2072
"movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
2074
:: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2075
: "%"REG_a, "%"REG_d
2079
//static long test=0;
2082
static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2083
uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
2085
// to save a register (FIXME do this outside of the loops)
2086
tempBlurredPast[127]= maxNoise[0];
2087
tempBlurredPast[128]= maxNoise[1];
2088
tempBlurredPast[129]= maxNoise[2];
2090
#define FAST_L2_DIFF
2091
//#define L1_DIFF //u should change the thresholds too if u try that one
2092
#if HAVE_MMX2 || HAVE_AMD3DNOW
2094
"lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2095
"lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2096
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2097
// 0 1 2 3 4 5 6 7 8 9
2098
// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2100
#ifdef L1_DIFF //needs mmx2
2101
"movq (%0), %%mm0 \n\t" // L0
2102
"psadbw (%1), %%mm0 \n\t" // |L0-R0|
2103
"movq (%0, %2), %%mm1 \n\t" // L1
2104
"psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2105
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2106
"psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2107
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2108
"psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3|
2110
"movq (%0, %2, 4), %%mm4 \n\t" // L4
2111
"paddw %%mm1, %%mm0 \n\t"
2112
"psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2113
"movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2114
"paddw %%mm2, %%mm0 \n\t"
2115
"psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5|
2116
"movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2117
"paddw %%mm3, %%mm0 \n\t"
2118
"psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6|
2119
"movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2120
"paddw %%mm4, %%mm0 \n\t"
2121
"psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7|
2122
"paddw %%mm5, %%mm6 \n\t"
2123
"paddw %%mm7, %%mm6 \n\t"
2124
"paddw %%mm6, %%mm0 \n\t"
2126
#if defined (FAST_L2_DIFF)
2127
"pcmpeqb %%mm7, %%mm7 \n\t"
2128
"movq "MANGLE(b80)", %%mm6 \n\t"
2129
"pxor %%mm0, %%mm0 \n\t"
2130
#define REAL_L2_DIFF_CORE(a, b)\
2131
"movq " #a ", %%mm5 \n\t"\
2132
"movq " #b ", %%mm2 \n\t"\
2133
"pxor %%mm7, %%mm2 \n\t"\
2134
PAVGB(%%mm2, %%mm5)\
2135
"paddb %%mm6, %%mm5 \n\t"\
2136
"movq %%mm5, %%mm2 \n\t"\
2137
"psllw $8, %%mm5 \n\t"\
2138
"pmaddwd %%mm5, %%mm5 \n\t"\
2139
"pmaddwd %%mm2, %%mm2 \n\t"\
2140
"paddd %%mm2, %%mm5 \n\t"\
2141
"psrld $14, %%mm5 \n\t"\
2142
"paddd %%mm5, %%mm0 \n\t"
2144
#else //defined (FAST_L2_DIFF)
2145
"pxor %%mm7, %%mm7 \n\t"
2146
"pxor %%mm0, %%mm0 \n\t"
2147
#define REAL_L2_DIFF_CORE(a, b)\
2148
"movq " #a ", %%mm5 \n\t"\
2149
"movq " #b ", %%mm2 \n\t"\
2150
"movq %%mm5, %%mm1 \n\t"\
2151
"movq %%mm2, %%mm3 \n\t"\
2152
"punpcklbw %%mm7, %%mm5 \n\t"\
2153
"punpckhbw %%mm7, %%mm1 \n\t"\
2154
"punpcklbw %%mm7, %%mm2 \n\t"\
2155
"punpckhbw %%mm7, %%mm3 \n\t"\
2156
"psubw %%mm2, %%mm5 \n\t"\
2157
"psubw %%mm3, %%mm1 \n\t"\
2158
"pmaddwd %%mm5, %%mm5 \n\t"\
2159
"pmaddwd %%mm1, %%mm1 \n\t"\
2160
"paddd %%mm1, %%mm5 \n\t"\
2161
"paddd %%mm5, %%mm0 \n\t"
2163
#endif //defined (FAST_L2_DIFF)
2165
#define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2167
L2_DIFF_CORE((%0) , (%1))
2168
L2_DIFF_CORE((%0, %2) , (%1, %2))
2169
L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2170
L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
2171
L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2172
L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
2173
L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2174
L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
2178
"movq %%mm0, %%mm4 \n\t"
2179
"psrlq $32, %%mm0 \n\t"
2180
"paddd %%mm0, %%mm4 \n\t"
2181
"movd %%mm4, %%ecx \n\t"
2182
"shll $2, %%ecx \n\t"
2183
"mov %3, %%"REG_d" \n\t"
2184
"addl -4(%%"REG_d"), %%ecx \n\t"
2185
"addl 4(%%"REG_d"), %%ecx \n\t"
2186
"addl -1024(%%"REG_d"), %%ecx \n\t"
2187
"addl $4, %%ecx \n\t"
2188
"addl 1024(%%"REG_d"), %%ecx \n\t"
2189
"shrl $3, %%ecx \n\t"
2190
"movl %%ecx, (%%"REG_d") \n\t"
2192
// "mov %3, %%"REG_c" \n\t"
2193
// "mov %%"REG_c", test \n\t"
2195
"cmpl 512(%%"REG_d"), %%ecx \n\t"
2197
"cmpl 516(%%"REG_d"), %%ecx \n\t"
2200
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2201
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2202
"movq (%0), %%mm0 \n\t" // L0
2203
"movq (%0, %2), %%mm1 \n\t" // L1
2204
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2205
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2206
"movq (%0, %2, 4), %%mm4 \n\t" // L4
2207
"movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2208
"movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2209
"movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2210
"movq %%mm0, (%1) \n\t" // L0
2211
"movq %%mm1, (%1, %2) \n\t" // L1
2212
"movq %%mm2, (%1, %2, 2) \n\t" // L2
2213
"movq %%mm3, (%1, %%"REG_a") \n\t" // L3
2214
"movq %%mm4, (%1, %2, 4) \n\t" // L4
2215
"movq %%mm5, (%1, %%"REG_d") \n\t" // L5
2216
"movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
2217
"movq %%mm7, (%1, %%"REG_c") \n\t" // L7
2221
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2222
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2223
"movq (%0), %%mm0 \n\t" // L0
2224
PAVGB((%1), %%mm0) // L0
2225
"movq (%0, %2), %%mm1 \n\t" // L1
2226
PAVGB((%1, %2), %%mm1) // L1
2227
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2228
PAVGB((%1, %2, 2), %%mm2) // L2
2229
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2230
PAVGB((%1, %%REGa), %%mm3) // L3
2231
"movq (%0, %2, 4), %%mm4 \n\t" // L4
2232
PAVGB((%1, %2, 4), %%mm4) // L4
2233
"movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2234
PAVGB((%1, %%REGd), %%mm5) // L5
2235
"movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2236
PAVGB((%1, %%REGa, 2), %%mm6) // L6
2237
"movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2238
PAVGB((%1, %%REGc), %%mm7) // L7
2239
"movq %%mm0, (%1) \n\t" // R0
2240
"movq %%mm1, (%1, %2) \n\t" // R1
2241
"movq %%mm2, (%1, %2, 2) \n\t" // R2
2242
"movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2243
"movq %%mm4, (%1, %2, 4) \n\t" // R4
2244
"movq %%mm5, (%1, %%"REG_d") \n\t" // R5
2245
"movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
2246
"movq %%mm7, (%1, %%"REG_c") \n\t" // R7
2247
"movq %%mm0, (%0) \n\t" // L0
2248
"movq %%mm1, (%0, %2) \n\t" // L1
2249
"movq %%mm2, (%0, %2, 2) \n\t" // L2
2250
"movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2251
"movq %%mm4, (%0, %2, 4) \n\t" // L4
2252
"movq %%mm5, (%0, %%"REG_d") \n\t" // L5
2253
"movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
2254
"movq %%mm7, (%0, %%"REG_c") \n\t" // L7
2258
"cmpl 508(%%"REG_d"), %%ecx \n\t"
2261
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2262
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2263
"movq (%0), %%mm0 \n\t" // L0
2264
"movq (%0, %2), %%mm1 \n\t" // L1
2265
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2266
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2267
"movq (%1), %%mm4 \n\t" // R0
2268
"movq (%1, %2), %%mm5 \n\t" // R1
2269
"movq (%1, %2, 2), %%mm6 \n\t" // R2
2270
"movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2279
"movq %%mm0, (%1) \n\t" // R0
2280
"movq %%mm1, (%1, %2) \n\t" // R1
2281
"movq %%mm2, (%1, %2, 2) \n\t" // R2
2282
"movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2283
"movq %%mm0, (%0) \n\t" // L0
2284
"movq %%mm1, (%0, %2) \n\t" // L1
2285
"movq %%mm2, (%0, %2, 2) \n\t" // L2
2286
"movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2288
"movq (%0, %2, 4), %%mm0 \n\t" // L4
2289
"movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2290
"movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2291
"movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2292
"movq (%1, %2, 4), %%mm4 \n\t" // R4
2293
"movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2294
"movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2295
"movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2304
"movq %%mm0, (%1, %2, 4) \n\t" // R4
2305
"movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2306
"movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2307
"movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2308
"movq %%mm0, (%0, %2, 4) \n\t" // L4
2309
"movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2310
"movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2311
"movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2315
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2316
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2317
"movq (%0), %%mm0 \n\t" // L0
2318
"movq (%0, %2), %%mm1 \n\t" // L1
2319
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2320
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2321
"movq (%1), %%mm4 \n\t" // R0
2322
"movq (%1, %2), %%mm5 \n\t" // R1
2323
"movq (%1, %2, 2), %%mm6 \n\t" // R2
2324
"movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2337
"movq %%mm0, (%1) \n\t" // R0
2338
"movq %%mm1, (%1, %2) \n\t" // R1
2339
"movq %%mm2, (%1, %2, 2) \n\t" // R2
2340
"movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2341
"movq %%mm0, (%0) \n\t" // L0
2342
"movq %%mm1, (%0, %2) \n\t" // L1
2343
"movq %%mm2, (%0, %2, 2) \n\t" // L2
2344
"movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2346
"movq (%0, %2, 4), %%mm0 \n\t" // L4
2347
"movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2348
"movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2349
"movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2350
"movq (%1, %2, 4), %%mm4 \n\t" // R4
2351
"movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2352
"movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2353
"movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2366
"movq %%mm0, (%1, %2, 4) \n\t" // R4
2367
"movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2368
"movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2369
"movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2370
"movq %%mm0, (%0, %2, 4) \n\t" // L4
2371
"movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2372
"movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2373
"movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2377
:: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2378
: "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2380
#else //HAVE_MMX2 || HAVE_AMD3DNOW
2390
int ref= tempBlurred[ x + y*stride ];
2391
int cur= src[ x + y*stride ];
2393
// if(x==0 || x==7) d1+= d1>>1;
2394
// if(y==0 || y==7) d1+= d1>>1;
2403
+(*(tempBlurredPast-256))
2404
+(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2405
+(*(tempBlurredPast+256))
2408
// ((*tempBlurredPast)*3 + d + 2)>>2;
2413
64 32 16 8 4 2 1 (1)
2414
64 48 36 27 20 15 11 (33) (approx)
2415
64 56 49 43 37 33 29 (200) (approx)
2417
if(d > maxNoise[1]){
2418
if(d < maxNoise[2]){
2422
int ref= tempBlurred[ x + y*stride ];
2423
int cur= src[ x + y*stride ];
2424
tempBlurred[ x + y*stride ]=
2425
src[ x + y*stride ]=
2433
tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2438
if(d < maxNoise[0]){
2442
int ref= tempBlurred[ x + y*stride ];
2443
int cur= src[ x + y*stride ];
2444
tempBlurred[ x + y*stride ]=
2445
src[ x + y*stride ]=
2446
(ref*7 + cur + 4)>>3;
2453
int ref= tempBlurred[ x + y*stride ];
2454
int cur= src[ x + y*stride ];
2455
tempBlurred[ x + y*stride ]=
2456
src[ x + y*stride ]=
2457
(ref*3 + cur + 2)>>2;
2463
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
2465
#endif //HAVE_ALTIVEC
2469
* accurate deblock filter
2471
static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2472
int64_t dc_mask, eq_mask, both_masks;
2473
int64_t sums[10*8*2];
2474
src+= step*3; // src points to begin of the 8x8 Block
2477
"movq %0, %%mm7 \n\t"
2478
"movq %1, %%mm6 \n\t"
2479
: : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2483
"lea (%2, %3), %%"REG_a" \n\t"
2484
// 0 1 2 3 4 5 6 7 8 9
2485
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2487
"movq (%2), %%mm0 \n\t"
2488
"movq (%%"REG_a"), %%mm1 \n\t"
2489
"movq %%mm1, %%mm3 \n\t"
2490
"movq %%mm1, %%mm4 \n\t"
2491
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
2492
"paddb %%mm7, %%mm0 \n\t"
2493
"pcmpgtb %%mm6, %%mm0 \n\t"
2495
"movq (%%"REG_a",%3), %%mm2 \n\t"
2496
PMAXUB(%%mm2, %%mm4)
2497
PMINUB(%%mm2, %%mm3, %%mm5)
2498
"psubb %%mm2, %%mm1 \n\t"
2499
"paddb %%mm7, %%mm1 \n\t"
2500
"pcmpgtb %%mm6, %%mm1 \n\t"
2501
"paddb %%mm1, %%mm0 \n\t"
2503
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2504
PMAXUB(%%mm1, %%mm4)
2505
PMINUB(%%mm1, %%mm3, %%mm5)
2506
"psubb %%mm1, %%mm2 \n\t"
2507
"paddb %%mm7, %%mm2 \n\t"
2508
"pcmpgtb %%mm6, %%mm2 \n\t"
2509
"paddb %%mm2, %%mm0 \n\t"
2511
"lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
2513
"movq (%2, %3, 4), %%mm2 \n\t"
2514
PMAXUB(%%mm2, %%mm4)
2515
PMINUB(%%mm2, %%mm3, %%mm5)
2516
"psubb %%mm2, %%mm1 \n\t"
2517
"paddb %%mm7, %%mm1 \n\t"
2518
"pcmpgtb %%mm6, %%mm1 \n\t"
2519
"paddb %%mm1, %%mm0 \n\t"
2521
"movq (%%"REG_a"), %%mm1 \n\t"
2522
PMAXUB(%%mm1, %%mm4)
2523
PMINUB(%%mm1, %%mm3, %%mm5)
2524
"psubb %%mm1, %%mm2 \n\t"
2525
"paddb %%mm7, %%mm2 \n\t"
2526
"pcmpgtb %%mm6, %%mm2 \n\t"
2527
"paddb %%mm2, %%mm0 \n\t"
2529
"movq (%%"REG_a", %3), %%mm2 \n\t"
2530
PMAXUB(%%mm2, %%mm4)
2531
PMINUB(%%mm2, %%mm3, %%mm5)
2532
"psubb %%mm2, %%mm1 \n\t"
2533
"paddb %%mm7, %%mm1 \n\t"
2534
"pcmpgtb %%mm6, %%mm1 \n\t"
2535
"paddb %%mm1, %%mm0 \n\t"
2537
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2538
PMAXUB(%%mm1, %%mm4)
2539
PMINUB(%%mm1, %%mm3, %%mm5)
2540
"psubb %%mm1, %%mm2 \n\t"
2541
"paddb %%mm7, %%mm2 \n\t"
2542
"pcmpgtb %%mm6, %%mm2 \n\t"
2543
"paddb %%mm2, %%mm0 \n\t"
2545
"movq (%2, %3, 8), %%mm2 \n\t"
2546
PMAXUB(%%mm2, %%mm4)
2547
PMINUB(%%mm2, %%mm3, %%mm5)
2548
"psubb %%mm2, %%mm1 \n\t"
2549
"paddb %%mm7, %%mm1 \n\t"
2550
"pcmpgtb %%mm6, %%mm1 \n\t"
2551
"paddb %%mm1, %%mm0 \n\t"
2553
"movq (%%"REG_a", %3, 4), %%mm1 \n\t"
2554
"psubb %%mm1, %%mm2 \n\t"
2555
"paddb %%mm7, %%mm2 \n\t"
2556
"pcmpgtb %%mm6, %%mm2 \n\t"
2557
"paddb %%mm2, %%mm0 \n\t"
2558
"psubusb %%mm3, %%mm4 \n\t"
2560
"pxor %%mm6, %%mm6 \n\t"
2561
"movq %4, %%mm7 \n\t" // QP,..., QP
2562
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2563
"psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2564
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2565
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2566
"movq %%mm7, %1 \n\t"
2568
"movq %5, %%mm7 \n\t"
2569
"punpcklbw %%mm7, %%mm7 \n\t"
2570
"punpcklbw %%mm7, %%mm7 \n\t"
2571
"punpcklbw %%mm7, %%mm7 \n\t"
2572
"psubb %%mm0, %%mm6 \n\t"
2573
"pcmpgtb %%mm7, %%mm6 \n\t"
2574
"movq %%mm6, %0 \n\t"
2576
: "=m" (eq_mask), "=m" (dc_mask)
2577
: "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2581
both_masks = dc_mask & eq_mask;
2584
x86_reg offset= -8*step;
2585
int64_t *temp_sums= sums;
2588
"movq %2, %%mm0 \n\t" // QP,..., QP
2589
"pxor %%mm4, %%mm4 \n\t"
2591
"movq (%0), %%mm6 \n\t"
2592
"movq (%0, %1), %%mm5 \n\t"
2593
"movq %%mm5, %%mm1 \n\t"
2594
"movq %%mm6, %%mm2 \n\t"
2595
"psubusb %%mm6, %%mm5 \n\t"
2596
"psubusb %%mm1, %%mm2 \n\t"
2597
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2598
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2599
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2601
"pxor %%mm6, %%mm1 \n\t"
2602
"pand %%mm0, %%mm1 \n\t"
2603
"pxor %%mm1, %%mm6 \n\t"
2606
"movq (%0, %1, 8), %%mm5 \n\t"
2607
"add %1, %0 \n\t" // %0 points to line 1 not 0
2608
"movq (%0, %1, 8), %%mm7 \n\t"
2609
"movq %%mm5, %%mm1 \n\t"
2610
"movq %%mm7, %%mm2 \n\t"
2611
"psubusb %%mm7, %%mm5 \n\t"
2612
"psubusb %%mm1, %%mm2 \n\t"
2613
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2614
"movq %2, %%mm0 \n\t" // QP,..., QP
2615
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2616
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2618
"pxor %%mm7, %%mm1 \n\t"
2619
"pand %%mm0, %%mm1 \n\t"
2620
"pxor %%mm1, %%mm7 \n\t"
2622
"movq %%mm6, %%mm5 \n\t"
2623
"punpckhbw %%mm4, %%mm6 \n\t"
2624
"punpcklbw %%mm4, %%mm5 \n\t"
2625
// 4:0 5/6:First 7:Last
2627
"movq %%mm5, %%mm0 \n\t"
2628
"movq %%mm6, %%mm1 \n\t"
2629
"psllw $2, %%mm0 \n\t"
2630
"psllw $2, %%mm1 \n\t"
2631
"paddw "MANGLE(w04)", %%mm0 \n\t"
2632
"paddw "MANGLE(w04)", %%mm1 \n\t"
2635
"movq (%0), %%mm2 \n\t"\
2636
"movq (%0), %%mm3 \n\t"\
2638
"punpcklbw %%mm4, %%mm2 \n\t"\
2639
"punpckhbw %%mm4, %%mm3 \n\t"\
2640
"paddw %%mm2, %%mm0 \n\t"\
2641
"paddw %%mm3, %%mm1 \n\t"
2644
"movq (%0), %%mm2 \n\t"\
2645
"movq (%0), %%mm3 \n\t"\
2647
"punpcklbw %%mm4, %%mm2 \n\t"\
2648
"punpckhbw %%mm4, %%mm3 \n\t"\
2649
"psubw %%mm2, %%mm0 \n\t"\
2650
"psubw %%mm3, %%mm1 \n\t"
2656
"movq %%mm0, (%3) \n\t"
2657
"movq %%mm1, 8(%3) \n\t"
2660
"psubw %%mm5, %%mm0 \n\t"
2661
"psubw %%mm6, %%mm1 \n\t"
2662
"movq %%mm0, 16(%3) \n\t"
2663
"movq %%mm1, 24(%3) \n\t"
2666
"psubw %%mm5, %%mm0 \n\t"
2667
"psubw %%mm6, %%mm1 \n\t"
2668
"movq %%mm0, 32(%3) \n\t"
2669
"movq %%mm1, 40(%3) \n\t"
2672
"psubw %%mm5, %%mm0 \n\t"
2673
"psubw %%mm6, %%mm1 \n\t"
2674
"movq %%mm0, 48(%3) \n\t"
2675
"movq %%mm1, 56(%3) \n\t"
2678
"psubw %%mm5, %%mm0 \n\t"
2679
"psubw %%mm6, %%mm1 \n\t"
2680
"movq %%mm0, 64(%3) \n\t"
2681
"movq %%mm1, 72(%3) \n\t"
2683
"movq %%mm7, %%mm6 \n\t"
2684
"punpckhbw %%mm4, %%mm7 \n\t"
2685
"punpcklbw %%mm4, %%mm6 \n\t"
2691
"movq %%mm0, 80(%3) \n\t"
2692
"movq %%mm1, 88(%3) \n\t"
2695
"paddw %%mm6, %%mm0 \n\t"
2696
"paddw %%mm7, %%mm1 \n\t"
2697
"movq %%mm0, 96(%3) \n\t"
2698
"movq %%mm1, 104(%3) \n\t"
2701
"paddw %%mm6, %%mm0 \n\t"
2702
"paddw %%mm7, %%mm1 \n\t"
2703
"movq %%mm0, 112(%3) \n\t"
2704
"movq %%mm1, 120(%3) \n\t"
2707
"paddw %%mm6, %%mm0 \n\t"
2708
"paddw %%mm7, %%mm1 \n\t"
2709
"movq %%mm0, 128(%3) \n\t"
2710
"movq %%mm1, 136(%3) \n\t"
2713
"paddw %%mm6, %%mm0 \n\t"
2714
"paddw %%mm7, %%mm1 \n\t"
2715
"movq %%mm0, 144(%3) \n\t"
2716
"movq %%mm1, 152(%3) \n\t"
2718
"mov %4, %0 \n\t" //FIXME
2721
: "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2724
src+= step; // src points to begin of the 8x8 Block
2727
"movq %4, %%mm6 \n\t"
2728
"pcmpeqb %%mm5, %%mm5 \n\t"
2729
"pxor %%mm6, %%mm5 \n\t"
2730
"pxor %%mm7, %%mm7 \n\t"
2733
"movq (%1), %%mm0 \n\t"
2734
"movq 8(%1), %%mm1 \n\t"
2735
"paddw 32(%1), %%mm0 \n\t"
2736
"paddw 40(%1), %%mm1 \n\t"
2737
"movq (%0, %3), %%mm2 \n\t"
2738
"movq %%mm2, %%mm3 \n\t"
2739
"movq %%mm2, %%mm4 \n\t"
2740
"punpcklbw %%mm7, %%mm2 \n\t"
2741
"punpckhbw %%mm7, %%mm3 \n\t"
2742
"paddw %%mm2, %%mm0 \n\t"
2743
"paddw %%mm3, %%mm1 \n\t"
2744
"paddw %%mm2, %%mm0 \n\t"
2745
"paddw %%mm3, %%mm1 \n\t"
2746
"psrlw $4, %%mm0 \n\t"
2747
"psrlw $4, %%mm1 \n\t"
2748
"packuswb %%mm1, %%mm0 \n\t"
2749
"pand %%mm6, %%mm0 \n\t"
2750
"pand %%mm5, %%mm4 \n\t"
2751
"por %%mm4, %%mm0 \n\t"
2752
"movq %%mm0, (%0, %3) \n\t"
2757
: "+r"(offset), "+r"(temp_sums)
2758
: "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2761
src+= step; // src points to begin of the 8x8 Block
2763
if(eq_mask != -1LL){
2764
uint8_t *temp_src= src;
2766
"pxor %%mm7, %%mm7 \n\t"
2767
"lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
2768
"and "ALIGN_MASK", %%"REG_c" \n\t" // align
2769
// 0 1 2 3 4 5 6 7 8 9
2770
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2772
"movq (%0), %%mm0 \n\t"
2773
"movq %%mm0, %%mm1 \n\t"
2774
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2775
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2777
"movq (%0, %1), %%mm2 \n\t"
2778
"lea (%0, %1, 2), %%"REG_a" \n\t"
2779
"movq %%mm2, %%mm3 \n\t"
2780
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2781
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2783
"movq (%%"REG_a"), %%mm4 \n\t"
2784
"movq %%mm4, %%mm5 \n\t"
2785
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2786
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2788
"paddw %%mm0, %%mm0 \n\t" // 2L0
2789
"paddw %%mm1, %%mm1 \n\t" // 2H0
2790
"psubw %%mm4, %%mm2 \n\t" // L1 - L2
2791
"psubw %%mm5, %%mm3 \n\t" // H1 - H2
2792
"psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2793
"psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2795
"psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2796
"psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2797
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2798
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2800
"movq (%%"REG_a", %1), %%mm2 \n\t"
2801
"movq %%mm2, %%mm3 \n\t"
2802
"punpcklbw %%mm7, %%mm2 \n\t" // L3
2803
"punpckhbw %%mm7, %%mm3 \n\t" // H3
2805
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2806
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2807
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2808
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2809
"movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2810
"movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2812
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"
2813
"movq %%mm0, %%mm1 \n\t"
2814
"punpcklbw %%mm7, %%mm0 \n\t" // L4
2815
"punpckhbw %%mm7, %%mm1 \n\t" // H4
2817
"psubw %%mm0, %%mm2 \n\t" // L3 - L4
2818
"psubw %%mm1, %%mm3 \n\t" // H3 - H4
2819
"movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
2820
"movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
2821
"paddw %%mm4, %%mm4 \n\t" // 2L2
2822
"paddw %%mm5, %%mm5 \n\t" // 2H2
2823
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2824
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2826
"lea (%%"REG_a", %1), %0 \n\t"
2827
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2828
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2829
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2830
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2832
"movq (%0, %1, 2), %%mm2 \n\t"
2833
"movq %%mm2, %%mm3 \n\t"
2834
"punpcklbw %%mm7, %%mm2 \n\t" // L5
2835
"punpckhbw %%mm7, %%mm3 \n\t" // H5
2836
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2837
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2838
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2839
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2841
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
2842
"punpcklbw %%mm7, %%mm6 \n\t" // L6
2843
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
2844
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
2845
"punpckhbw %%mm7, %%mm6 \n\t" // H6
2846
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
2848
"paddw %%mm0, %%mm0 \n\t" // 2L4
2849
"paddw %%mm1, %%mm1 \n\t" // 2H4
2850
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2851
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2853
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2854
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2855
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2856
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2858
"movq (%0, %1, 4), %%mm2 \n\t"
2859
"movq %%mm2, %%mm3 \n\t"
2860
"punpcklbw %%mm7, %%mm2 \n\t" // L7
2861
"punpckhbw %%mm7, %%mm3 \n\t" // H7
2863
"paddw %%mm2, %%mm2 \n\t" // 2L7
2864
"paddw %%mm3, %%mm3 \n\t" // 2H7
2865
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2866
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2868
"movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2869
"movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2872
"movq %%mm7, %%mm6 \n\t" // 0
2873
"psubw %%mm0, %%mm6 \n\t"
2874
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2875
"movq %%mm7, %%mm6 \n\t" // 0
2876
"psubw %%mm1, %%mm6 \n\t"
2877
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2878
"movq %%mm7, %%mm6 \n\t" // 0
2879
"psubw %%mm2, %%mm6 \n\t"
2880
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2881
"movq %%mm7, %%mm6 \n\t" // 0
2882
"psubw %%mm3, %%mm6 \n\t"
2883
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2885
"movq %%mm7, %%mm6 \n\t" // 0
2886
"pcmpgtw %%mm0, %%mm6 \n\t"
2887
"pxor %%mm6, %%mm0 \n\t"
2888
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2889
"movq %%mm7, %%mm6 \n\t" // 0
2890
"pcmpgtw %%mm1, %%mm6 \n\t"
2891
"pxor %%mm6, %%mm1 \n\t"
2892
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2893
"movq %%mm7, %%mm6 \n\t" // 0
2894
"pcmpgtw %%mm2, %%mm6 \n\t"
2895
"pxor %%mm6, %%mm2 \n\t"
2896
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2897
"movq %%mm7, %%mm6 \n\t" // 0
2898
"pcmpgtw %%mm3, %%mm6 \n\t"
2899
"pxor %%mm6, %%mm3 \n\t"
2900
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2904
"pminsw %%mm2, %%mm0 \n\t"
2905
"pminsw %%mm3, %%mm1 \n\t"
2907
"movq %%mm0, %%mm6 \n\t"
2908
"psubusw %%mm2, %%mm6 \n\t"
2909
"psubw %%mm6, %%mm0 \n\t"
2910
"movq %%mm1, %%mm6 \n\t"
2911
"psubusw %%mm3, %%mm6 \n\t"
2912
"psubw %%mm6, %%mm1 \n\t"
2915
"movd %2, %%mm2 \n\t" // QP
2916
"punpcklbw %%mm7, %%mm2 \n\t"
2918
"movq %%mm7, %%mm6 \n\t" // 0
2919
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2920
"pxor %%mm6, %%mm4 \n\t"
2921
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2922
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2923
"pxor %%mm7, %%mm5 \n\t"
2924
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2926
"psllw $3, %%mm2 \n\t" // 8QP
2927
"movq %%mm2, %%mm3 \n\t" // 8QP
2928
"pcmpgtw %%mm4, %%mm2 \n\t"
2929
"pcmpgtw %%mm5, %%mm3 \n\t"
2930
"pand %%mm2, %%mm4 \n\t"
2931
"pand %%mm3, %%mm5 \n\t"
2934
"psubusw %%mm0, %%mm4 \n\t" // hd
2935
"psubusw %%mm1, %%mm5 \n\t" // ld
2938
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
2939
"pmullw %%mm2, %%mm4 \n\t"
2940
"pmullw %%mm2, %%mm5 \n\t"
2941
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
2942
"paddw %%mm2, %%mm4 \n\t"
2943
"paddw %%mm2, %%mm5 \n\t"
2944
"psrlw $6, %%mm4 \n\t"
2945
"psrlw $6, %%mm5 \n\t"
2947
"movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
2948
"movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
2950
"pxor %%mm2, %%mm2 \n\t"
2951
"pxor %%mm3, %%mm3 \n\t"
2953
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2954
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2955
"pxor %%mm2, %%mm0 \n\t"
2956
"pxor %%mm3, %%mm1 \n\t"
2957
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2958
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2959
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2960
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2962
"pxor %%mm6, %%mm2 \n\t"
2963
"pxor %%mm7, %%mm3 \n\t"
2964
"pand %%mm2, %%mm4 \n\t"
2965
"pand %%mm3, %%mm5 \n\t"
2968
"pminsw %%mm0, %%mm4 \n\t"
2969
"pminsw %%mm1, %%mm5 \n\t"
2971
"movq %%mm4, %%mm2 \n\t"
2972
"psubusw %%mm0, %%mm2 \n\t"
2973
"psubw %%mm2, %%mm4 \n\t"
2974
"movq %%mm5, %%mm2 \n\t"
2975
"psubusw %%mm1, %%mm2 \n\t"
2976
"psubw %%mm2, %%mm5 \n\t"
2978
"pxor %%mm6, %%mm4 \n\t"
2979
"pxor %%mm7, %%mm5 \n\t"
2980
"psubw %%mm6, %%mm4 \n\t"
2981
"psubw %%mm7, %%mm5 \n\t"
2982
"packsswb %%mm5, %%mm4 \n\t"
2983
"movq %3, %%mm1 \n\t"
2984
"pandn %%mm4, %%mm1 \n\t"
2985
"movq (%0), %%mm0 \n\t"
2986
"paddb %%mm1, %%mm0 \n\t"
2987
"movq %%mm0, (%0) \n\t"
2988
"movq (%0, %1), %%mm0 \n\t"
2989
"psubb %%mm1, %%mm0 \n\t"
2990
"movq %%mm0, (%0, %1) \n\t"
2993
: "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask)
2994
: "%"REG_a, "%"REG_c
2998
STOP_TIMER("step16")
3006
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3007
const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3010
* Copy a block from src to dst and fixes the blacklevel.
3011
* levelFix == 0 -> do not touch the brighness & contrast
3013
#undef REAL_SCALED_CPY
3016
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3017
int levelFix, int64_t *packedOffsetAndScale)
3025
"movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
3026
"movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
3027
"lea (%2,%4), %%"REG_a" \n\t"
3028
"lea (%3,%5), %%"REG_d" \n\t"
3029
"pxor %%mm4, %%mm4 \n\t"
3031
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3032
"movq " #src1 ", %%mm0 \n\t"\
3033
"movq " #src1 ", %%mm5 \n\t"\
3034
"movq " #src2 ", %%mm1 \n\t"\
3035
"movq " #src2 ", %%mm6 \n\t"\
3036
"punpcklbw %%mm0, %%mm0 \n\t"\
3037
"punpckhbw %%mm5, %%mm5 \n\t"\
3038
"punpcklbw %%mm1, %%mm1 \n\t"\
3039
"punpckhbw %%mm6, %%mm6 \n\t"\
3040
"pmulhuw %%mm3, %%mm0 \n\t"\
3041
"pmulhuw %%mm3, %%mm5 \n\t"\
3042
"pmulhuw %%mm3, %%mm1 \n\t"\
3043
"pmulhuw %%mm3, %%mm6 \n\t"\
3044
"psubw %%mm2, %%mm0 \n\t"\
3045
"psubw %%mm2, %%mm5 \n\t"\
3046
"psubw %%mm2, %%mm1 \n\t"\
3047
"psubw %%mm2, %%mm6 \n\t"\
3048
"packuswb %%mm5, %%mm0 \n\t"\
3049
"packuswb %%mm6, %%mm1 \n\t"\
3050
"movq %%mm0, " #dst1 " \n\t"\
3051
"movq %%mm1, " #dst2 " \n\t"\
3054
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3055
"movq " #src1 ", %%mm0 \n\t"\
3056
"movq " #src1 ", %%mm5 \n\t"\
3057
"punpcklbw %%mm4, %%mm0 \n\t"\
3058
"punpckhbw %%mm4, %%mm5 \n\t"\
3059
"psubw %%mm2, %%mm0 \n\t"\
3060
"psubw %%mm2, %%mm5 \n\t"\
3061
"movq " #src2 ", %%mm1 \n\t"\
3062
"psllw $6, %%mm0 \n\t"\
3063
"psllw $6, %%mm5 \n\t"\
3064
"pmulhw %%mm3, %%mm0 \n\t"\
3065
"movq " #src2 ", %%mm6 \n\t"\
3066
"pmulhw %%mm3, %%mm5 \n\t"\
3067
"punpcklbw %%mm4, %%mm1 \n\t"\
3068
"punpckhbw %%mm4, %%mm6 \n\t"\
3069
"psubw %%mm2, %%mm1 \n\t"\
3070
"psubw %%mm2, %%mm6 \n\t"\
3071
"psllw $6, %%mm1 \n\t"\
3072
"psllw $6, %%mm6 \n\t"\
3073
"pmulhw %%mm3, %%mm1 \n\t"\
3074
"pmulhw %%mm3, %%mm6 \n\t"\
3075
"packuswb %%mm5, %%mm0 \n\t"\
3076
"packuswb %%mm6, %%mm1 \n\t"\
3077
"movq %%mm0, " #dst1 " \n\t"\
3078
"movq %%mm1, " #dst2 " \n\t"\
3081
#define SCALED_CPY(src1, src2, dst1, dst2)\
3082
REAL_SCALED_CPY(src1, src2, dst1, dst2)
3084
SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3085
SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3086
SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3087
"lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
3088
"lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
3089
SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3092
: "=&a" (packedOffsetAndScale)
3093
: "0" (packedOffsetAndScale),
3096
"r" ((x86_reg)srcStride),
3097
"r" ((x86_reg)dstStride)
3102
memcpy( &(dst[dstStride*i]),
3103
&(src[srcStride*i]), BLOCK_SIZE);
3108
"lea (%0,%2), %%"REG_a" \n\t"
3109
"lea (%1,%3), %%"REG_d" \n\t"
3111
#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3112
"movq " #src1 ", %%mm0 \n\t"\
3113
"movq " #src2 ", %%mm1 \n\t"\
3114
"movq %%mm0, " #dst1 " \n\t"\
3115
"movq %%mm1, " #dst2 " \n\t"\
3117
#define SIMPLE_CPY(src1, src2, dst1, dst2)\
3118
REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3120
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3121
SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3122
SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3123
"lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
3124
"lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
3125
SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3129
"r" ((x86_reg)srcStride),
3130
"r" ((x86_reg)dstStride)
3131
: "%"REG_a, "%"REG_d
3135
memcpy( &(dst[dstStride*i]),
3136
&(src[srcStride*i]), BLOCK_SIZE);
3142
* Duplicate the given 8 src pixels ? times upward
3144
static inline void RENAME(duplicate)(uint8_t src[], int stride)
3148
"movq (%0), %%mm0 \n\t"
3150
"movq %%mm0, (%0) \n\t"
3151
"movq %%mm0, (%0, %1) \n\t"
3152
"movq %%mm0, (%0, %1, 2) \n\t"
3154
: "r" ((x86_reg)-stride)
3167
* Filter array of bytes (Y or U or V values)
3169
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3170
const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3172
DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3174
#ifdef COMPILE_TIME_MODE
3175
const int mode= COMPILE_TIME_MODE;
3177
const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3179
int black=0, white=255; // blackest black and whitest white in the picture
3180
int QPCorrecture= 256*256;
3187
const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3188
const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3191
uint64_t * const yHistogram= c.yHistogram;
3192
uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3193
uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3194
//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3197
for(i=0; i<57; i++){
3198
int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3199
int threshold= offset*2 + 1;
3200
c.mmxDcOffset[i]= 0x7F - offset;
3201
c.mmxDcThreshold[i]= 0x7F - threshold;
3202
c.mmxDcOffset[i]*= 0x0101010101010101LL;
3203
c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3207
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3208
else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3209
|| (mode & FFMPEG_DEINT_FILTER)
3210
|| (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3211
else if( (mode & V_DEBLOCK)
3212
|| (mode & LINEAR_IPOL_DEINT_FILTER)
3213
|| (mode & MEDIAN_DEINT_FILTER)
3214
|| (mode & V_A_DEBLOCK)) copyAhead=13;
3215
else if(mode & V_X1_FILTER) copyAhead=11;
3216
// else if(mode & V_RK1_FILTER) copyAhead=10;
3217
else if(mode & DERING) copyAhead=9;
3225
uint64_t maxClipped;
3230
// first frame is fscked so we ignore it
3231
if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3233
for(i=0; i<256; i++){
3234
sum+= yHistogram[i];
3237
/* We always get a completely black picture first. */
3238
maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3241
for(black=255; black>0; black--){
3242
if(clipped < maxClipped) break;
3243
clipped-= yHistogram[black];
3247
for(white=0; white<256; white++){
3248
if(clipped < maxClipped) break;
3249
clipped-= yHistogram[white];
3252
scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3255
c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3256
c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3258
c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3259
c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3262
c.packedYOffset|= c.packedYOffset<<32;
3263
c.packedYOffset|= c.packedYOffset<<16;
3265
c.packedYScale|= c.packedYScale<<32;
3266
c.packedYScale|= c.packedYScale<<16;
3268
if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
3269
else QPCorrecture= 256*256;
3271
c.packedYScale= 0x0100010001000100LL;
3273
QPCorrecture= 256*256;
3276
/* copy & deinterlace first row of blocks */
3279
const uint8_t *srcBlock= &(src[y*srcStride]);
3280
uint8_t *dstBlock= tempDst + dstStride;
3282
// From this point on it is guaranteed that we can read and write 16 lines downward
3283
// finish 1 block before the next otherwise we might have a problem
3284
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3285
for(x=0; x<width; x+=BLOCK_SIZE){
3289
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3290
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3291
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3292
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3296
"mov %4, %%"REG_a" \n\t"
3297
"shr $2, %%"REG_a" \n\t"
3298
"and $6, %%"REG_a" \n\t"
3299
"add %5, %%"REG_a" \n\t"
3300
"mov %%"REG_a", %%"REG_d" \n\t"
3301
"imul %1, %%"REG_a" \n\t"
3302
"imul %3, %%"REG_d" \n\t"
3303
"prefetchnta 32(%%"REG_a", %0) \n\t"
3304
"prefetcht0 32(%%"REG_d", %2) \n\t"
3305
"add %1, %%"REG_a" \n\t"
3306
"add %3, %%"REG_d" \n\t"
3307
"prefetchnta 32(%%"REG_a", %0) \n\t"
3308
"prefetcht0 32(%%"REG_d", %2) \n\t"
3309
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3310
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3311
: "%"REG_a, "%"REG_d
3315
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3316
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3317
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3318
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3319
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3323
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3324
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3326
RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3328
if(mode & LINEAR_IPOL_DEINT_FILTER)
3329
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3330
else if(mode & LINEAR_BLEND_DEINT_FILTER)
3331
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3332
else if(mode & MEDIAN_DEINT_FILTER)
3333
RENAME(deInterlaceMedian)(dstBlock, dstStride);
3334
else if(mode & CUBIC_IPOL_DEINT_FILTER)
3335
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3336
else if(mode & FFMPEG_DEINT_FILTER)
3337
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3338
else if(mode & LOWPASS5_DEINT_FILTER)
3339
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3340
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3341
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3346
if(width==FFABS(dstStride))
3347
linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3350
for(i=0; i<copyAhead; i++){
3351
memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3356
for(y=0; y<height; y+=BLOCK_SIZE){
3357
//1% speedup if these are here instead of the inner loop
3358
const uint8_t *srcBlock= &(src[y*srcStride]);
3359
uint8_t *dstBlock= &(dst[y*dstStride]);
3361
uint8_t *tempBlock1= c.tempBlocks;
3362
uint8_t *tempBlock2= c.tempBlocks + 8;
3364
const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3365
int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3367
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3368
if not than use a temporary buffer */
3371
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3372
blockcopy to dst later */
3373
linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3374
FFMAX(height-y-copyAhead, 0), srcStride);
3376
/* duplicate last line of src to fill the void up to line (copyAhead+7) */
3377
for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3378
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3380
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3381
linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3383
/* duplicate last line of dst to fill the void up to line (copyAhead) */
3384
for(i=height-y+1; i<=copyAhead; i++)
3385
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3387
dstBlock= tempDst + dstStride;
3391
// From this point on it is guaranteed that we can read and write 16 lines downward
3392
// finish 1 block before the next otherwise we might have a problem
3393
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3394
for(x=0; x<width; x+=BLOCK_SIZE){
3395
const int stride= dstStride;
3400
QP= QPptr[x>>qpHShift];
3401
c.nonBQP= nonBQPptr[x>>qpHShift];
3404
QP= (QP* QPCorrecture + 256*128)>>16;
3405
c.nonBQP= nonBQPptr[x>>4];
3406
c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3407
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3412
"movd %1, %%mm7 \n\t"
3413
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3414
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3415
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3416
"movq %%mm7, %0 \n\t"
3425
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3426
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3427
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3428
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3432
"mov %4, %%"REG_a" \n\t"
3433
"shr $2, %%"REG_a" \n\t"
3434
"and $6, %%"REG_a" \n\t"
3435
"add %5, %%"REG_a" \n\t"
3436
"mov %%"REG_a", %%"REG_d" \n\t"
3437
"imul %1, %%"REG_a" \n\t"
3438
"imul %3, %%"REG_d" \n\t"
3439
"prefetchnta 32(%%"REG_a", %0) \n\t"
3440
"prefetcht0 32(%%"REG_d", %2) \n\t"
3441
"add %1, %%"REG_a" \n\t"
3442
"add %3, %%"REG_d" \n\t"
3443
"prefetchnta 32(%%"REG_a", %0) \n\t"
3444
"prefetcht0 32(%%"REG_d", %2) \n\t"
3445
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3446
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3447
: "%"REG_a, "%"REG_d
3451
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3452
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3453
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3454
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3455
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3459
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3460
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3462
if(mode & LINEAR_IPOL_DEINT_FILTER)
3463
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3464
else if(mode & LINEAR_BLEND_DEINT_FILTER)
3465
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3466
else if(mode & MEDIAN_DEINT_FILTER)
3467
RENAME(deInterlaceMedian)(dstBlock, dstStride);
3468
else if(mode & CUBIC_IPOL_DEINT_FILTER)
3469
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3470
else if(mode & FFMPEG_DEINT_FILTER)
3471
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3472
else if(mode & LOWPASS5_DEINT_FILTER)
3473
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3474
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3475
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3478
/* only deblock if we have 2 blocks */
3480
if(mode & V_X1_FILTER)
3481
RENAME(vertX1Filter)(dstBlock, stride, &c);
3482
else if(mode & V_DEBLOCK){
3483
const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3486
RENAME(doVertLowPass)(dstBlock, stride, &c);
3488
RENAME(doVertDefFilter)(dstBlock, stride, &c);
3489
}else if(mode & V_A_DEBLOCK){
3490
RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3495
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3497
/* check if we have a previous block to deblock it with dstBlock */
3500
if(mode & H_X1_FILTER)
3501
RENAME(vertX1Filter)(tempBlock1, 16, &c);
3502
else if(mode & H_DEBLOCK){
3504
const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3505
//STOP_TIMER("dc & minmax")
3507
RENAME(doVertLowPass)(tempBlock1, 16, &c);
3509
RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3510
}else if(mode & H_A_DEBLOCK){
3511
RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3514
RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3517
if(mode & H_X1_FILTER)
3518
horizX1Filter(dstBlock-4, stride, QP);
3519
else if(mode & H_DEBLOCK){
3521
DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3523
transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3525
t = vertClassify_altivec(tempBlock-48, 16, &c);
3527
doVertLowPass_altivec(tempBlock-48, 16, &c);
3528
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3531
doVertDefFilter_altivec(tempBlock-48, 16, &c);
3532
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3535
const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3538
RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3540
RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3542
}else if(mode & H_A_DEBLOCK){
3543
RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3547
//FIXME filter first line
3548
if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3551
if(mode & TEMP_NOISE_FILTER)
3553
RENAME(tempNoiseReducer)(dstBlock-8, stride,
3554
c.tempBlurred[isColor] + y*dstStride + x,
3555
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
3556
c.ppMode.maxTmpNoise);
3564
tmpXchg= tempBlock1;
3565
tempBlock1= tempBlock2;
3566
tempBlock2 = tmpXchg;
3571
if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3574
if((mode & TEMP_NOISE_FILTER)){
3575
RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3576
c.tempBlurred[isColor] + y*dstStride + x,
3577
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
3578
c.ppMode.maxTmpNoise);
3581
/* did we use a tmp buffer for the last lines*/
3583
uint8_t *dstBlock= &(dst[y*dstStride]);
3584
if(width==FFABS(dstStride))
3585
linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3588
for(i=0; i<height-y; i++){
3589
memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3594
for(x=0; x<width; x+=32){
3596
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3597
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3598
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3599
+ dstBlock[x +13*dstStride]
3600
+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3604
__asm__ volatile("femms");
3606
__asm__ volatile("emms");
3609
#ifdef DEBUG_BRIGHTNESS
3613
for(i=0; i<256; i++)
3614
if(yHistogram[i] > max) max=yHistogram[i];
3616
for(i=1; i<256; i++){
3618
int start=yHistogram[i-1]/(max/256+1);
3619
int end=yHistogram[i]/(max/256+1);
3620
int inc= end > start ? 1 : -1;
3621
for(x=start; x!=end+inc; x+=inc)
3622
dst[ i*dstStride + x]+=128;
3625
for(i=0; i<100; i+=2){
3626
dst[ (white)*dstStride + i]+=128;
3627
dst[ (black)*dstStride + i]+=128;
3632
*c2= c; //copy local context back