2
Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
4
AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2 of the License, or
9
(at your option) any later version.
11
This program is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with this program; if not, write to the Free Software
18
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27
C MMX MMX2 3DNow AltiVec
29
isVertMinMaxOk Ec Ec Ec
30
doVertLowPass E e e Ec
31
doVertDefFilter Ec Ec e e Ec
33
isHorizMinMaxOk a E Ec
34
doHorizLowPass E e e Ec
35
doHorizDefFilter Ec Ec e e Ec
36
do_a_deblock Ec E Ec E
38
Vertical RKAlgo1 E a a
39
Horizontal RKAlgo1 a a
42
LinIpolDeinterlace e E E*
43
CubicIpolDeinterlace a e e*
44
LinBlendDeinterlace e E E*
45
MedianDeinterlace# E Ec Ec
46
TempDeNoiser# E e e Ec
48
* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49
# more or less selfinvented filters so the exactness isnt too meaningfull
50
E = Exact implementation
51
e = allmost exact implementation (slightly different rounding,...)
52
a = alternative / approximate impl
53
c = checked against the other implementations (-vo md5)
54
p = partially optimized, still some work to do
59
reduce the time wasted on the mem transfer
60
unroll stuff if instructions depend too much on the prior one
61
move YScale thing to the end instead of fixing QP
62
write a faster and higher quality deblocking filter :)
63
make the mainloop more flexible (variable number of blocks at once
64
(the if/else stuff per block is slowing things down)
65
compare the quality & speed of all filters
68
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
72
//Changelog: use the CVS log
86
//#define DEBUG_BRIGHTNESS
88
#include "fastmemcpy.h"
90
#include "postprocess.h"
91
#include "postprocess_internal.h"
93
#include "mangle.h" //FIXME should be supressed
100
#define memalign(a,b) malloc(b)
103
#define MIN(a,b) ((a) > (b) ? (b) : (a))
104
#define MAX(a,b) ((a) < (b) ? (b) : (a))
105
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
106
#define SIGN(a) ((a) > 0 ? 1 : -1)
108
#define GET_MODE_BUFFER_SIZE 500
109
#define OPTIONS_ARRAY_SIZE 10
111
#define TEMP_STRIDE 8
112
//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
114
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
115
# define attribute_used __attribute__((used))
116
# define always_inline __attribute__((always_inline)) inline
118
# define attribute_used
119
# define always_inline inline
122
#if defined(ARCH_X86) || defined(ARCH_X86_64)
123
static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
124
static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
125
static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
126
static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
127
static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
128
static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
129
static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
130
static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
133
static uint8_t clip_table[3*256];
134
static uint8_t * const clip_tab= clip_table + 256;
136
static const int verbose= 0;
138
static const int attribute_used deringThreshold= 20;
141
static struct PPFilter filters[]=
143
{"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
144
{"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
145
/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
146
{"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
147
{"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
148
{"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
149
{"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
150
{"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
151
{"dr", "dering", 1, 5, 6, DERING},
152
{"al", "autolevels", 0, 1, 2, LEVEL_FIX},
153
{"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
154
{"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
155
{"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
156
{"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
157
{"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
158
{"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
159
{"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
160
{"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
161
{NULL, NULL,0,0,0,0} //End Marker
164
static char *replaceTable[]=
166
"default", "hdeblock:a,vdeblock:a,dering:a",
167
"de", "hdeblock:a,vdeblock:a,dering:a",
168
"fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
169
"fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
170
"ac", "ha:a:128:7,va:a,dering:a",
175
#if defined(ARCH_X86) || defined(ARCH_X86_64)
176
static inline void prefetchnta(void *p)
178
asm volatile( "prefetchnta (%0)\n\t"
183
static inline void prefetcht0(void *p)
185
asm volatile( "prefetcht0 (%0)\n\t"
190
static inline void prefetcht1(void *p)
192
asm volatile( "prefetcht1 (%0)\n\t"
197
static inline void prefetcht2(void *p)
199
asm volatile( "prefetcht2 (%0)\n\t"
205
// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
208
* Check if the given 8x8 Block is mostly "flat"
210
static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
214
const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
215
const int dcThreshold= dcOffset*2 + 1;
217
for(y=0; y<BLOCK_SIZE; y++)
219
if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
220
if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
221
if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
222
if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
223
if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
224
if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
225
if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
228
return numEq > c->ppMode.flatnessThreshold;
232
* Check if the middle 8x8 Block in the given 8x16 block is flat
234
static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
237
const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
238
const int dcThreshold= dcOffset*2 + 1;
240
src+= stride*4; // src points to begin of the 8x8 Block
241
for(y=0; y<BLOCK_SIZE-1; y++)
243
if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
244
if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
245
if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
246
if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
247
if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
248
if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
249
if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
250
if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
253
return numEq > c->ppMode.flatnessThreshold;
256
static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
261
if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
263
if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
265
if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
267
if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
272
if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
279
static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
285
for(x=0; x<BLOCK_SIZE; x+=4)
287
if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
288
if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
289
if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
290
if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
295
for(x=0; x<BLOCK_SIZE; x++)
297
if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
304
for(x=0; x<BLOCK_SIZE; x++)
310
int v= src[x + y*stride];
314
if(max-min > 2*QP) return 0;
320
static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
321
if( isHorizDC_C(src, stride, c) ){
322
if( isHorizMinMaxOk_C(src, stride, c->QP) )
331
static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
332
if( isVertDC_C(src, stride, c) ){
333
if( isVertMinMaxOk_C(src, stride, c->QP) )
342
static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
345
for(y=0; y<BLOCK_SIZE; y++)
347
const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
349
if(ABS(middleEnergy) < 8*c->QP)
351
const int q=(dst[3] - dst[4])/2;
352
const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
353
const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
355
int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
359
d*= SIGN(-middleEnergy);
380
* Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
381
* using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
383
static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
386
for(y=0; y<BLOCK_SIZE; y++)
388
const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
389
const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
392
sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
393
sums[1] = sums[0] - first + dst[3];
394
sums[2] = sums[1] - first + dst[4];
395
sums[3] = sums[2] - first + dst[5];
396
sums[4] = sums[3] - first + dst[6];
397
sums[5] = sums[4] - dst[0] + dst[7];
398
sums[6] = sums[5] - dst[1] + last;
399
sums[7] = sums[6] - dst[2] + last;
400
sums[8] = sums[7] - dst[3] + last;
401
sums[9] = sums[8] - dst[4] + last;
403
dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
404
dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
405
dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
406
dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
407
dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
408
dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
409
dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
410
dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
417
* Experimental Filter 1 (Horizontal)
418
* will not damage linear gradients
419
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
420
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
421
* MMX2 version does correct clipping C version doesnt
422
* not identical with the vertical one
424
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
427
static uint64_t *lut= NULL;
431
lut= (uint64_t*)memalign(8, 256*8);
434
int v= i < 128 ? 2*i : 2*(i-256);
436
//Simulate 112242211 9-Tap filter
437
uint64_t a= (v/16) & 0xFF;
438
uint64_t b= (v/8) & 0xFF;
439
uint64_t c= (v/4) & 0xFF;
440
uint64_t d= (3*v/8) & 0xFF;
442
//Simulate piecewise linear interpolation
443
uint64_t a= (v/16) & 0xFF;
444
uint64_t b= (v*3/16) & 0xFF;
445
uint64_t c= (v*5/16) & 0xFF;
446
uint64_t d= (7*v/16) & 0xFF;
447
uint64_t A= (0x100 - a)&0xFF;
448
uint64_t B= (0x100 - b)&0xFF;
449
uint64_t C= (0x100 - c)&0xFF;
450
uint64_t D= (0x100 - c)&0xFF;
452
lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
453
(D<<24) | (C<<16) | (B<<8) | (A);
454
//lut[i] = (v<<32) | (v<<24);
458
for(y=0; y<BLOCK_SIZE; y++)
460
int a= src[1] - src[2];
461
int b= src[3] - src[4];
462
int c= src[5] - src[6];
464
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
468
int v = d * SIGN(-b);
483
* accurate deblock filter
485
static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
488
const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
489
const int dcThreshold= dcOffset*2 + 1;
491
src+= step*4; // src points to begin of the 8x8 Block
495
if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
496
if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
497
if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
498
if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
499
if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
500
if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
501
if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
502
if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
503
if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
504
if(numEq > c->ppMode.flatnessThreshold){
507
if(src[0] > src[step]){
515
if(src[x*step] > src[(x+1)*step]){
516
if(src[x *step] > max) max= src[ x *step];
517
if(src[(x+1)*step] < min) min= src[(x+1)*step];
519
if(src[(x+1)*step] > max) max= src[(x+1)*step];
520
if(src[ x *step] < min) min= src[ x *step];
524
const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
525
const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
528
sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
529
sums[1] = sums[0] - first + src[3*step];
530
sums[2] = sums[1] - first + src[4*step];
531
sums[3] = sums[2] - first + src[5*step];
532
sums[4] = sums[3] - first + src[6*step];
533
sums[5] = sums[4] - src[0*step] + src[7*step];
534
sums[6] = sums[5] - src[1*step] + last;
535
sums[7] = sums[6] - src[2*step] + last;
536
sums[8] = sums[7] - src[3*step] + last;
537
sums[9] = sums[8] - src[4*step] + last;
539
src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
540
src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
541
src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
542
src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
543
src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
544
src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
545
src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
546
src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
549
const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
551
if(ABS(middleEnergy) < 8*QP)
553
const int q=(src[3*step] - src[4*step])/2;
554
const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
555
const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
557
int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
561
d*= SIGN(-middleEnergy);
588
//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
590
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
596
#define COMPILE_ALTIVEC
597
#endif //HAVE_ALTIVEC
598
#endif //ARCH_POWERPC
600
#if defined(ARCH_X86) || defined(ARCH_X86_64)
602
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
606
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
610
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
611
#define COMPILE_3DNOW
624
#define RENAME(a) a ## _C
625
#include "postprocess_template.c"
629
#ifdef COMPILE_ALTIVEC
632
#define RENAME(a) a ## _altivec
633
#include "postprocess_altivec_template.c"
634
#include "postprocess_template.c"
636
#endif //ARCH_POWERPC
644
#define RENAME(a) a ## _MMX
645
#include "postprocess_template.c"
654
#define RENAME(a) a ## _MMX2
655
#include "postprocess_template.c"
664
#define RENAME(a) a ## _3DNow
665
#include "postprocess_template.c"
668
// minor note: the HAVE_xyz is messed up after that line so dont use it
670
static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
671
QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
673
PPContext *c= (PPContext *)vc;
674
PPMode *ppMode= (PPMode *)vm;
675
c->ppMode= *ppMode; //FIXME
677
// useing ifs here as they are faster than function pointers allthough the
678
// difference wouldnt be messureable here but its much better because
679
// someone might exchange the cpu whithout restarting mplayer ;)
680
#ifdef RUNTIME_CPUDETECT
681
#if defined(ARCH_X86) || defined(ARCH_X86_64)
682
// ordered per speed fasterst first
683
if(c->cpuCaps & PP_CPU_CAPS_MMX2)
684
postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
685
else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
686
postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
687
else if(c->cpuCaps & PP_CPU_CAPS_MMX)
688
postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690
postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694
if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
695
postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
699
postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
701
#else //RUNTIME_CPUDETECT
703
postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
704
#elif defined (HAVE_3DNOW)
705
postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
706
#elif defined (HAVE_MMX)
707
postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
708
#elif defined (HAVE_ALTIVEC)
709
postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
711
postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
713
#endif //!RUNTIME_CPUDETECT
716
//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
717
// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
719
/* -pp Command line Help
722
"Available postprocessing filters:\n"
724
"short long name short long option Description\n"
725
"* * a autoq CPU power dependent enabler\n"
726
" c chrom chrominance filtering enabled\n"
727
" y nochrom chrominance filtering disabled\n"
728
" n noluma luma filtering disabled\n"
729
"hb hdeblock (2 threshold) horizontal deblocking filter\n"
730
" 1. difference factor: default=32, higher -> more deblocking\n"
731
" 2. flatness threshold: default=39, lower -> more deblocking\n"
732
" the h & v deblocking filters share these\n"
733
" so you can't set different thresholds for h / v\n"
734
"vb vdeblock (2 threshold) vertical deblocking filter\n"
735
"ha hadeblock (2 threshold) horizontal deblocking filter\n"
736
"va vadeblock (2 threshold) vertical deblocking filter\n"
737
"h1 x1hdeblock experimental h deblock filter 1\n"
738
"v1 x1vdeblock experimental v deblock filter 1\n"
739
"dr dering deringing filter\n"
740
"al autolevels automatic brightness / contrast\n"
741
" f fullyrange stretch luminance to (0..255)\n"
742
"lb linblenddeint linear blend deinterlacer\n"
743
"li linipoldeint linear interpolating deinterlace\n"
744
"ci cubicipoldeint cubic interpolating deinterlacer\n"
745
"md mediandeint median deinterlacer\n"
746
"fd ffmpegdeint ffmpeg deinterlacer\n"
747
"l5 lowpass5 FIR lowpass deinterlacer\n"
748
"de default hb:a,vb:a,dr:a\n"
749
"fa fast h1:a,v1:a,dr:a\n"
750
"ac ha:a:128:7,va:a,dr:a\n"
751
"tn tmpnoise (3 threshold) temporal noise reducer\n"
752
" 1. <= 2. <= 3. larger -> stronger filtering\n"
753
"fq forceQuant <quantizer> force quantizer\n"
755
"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
756
"long form example:\n"
757
"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
758
"short form example:\n"
759
"vb:a/hb:a/lb de,-vb\n"
764
pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
766
char temp[GET_MODE_BUFFER_SIZE];
768
char *filterDelimiters= ",/";
769
char *optionDelimiters= ":";
770
struct PPMode *ppMode;
773
ppMode= memalign(8, sizeof(PPMode));
776
ppMode->chromMode= 0;
777
ppMode->maxTmpNoise[0]= 700;
778
ppMode->maxTmpNoise[1]= 1500;
779
ppMode->maxTmpNoise[2]= 3000;
780
ppMode->maxAllowedY= 234;
781
ppMode->minAllowedY= 16;
782
ppMode->baseDcDiff= 256/8;
783
ppMode->flatnessThreshold= 56-16-1;
784
ppMode->maxClippedThreshold= 0.01;
787
strncpy(temp, name, GET_MODE_BUFFER_SIZE);
789
if(verbose>1) printf("pp: %s\n", name);
793
int q= 1000000; //PP_QUALITY_MAX;
797
char *options[OPTIONS_ARRAY_SIZE];
800
int numOfUnknownOptions=0;
801
int enable=1; //does the user want us to enabled or disabled the filter
803
filterToken= strtok(p, filterDelimiters);
804
if(filterToken == NULL) break;
805
p+= strlen(filterToken) + 1; // p points to next filterToken
806
filterName= strtok(filterToken, optionDelimiters);
807
if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
809
if(*filterName == '-')
815
for(;;){ //for all options
816
option= strtok(NULL, optionDelimiters);
817
if(option == NULL) break;
819
if(verbose>1) printf("pp: option: %s\n", option);
820
if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
821
else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
822
else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
823
else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
826
options[numOfUnknownOptions] = option;
827
numOfUnknownOptions++;
829
if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
831
options[numOfUnknownOptions] = NULL;
833
/* replace stuff from the replace Table */
834
for(i=0; replaceTable[2*i]!=NULL; i++)
836
if(!strcmp(replaceTable[2*i], filterName))
838
int newlen= strlen(replaceTable[2*i + 1]);
842
if(p==NULL) p= temp, *p=0; //last filter
843
else p--, *p=','; //not last filter
846
spaceLeft= p - temp + plen;
847
if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
852
memmove(p + newlen, p, plen+1);
853
memcpy(p, replaceTable[2*i + 1], newlen);
858
for(i=0; filters[i].shortName!=NULL; i++)
860
// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
861
if( !strcmp(filters[i].longName, filterName)
862
|| !strcmp(filters[i].shortName, filterName))
864
ppMode->lumMode &= ~filters[i].mask;
865
ppMode->chromMode &= ~filters[i].mask;
868
if(!enable) break; // user wants to disable it
870
if(q >= filters[i].minLumQuality && luma)
871
ppMode->lumMode|= filters[i].mask;
872
if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
873
if(q >= filters[i].minChromQuality)
874
ppMode->chromMode|= filters[i].mask;
876
if(filters[i].mask == LEVEL_FIX)
879
ppMode->minAllowedY= 16;
880
ppMode->maxAllowedY= 234;
881
for(o=0; options[o]!=NULL; o++)
883
if( !strcmp(options[o],"fullyrange")
884
||!strcmp(options[o],"f"))
886
ppMode->minAllowedY= 0;
887
ppMode->maxAllowedY= 255;
888
numOfUnknownOptions--;
892
else if(filters[i].mask == TEMP_NOISE_FILTER)
897
for(o=0; options[o]!=NULL; o++)
900
ppMode->maxTmpNoise[numOfNoises]=
901
strtol(options[o], &tail, 0);
905
numOfUnknownOptions--;
906
if(numOfNoises >= 3) break;
910
else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
911
|| filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
915
for(o=0; options[o]!=NULL && o<2; o++)
918
int val= strtol(options[o], &tail, 0);
919
if(tail==options[o]) break;
921
numOfUnknownOptions--;
922
if(o==0) ppMode->baseDcDiff= val;
923
else ppMode->flatnessThreshold= val;
926
else if(filters[i].mask == FORCE_QUANT)
929
ppMode->forcedQuant= 15;
931
for(o=0; options[o]!=NULL && o<1; o++)
934
int val= strtol(options[o], &tail, 0);
935
if(tail==options[o]) break;
937
numOfUnknownOptions--;
938
ppMode->forcedQuant= val;
943
if(!filterNameOk) ppMode->error++;
944
ppMode->error += numOfUnknownOptions;
947
if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
950
fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
957
void pp_free_mode(pp_mode_t *mode){
961
static void reallocAlign(void **p, int alignment, int size){
963
*p= memalign(alignment, size);
967
static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
968
int mbWidth = (width+15)>>4;
969
int mbHeight= (height+15)>>4;
973
c->qpStride= qpStride;
975
reallocAlign((void **)&c->tempDst, 8, stride*24);
976
reallocAlign((void **)&c->tempSrc, 8, stride*24);
977
reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
978
reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
980
c->yHistogram[i]= width*height/64*15/256;
984
//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
985
reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
986
reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
989
reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
990
reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
991
reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
992
reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
995
static void global_init(void){
997
memset(clip_table, 0, 256);
998
for(i=256; i<512; i++)
1000
memset(clip_table+512, 0, 256);
1003
pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1004
PPContext *c= memalign(32, sizeof(PPContext));
1005
int stride= (width+15)&(~15); //assumed / will realloc if needed
1006
int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1010
memset(c, 0, sizeof(PPContext));
1011
c->cpuCaps= cpuCaps;
1012
if(cpuCaps&PP_FORMAT){
1013
c->hChromaSubSample= cpuCaps&0x3;
1014
c->vChromaSubSample= (cpuCaps>>4)&0x3;
1016
c->hChromaSubSample= 1;
1017
c->vChromaSubSample= 1;
1020
reallocBuffers(c, width, height, stride, qpStride);
1027
void pp_free_context(void *vc){
1028
PPContext *c = (PPContext*)vc;
1031
for(i=0; i<3; i++) free(c->tempBlured[i]);
1032
for(i=0; i<3; i++) free(c->tempBluredPast[i]);
1034
free(c->tempBlocks);
1035
free(c->yHistogram);
1039
free(c->stdQPTable);
1040
free(c->nonBQPTable);
1041
free(c->forcedQPTable);
1043
memset(c, 0, sizeof(PPContext));
1048
void pp_postprocess(uint8_t * src[3], int srcStride[3],
1049
uint8_t * dst[3], int dstStride[3],
1050
int width, int height,
1051
QP_STORE_T *QP_store, int QPStride,
1052
pp_mode_t *vm, void *vc, int pict_type)
1054
int mbWidth = (width+15)>>4;
1055
int mbHeight= (height+15)>>4;
1056
PPMode *mode = (PPMode*)vm;
1057
PPContext *c = (PPContext*)vc;
1058
int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1059
int absQPStride = ABS(QPStride);
1061
// c->stride and c->QPStride are always positive
1062
if(c->stride < minStride || c->qpStride < absQPStride)
1063
reallocBuffers(c, width, height,
1064
MAX(minStride, c->stride),
1065
MAX(c->qpStride, absQPStride));
1067
if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1070
QP_store= c->forcedQPTable;
1071
absQPStride = QPStride = 0;
1072
if(mode->lumMode & FORCE_QUANT)
1073
for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1075
for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1077
//printf("pict_type:%d\n", pict_type);
1079
if(pict_type & PP_PICT_TYPE_QP2){
1081
const int count= mbHeight * absQPStride;
1082
for(i=0; i<(count>>2); i++){
1083
((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1085
for(i<<=2; i<count; i++){
1086
c->stdQPTable[i] = QP_store[i]>>1;
1088
QP_store= c->stdQPTable;
1089
QPStride= absQPStride;
1094
for(y=0; y<mbHeight; y++){
1095
for(x=0; x<mbWidth; x++){
1096
printf("%2d ", QP_store[x + y*QPStride]);
1103
if((pict_type&7)!=3)
1105
if (QPStride >= 0) {
1107
const int count= mbHeight * QPStride;
1108
for(i=0; i<(count>>2); i++){
1109
((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1111
for(i<<=2; i<count; i++){
1112
c->nonBQPTable[i] = QP_store[i] & 0x3F;
1116
for(i=0; i<mbHeight; i++) {
1117
for(j=0; j<absQPStride; j++) {
1118
c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1126
printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1129
postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1130
width, height, QP_store, QPStride, 0, mode, c);
1132
width = (width )>>c->hChromaSubSample;
1133
height = (height)>>c->vChromaSubSample;
1137
postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1138
width, height, QP_store, QPStride, 1, mode, c);
1139
postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1140
width, height, QP_store, QPStride, 2, mode, c);
1142
else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1144
linecpy(dst[1], src[1], height, srcStride[1]);
1145
linecpy(dst[2], src[2], height, srcStride[2]);
1150
for(y=0; y<height; y++)
1152
memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1153
memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);