1
/*****************************************************************************
2
* postprocessing_mmxext.c: Post Processing plugin MMXEXT
3
*****************************************************************************
4
* Copyright (C) 2001 VideoLAN
5
* $Id: postprocessing_mmxext.c 6961 2004-03-05 17:34:23Z sam $
7
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22
*****************************************************************************/
24
#include <vlc/vlc.h> /* only use uint8_t, uint32_t .... */
26
#include "postprocessing.h"
27
#include "postprocessing_common.h"
29
/*****************************************************************************
31
* Internals functions common to pp_Deblock_V and pp_Deblock_H
33
*****************************************************************************/
35
/*****************************************************************************
37
*****************************************************************************/
40
/* XXX PP_THR1 need to be defined as ULL */
42
/* Use same things as in idct but how it work ? */
43
#define UNUSED_LONGLONG( foo ) \
44
static const unsigned long long foo __asm__ (#foo) __attribute__((unused))
46
/* to calculate isDC_mode for mmx */
47
UNUSED_LONGLONG( mmx_thr1 ) = ( PP_THR1 << 56 )|
56
UNUSED_LONGLONG( mmx_127_thr1 ) = ( ( 127ULL - PP_THR1 ) << 56 )|
57
( ( 127ULL - PP_THR1 ) << 48 )|
58
( ( 127ULL - PP_THR1 ) << 40 )|
59
( ( 127ULL - PP_THR1 ) << 32 )|
60
( ( 127ULL - PP_THR1 ) << 24 )|
61
( ( 127ULL - PP_THR1 ) << 16 )|
62
( ( 127ULL - PP_THR1 ) << 8 )|
63
( ( 127ULL - PP_THR1 ) );
65
UNUSED_LONGLONG( mmx_127_2xthr1_1 ) = ( ( 127ULL - PP_2xTHR1 -1) << 56 )|
66
( ( 127ULL - PP_2xTHR1 -1 ) << 48 )|
67
( ( 127ULL - PP_2xTHR1 -1 ) << 40 )|
68
( ( 127ULL - PP_2xTHR1 -1 ) << 32 )|
69
( ( 127ULL - PP_2xTHR1 -1 ) << 24 )|
70
( ( 127ULL - PP_2xTHR1 -1 ) << 16 )|
71
( ( 127ULL - PP_2xTHR1 -1 ) << 8 )|
72
( ( 127ULL - PP_2xTHR1 -1 ) );
74
UNUSED_LONGLONG( mmx_m2_5_m5_2 ) = 0xfffe0005fffb0002ULL;
77
/* find min bytes from r ans set it in r, t is destroyed */
78
#define MMXEXT_GET_PMIN( r, t ) \
79
"movq " #r ", " #t " \n" \
80
"psrlq $8, " #t " \n" \
81
"pminub " #t ", " #r " \n" \
82
"pshufw $0xf5, " #r ", " #t " #instead of shift with tmp reg \n" \
83
"pminub " #t ", " #r " \n" \
84
"pshufw $0xfe, " #r ", " #t " \n" \
85
"pminub " #t ", " #r " \n"
87
/* find mzx bytes from r ans set it in r, t is destroyed */
88
#define MMXEXT_GET_PMAX( r, t ) \
89
"movq " #r ", " #t " \n" \
90
"psrlq $8, " #t " \n" \
91
"pmaxub " #t ", " #r " \n" \
92
"pshufw $0xf5, " #r ", " #t " \n" \
93
"pmaxub " #t ", " #r " \n" \
94
"pshufw $0xfe, " #r ", " #t " \n" \
95
"pmaxub " #t ", " #r " \n"
99
#define MMXEXT_GET_LMINMAX( s, m, M, t ) \
100
"movq " #s ", " #t " \n" \
101
"pminub " #t ", " #m " \n" \
102
"pmaxub " #t ", " #M " \n"
107
d1 = a - b with unsigned saturate
114
/****************************************************************************
115
* pp_deblock_isDC_mode : Check if we will use DC mode or Default mode
116
****************************************************************************
117
* Use constant PP_THR1 and PP_THR2 ( PP_2xTHR1 )
119
* Called for for each pixel on a boundary block when doing deblocking
120
* so need to be fast ...
122
****************************************************************************/
123
static inline int pp_deblock_isDC_mode( uint8_t *p_v )
125
unsigned int i_eq_cnt;
129
x = v[i] - v[i+1] without signed saturation
130
( XXX see if there is'nt problem, but can't be with signed
131
sat because pixel will be saturate :(
132
so x within [-128, 127] and we have to test if it fit in [-M, M]
133
we add 127-M with wrap around -> good value fit in [ 127-2*M, 127]
134
and if x >= 127 - 2 * M ie x > 127 -2*M - 1 value is good
137
__asm__ __volatile__ (
138
" #* Do (v0-v1) to (v7-v8) \n"
139
"movq (%1), %%mm1 # load v0->v7 \n"
140
"movq 1(%1), %%mm2 # load v1->v8 \n"
141
"psubb %%mm2, %%mm1 # v[i]-v[i+1] \n"
142
"paddb mmx_127_thr1, %%mm1 # + 127-THR1 with wrap \n"
143
"pcmpgtb mmx_127_2xthr1_1, %%mm1 # > 127 -2*thr1 - 1 \n"
144
"pxor %%mm0, %%mm0 # mm0 = 0 \n"
145
"psadbw %%mm1, %%mm0 \n"
146
"movd %%mm0, %0 # \n"
150
: "=r"(i_eq_cnt) : "r" (p_v) );
152
__asm__ __volatile__ (
153
" #* Do (v0-v1) to (v7-v8) \n"
154
"movq (%1), %%mm1 # load v0->v7 \n"
155
"pxor %%mm0, %%mm0 # mm0 = 0 \n"
156
"movq 1(%1), %%mm2 # load v1->v8 \n"
157
"psubb %%mm2, %%mm1 # v[i]-v[i+1] \n"
158
"paddb mmx_127_thr1, %%mm1 # + 127-THR1 with wrap \n"
159
"pcmpgtb mmx_127_2xthr1_1, %%mm1 # > 127 -2*thr1 - 1 \n"
160
"psadbw %%mm1, %%mm0 \n"
161
"movd %%mm0, %0 # \n"
164
: "=r"(i_eq_cnt) : "r" (p_v) );
166
/* last test, hey, 9 don't fit in MMX */
167
if(( ( p_v[8] - p_v[9] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
173
/* algo : if ( | v[i] -v[i+1] | <= PP_THR1 ) { i_eq_cnt++; } */
176
for( i =0; i < 9; i++ )
178
if(( ( p_v[i] - p_v[i+1] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
185
return( (i_eq_cnt >= PP_THR2 ) ? 1 : 0 );
188
static inline int pp_deblock_isMinMaxOk( uint8_t *p_v, int i_QP )
192
__asm__ __volatile__ (
193
"movq 1(%1), %%mm0 # 8 bytes \n"
194
"movq %%mm0, %%mm1 \n"
195
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
196
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
197
"psubd %%mm0, %%mm1 # max - min \n"
199
"andl $255, %0" : "=r"(i_range) : "r"(p_v) );
205
i_min = i_max = p_v[1];
206
for( i = 2; i < 9; i++ )
208
if( i_max < p_v[i] ) i_max = p_v[i];
209
if( i_min > p_v[i] ) i_min = p_v[i];
211
i_range = i_max - i_min;
214
return( i_range< 2*i_QP ? 1 : 0 );
218
static inline void pp_deblock_DefaultMode( uint8_t i_v[10], int i_stride,
222
int a3x0, a3x0_, a3x1, a3x2;
225
/* d = CLIP( 5(a3x0' - a3x0)//8, 0, (v4-v5)/2 ).d( abs(a3x0) < QP ) */
227
/* First calculate a3x0 */
228
__asm__ __volatile__ (
229
"pxor %%mm7, %%mm7 # mm7 = 0 \n"
230
"movq mmx_m2_5_m5_2, %%mm6 # mm6 =(2,-5,5,-2) \n"
231
"movd 3(%1), %%mm0 \n"
232
"punpcklbw %%mm7,%%mm0 \n"
233
"pmaddwd %%mm6, %%mm0 \n"
234
"pshufw $0xfe, %%mm0, %%mm1 \n"
235
"paddd %%mm1, %%mm0 \n"
236
"movd %%mm0, %0" : "=r"(a3x0) :"r"(i_v) );
238
a3x0 = 2 * ( i_v[3] - i_v[6] ) + 5 *( i_v[5] - i_v[4] );
250
/* XXX Now a3x0 is abs( a3x0 ) */
251
if( ( a3x0 < 8 * i_QP )&&( a3x0 != 0 ) ) /* |a3x0| < 8*i_QP */
253
/* calculate a3x1 et a3x2 */
254
__asm__ __volatile__ (
256
" # mm6 = ( 2, -5, 5, -2 ) \n"
257
"movd 1(%2), %%mm0 \n"
258
"movd 5(%2), %%mm2 \n"
259
"punpcklbw %%mm7,%%mm0 \n"
260
"punpcklbw %%mm7,%%mm2 \n"
261
"pmaddwd %%mm6, %%mm0 \n"
262
"pmaddwd %%mm6, %%mm2 \n"
263
"pshufw $0xfe, %%mm0, %%mm1 \n"
264
"paddd %%mm1, %%mm0 # mm0 = a3x1 \n"
266
"pshufw $0xfe, %%mm2, %%mm1 \n"
267
"paddd %%mm1, %%mm2 # mm2 = a3x2 \n"
269
: "=r"(a3x1), "=r"(a3x2) : "r"(i_v) );
271
a3x1 = 2 * ( i_v[1] - i_v[4] ) + 5 * ( i_v[3] - i_v[2] );
272
a3x2 = 2 * ( i_v[5] - i_v[8] ) + 5 * ( i_v[7] - i_v[6] );
275
if( a3x1 < 0) a3x1 = -a3x1; /* abs( a3x1 ) */
276
if( a3x2 < 0) a3x2 = -a3x2; /* abs( a3x2 ) */
278
a3x0_ = PP_MIN3( a3x0, a3x1, a3x2 );
280
d = 5 *( a3x0 - a3x0_ ) / 8; /* always > 0 */
282
i_delta = ( i_v[4] - i_v[5] ) / 2;
283
/* clip into [0, i_delta] or [i_delta, 0] */
286
if( !b_neg ) /* since true d has sgn(d) = - sgn( a3x0 ) */
289
if( d < i_delta ) d = i_delta;
298
if( d > i_delta ) d = i_delta;
308
static inline void pp_deblock_DCMode( uint8_t *p_v, /* = int i_v[10] */
312
i_p0 = PP_ABS( p_v[1] - p_v[0] ) < i_QP ? p_v[0] : p_v[1];
313
i_p9 = PP_ABS( p_v[8] - p_v[9] ) < i_QP ? p_v[9] : p_v[8];
315
/* mm0 = 8 pix unmodified
316
-We will process first 4 pixel
317
mm0 = 8 pix unmodified
318
mm1 = for the first part of the 4 first pix
319
(v1) -> (p0) -> ... ( word )
324
= for the commoin part between first and last pix
325
(v2) -> (v3) -> ... ( word )
330
= for the last part of the 4 last pix
331
(v5) -> (v6) -> ... ( word )
336
mm2 = acu for first new pix
337
mm3 = acu for last pix
342
__asm__ __volatile__ (
343
"pxor %%mm7, %%mm7 \n"
344
"movq 1(%0), %%mm0 # get 8 pix \n"
345
" # unpack into mm1 \n"
346
"movq %%mm0, %%mm1 \n"
347
"punpcklbw %%mm7, %%mm1 \n"
348
" # get p_0 and i_p9 \n"
351
"psllq $48, %%mm6 \n"
353
"movq %%mm1, %%mm3 # p_v[5-8] = v[1-4] !! \n"
354
"movq %%mm1, %%mm2 \n"
355
"psllw $2, %%mm2 # p_v[1-4] = 4*v[1-4] \n"
357
"psllq $16, %%mm1 \n"
358
"por %%mm5, %%mm1 # mm1 =( p0, v1, v2 ,v3)\n"
360
"paddw %%mm1, %%mm2 \n"
361
"paddw %%mm1, %%mm2 \n"
363
"pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, v1, v2)\n"
364
"paddw %%mm1, %%mm2 \n"
365
"paddw %%mm1, %%mm2 \n"
367
"pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, p0, v2)\n"
368
"paddw %%mm1, %%mm2 \n"
370
"pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, p0, p0)\n"
371
"paddw %%mm1, %%mm2 \n"
372
" # Now last part a little borring\n"
373
" # last part for mm2, beginig for mm3\n"
374
"movq %%mm0, %%mm1 \n"
376
"punpcklbw %%mm7, %%mm1 # mm1 =( v2, v3, v4, v5 )\n"
377
"paddw %%mm1, %%mm2 \n"
378
"paddw %%mm1, %%mm2 \n"
379
"paddw %%mm1, %%mm3 \n"
382
"movq %%mm0, %%mm1 \n"
383
"psrlq $16, %%mm1 \n"
384
"punpcklbw %%mm7, %%mm1 # mm1 =( v3, v4, v5, v6 )\n"
386
"paddw %%mm1, %%mm2 \n"
387
"paddw %%mm1, %%mm3 \n"
389
"movq %%mm0, %%mm1 \n"
390
"psrlq $24, %%mm1 \n"
391
"punpcklbw %%mm7, %%mm1 # mm1 =( v4, v5, v6, v7) \n"
392
"paddw %%mm1, %%mm2 \n"
393
"paddw %%mm1, %%mm3 \n"
394
"paddw %%mm1, %%mm3 \n"
396
"movq %%mm0, %%mm1 \n"
397
"psrlq $32, %%mm1 \n"
398
"punpcklbw %%mm7, %%mm1 # mm1 =( v5, v6, v7, v8) \n"
399
"paddw %%mm1, %%mm2 \n"
401
"paddw %%mm1, %%mm3 \n"
402
" # Now last part for last 4 pix \n"
404
"movq %%mm0, %%mm1 \n"
405
"punpckhbw %%mm7, %%mm1 # mm1 = ( v5, v6, v7, v8) \n"
407
"psrlq $16, %%mm1 \n"
408
"por %%mm6, %%mm1 # mm1 =( v6, v7, v8, p9 )\n"
410
"paddw %%mm1, %%mm3 \n"
411
"paddw %%mm1, %%mm3 \n"
413
"pshufw $0xf9,%%mm1,%%mm1 # mm1 =( v7, v8, p9, p9)\n"
414
"paddw %%mm1, %%mm3 \n"
415
"paddw %%mm1, %%mm3 \n"
417
"pshufw $0xf9,%%mm1,%%mm1 # mm1 =( v8, p9, p9, p9)\n"
418
"paddw %%mm1, %%mm3 \n"
420
"pshufw $0xf9,%%mm1,%%mm1 # mm1 =( p9, p9, p9, p9)\n"
421
"paddw %%mm1, %%mm3 \n"
425
"packuswb %%mm3, %%mm2 \n"
426
"movq %%mm2, 1(%0) \n"
428
: : "r"(p_v), "r"(i_p0), "r"(i_p9) : "memory" );
431
for( i = 1; i < 9; i++ )
433
v[i] = p_v[i]; /* save 8 pix that will be modified */
436
p_v[1] = ( 6 * i_p0 + 4 * v[1]
437
+ 2 *( v[2] + v[3]) + v[4] + v[5]) >> 4;
439
p_v[2] = ( 4 * i_p0 + 2 * v[1] + 4 * v[2]
440
+ 2 *( v[3] + v[4]) + v[5] + v[6]) >> 4;
442
p_v[3] = ( 2 * i_p0 + 2 * (v[1] + v[2]) + 4 * v[3]
443
+ 2 *( v[4] + v[5]) + v[6] + v[7]) >> 4;
445
p_v[4] = ( i_p0 + v[1] + 2 * (v[2] + v[3]) + 4 * v[4]
446
+ 2 *( v[5] + v[6]) + v[7] + v[8]) >> 4;
448
p_v[5] = ( v[1] + v[2] + 2 * (v[3] + v[4]) + 4 * v[5]
449
+ 2 *( v[6] + v[7]) + v[8] + i_p9) >> 4;
451
p_v[6] = ( v[2] + v[3] + 2 * (v[4] + v[5]) + 4 * v[6]
452
+ 2 *( v[7] + v[8]) + 2 * i_p9) >> 4;
454
p_v[7] = ( v[3] + v[4] + 2 * (v[5] + v[6]) + 4 * v[7]
455
+ 2 * v[8] + 4 * i_p9) >> 4;
457
p_v[8] = ( v[4] + v[5] + 2 * (v[6] + v[7]) + 4 * v[8]
465
/*****************************************************************************/
466
/*---------------------------------------------------------------------------*/
468
/* ---------- filter Vertical lines so follow horizontal edges -------- */
470
/*---------------------------------------------------------------------------*/
471
/*****************************************************************************/
473
void E_( pp_deblock_V )( uint8_t *p_plane,
474
int i_width, int i_height, int i_stride,
475
QT_STORE_T *p_QP_store, int i_QP_stride,
480
int i_QP_scale; /* use to do ( ? >> i_QP_scale ) */
485
i_QP_scale = b_chroma ? 5 : 4 ;
487
for( y = 8; y < i_height - 4; y += 8 )
489
p_v = p_plane + ( y - 5 )* i_stride;
490
for( x = 0; x < i_width; x++ )
492
/* First get 10 vert pix to use them without i_stride */
493
for( i = 0; i < 10; i++ )
495
i_v[i] = p_v[i*i_stride + x];
498
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
500
/* XXX QP is for v5 */
501
if( pp_deblock_isDC_mode( i_v ) )
503
if( pp_deblock_isMinMaxOk( i_v, i_QP ) )
505
pp_deblock_DCMode( i_v, i_QP );
510
pp_deblock_DefaultMode( i_v, i_stride, i_QP );
513
/* Copy back, XXX only 1-8 were modified */
514
for( i = 1; i < 9; i++ )
516
p_v[i*i_stride + x] = i_v[i];
524
/*****************************************************************************/
525
/*---------------------------------------------------------------------------*/
527
/* --------- filter Horizontal lines so follow vertical edges -------- */
529
/*---------------------------------------------------------------------------*/
530
/*****************************************************************************/
532
void E_( pp_deblock_H )( uint8_t *p_plane,
533
int i_width, int i_height, int i_stride,
534
QT_STORE_T *p_QP_store, int i_QP_stride,
542
i_QP_scale = b_chroma ? 5 : 4 ;
544
for( y = 0; y < i_height; y++ )
546
p_v = p_plane + y * i_stride - 5;
547
for( x = 8; x < i_width - 4; x += 8 )
549
/* p_v point 5 pix before a block boundary */
550
/* XXX QP is for v5 */
551
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
553
if( pp_deblock_isDC_mode( p_v + x ) )
555
if( pp_deblock_isMinMaxOk( p_v+ x, i_QP ) )
557
pp_deblock_DCMode( p_v+x, i_QP );
562
pp_deblock_DefaultMode( p_v+x, i_stride, i_QP );
571
/*****************************************************************************
573
* Internals functions common to pp_Dering_Y pp_Dering_C
575
*****************************************************************************/
577
static inline void pp_dering_MinMax( uint8_t *p_block, int i_stride,
578
int *pi_min, int *pi_max )
581
/* First we will extract min/max for each pix on vertical line
582
and next extract global min/max */
583
__asm__ __volatile__(
585
"leal (%2,%3), %%eax \n"
586
"movq (%2), %%mm0 #load line \n"
587
"movq %%mm0, %%mm1 \n"
589
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
590
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
591
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
592
MMXEXT_GET_LMINMAX( (%2, %3, 4), %%mm0, %%mm1, %%mm7 )
593
"leal (%%eax,%3,4), %%eax \n"
595
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
596
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
597
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
598
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
599
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
600
"movd %%mm0, %%eax \n"
601
"andl $255, %%eax \n"
602
"movl %%eax, (%0) \n"
603
"movd %%mm1, %%eax \n"
604
"andl $255, %%eax \n"
605
"movl %%eax, (%1) \n"
607
: : "r"(pi_min), "r"(pi_max), "r"(p_block), "r"(i_stride) : "%eax", "memory" );
610
i_min = 255; i_max = 0;
612
for( y = 0; y < 8; y++ )
614
for( x = 0; x < 8; x++ )
616
if( i_min > p_block[x] ) i_min = p_block[x];
617
if( i_max < p_block[x] ) i_max = p_block[x];
628
static inline void pp_dering_BinIndex( uint8_t *p_block, int i_stride,
629
int i_thr, uint32_t *p_bin )
634
/* first create mm7 with all bytes set to thr and mm6 = 0 */
635
__asm__ __volatile__(
638
"movd %%eax, %%mm7 \n"
639
"pshufw $0x00, %%mm7, %%mm7 \n"
640
"pxor %%mm6, %%mm6 \n"
641
: : "r"(i_thr) : "%eax" );
643
for( y = 0; y < 10; y++ )
645
__asm__ __volatile__(
646
"movq (%1), %%mm0 \n"
647
"psubusb %%mm7, %%mm0 \n" /* sat makes that x <= thr --> 0 */
648
"pcmpeqb %%mm6, %%mm0 \n" /* p_block <= i_thr ? -1 : 0 */
649
"pmovmskb %%mm0, %0 \n" /* i_bin msb of each bytes */
650
: "=r"(i_bin) :"r"(p_block) );
651
/* Now last 2 tests */
652
if( p_block[8] <= i_thr ) i_bin |= 1 << 8;
653
if( p_block[9] <= i_thr ) i_bin |= 1 << 9;
655
i_bin |= (~i_bin) << 16; /* for detect three 1 or three 0*/
656
*p_bin = ( i_bin >> 1 )&&( i_bin )&&( i_bin << 1 );
664
for( y = 0; y < 10; y++ )
667
for( x = 0; x < 10; x++ )
669
if( p_block[x] > i_thr )
674
i_bin |= (~i_bin) << 16; /* for detect also three 0 */
675
*p_bin = i_bin&( i_bin >> 1 )&( i_bin << 1 );
684
static inline void pp_dering_Filter( uint8_t *p_block, int i_stride,
698
for( y = 0; y < 8; y++ )
700
i_bin = p_bin[y] & p_bin[y+1] & p_bin[y+2]; /* To be optimised */
701
i_bin |= i_bin >> 16; /* detect 0 or 1 */
703
for( x = 0; x < 8; x++ )
705
if( i_bin&0x02 ) /* 0x02 since 10 index but want 1-9 */
711
i_f = p_block[x - i_stride - 1] +
712
( p_block[x - i_stride ] << 1)+
713
p_block[x - i_stride + 1] +
715
( p_block[x - 1] << 1 )+
716
( p_block[x ] << 2 )+
717
( p_block[x + 1] << 1 )+
719
p_block[x + i_stride - 1] +
720
( p_block[x + i_stride ] << 1 ) +
721
p_block[x + i_stride + 1];
723
i_flt[y][x] = ( 8 + i_f ) >> 4;
727
i_flt[y][x] = p_block[x];
736
/* Create mm7 with all bytes set to QP/2 */
737
__asm__ __volatile__(
739
"shrl $1, %%eax \n" /* i_QP/2 */
741
"movd %%eax, %%mm7 \n"
742
"pshufw $0x00, %%mm7, %%mm7 \n"
743
: : "r"(i_QP) : "%eax" );
745
for( y = 0; y < 8; y++ )
747
/* clamp those values and copy them */
748
__asm__ __volatile__(
749
"movq (%0), %%mm0 \n" /* mm0 = i_ftl[y][0] ... i_ftl[y][7] */
750
"movq (%1), %%mm1 \n" /* mm1 = p_sav[0] ... p_sav[7] */
751
"movq %%mm1, %%mm2 \n"
752
"psubusb %%mm7, %%mm1 \n" /* mm1 = psav - i_QP/2 ( >= 0 ) */
753
"paddusb %%mm7, %%mm2 \n" /* mm2 = psav + i_QP/2 ( <= 255 ) */
754
"pmaxub %%mm1, %%mm0 \n" /* psav - i_QP/2 <= mm0 */
755
"pminub %%mm2, %%mm0 \n" /* mm0 <= psav + i_QP/2 */
756
"movq %%mm0, (%1) \n"
757
: :"r"(i_flt[y]), "r"(p_sav) : "memory" );
764
/*****************************************************************************/
765
/*---------------------------------------------------------------------------*/
767
/* ----------------- Dering filter on Y and C blocks ----------------- */
769
/*---------------------------------------------------------------------------*/
770
/*****************************************************************************/
772
void E_( pp_dering_Y )( uint8_t *p_plane,
773
int i_width, int i_height, int i_stride,
774
QT_STORE_T *p_QP_store, int i_QP_stride )
777
int i_max[4], i_min[4], i_range[4];
779
int i_max_range, i_kmax;
780
uint32_t i_bin[4][10];
784
/* We process 4 blocks/loop*/
785
for( y = 8; y < i_height-8; y += 16 )
793
p_block[0] = p_plane + y * i_stride + 8;
794
p_block[1] = p_block[0] + 8;
795
p_block[2] = p_block[0] + ( i_stride << 3 );
796
p_block[3] = p_block[2] + 8;
798
for( x = 8; x < i_width-8; x += 16 )
800
/* 1: Calculate threshold */
801
/* Calculate max/min for each block */
802
pp_dering_MinMax( p_block[0], i_stride, &i_min[0], &i_max[0] );
803
pp_dering_MinMax( p_block[1], i_stride, &i_min[1], &i_max[1] );
804
pp_dering_MinMax( p_block[2], i_stride, &i_min[2], &i_max[2] );
805
pp_dering_MinMax( p_block[3], i_stride, &i_min[3], &i_max[3] );
806
/* Calculate range, max_range and thr */
807
i_max_range = 0; i_kmax = 0;
808
for( k = 0; k < 4; k++ )
810
i_range[k] = i_max[k] - i_min[k];
811
i_thr[k] = ( i_max[k] + i_min[k] + 1 )/2;
812
if( i_max_range < i_max[k])
814
i_max_range = i_max[k];
818
/* Now rearrange thr */
819
if( i_max_range > 64 )
821
for( k = 1; k < 5; k++ )
823
if( i_range[k] < 16 )
828
if( i_range[k] < 32 )
830
i_thr[k] = i_thr[i_kmax];
836
for( k = 1; k < 5; k++ )
838
if( i_range[k] < 16 )
844
/* 2: Index acquisition 10x10 ! so " -i_stride - 1"*/
845
pp_dering_BinIndex( p_block[0] - i_stride - 1, i_stride,
846
i_thr[0], i_bin[0] );
847
pp_dering_BinIndex( p_block[1] - i_stride - 1, i_stride,
848
i_thr[1], i_bin[1] );
849
pp_dering_BinIndex( p_block[2] - i_stride - 1, i_stride,
850
i_thr[2], i_bin[2] );
851
pp_dering_BinIndex( p_block[3] - i_stride - 1, i_stride,
852
i_thr[3], i_bin[3] );
855
/* 3: adaptive smoothing */
856
/* since we begin at (8,8) QP can be different for each block */
857
p_QP = &( p_QP_store[( y >> 4) * i_QP_stride + (x >> 4)] );
859
pp_dering_Filter( p_block[0], i_stride,
862
pp_dering_Filter( p_block[1], i_stride,
865
pp_dering_Filter( p_block[2], i_stride,
866
i_bin[2], p_QP[i_QP_stride] );
868
pp_dering_Filter( p_block[3], i_stride,
869
i_bin[3], p_QP[i_QP_stride+1] );
879
void E_( pp_dering_C )( uint8_t *p_plane,
880
int i_width, int i_height, int i_stride,
881
QT_STORE_T *p_QP_store, int i_QP_stride )
890
for( y = 8; y < i_height-8; y += 8 )
893
p_block = p_plane + y * i_stride + 8;
894
for( x = 8; x < i_width-8; x += 8 )
897
/* 1: Calculate threshold */
898
/* Calculate max/min for each block */
899
pp_dering_MinMax( p_block, i_stride,
902
i_thr = ( i_max + i_min + 1 )/2;
904
/* 2: Index acquisition 10x10 */
905
/* point on 10x10 in wich we have our 8x8 block */
906
pp_dering_BinIndex( p_block - i_stride -1, i_stride,
910
/* 3: adaptive smoothing */
911
pp_dering_Filter( p_block, i_stride,
913
p_QP_store[(y>>5)*i_QP_stride+ (x>>5)]);