1
/* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
2
Copyright (C) 2004-2006 Epic Games
5
Preprocessor with denoising based on the algorithm by Ephraim and Malah
7
Redistribution and use in source and binary forms, with or without
8
modification, are permitted provided that the following conditions are
11
1. Redistributions of source code must retain the above copyright notice,
12
this list of conditions and the following disclaimer.
14
2. Redistributions in binary form must reproduce the above copyright
15
notice, this list of conditions and the following disclaimer in the
16
documentation and/or other materials provided with the distribution.
18
3. The name of the author may not be used to endorse or promote products
19
derived from this software without specific prior written permission.
21
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
POSSIBILITY OF SUCH DAMAGE.
38
Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
39
short-time spectral amplitude estimator". IEEE Transactions on Acoustics,
40
Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
42
Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
43
log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and
44
Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
46
I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
47
Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
49
Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic
50
approach to combined acoustic echo cancellation and noise reduction". IEEE
51
Transactions on Speech and Audio Processing, 2002.
53
J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
54
of simultaneous non-stationary sources". In Proceedings IEEE International
55
Conference on Acoustics, Speech, and Signal Processing, 2004.
63
#include "speex/speex_preprocess.h"
64
#include "speex/speex_echo.h"
67
#include "filterbank.h"
68
#include "math_approx.h"
69
#include "os_support.h"
72
#define M_PI 3.14159263
75
#define LOUDNESS_EXP 5.f
76
#define AMP_SCALE .001f
77
#define AMP_SCALE_1 1000.f
81
#define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15)
82
#define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15)
83
#define NOISE_SUPPRESS_DEFAULT -15
84
#define ECHO_SUPPRESS_DEFAULT -40
85
#define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
91
#define SQR(x) ((x)*(x))
92
#define SQR16(x) (MULT16_16((x),(x)))
93
#define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
96
static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
102
if (b>=QCONST32(1,23))
107
if (b>=QCONST32(1,19))
112
if (b>=QCONST32(1,15))
118
return PDIV32_16(a,b);
122
static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
124
if (SHR32(a,15) >= b)
128
if (b>=QCONST32(1,23))
133
if (b>=QCONST32(1,19))
138
if (b>=QCONST32(1,15))
144
return DIV32_16(a,b);
147
#define SNR_SCALING 256.f
148
#define SNR_SCALING_1 0.0039062f
151
#define FRAC_SCALING 32767.f
152
#define FRAC_SCALING_1 3.0518e-05
155
#define EXPIN_SCALING 2048.f
156
#define EXPIN_SCALING_1 0.00048828f
157
#define EXPIN_SHIFT 11
158
#define EXPOUT_SCALING_1 1.5259e-05
160
#define NOISE_SHIFT 7
164
#define DIV32_16_Q8(a,b) ((a)/(b))
165
#define DIV32_16_Q15(a,b) ((a)/(b))
166
#define SNR_SCALING 1.f
167
#define SNR_SCALING_1 1.f
169
#define FRAC_SCALING 1.f
170
#define FRAC_SCALING_1 1.f
172
#define NOISE_SHIFT 0
174
#define EXPIN_SCALING 1.f
175
#define EXPIN_SCALING_1 1.f
176
#define EXPOUT_SCALING_1 1.f
180
/** Speex pre-processor state. */
181
struct SpeexPreprocessState_ {
183
int frame_size; /**< Number of samples processed each time */
184
int ps_size; /**< Number of points in the power spectrum */
185
int sampling_rate; /**< Sampling rate of the input/output */
192
int dereverb_enabled;
193
spx_word16_t reverb_decay;
194
spx_word16_t reverb_level;
195
spx_word16_t speech_prob_start;
196
spx_word16_t speech_prob_continue;
199
int echo_suppress_active;
200
SpeexEchoState *echo_state;
202
spx_word16_t speech_prob; /**< Probability last frame was speech */
204
/* DSP-related arrays */
205
spx_word16_t *frame; /**< Processing frame (2*ps_size) */
206
spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */
207
spx_word32_t *ps; /**< Current power spectrum */
208
spx_word16_t *gain2; /**< Adjusted gains */
209
spx_word16_t *gain_floor; /**< Minimum gain allowed */
210
spx_word16_t *window; /**< Analysis/Synthesis window */
211
spx_word32_t *noise; /**< Noise estimate */
212
spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
213
spx_word32_t *old_ps; /**< Power spectrum for last frame */
214
spx_word16_t *gain; /**< Ephraim Malah gain */
215
spx_word16_t *prior; /**< A-priori SNR */
216
spx_word16_t *post; /**< A-posteriori SNR */
218
spx_word32_t *S; /**< Smoothed power spectrum */
219
spx_word32_t *Smin; /**< See Cohen paper */
220
spx_word32_t *Stmp; /**< See Cohen paper */
221
int *update_prob; /**< Probability of speech presence for noise update */
223
spx_word16_t *zeta; /**< Smoothed a priori SNR */
224
spx_word32_t *echo_noise;
225
spx_word32_t *residual_echo;
228
spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */
229
spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */
231
/* AGC stuff, only for floating point for now */
235
float loudness_accum;
236
float *loudness_weight; /**< Perceptual loudness curve */
237
float loudness; /**< Loudness estimate */
238
float agc_gain; /**< Current AGC gain */
239
float max_gain; /**< Maximum gain allowed */
240
float max_increase_step; /**< Maximum increase in gain from one frame to another */
241
float max_decrease_step; /**< Maximum decrease in gain from one frame to another */
242
float prev_loudness; /**< Loudness of previous frame */
243
float init_max; /**< Current gain limit during initialisation */
245
int nb_adapt; /**< Number of frames used for adaptation so far */
247
int min_count; /**< Number of frames processed so far */
248
void *fft_lookup; /**< Lookup table for the FFT */
255
static void conj_window(spx_word16_t *w, int len)
262
spx_word16_t x = DIV32_16(MULT16_16(32767,i),len);
264
spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
267
if (x<QCONST16(1.f,13))
269
} else if (x<QCONST16(2.f,13))
271
x=QCONST16(2.f,13)-x;
273
} else if (x<QCONST16(3.f,13))
275
x=x-QCONST16(2.f,13);
278
x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
280
x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
281
tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(SHL32(EXTEND32(x),2))));
283
tmp=SUB16(Q15_ONE,tmp);
284
w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
290
/* This function approximates the gain function
291
y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
292
which multiplied by xi/(1+xi) is the optimal gain
293
in the loudness domain ( sqrt[amplitude] )
294
Input in Q11 format, output in Q15
296
static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
301
static const spx_word16_t table[21] = {
302
6730, 8357, 9868, 11267, 12563, 13770, 14898,
303
15959, 16961, 17911, 18816, 19682, 20512, 21311,
304
22082, 22827, 23549, 24250, 24931, 25594, 26241};
309
return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
310
frac = SHL32(xx-SHL32(ind,10),5);
311
return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
314
static inline spx_word16_t qcurve(spx_word16_t x)
317
return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
320
/* Compute the gain floor based on different floors for the background noise and residual echo */
321
static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
325
if (noise_suppress > effective_echo_suppress)
327
spx_word16_t noise_gain, gain_ratio;
328
noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
329
gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
331
/* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
333
gain_floor[i] = MULT16_16_Q15(noise_gain,
334
spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
335
(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
337
spx_word16_t echo_gain, gain_ratio;
338
echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
339
gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
341
/* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
343
gain_floor[i] = MULT16_16_Q15(echo_gain,
344
spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
345
(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
350
/* This function approximates the gain function
351
y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
352
which multiplied by xi/(1+xi) is the optimal gain
353
in the loudness domain ( sqrt[amplitude] )
355
static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
360
static const float table[21] = {
361
0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
362
1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
363
2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
364
x = EXPIN_SCALING_1*xx;
365
integer = floor(2*x);
370
return FRAC_SCALING*(1+.1296/x);
372
return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
375
static inline spx_word16_t qcurve(spx_word16_t x)
377
return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
380
static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
386
noise_floor = exp(.2302585f*noise_suppress);
387
echo_floor = exp(.2302585f*effective_echo_suppress);
389
/* Compute the gain floor based on different floors for the background noise and residual echo */
391
gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
395
EXPORT SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
400
SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
401
st->frame_size = frame_size;
403
/* Round ps_size down to the nearest power of two */
406
st->ps_size = st->frame_size;
409
if (st->ps_size & ~i)
419
if (st->ps_size < 3*st->frame_size/4)
420
st->ps_size = st->ps_size * 3 / 2;
422
st->ps_size = st->frame_size;
426
N3 = 2*N - st->frame_size;
427
N4 = st->frame_size - N3;
429
st->sampling_rate = sampling_rate;
430
st->denoise_enabled = 1;
432
st->dereverb_enabled = 0;
433
st->reverb_decay = 0;
434
st->reverb_level = 0;
435
st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
436
st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
437
st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
439
st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
440
st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
442
st->echo_state = NULL;
444
st->nbands = NB_BANDS;
446
st->bank = filterbank_new(M, sampling_rate, N, 1);
448
st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
449
st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
450
st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
452
st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
453
st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
454
st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
455
st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
456
st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
457
st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
458
st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
459
st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
460
st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
461
st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
462
st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
463
st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
465
st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
466
st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
467
st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
468
st->update_prob = (int*)speex_alloc(N*sizeof(int));
470
st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
471
st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
473
conj_window(st->window, 2*N3);
474
for (i=2*N3;i<2*st->ps_size;i++)
475
st->window[i]=Q15_ONE;
479
for (i=N3-1;i>=0;i--)
481
st->window[i+N3+N4]=st->window[i+N3];
487
st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
488
st->reverb_estimate[i]=0;
491
st->post[i]=SHL16(1, SNR_SHIFT);
492
st->prior[i]=SHL16(1, SNR_SHIFT);
496
st->update_prob[i] = 1;
504
st->agc_level = 8000;
505
st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
508
float ff=((float)i)*.5*sampling_rate/((float)N);
509
/*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/
510
st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
511
if (st->loudness_weight[i]<.01f)
512
st->loudness_weight[i]=.01f;
513
st->loudness_weight[i] *= st->loudness_weight[i];
515
/*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/
516
st->loudness = 1e-15;
519
st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate);
520
st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate);
521
st->prev_loudness = 1;
526
st->fft_lookup = spx_fft_init(2*N);
533
EXPORT void speex_preprocess_state_destroy(SpeexPreprocessState *st)
535
speex_free(st->frame);
538
speex_free(st->gain2);
539
speex_free(st->gain_floor);
540
speex_free(st->window);
541
speex_free(st->noise);
542
speex_free(st->reverb_estimate);
543
speex_free(st->old_ps);
544
speex_free(st->gain);
545
speex_free(st->prior);
546
speex_free(st->post);
548
speex_free(st->loudness_weight);
550
speex_free(st->echo_noise);
551
speex_free(st->residual_echo);
554
speex_free(st->Smin);
555
speex_free(st->Stmp);
556
speex_free(st->update_prob);
557
speex_free(st->zeta);
559
speex_free(st->inbuf);
560
speex_free(st->outbuf);
562
spx_fft_destroy(st->fft_lookup);
563
filterbank_destroy(st->bank);
567
/* FIXME: The AGC doesn't work yet with fixed-point*/
569
static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft)
579
loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
581
loudness=sqrt(loudness);
582
/*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
583
loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
586
/*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
587
rate = .03*Pframe*Pframe;
588
st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
589
st->loudness_accum = (1-rate)*st->loudness_accum + rate;
590
if (st->init_max < st->max_gain && st->nb_adapt > 20)
591
st->init_max *= 1.f + .1f*Pframe*Pframe;
593
/*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
595
target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
597
if ((Pframe>.5 && st->nb_adapt > 20) || target_gain < st->agc_gain)
599
if (target_gain > st->max_increase_step*st->agc_gain)
600
target_gain = st->max_increase_step*st->agc_gain;
601
if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
602
target_gain = st->max_decrease_step*st->agc_gain;
603
if (target_gain > st->max_gain)
604
target_gain = st->max_gain;
605
if (target_gain > st->init_max)
606
target_gain = st->init_max;
608
st->agc_gain = target_gain;
610
/*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/
613
ft[i] *= st->agc_gain;
614
st->prev_loudness = loudness;
618
static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
622
int N3 = 2*N - st->frame_size;
623
int N4 = st->frame_size - N3;
624
spx_word32_t *ps=st->ps;
626
/* 'Build' input frame */
628
st->frame[i]=st->inbuf[i];
629
for (i=0;i<st->frame_size;i++)
630
st->frame[N3+i]=x[i];
634
st->inbuf[i]=x[N4+i];
638
st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
642
spx_word16_t max_val=0;
644
max_val = MAX16(max_val, ABS16(st->frame[i]));
645
st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
647
st->frame[i] = SHL16(st->frame[i], st->frame_shift);
652
spx_fft(st->fft_lookup, st->frame, st->ft);
655
ps[0]=MULT16_16(st->ft[0],st->ft[0]);
657
ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
659
st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
661
filterbank_compute_bank32(st->bank, ps, ps+N);
664
static void update_noise_prob(SpeexPreprocessState *st)
671
st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1])
672
+ MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
673
st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
674
st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
679
st->Smin[i] = st->Stmp[i] = 0;
682
if (st->nb_adapt < 100)
684
else if (st->nb_adapt < 1000)
686
else if (st->nb_adapt < 10000)
690
if (st->min_count > min_range)
695
st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
696
st->Stmp[i] = st->S[i];
701
st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
702
st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);
707
if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > st->Smin[i])
708
st->update_prob[i] = 1;
710
st->update_prob[i] = 0;
711
/*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
712
/*fprintf (stderr, "%f ", st->update_prob[i]);*/
717
#define NOISE_OVERCOMPENS 1.
719
void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
721
EXPORT int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
723
return speex_preprocess_run(st, x);
726
EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
731
int N3 = 2*N - st->frame_size;
732
int N4 = st->frame_size - N3;
733
spx_word32_t *ps=st->ps;
736
spx_word16_t beta, beta_1;
737
spx_word16_t effective_echo_suppress;
740
if (st->nb_adapt>20000)
741
st->nb_adapt = 20000;
744
beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
745
beta_1 = Q15_ONE-beta;
747
/* Deal with residual echo if provided */
750
speex_echo_get_residual(st->echo_state, st->residual_echo, N);
752
/* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
753
if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
756
st->residual_echo[i] = 0;
760
st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
761
filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
764
st->echo_noise[i] = 0;
766
preprocess_analysis(st, x);
768
update_noise_prob(st);
770
/* Noise estimation always updated for the 10 first frames */
771
/*if (st->nb_adapt<10)
774
st->update_prob[i] = 0;
778
/* Update the noise estimate for the frequencies where it can be */
781
if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
782
st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
784
filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
786
/* Special case for first frame */
789
st->old_ps[i] = ps[i];
791
/* Compute a posteriori SNR */
796
/* Total noise estimate including residual echo and reverberation */
797
spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
799
/* A posteriori SNR = ps/noise - 1*/
800
st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
801
st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
803
/* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
804
gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
806
/* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
807
st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
808
st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
811
/*print_vec(st->post, N+M, "");*/
813
/* Recursive average of the a priori SNR. A bit smoothed for the psd components */
814
st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
816
st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
817
MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
818
for (i=N-1;i<N+M;i++)
819
st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
821
/* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
824
Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
825
Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
827
effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
829
compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
831
/* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale)
832
Technically this is actually wrong because the EM gaim assumes a slightly different probability
836
/* See EM and Cohen papers*/
838
/* Gain from hypergeometric function */
840
/* Weiner filter gain */
841
spx_word16_t prior_ratio;
842
/* a priority probability of speech presence based on Bark sub-band alone */
844
/* Speech absence a priori probability (considering sub-band and frame) */
850
prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
851
theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
853
MM = hypergeom_gain(theta);
854
/* Gain with bound */
855
st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
856
/* Save old Bark power spectrum */
857
st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
859
P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
860
q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
862
theta = MIN32(theta, EXTEND32(32767));
863
/*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
864
tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
865
/*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
866
st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
868
st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
871
/* Convert the EM gains and speech prob to linear frequency */
872
filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
873
filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
875
/* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
878
filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
880
/* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
885
spx_word16_t prior_ratio;
890
/* Wiener filter gain */
891
prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
892
theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
894
/* Optimal estimator for loudness domain */
895
MM = hypergeom_gain(theta);
896
/* EM gain with bound */
897
g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
898
/* Interpolated speech probability of presence */
901
/* Constrain the gain to be close to the Bark scale gain */
902
if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
903
g = MULT16_16(3,st->gain[i]);
906
/* Save old power spectrum */
907
st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
909
/* Apply gain floor */
910
if (st->gain[i] < st->gain_floor[i])
911
st->gain[i] = st->gain_floor[i];
913
/* Exponential decay model for reverberation (unused) */
914
/*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
916
/* Take into account speech probability of presence (loudness domain MMSE estimator) */
917
/* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
918
tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
919
st->gain2[i]=SQR16_Q15(tmp);
921
/* Use this if you want a log-domain MMSE estimator instead */
922
/*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
928
spx_word16_t p = st->gain2[i];
929
st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);
930
tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
931
st->gain2[i]=SQR16_Q15(tmp);
933
filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
936
/* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
937
if (!st->denoise_enabled)
940
st->gain2[i]=Q15_ONE;
943
/* Apply computed gain */
946
st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
947
st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
949
st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
950
st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
952
/*FIXME: This *will* not work for fixed-point */
955
speex_compute_agc(st, Pframe, st->ft);
958
/* Inverse FFT with 1/N scaling */
959
spx_ifft(st->fft_lookup, st->ft, st->frame);
960
/* Scale back to original (lower) amplitude */
962
st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
964
/*FIXME: This *will* not work for fixed-point */
970
if (fabs(st->frame[i])>max_sample)
971
max_sample = fabs(st->frame[i]);
972
if (max_sample>28000.f)
974
float damp = 28000.f/max_sample;
976
st->frame[i] *= damp;
981
/* Synthesis window (for WOLA) */
983
st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
985
/* Perform overlap and add */
987
x[i] = st->outbuf[i] + st->frame[i];
989
x[N3+i] = st->frame[N3+i];
993
st->outbuf[i] = st->frame[st->frame_size+i];
995
/* FIXME: This VAD is a kludge */
996
st->speech_prob = Pframe;
999
if (st->speech_prob > st->speech_prob_start || (st->was_speech && st->speech_prob > st->speech_prob_continue))
1013
EXPORT void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
1016
int N = st->ps_size;
1017
int N3 = 2*N - st->frame_size;
1019
spx_word32_t *ps=st->ps;
1024
preprocess_analysis(st, x);
1026
update_noise_prob(st);
1030
if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
1032
st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
1037
st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
1039
/* Save old power spectrum */
1041
st->old_ps[i] = ps[i];
1044
st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
1048
EXPORT int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
1051
SpeexPreprocessState *st;
1052
st=(SpeexPreprocessState*)state;
1055
case SPEEX_PREPROCESS_SET_DENOISE:
1056
st->denoise_enabled = (*(spx_int32_t*)ptr);
1058
case SPEEX_PREPROCESS_GET_DENOISE:
1059
(*(spx_int32_t*)ptr) = st->denoise_enabled;
1062
case SPEEX_PREPROCESS_SET_AGC:
1063
st->agc_enabled = (*(spx_int32_t*)ptr);
1065
case SPEEX_PREPROCESS_GET_AGC:
1066
(*(spx_int32_t*)ptr) = st->agc_enabled;
1068
#ifndef DISABLE_FLOAT_API
1069
case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1070
st->agc_level = (*(float*)ptr);
1071
if (st->agc_level<1)
1073
if (st->agc_level>32768)
1074
st->agc_level=32768;
1076
case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1077
(*(float*)ptr) = st->agc_level;
1079
#endif /* #ifndef DISABLE_FLOAT_API */
1080
case SPEEX_PREPROCESS_SET_AGC_INCREMENT:
1081
st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1083
case SPEEX_PREPROCESS_GET_AGC_INCREMENT:
1084
(*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size);
1086
case SPEEX_PREPROCESS_SET_AGC_DECREMENT:
1087
st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1089
case SPEEX_PREPROCESS_GET_AGC_DECREMENT:
1090
(*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size);
1092
case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN:
1093
st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr));
1095
case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN:
1096
(*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain));
1099
case SPEEX_PREPROCESS_SET_VAD:
1100
speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
1101
st->vad_enabled = (*(spx_int32_t*)ptr);
1103
case SPEEX_PREPROCESS_GET_VAD:
1104
(*(spx_int32_t*)ptr) = st->vad_enabled;
1107
case SPEEX_PREPROCESS_SET_DEREVERB:
1108
st->dereverb_enabled = (*(spx_int32_t*)ptr);
1109
for (i=0;i<st->ps_size;i++)
1110
st->reverb_estimate[i]=0;
1112
case SPEEX_PREPROCESS_GET_DEREVERB:
1113
(*(spx_int32_t*)ptr) = st->dereverb_enabled;
1116
case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1117
/* FIXME: Re-enable when de-reverberation is actually enabled again */
1118
/*st->reverb_level = (*(float*)ptr);*/
1120
case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1121
/* FIXME: Re-enable when de-reverberation is actually enabled again */
1122
/*(*(float*)ptr) = st->reverb_level;*/
1125
case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1126
/* FIXME: Re-enable when de-reverberation is actually enabled again */
1127
/*st->reverb_decay = (*(float*)ptr);*/
1129
case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1130
/* FIXME: Re-enable when de-reverberation is actually enabled again */
1131
/*(*(float*)ptr) = st->reverb_decay;*/
1134
case SPEEX_PREPROCESS_SET_PROB_START:
1135
*(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1136
st->speech_prob_start = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1138
case SPEEX_PREPROCESS_GET_PROB_START:
1139
(*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
1142
case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1143
*(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1144
st->speech_prob_continue = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1146
case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1147
(*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
1150
case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
1151
st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
1153
case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
1154
(*(spx_int32_t*)ptr) = st->noise_suppress;
1156
case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
1157
st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
1159
case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
1160
(*(spx_int32_t*)ptr) = st->echo_suppress;
1162
case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
1163
st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
1165
case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
1166
(*(spx_int32_t*)ptr) = st->echo_suppress_active;
1168
case SPEEX_PREPROCESS_SET_ECHO_STATE:
1169
st->echo_state = (SpeexEchoState*)ptr;
1171
case SPEEX_PREPROCESS_GET_ECHO_STATE:
1172
(*(SpeexEchoState**)ptr) = (SpeexEchoState*)st->echo_state;
1175
case SPEEX_PREPROCESS_GET_AGC_LOUDNESS:
1176
(*(spx_int32_t*)ptr) = pow(st->loudness, 1.0/LOUDNESS_EXP);
1178
case SPEEX_PREPROCESS_GET_AGC_GAIN:
1179
(*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->agc_gain));
1182
case SPEEX_PREPROCESS_GET_PSD_SIZE:
1183
case SPEEX_PREPROCESS_GET_NOISE_PSD_SIZE:
1184
(*(spx_int32_t*)ptr) = st->ps_size;
1186
case SPEEX_PREPROCESS_GET_PSD:
1187
for(i=0;i<st->ps_size;i++)
1188
((spx_int32_t *)ptr)[i] = (spx_int32_t) st->ps[i];
1190
case SPEEX_PREPROCESS_GET_NOISE_PSD:
1191
for(i=0;i<st->ps_size;i++)
1192
((spx_int32_t *)ptr)[i] = (spx_int32_t) PSHR32(st->noise[i], NOISE_SHIFT);
1194
case SPEEX_PREPROCESS_GET_PROB:
1195
(*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob, 100);
1198
case SPEEX_PREPROCESS_SET_AGC_TARGET:
1199
st->agc_level = (*(spx_int32_t*)ptr);
1200
if (st->agc_level<1)
1202
if (st->agc_level>32768)
1203
st->agc_level=32768;
1205
case SPEEX_PREPROCESS_GET_AGC_TARGET:
1206
(*(spx_int32_t*)ptr) = st->agc_level;
1210
speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
1217
long long spx_mips=0;