1
/* ====================================================================
2
* Copyright (c) 1995-2000 Carnegie Mellon University. All rights
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in
14
* the documentation and/or other materials provided with the
17
* This work was supported in part by funding from the Defense Advanced
18
* Research Projects Agency and the National Science Foundation of the
19
* United States of America, and the CMU Sphinx Speech Consortium.
21
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
* ====================================================================
36
/*********************************************************************
41
* This is the top level routine for SPHINX-III Baum-Welch
45
* Eric Thayer (eht+@cmu.edu) 20-Jun-95
47
*********************************************************************/
49
#include "train_cmd_ln.h"
52
#include "next_utt_states.h"
53
#include "baum_welch.h"
56
#include <s3/common.h>
57
#include <s3/mk_phone_list.h>
58
#include <s3/cvt2triphone.h>
59
#include <s3/mk_sseq.h>
60
#include <s3/mk_trans_seq.h>
61
#include <s3/model_inventory.h>
62
#include <s3/model_def_io.h>
63
#include <s3/s3ts2cb_io.h>
65
#include <s3/mllr_io.h>
67
#include <s3/s3cb2mllr_io.h>
68
#include <sys_compat/misc.h>
69
#include <sys_compat/time.h>
70
#include <sys_compat/file.h>
72
#include <sphinxbase/ckd_alloc.h>
73
#include <sphinxbase/profile.h>
74
#include <sphinxbase/feat.h>
82
#define DUMP_RETRY_PERIOD 3 /* If a count dump fails, retry every # of sec's */
84
/* the following parameters are used for MMIE training */
85
#define LOG_ZERO -1.0E10
86
static float32 lm_scale = 11.5;
88
/* FIXME: Should go in libutil */
90
string_join(const char *base, ...)
99
while ((c = va_arg(args, const char *)) != NULL) {
105
out = ckd_calloc(len, 1);
106
va_start(args, base);
108
while ((c = va_arg(args, const char *)) != NULL) {
117
print_all_timers(bw_timers_t *timers, int32 n_frame)
119
printf(" utt %4.3fx %4.3fe"
124
" rsts %4.3fx %4.3fe"
125
" rstf %4.3fx %4.3fe"
126
" rstu %4.3fx %4.3fe",
128
timers->utt_timer.t_cpu/(n_frame*0.01),
129
(timers->utt_timer.t_cpu > 0 ? timers->utt_timer.t_elapsed / timers->utt_timer.t_cpu : 0.0),
131
timers->upd_timer.t_cpu/(n_frame*0.01),
132
(timers->upd_timer.t_cpu > 0 ? timers->upd_timer.t_elapsed / timers->upd_timer.t_cpu : 0.0),
134
timers->fwd_timer.t_cpu/(n_frame*0.01),
135
(timers->fwd_timer.t_cpu > 0 ? timers->fwd_timer.t_elapsed / timers->fwd_timer.t_cpu : 0.0),
137
timers->bwd_timer.t_cpu/(n_frame*0.01),
138
(timers->bwd_timer.t_cpu > 0 ? timers->bwd_timer.t_elapsed / timers->bwd_timer.t_cpu : 0.0),
140
timers->gau_timer.t_cpu/(n_frame*0.01),
141
(timers->gau_timer.t_cpu > 0 ? timers->gau_timer.t_elapsed / timers->gau_timer.t_cpu : 0.0),
143
timers->rsts_timer.t_cpu/(n_frame*0.01),
144
(timers->rsts_timer.t_cpu > 0 ? timers->rsts_timer.t_elapsed / timers->rsts_timer.t_cpu : 0.0),
146
timers->rstf_timer.t_cpu/(n_frame*0.01),
147
(timers->rstf_timer.t_cpu > 0 ? timers->rstf_timer.t_elapsed / timers->rstf_timer.t_cpu : 0.0),
149
timers->rstu_timer.t_cpu/(n_frame*0.01),
150
(timers->rstu_timer.t_cpu > 0 ? timers->rstu_timer.t_elapsed / timers->rstu_timer.t_cpu : 0.0));
155
/*********************************************************************
161
* Construct data structures and precompute values necessary
162
* for Baum-Welch reestimation.
166
* The number of command line arguments
168
* Array of command line argument strings
170
* The model inventory data structure created
171
* by this routine. (see libmodinv/modinv.c)
173
* A word -> phone dictionary for the training set.
180
* This value is returned when no error condition
183
* This value is returned when an error condition
195
*********************************************************************/
198
main_initialize(int argc,
200
model_inventory_t **out_inv,
202
model_def_t **out_mdef,
205
model_inventory_t *inv; /* the model inventory */
206
lexicon_t *lex; /* the lexicon to be returned to the caller */
217
int did_restore = FALSE;
219
int32 *mllr_idx = NULL;
221
const char *mdeffn, *meanfn, *varfn, *mixwfn, *tmatfn, *fdictfn;
223
/* Note these are forward transforms for use
224
in training. The inverse transform of the accumulators is now
225
done externally by mllr_transform. */
226
float32 ****sxfrm_a = NULL;
227
float32 ***sxfrm_b = NULL;
229
E_INFO("Compiled on %s at %s\n", __DATE__, __TIME__);
231
/* define, parse and (partially) validate the command line */
232
train_cmd_ln_parse(argc, argv);
235
feat_init(cmd_ln_str("-feat"),
236
cmn_type_from_str(cmd_ln_str("-cmn")),
237
cmd_ln_boolean("-varnorm"),
238
agc_type_from_str(cmd_ln_str("-agc")),
239
1, cmd_ln_int32("-ceplen"));
243
if (cmd_ln_str("-lda")) {
244
E_INFO("Reading linear feature transformation from %s\n",
246
if (feat_read_lda(feat,
248
cmd_ln_int32("-ldadim")) < 0)
252
if (cmd_ln_str("-svspec")) {
254
E_INFO("Using subvector specification %s\n",
255
cmd_ln_str("-svspec"));
256
if ((subvecs = parse_subvecs(cmd_ln_str("-svspec"))) == NULL)
258
if ((feat_set_subvecs(feat, subvecs)) < 0)
262
if (cmd_ln_exists("-agcthresh")
263
&& 0 != strcmp(cmd_ln_str("-agc"), "none")) {
264
agc_set_threshold(feat->agc_struct,
265
cmd_ln_float32("-agcthresh"));
269
&& cmd_ln_exists("-cmninit")) {
270
char *c, *cc, *vallist;
273
vallist = ckd_salloc(cmd_ln_str("-cmninit"));
276
while (nvals < feat->cmn_struct->veclen
277
&& (cc = strchr(c, ',')) != NULL) {
279
feat->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof(c));
283
if (nvals < feat->cmn_struct->veclen && *c != '\0') {
284
feat->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof(c));
290
/* create a new model inventory structure */
291
*out_inv = inv = mod_inv_new();
293
mod_inv_set_n_feat(inv, feat_dimension1(feat));
295
mdeffn = cmd_ln_str("-moddeffn");
296
meanfn = cmd_ln_str("-meanfn");
297
varfn = cmd_ln_str("-varfn");
298
mixwfn = cmd_ln_str("-mixwfn");
299
tmatfn = cmd_ln_str("-tmatfn");
300
fdictfn = cmd_ln_str("-fdictfn");
302
/* Note: this will leak a small amount of memory but we really
304
if ((hmmdir = cmd_ln_str("-hmmdir")) != NULL) {
306
mdeffn = string_join(hmmdir, "/mdef", NULL);
308
meanfn = string_join(hmmdir, "/means", NULL);
310
varfn = string_join(hmmdir, "/variances", NULL);
312
mixwfn = string_join(hmmdir, "/mixture_weights", NULL);
314
tmatfn = string_join(hmmdir, "/transition_matrices", NULL);
316
fdictfn = string_join(hmmdir, "/noisedict", NULL);
318
E_INFO("Reading %s\n", mdeffn);
320
/* Read in the model definitions. Defines the set of
321
CI phones and context dependent phones. Defines the
322
transition matrix tying and state level tying. */
323
if (model_def_read(&mdef, mdeffn) != S3_SUCCESS) {
329
fn = cmd_ln_str("-ts2cbfn");
331
E_FATAL("Specify -ts2cbfn\n");
333
if (strcmp(fn, SEMI_LABEL) == 0) {
334
mdef->cb = semi_ts2cb(mdef->n_tied_state);
335
n_ts = mdef->n_tied_state;
338
else if (strcmp(fn, CONT_LABEL) == 0) {
339
mdef->cb = cont_ts2cb(mdef->n_tied_state);
340
n_ts = mdef->n_tied_state;
341
n_cb = mdef->n_tied_state;
343
else if (strcmp(fn, PTM_LABEL) == 0) {
344
mdef->cb = ptm_ts2cb(mdef);
345
n_ts = mdef->n_tied_state;
346
n_cb = mdef->acmod_set->n_ci;
348
else if (s3ts2cb_read(fn,
351
&n_cb) != S3_SUCCESS) {
355
inv->acmod_set = mdef->acmod_set;
358
if (mod_inv_read_mixw(inv, mdef, mixwfn,
359
cmd_ln_float32("-mwfloor")) != S3_SUCCESS)
362
if (n_ts != inv->n_mixw) {
363
E_WARN("%u mappings from tied-state to cb, but %u tied-state in %s\n",
364
mdef->n_cb, inv->n_mixw, mixwfn);
367
if (mod_inv_read_tmat(inv, tmatfn,
368
cmd_ln_float32("-tpfloor")) != S3_SUCCESS)
371
if (mod_inv_read_gauden(inv, meanfn, varfn,
372
cmd_ln_float32("-varfloor"),
373
cmd_ln_int32("-topn"),
374
cmd_ln_int32("-fullvar")) != S3_SUCCESS) {
375
if (!cmd_ln_int32("-fullvar")) {
379
/* If reading full variances failed, try reading
380
* them as diagonal variances (allows us to
381
* initialize full vars from diagonal ones) */
382
if (mod_inv_read_gauden(inv, meanfn, varfn,
383
cmd_ln_float32("-varfloor"),
384
cmd_ln_int32("-topn"),
385
FALSE) != S3_SUCCESS) {
392
/* If we want to use diagonals only, and we didn't read diagonals
393
* above, then we have to extract them here. */
394
if (cmd_ln_int32("-diagfull") && inv->gauden->var == NULL) {
395
/* Extract diagonals and use them for Gaussian computation. */
400
g->var = gauden_alloc_param(g->n_mgau,
404
for (i = 0; i < g->n_mgau; ++i)
405
for (j = 0; j < g->n_feat; ++j)
406
for (k = 0; k < g->n_density; ++k)
407
for (l = 0; l < g->veclen[j]; ++l)
409
g->fullvar[i][j][k][l][l];
410
gauden_free_param_full(g->fullvar);
412
gauden_floor_variance(g);
415
if (gauden_eval_precomp(inv->gauden) != S3_SUCCESS) {
416
E_ERROR("Problems precomputing values used during Gaussian density evaluation\n");
421
if (inv->gauden->n_mgau != n_cb) {
422
printf("# of codebooks in mean/var files, %u, inconsistent with ts2cb mapping %u\n", inv->gauden->n_mgau, n_cb);
425
mixw_reest = cmd_ln_int32("-mixwreest");
426
mean_reest = cmd_ln_int32("-meanreest");
427
var_reest = cmd_ln_int32("-varreest");
428
tmat_reest = cmd_ln_int32("-tmatreest");
430
E_INFO("Will %sreestimate mixing weights.\n",
431
(mixw_reest ? "" : "NOT "));
432
E_INFO("Will %sreestimate means.\n",
433
(mean_reest ? "" : "NOT "));
434
E_INFO("Will %sreestimate variances.\n",
435
(var_reest ? "" : "NOT "));
437
if (cmd_ln_int32("-mixwreest")) {
438
if (mod_inv_alloc_mixw_acc(inv) != S3_SUCCESS)
442
E_INFO("Will %sreestimate transition matrices\n",
443
(cmd_ln_int32("-tmatreest") ? "" : "NOT "));
444
if (cmd_ln_int32("-tmatreest")) {
445
if (mod_inv_alloc_tmat_acc(inv) != S3_SUCCESS)
449
if (cmd_ln_int32("-meanreest") ||
450
cmd_ln_int32("-varreest")) {
451
if (mod_inv_alloc_gauden_acc(inv) != S3_SUCCESS)
455
E_INFO("Reading main lexicon: %s\n",
456
cmd_ln_str("-dictfn"));
458
lex = lexicon_read(NULL,
459
cmd_ln_str("-dictfn"),
465
E_INFO("Reading filler lexicon: %s\n",
467
(void)lexicon_read(lex,
476
* Configure corpus module (controls sequencing/access of per utterance data)
479
/* set the data directory and extension for cepstrum files */
480
corpus_set_mfcc_dir(cmd_ln_str("-cepdir"));
481
corpus_set_mfcc_ext(cmd_ln_str("-cepext"));
483
if (cmd_ln_str("-lsnfn")) {
484
/* use a LSN file which has all the transcripts */
485
corpus_set_lsn_filename(cmd_ln_str("-lsnfn"));
488
/* set the data directory and extension for word transcript
490
corpus_set_sent_dir(cmd_ln_str("-sentdir"));
491
corpus_set_sent_ext(cmd_ln_str("-sentext"));
494
if (cmd_ln_str("-ctlfn")) {
495
corpus_set_ctl_filename(cmd_ln_str("-ctlfn"));
498
if (cmd_ln_str("-phsegdir")) {
499
corpus_set_phseg_dir(cmd_ln_str("-phsegdir"));
500
corpus_set_phseg_ext(cmd_ln_str("-phsegext"));
503
if (cmd_ln_str("-accumdir")) {
504
char fn[MAXPATHLEN+1];
507
sprintf(fn, "%s/ckpt", cmd_ln_str("-accumdir"));
511
const uint32* feat_veclen;
514
E_INFO("RESTORING CHECKPOINTED COUNTS IN %s\n", cmd_ln_str("-accumdir"));
516
feat_veclen = (uint32 *)feat_stream_lengths(feat);
518
if (mod_inv_restore_acc(inv,
519
cmd_ln_str("-accumdir"),
524
feat_veclen) != S3_SUCCESS) {
525
E_FATAL("Unable to restore checkpoint information\n");
528
if (corpus_ckpt_set_interval(fn) != S3_SUCCESS) {
529
E_FATAL("Unable to restore corpus state information\n");
532
E_INFO("Resuming at utt %u\n", corpus_get_begin());
538
if (cmd_ln_int32("-nskip") && cmd_ln_int32("-runlen")) {
539
corpus_set_interval(cmd_ln_int32("-nskip"),
540
cmd_ln_int32("-runlen"));
541
} else if (cmd_ln_int32("-part") && cmd_ln_int32("-npart")) {
542
corpus_set_partition(cmd_ln_int32("-part"),
543
cmd_ln_int32("-npart"));
547
/* BEWARE: this function call must be done after all the other corpus
551
if (cmd_ln_str("-mllrmat")) {
552
uint32 *tmp_veclen, *feat_veclen;
553
uint32 tmp_n_mllrcls;
557
if (read_reg_mat(cmd_ln_str("-mllrmat"),
561
&sxfrm_a, &sxfrm_b) != S3_SUCCESS) {
562
E_FATAL("Unable to read %s\n", cmd_ln_str("-mllrmat"));
565
if (feat_dimension1(feat) != tmp_n_stream) {
566
E_FATAL("# feature streams in -mllrmat %s != # feature streams configured on cmd ln\n");
569
feat_veclen = (uint32 *)feat_stream_lengths(feat);
571
for (j = 0; j < tmp_n_stream; j++) {
572
if (feat_veclen[j] != tmp_veclen[j]) {
573
E_FATAL("# components of stream %u in -mllrmat inconsistent w/ -feat config (%u != %u)\n",
574
j, tmp_veclen[j], feat_veclen[j]);
577
ckd_free((void *)tmp_veclen);
579
fn = cmd_ln_str("-cb2mllrfn");
581
if (strcmp(fn, ".1cls.") == 0) {
582
mllr_idx = ckd_calloc(inv->gauden->n_mgau, sizeof(int32));
584
n_map = inv->gauden->n_mgau;
586
else if (s3cb2mllr_read(cmd_ln_str("-cb2mllrfn"),
589
&n_mllr) != S3_SUCCESS) {
592
if (n_map != inv->gauden->n_mgau) {
593
E_FATAL("cb2mllr maps %u cb, but read %u cb from files\n",
594
n_map, inv->gauden->n_mgau);
598
/* Transform the means using the speaker transform if available. */
599
mllr_transform_mean(inv->gauden->mean,
601
0, inv->gauden->n_mgau,
603
inv->gauden->n_density,
608
free_mllr_A(sxfrm_a, n_mllr, tmp_n_stream);
609
free_mllr_B(sxfrm_b, n_mllr, tmp_n_stream);
616
main_reestimate(model_inventory_t *inv,
622
vector_t *mfcc; /* utterance cepstra */
623
int32 n_frame; /* # of cepstrum frames */
624
uint32 svd_n_frame; /* # of cepstrum frames */
625
vector_t **f; /* independent feature streams derived
627
state_t *state_seq; /* sentence HMM state sequence for the
629
uint32 n_state = 0; /* # of sentence HMM states */
630
float64 total_log_lik; /* total log liklihood over corpus */
631
float64 log_lik; /* log liklihood for an utterance */
632
uint32 total_frames; /* # of frames over the corpus */
633
float64 a_beam; /* alpha pruning beam */
634
float64 b_beam; /* beta pruning beam */
635
float32 spthresh; /* state posterior probability threshold */
636
uint32 seq_no; /* sequence # of utterance in corpus */
637
uint32 mixw_reest; /* if TRUE, reestimate mixing weights */
638
uint32 tmat_reest; /* if TRUE, reestimate transition probability matrices */
639
uint32 mean_reest; /* if TRUE, reestimate means */
640
uint32 var_reest; /* if TRUE, reestimate variances */
642
const char *pdumpdir;
646
bw_timers_t* timers = NULL;
654
s3phseg_t *phseg = NULL;
657
uint32 n_frame_skipped = 0;
659
uint32 ckpt_intv = 0;
660
uint32 no_retries = 0;
662
uint32 outputfullpath = 0;
663
uint32 fullsuffixmatch = 0;
665
E_INFO("Reestimation: %s\n",
666
(viterbi ? "Viterbi" : "Baum-Welch"));
668
profile = cmd_ln_int32("-timing");
670
E_INFO("Generating profiling information consumes significant CPU resources.\n");
671
E_INFO("If you are not interested in profiling, use -timing no\n");
673
outputfullpath = cmd_ln_int32("-outputfullpath");
674
fullsuffixmatch = cmd_ln_int32("-fullsuffixmatch");
676
corpus_set_full_suffix_match(fullsuffixmatch);
679
timers = ckd_calloc(1, sizeof(bw_timers_t));
680
ptmr_init(&timers->utt_timer);
681
ptmr_init(&timers->upd_timer);
682
ptmr_init(&timers->fwd_timer);
683
ptmr_init(&timers->bwd_timer);
684
ptmr_init(&timers->gau_timer);
685
ptmr_init(&timers->rsts_timer);
686
ptmr_init(&timers->rstf_timer);
687
ptmr_init(&timers->rstu_timer);
690
mixw_reest = cmd_ln_int32("-mixwreest");
691
tmat_reest = cmd_ln_int32("-tmatreest");
692
mean_reest = cmd_ln_int32("-meanreest");
693
var_reest = cmd_ln_int32("-varreest");
694
pass2var = cmd_ln_int32("-2passvar");
695
var_is_full = cmd_ln_int32("-fullvar");
696
pdumpdir = cmd_ln_str("-pdumpdir");
697
in_veclen = cmd_ln_int32("-ceplen");
699
if (cmd_ln_str("-ckptintv")) {
700
ckpt_intv = cmd_ln_int32("-ckptintv");
703
if (cmd_ln_str("-accumdir") == NULL) {
704
E_WARN("NO ACCUMDIR SET. No counts will be written; assuming debug\n");
707
if (!mixw_reest && !tmat_reest && !mean_reest && !var_reest) {
708
E_WARN("No reestimation specified! None done.\n");
716
a_beam = cmd_ln_float64("-abeam");
717
b_beam = cmd_ln_float64("-bbeam");
718
spthresh = cmd_ln_float32("-spthresh");
719
maxuttlen = cmd_ln_int32("-maxuttlen");
721
/* Begin by skipping over some (possibly zero) # of utterances.
722
* Continue to process utterances until there are no more (either EOF
725
seq_no = corpus_get_begin();
727
printf("column defns\n");
730
printf("\t<n_frame_in>\n");
731
printf("\t<n_frame_del>\n");
732
printf("\t<n_state_shmm>\n");
733
printf("\t<avg_states_alpha>\n");
734
if (!cmd_ln_int32("-viterbi")) {
735
printf("\t<avg_states_beta>\n");
736
printf("\t<avg_states_reest>\n");
737
printf("\t<avg_posterior_prune>\n");
739
printf("\t<frame_log_lik>\n");
740
printf("\t<utt_log_lik>\n");
741
printf("\t... timing info ... \n");
745
while (corpus_next_utt()) {
746
/* Zero timers before utt processing begins */
748
ptmr_reset(&timers->utt_timer);
749
ptmr_reset(&timers->upd_timer);
750
ptmr_reset(&timers->fwd_timer);
751
ptmr_reset(&timers->bwd_timer);
752
ptmr_reset(&timers->gau_timer);
753
ptmr_reset(&timers->rsts_timer);
754
ptmr_reset(&timers->rstf_timer);
755
ptmr_reset(&timers->rstu_timer);
759
ptmr_start(&timers->utt_timer);
761
printf("utt> %5u %25s",
763
(outputfullpath ? corpus_utt_full_name() : corpus_utt()));
765
if (corpus_get_generic_featurevec(&mfcc, &n_frame, in_veclen) < 0) {
766
E_FATAL("Can't read input features\n");
769
printf(" %4u", n_frame);
772
E_WARN("utt %s too short\n", corpus_utt());
780
if ((maxuttlen > 0) && (n_frame > maxuttlen)) {
781
E_INFO("utt # frames > -maxuttlen; skipping\n");
782
n_frame_skipped += n_frame;
791
svd_n_frame = n_frame;
793
/* Hack to not apply the LDA, it will be applied later during accum_dir
794
* Pretty useless thing to be honest, what to do with CMN after that for example?
796
if (cmd_ln_boolean("-ldaaccum")) {
797
float32 ***lda = feat->lda;
799
f = feat_array_alloc(feat, n_frame + feat_window_size(feat));
800
feat_s2mfc2feat_live(feat, mfcc, &n_frame, TRUE, TRUE, f);
803
f = feat_array_alloc(feat, n_frame + feat_window_size(feat));
804
feat_s2mfc2feat_live(feat, mfcc, &n_frame, TRUE, TRUE, f);
807
printf(" %4u", n_frame - svd_n_frame);
809
/* Get the transcript */
810
corpus_get_sent(&trans);
812
/* Get the phone segmentation */
813
corpus_get_phseg(inv->acmod_set, &phseg);
815
/* Open a dump file if required. */
817
char *pdumpfn, *uttid;
819
uttid = (outputfullpath ? corpus_utt_full_name() : corpus_utt());
820
pdumpfn = ckd_calloc(strlen(pdumpdir) + 1
822
+ strlen(".pdump") + 1, 1);
823
strcpy(pdumpfn, pdumpdir);
824
strcat(pdumpfn, "/");
825
strcat(pdumpfn, uttid);
826
strcat(pdumpfn, ".pdump");
827
if ((pdumpfh = fopen(pdumpfn, "w")) == NULL)
828
E_FATAL_SYSTEM("Failed to open %s for writing", pdumpfn);
835
ptmr_start(&timers->upd_timer);
836
/* create a sentence HMM */
837
state_seq = next_utt_states(&n_state, lex, inv, mdef, trans);
838
printf(" %5u", n_state);
840
if (state_seq == NULL) {
841
E_WARN("Skipped utterance '%s'\n", trans);
842
} else if (!viterbi) {
843
/* accumulate reestimation sums for the utterance */
844
if (baum_welch_update(&log_lik,
860
feat) == S3_SUCCESS) {
861
total_frames += n_frame;
862
total_log_lik += log_lik;
865
(n_frame > 0 ? log_lik / n_frame : 0.0),
870
/* Viterbi search and accumulate in it */
871
if (viterbi_update(&log_lik,
886
feat) == S3_SUCCESS) {
887
total_frames += n_frame;
888
total_log_lik += log_lik;
890
(n_frame > 0 ? log_lik / n_frame : 0.0),
896
ptmr_stop(&timers->upd_timer);
903
free(trans); /* alloc'ed using strdup() */
909
ptmr_stop(&timers->utt_timer);
912
print_all_timers(timers, n_frame);
917
if ((ckpt_intv > 0) &&
918
((n_utt % ckpt_intv) == 0) &&
919
(cmd_ln_str("-accumdir") != NULL)) {
920
while (accum_dump(cmd_ln_str("-accumdir"),
928
TRUE) != S3_SUCCESS) {
929
static int notified = FALSE;
934
* If we were not able to dump the parameters, write one log entry
937
if (notified == FALSE) {
939
strcpy(time_str, (const char *)ctime((const time_t *)&t));
940
/* nuke the newline at the end of this. */
941
time_str[strlen(time_str)-1] = '\0';
942
E_WARN("Ckpt count dump failed on %s. Retrying dump every %3.1f hour until success.\n",
943
time_str, DUMP_RETRY_PERIOD/3600.0);
948
E_FATAL("Failed to get the files after 10 retries(about 5 minutes).\n ");
951
sleep(DUMP_RETRY_PERIOD);
956
printf("overall> stats %u (-%u) %e %e",
959
(total_frames > 0 ? total_log_lik / total_frames : 0.0),
962
printf(" %4.3fx %4.3fe",
963
(total_frames > 0 ? timers->utt_timer.t_tot_cpu/(total_frames*0.01) : 0.0),
964
(timers->utt_timer.t_tot_cpu > 0 ? timers->utt_timer.t_tot_elapsed / timers->utt_timer.t_tot_cpu : 0.0));
970
/* dump the accumulators to a file system */
971
while (cmd_ln_str("-accumdir") != NULL &&
972
accum_dump(cmd_ln_str("-accumdir"), inv,
979
FALSE) != S3_SUCCESS) {
980
static int notified = FALSE;
985
* If we were not able to dump the parameters, write one log entry
988
if (notified == FALSE) {
990
strcpy(time_str, (const char *)ctime((const time_t *)&t));
991
/* nuke the newline at the end of this. */
992
time_str[strlen(time_str)-1] = '\0';
993
E_WARN("Count dump failed on %s. Retrying dump every %3.1f hour until success.\n",
994
time_str, DUMP_RETRY_PERIOD/3600.0);
999
E_FATAL("Failed to get the files after 10 retries(about 5 minutes).\n ");
1003
sleep(DUMP_RETRY_PERIOD);
1012
/* Write a log entry on success */
1013
if (cmd_ln_str("-accumdir"))
1014
E_INFO("Counts saved to %s\n", cmd_ln_str("-accumdir"));
1016
E_INFO("Counts NOT saved.\n");
1019
/* x=log(a) y=log(b), log_add(x,y) = log(a+b) */
1021
log_add(float64 x, float64 y)
1026
return log_add(y, x);
1032
return x+log(1.0+z);
1036
/* forward-backward computation on lattice */
1038
lat_fwd_bwd(s3lattice_t *lat)
1042
float64 ac_score, lm_score;
1045
for (i=0; i<lat->n_arcs; i++) {
1046
/* initialise alpha */
1047
lat->arc[i].alpha = LOG_ZERO;
1048
if (lat->arc[i].good_arc == 1) {
1049
/* get the acoustic and lm socre for a word hypothesis */
1050
ac_score = lat->arc[i].ac_score / lm_scale;
1051
lm_score = lat->arc[i].lm_score;
1054
for (j=0; j<lat->arc[i].n_prev_arcs; j++) {
1055
id = lat->arc[i].prev_arcs[j];
1057
if (lat->arc[i].sf == 1) {
1058
lat->arc[i].alpha = log_add(lat->arc[i].alpha, 0);
1062
if (lat->arc[id-1].good_arc == 1)
1063
lat->arc[i].alpha = log_add(lat->arc[i].alpha, lat->arc[id-1].alpha);
1066
lat->arc[i].alpha += ac_score + lm_score;
1070
/* initialise overall log-likelihood */
1071
lat->prob = LOG_ZERO;
1074
for (i=lat->n_arcs-1; i>=0 ;i--) {
1075
/* initialise beta */
1076
lat->arc[i].beta = LOG_ZERO;
1078
if (lat->arc[i].good_arc == 1) {
1079
/* get the acoustic and lm socre for a word hypothesis */
1080
ac_score = lat->arc[i].ac_score / lm_scale;
1081
lm_score = lat->arc[i].lm_score;
1084
for (j=0; j<lat->arc[i].n_next_arcs; j++) {
1085
id = lat->arc[i].next_arcs[j];
1087
lat->arc[i].beta = log_add(lat->arc[i].beta, 0);
1090
if (lat->arc[id-1].good_arc == 1);
1091
lat->arc[i].beta = log_add(lat->arc[i].beta, lat->arc[id-1].beta);
1094
lat->arc[i].beta += ac_score + lm_score;
1096
/* compute overall log-likelihood loglid=beta(1)=alpha(Q) */
1097
if (lat->arc[i].sf == 1)
1098
lat->prob = log_add(lat->prob, lat->arc[i].beta);
1103
for (i=0; i<lat->n_arcs; i++)
1105
/* initialise gamma */
1106
lat->arc[i].gamma = LOG_ZERO;
1107
if (lat->arc[i].good_arc == 1)
1109
ac_score = lat->arc[i].ac_score / lm_scale;
1110
lm_score = lat->arc[i].lm_score;
1111
lat->arc[i].gamma = lat->arc[i].alpha + lat->arc[i].beta - (ac_score + lm_score + lat->prob);
1115
/* compute the posterior probability of the true path */
1117
for (i=lat->n_arcs-lat->n_true_arcs; i<lat->n_arcs; i++)
1118
lat->postprob += lat->arc[i].gamma;
1123
/* mmie training: take random left and right context for viterbi run */
1125
mmi_rand_train(model_inventory_t *inv,
1136
uint32 n_rand;/* random number */
1137
uint32 n_max_run;/* the maximum number of viterbi run */
1138
char pword[128], cword[128], nword[128]; /* previous, current, next word */
1139
vector_t **arc_f = NULL;/* feature vector for a word arc */
1140
uint32 n_word_obs;/* frames of a word arc */
1141
uint32 rand_prev_id, rand_next_id;/* randomly selected previous and next arc id */
1142
uint32 *lphone, *rphone; /* the last and first phone of previous and next word hypothesis */
1143
state_t *state_seq;/* HMM state sequence for an arc */
1144
uint32 n_state = 0;/* number of HMM states */
1145
float64 log_lik;/* log-likelihood of an arc */
1147
/* viterbi run on each arc */
1148
printf(" %5u", lat->n_arcs);
1150
for(n=0; n<lat->n_arcs; n++) {
1152
/* total observations of this arc */
1153
/* this is not very accurate, as it consumes one more frame for each word at the end */
1154
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1156
/* get the feature for this arc */
1157
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1158
for (k=0; k<n_word_obs; k++)
1159
arc_f[k] = f[k+lat->arc[n].sf-1];
1161
/* in case the viterbi run fails at a certain left and right context,
1162
at most randomly pick context n_prev_arcs * n_next_arcs times */
1163
n_max_run = lat->arc[n].n_prev_arcs * lat->arc[n].n_next_arcs;
1165
/* seed the random-number generator with current time */
1166
srand( (unsigned)time( NULL ) );
1168
/* randomly pick the left and right context */
1169
while (n_max_run > 0 && lat->arc[n].good_arc == 0) {
1171
/* get left arc id */
1172
if (lat->arc[n].n_prev_arcs == 1) {
1176
n_rand = (uint32) (((double) rand() / (((double) RAND_MAX) + 1)) * lat->arc[n].n_prev_arcs );
1178
rand_prev_id = lat->arc[n].prev_arcs[n_rand];
1180
/* get right arc id */
1181
if (lat->arc[n].n_next_arcs == 1) {
1185
n_rand = (uint32) (((double) rand() / (((double) RAND_MAX) + 1)) * lat->arc[n].n_next_arcs );
1187
rand_next_id = lat->arc[n].next_arcs[n_rand];
1189
/* get the triphone list */
1190
strcpy(cword, lat->arc[n].word);
1191
if (rand_prev_id == 0)
1192
strcpy(pword, "<s>");
1194
strcpy(pword, lat->arc[rand_prev_id-1].word);
1195
lphone = mk_boundary_phone(pword, 0, lex);
1196
if (rand_next_id == 0)
1197
strcpy(nword, "</s>");
1199
strcpy(nword, lat->arc[rand_next_id-1].word);
1200
rphone = mk_boundary_phone(nword, 1, lex);
1202
state_seq = next_utt_states_mmie(&n_state, lex, inv, mdef, cword, lphone, rphone);
1204
/* viterbi compuation to get the acoustic score for a word hypothesis */
1205
if (mmi_viterbi_run(&log_lik,
1209
a_beam) == S3_SUCCESS) {
1210
lat->arc[n].good_arc = 1;
1211
lat->arc[n].ac_score = log_lik;
1212
lat->arc[n].best_prev_arc = rand_prev_id;
1213
lat->arc[n].best_next_arc = rand_next_id;
1223
if (lat->arc[n].good_arc == 0) {
1224
E_INFO("arc_%d is ignored (viterbi run failed)\n", n+1);
1228
/* lattice-based forward-backward computation */
1231
/* update Gaussian parameters */
1232
for (n=0; n<lat->n_arcs; n++) {
1234
/* only if the arc was successful in viterbi run */
1235
if (lat->arc[n].good_arc == 1) {
1237
/* total observations of this arc */
1238
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1239
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1240
for (k=0; k<n_word_obs; k++)
1241
arc_f[k] = f[k+lat->arc[n].sf-1];
1243
/* get the randomly picked left and right context */
1244
rand_prev_id = lat->arc[n].best_prev_arc;
1245
rand_next_id = lat->arc[n].best_next_arc;
1247
/* get the triphone list */
1248
strcpy(cword, lat->arc[n].word);
1249
if (rand_prev_id == 0)
1250
strcpy(pword, "<s>");
1252
strcpy(pword, lat->arc[rand_prev_id-1].word);
1253
lphone = mk_boundary_phone(pword, 0, lex);
1254
if (rand_next_id == 0)
1255
strcpy(nword, "</s>");
1257
strcpy(nword, lat->arc[rand_next_id-1].word);
1258
rphone = mk_boundary_phone(nword, 1, lex);
1260
/* make state list */
1261
state_seq = next_utt_states_mmie(&n_state, lex, inv, mdef, cword, lphone, rphone);
1263
/* viterbi update model parameters */
1264
if (mmi_viterbi_update(arc_f, n_word_obs,
1271
fcb) != S3_SUCCESS) {
1272
E_ERROR("arc_%d is ignored (viterbi update failed)\n", n+1);
1283
/* mmie training: take the best left and right context for viterbi run */
1285
mmi_best_train(model_inventory_t *inv,
1296
char pword[128], cword[128], nword[128]; /* previous, current and next word hypothesis */
1297
vector_t **arc_f = NULL;/* feature vector for a word arc */
1298
uint32 n_word_obs;/* frames of a word arc */
1299
uint32 prev_id, next_id;/* previous and next arc id */
1300
uint32 *lphone, *rphone;/* the last and first phone of previous and next arc */
1301
uint32 prev_lphone, prev_rphone;/* the lphone and rphone of previous viterbi run on arc */
1302
state_t *state_seq;/* HMM state sequence for an arc */
1303
uint32 n_state = 0;/* number of HMM states */
1304
float64 log_lik;/* log-likelihood of an arc */
1306
/* viterbi run on each arc */
1307
printf(" %5u", lat->n_arcs);
1309
for(n=0; n<lat->n_arcs; n++) {
1311
/* total observations of this arc */
1312
/* this is not very accurate, as it consumes one more frame for each word at the end */
1313
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1315
/* get the feature for this arc */
1316
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1317
for (k=0; k<n_word_obs; k++)
1318
arc_f[k] = f[k+lat->arc[n].sf-1];
1320
/* now try to find the best left and right context for viterbi run */
1321
/* current word hypothesis */
1322
strcpy(cword, lat->arc[n].word);
1324
/* initialise previous lphone */
1327
/* try all left context */
1328
for (i=0; i<lat->arc[n].n_prev_arcs; i++) {
1329
/* preceding word */
1330
prev_id = lat->arc[n].prev_arcs[i];
1332
strcpy(pword, "<s>");
1335
strcpy(pword, lat->arc[prev_id-1].word);
1338
/* get the left boundary triphone */
1339
lphone = mk_boundary_phone(pword, 0, lex);
1341
/* if the previous preceeding arc has different context as the new one */
1342
if (*lphone != prev_lphone || i == 0) {
1344
/* initialize rphone */
1347
/* try all right context */
1348
for(j=0; j<lat->arc[n].n_next_arcs; j++) {
1349
/* succeeding word */
1350
next_id = lat->arc[n].next_arcs[j];
1352
strcpy(nword, "</s>");
1354
strcpy(nword, lat->arc[next_id-1].word);
1356
/* get the right boundary triphone */
1357
rphone = mk_boundary_phone(nword, 1, lex);
1359
/* if the previous succeeding arc has different context as the new one */
1360
if (*rphone != prev_rphone || j == 0) {
1362
/* make state list */
1363
state_seq = next_utt_states_mmie(&n_state, lex, inv, mdef, cword, lphone, rphone);
1365
/* viterbi compuation to get the acoustic score for a word hypothesis */
1366
if (mmi_viterbi_run(&log_lik,
1370
a_beam) == S3_SUCCESS) {
1371
if (lat->arc[n].good_arc == 0) {
1372
lat->arc[n].good_arc = 1;
1373
lat->arc[n].ac_score = log_lik;
1374
lat->arc[n].best_prev_arc = lat->arc[n].prev_arcs[i];
1375
lat->arc[n].best_next_arc = lat->arc[n].next_arcs[j];
1377
else if (log_lik > lat->arc[n].ac_score) {
1378
lat->arc[n].ac_score = log_lik;
1379
lat->arc[n].best_prev_arc = lat->arc[n].prev_arcs[i];
1380
lat->arc[n].best_next_arc = lat->arc[n].next_arcs[j];
1383
/* save the current right context */
1384
prev_rphone = *rphone;
1388
/* save the current left context */
1389
prev_lphone = *lphone;
1396
if (lat->arc[n].good_arc == 0) {
1397
E_INFO("arc_%d is ignored (viterbi run failed)\n", n+1);
1401
/* lattice-based forward-backward computation */
1404
/* update Gaussian parameters */
1405
for (n=0; n<lat->n_arcs; n++) {
1407
/* only if the arc was successful in viterbi run */
1408
if (lat->arc[n].good_arc == 1) {
1410
/* total observations of this arc */
1411
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1412
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1413
for (k=0; k<n_word_obs; k++)
1414
arc_f[k] = f[k+lat->arc[n].sf-1];
1416
/* get the best left and right context */
1417
prev_id = lat->arc[n].best_prev_arc;
1418
next_id = lat->arc[n].best_next_arc;
1420
/* get best triphone list */
1421
strcpy(cword, lat->arc[n].word);
1423
strcpy(pword, "<s>");
1425
strcpy(pword, lat->arc[prev_id-1].word);
1426
lphone = mk_boundary_phone(pword, 0, lex);
1428
strcpy(nword, "</s>");
1430
strcpy(nword, lat->arc[next_id-1].word);
1431
rphone = mk_boundary_phone(nword, 1, lex);
1433
/* make state list */
1434
state_seq = next_utt_states_mmie(&n_state, lex, inv, mdef, cword, lphone, rphone);
1436
/* viterbi update model parameters */
1437
if (mmi_viterbi_update(arc_f, n_word_obs,
1444
fcb) != S3_SUCCESS) {
1445
E_ERROR("arc_%d is ignored (viterbi update failed)\n", n+1);
1456
/* mmie training: use context-independent hmms for word boundary models */
1458
mmi_ci_train(model_inventory_t *inv,
1469
vector_t **arc_f = NULL;/* feature vector for a word arc */
1470
uint32 n_word_obs;/* frames of a word arc */
1471
state_t *state_seq;/* HMM state sequence for an arc */
1472
uint32 n_state = 0;/* number of HMM states */
1473
float64 log_lik;/* log-likelihood of an arc */
1475
/* viterbi run on each arc */
1476
printf(" %5u", lat->n_arcs);
1478
for(n=0; n<lat->n_arcs; n++) {
1480
/* total observations of this arc */
1481
/* this is not very accurate, as it consumes one more frame for each word at the end */
1482
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1484
/* get the feature for this arc */
1485
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1486
for (k=0; k<n_word_obs; k++)
1487
arc_f[k] = f[k+lat->arc[n].sf-1];
1489
/* make state list */
1490
state_seq = next_utt_states(&n_state, lex, inv, mdef, lat->arc[n].word);
1492
/* viterbi compuation to get the acoustic score for a word hypothesis */
1493
if (mmi_viterbi_run(&log_lik,
1497
a_beam) == S3_SUCCESS) {
1498
lat->arc[n].good_arc = 1;
1499
lat->arc[n].ac_score = log_lik;
1504
if (lat->arc[n].good_arc == 0) {
1505
E_INFO("arc_%d is ignored (viterbi run failed)\n", n+1);
1509
/* lattice-based forward-backward computation */
1512
/* update Gaussian parameters */
1513
for (n=0; n<lat->n_arcs; n++) {
1515
/* only if the arc was successful in viterbi run */
1516
if (lat->arc[n].good_arc == 1) {
1518
/* total observations of this arc */
1519
n_word_obs = lat->arc[n].ef - lat->arc[n].sf + 1;
1520
arc_f = (vector_t **) ckd_calloc(n_word_obs, sizeof(vector_t *));
1521
for (k=0; k<n_word_obs; k++)
1522
arc_f[k] = f[k+lat->arc[n].sf-1];
1524
/* make state list */
1525
state_seq = next_utt_states(&n_state, lex, inv, mdef, lat->arc[n].word);
1527
/* viterbi update model parameters */
1528
if (mmi_viterbi_update(arc_f, n_word_obs,
1535
fcb) != S3_SUCCESS) {
1536
E_ERROR("arc_%d is ignored (viterbi update failed)\n", n+1);
1546
/* main mmie training program */
1548
main_mmi_reestimate(model_inventory_t *inv,
1553
vector_t *mfcc;/* utterance cepstra */
1554
int32 n_frame;/* # of cepstrum frames */
1555
uint32 svd_n_frame; /* # of cepstrum frames */
1556
vector_t **f;/* independent feature streams derived from cepstra */
1557
float32 ***lda = NULL;
1558
uint32 total_frames; /* # of frames over the corpus */
1559
float64 a_beam;/* alpha pruning beam */
1560
float64 b_beam;/* beta pruning beam */
1561
float32 spthresh; /* state posterior probability threshold */
1562
uint32 seq_no;/* sequence # of utterance in corpus */
1563
uint32 mean_reest; /* if TRUE, reestimate means */
1564
uint32 var_reest; /* if TRUE, reestimate variances */
1566
const char *lat_dir; /* lattice directory */
1567
const char *lat_ext;/* denominator or numerator lattice */
1568
const char *mmi_type;/* different methods to get left and right context for Viterbi run on lattice */
1569
uint32 n_mmi_type = 0;/* convert the mmi_type string to a int */
1570
s3lattice_t *lat = NULL;/* input lattice */
1571
float64 total_log_postprob = 0;/* total posterior probability of the correct hypotheses */
1572
uint32 n_utt_fail = 0; /* number of sentences failed */
1579
uint32 no_retries=0;
1582
uint32 n_frame_skipped = 0;
1584
/* get rid of unnecessary arguments */
1585
if (cmd_ln_int32("-2passvar")) {
1586
E_FATAL("for MMIE training, set -2passvar to no\n");
1588
if (cmd_ln_int32("-fullvar")) {
1589
E_FATAL("current MMIE training don't support full variance matrix, set -fullvar to no\n");
1591
if (cmd_ln_int32("-timing")) {
1592
E_FATAL("current MMIE training don't support timing, set -timing to no\n");
1594
if (cmd_ln_int32("-mixwreest")) {
1595
E_FATAL("current MMIE training don't support mixture weight reestimation, set -mixwreest to no\n");
1597
if (cmd_ln_int32("-tmatreest")) {
1598
E_FATAL("current MMIE training don't support transition matrix reestimation, set -tmatreest to no\n");
1600
if (cmd_ln_int32("-outputfullpath")) {
1601
E_FATAL("current MMIE training don't support outputfullpath, set -outputfullpath to no\n");
1603
if (cmd_ln_int32("-fullsuffixmatch")) {
1604
E_FATAL("current MMIE training don't support fullsuffixmatch, set -fullsuffixmatch to no\n");
1606
if (cmd_ln_str("-ckptintv")) {
1607
E_FATAL("current MMIE training don't support ckptintv, remove -ckptintv\n");
1609
if (cmd_ln_str("-pdumpdir")) {
1610
E_FATAL("current MMIE training don't support pdumpdir, set -pdumpdir to no\n");
1613
/* get lattice related parameters */
1614
lat_dir = cmd_ln_str("-latdir");
1615
lat_ext = cmd_ln_str("-latext");
1616
if (strcmp(lat_ext, "denlat") != 0 && strcmp(lat_ext, "numlat") != 0) {
1617
E_FATAL("-latext should be either denlat or numlat\n");
1620
printf("MMIE training for %s \n", lat_ext);
1622
mmi_type = cmd_ln_str("-mmie_type");
1623
if (strcmp(mmi_type, "rand") == 0) {
1625
printf("MMIE training: take random left and right context for Viterbi run \n");
1627
else if (strcmp(mmi_type, "best") == 0) {
1629
printf("MMIE training: take the best left and right context for Viterbi run \n");
1631
else if (strcmp(mmi_type, "ci") == 0) {
1632
printf("MMIE training: use context-independent hmms for boundary word models \n");
1636
E_FATAL("-mmie_type should be rand, best or ci\n");
1638
lm_scale = cmd_ln_float32("-lw");
1640
mean_reest = cmd_ln_int32("-meanreest");
1641
var_reest = cmd_ln_int32("-varreest");
1642
in_veclen = cmd_ln_int32("-ceplen");
1644
/* Read in an LDA matrix for accumulation. */
1645
if (cmd_ln_str("-lda")) {
1646
feat_read_lda(feat, cmd_ln_str("-lda"),
1647
cmd_ln_int32("-ldadim"));
1651
if (cmd_ln_str("-accumdir") == NULL) {
1652
E_WARN("NO ACCUMDIR SET. No counts will be written; assuming debug\n");
1656
if (!mean_reest && !var_reest) {
1657
E_FATAL("No reestimation specified! Nothing done. Set -meanreest or -varreest \n");
1663
a_beam = cmd_ln_float64("-abeam");
1664
b_beam = cmd_ln_float64("-bbeam");
1665
spthresh = cmd_ln_float32("-spthresh");
1666
maxuttlen = cmd_ln_int32("-maxuttlen");
1668
/* Begin by skipping over some (possibly zero) # of utterances.
1669
* Continue to process utterances until there are no more (either EOF
1670
* or end of run). */
1671
seq_no = corpus_get_begin();
1673
printf("column defns\n");
1674
printf("\t<seq>\n");
1676
printf("\t<n_frame_in>\n");
1677
printf("\t<n_frame_del>\n");
1678
printf("\t<lattice_cat>\n");
1679
printf("\t<n_word>\n");
1680
printf("\t<lattice_log_postprob>\n");
1682
/* accumulate density for each training sentence */
1684
while (corpus_next_utt()) {
1685
printf("utt> %5u %25s", seq_no, corpus_utt());
1687
if (corpus_get_generic_featurevec(&mfcc, &n_frame, in_veclen) < 0) {
1688
E_FATAL("Can't read input features\n");
1691
printf(" %4u", n_frame);
1694
E_WARN("utt %s too short\n", corpus_utt());
1702
if ((maxuttlen > 0) && (n_frame > maxuttlen)) {
1703
E_INFO("utt # frames > -maxuttlen; skipping\n");
1704
n_frame_skipped += n_frame;
1713
svd_n_frame = n_frame;
1715
f = feat_array_alloc(feat, n_frame + feat_window_size(feat));
1716
feat_s2mfc2feat_live(feat, mfcc, &n_frame, TRUE, TRUE, f);
1718
printf(" %4u", n_frame - svd_n_frame);
1720
/* Get the transcript */
1721
corpus_get_sent(&trans);
1723
/* accumulate density counts on lattice */
1724
if (corpus_load_lattice(&lat, lat_dir, lat_ext) == S3_SUCCESS) {
1726
/* different type of mmie training */
1727
switch (n_mmi_type) {
1728
/* take random left and right context for viterbi run */
1731
if (mmi_rand_train(inv, mdef, lex, f, lat,
1733
var_reest, feat) == S3_SUCCESS) {
1734
total_log_postprob += lat->postprob;
1735
printf(" %e", lat->postprob);
1742
/* take the best left and right context for viterbi run */
1745
if (mmi_best_train(inv, mdef, lex, f, lat,
1747
var_reest, feat) == S3_SUCCESS) {
1748
total_log_postprob += lat->postprob;
1749
printf(" %e", lat->postprob);
1756
/* use context-independent hmms for word boundary models */
1759
if (mmi_ci_train(inv, mdef, lex, f, lat,
1761
var_reest, feat) == S3_SUCCESS) {
1762
total_log_postprob += lat->postprob;
1763
printf(" %e", lat->postprob);
1770
/* mmi_type error */
1773
E_FATAL("Invalid -mmie_type, try rand, best or ci \n");
1778
/* free memory for lattice */
1779
for(i=0; i<lat->n_arcs; i++) {
1780
ckd_free(lat->arc[i].prev_arcs);
1781
ckd_free(lat->arc[i].next_arcs);
1787
E_WARN("Can't read input lattice");
1801
printf ("overall> stats %u (-%u) %e %e",
1804
(n_utt-n_utt_fail>0 ? total_log_postprob/(n_utt-n_utt_fail) : 0.0),
1805
total_log_postprob);
1809
/* dump the accumulators to a file system */
1810
while (cmd_ln_str("-accumdir") != NULL &&
1811
accum_mmie_dump(cmd_ln_str("-accumdir"),
1815
var_reest) != S3_SUCCESS) {
1816
static int notified = FALSE;
1821
* If we were not able to dump the parameters, write one log entry
1824
if (notified == FALSE) {
1826
strcpy(time_str, (const char *)ctime((const time_t *)&t));
1827
/* nuke the newline at the end of this. */
1828
time_str[strlen(time_str)-1] = '\0';
1829
E_WARN("Count dump failed on %s. Retrying dump every %3.1f hour until success.\n",
1830
time_str, DUMP_RETRY_PERIOD/3600.0);
1835
E_FATAL("Failed to get the files after 10 retries(about 5 minutes).\n ");
1838
sleep(DUMP_RETRY_PERIOD);
1841
/* Write a log entry on success */
1842
if (cmd_ln_str("-accumdir"))
1843
E_INFO("Counts saved to %s\n", cmd_ln_str("-accumdir"));
1845
E_INFO("Counts NOT saved.\n");
1848
int main(int argc, char *argv[])
1850
model_inventory_t *inv;
1851
lexicon_t *lex = NULL;
1852
model_def_t *mdef = NULL;
1853
feat_t *feat = NULL;
1855
if (main_initialize(argc, argv,
1856
&inv, &lex, &mdef, &feat) != S3_SUCCESS) {
1857
E_FATAL("initialization failed\n");
1860
if (cmd_ln_int32("-mmie")) {
1861
main_mmi_reestimate(inv, lex, mdef, feat);
1864
main_reestimate(inv, lex, mdef, feat, cmd_ln_int32("-viterbi"));
1870
model_def_free(mdef);