1
/* ====================================================================
2
* Copyright (c) 1994-2000 Carnegie Mellon University. All rights
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in
14
* the documentation and/or other materials provided with the
17
* This work was supported in part by funding from the Defense Advanced
18
* Research Projects Agency and the National Science Foundation of the
19
* United States of America, and the CMU Sphinx Speech Consortium.
21
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
* ====================================================================
36
/*********************************************************************
41
* Prune decision trees.
45
*********************************************************************/
47
#include "parse_cmd_ln.h"
49
#include <sphinxbase/ckd_alloc.h>
50
#include <sphinxbase/err.h>
53
#include <s3/model_def_io.h>
55
#include <s3/pset_io.h>
57
#include <sys_compat/file.h>
62
init(model_def_t **out_mdef,
73
moddeffn = cmd_ln_str("-moddeffn");
75
E_FATAL("Specify -moddeffn\n");
76
E_INFO("Reading: %s\n", moddeffn);
77
if (model_def_read(&mdef, moddeffn) != S3_SUCCESS)
81
psetfn = cmd_ln_str("-psetfn");
82
E_INFO("Reading: %s\n", psetfn);
83
*out_pset = pset = read_pset_file(psetfn, mdef->acmod_set, &n_pset);
104
for (i = 0; i < sz-1; i++) {
107
if (phn[k_i] == NO_ID) {
108
E_ERROR("A hole is on the heap for key %u!\n", k_i);
114
for (j = i+1; j < sz; j++) {
117
if (phn[k_j] == NO_ID)
120
if ((phn[k_i] == phn[k_j]) &&
121
(st[k_i] == st[k_j]) &&
122
(nd[k_i] == nd[k_j])) {
123
E_ERROR("tree (%s %u) node %u on heap more than once\n",
124
acmod_set_id2name(mdef->acmod_set, phn[k_i]),
132
if (phn[hkey[sz-1]] == NO_ID) {
133
E_ERROR("A hole is on the heap for key %u!\n", sz-1);
142
read_phone_trees(model_def_t *mdef,
143
const char *itreedir,
156
out_tree[p] = (dtree_t **)ckd_calloc(n_state, sizeof(dtree_t *));
157
for (s = 0; s < n_state; s++) {
158
char fn[MAXPATHLEN+1];
162
sprintf(fn, "%s/%s-%u.dtree",
167
E_FATAL_SYSTEM("Unable to open %s for reading",fn);
170
out_tree[p][s] = tr = read_final_tree(fp, pset, n_pset);
173
E_ERROR("Error(s) while reading tree\n");
179
lt_minocc = prune_lowcnt(&tr->node[0], cmd_ln_float32("-minocc"));
181
reindex(&tr->node[0], &n);
182
*out_n_seno += n = cnt_leaf(&tr->node[0]);
183
E_INFO("%s-%u\t%u [%u < %e]\n",
184
pname, s, n, lt_minocc,
185
cmd_ln_float32("-minocc"));
186
*out_n_twig += cnt_twig(&tr->node[0]);
187
*out_n_node += cnt_node(&tr->node[0]);
196
prune_tree(model_def_t *mdef,
200
const char *itreedir;
201
const char *otreedir;
202
char fn[MAXPATHLEN+1];
205
dtree_t ***tree; /* Decision trees indexed by phone and state */
207
dtree_node_t *node, *prnt;
208
float32 *twig_heap; /* Heap of wt_ent_dec of split quest */
209
uint32 *twig_hkey; /* Key's for items in the heap */
210
uint32 *twig_phnid; /* Phone id of items on heap */
211
uint32 *twig_state; /* State id of items on heap */
212
uint32 *twig_nid; /* Node id of items on heap */
213
uint32 free_key; /* Next unused heap key */
214
uint32 free_idx; /* Next unused node index */
215
uint32 n_ci, p, s, n;
216
uint32 *n_state_ci; /* # of state of models in the same base phone class */
218
uint32 n_seno_wanted;
228
itreedir = cmd_ln_str("-itreedir");
229
allphones = cmd_ln_int32("-allphones");
230
n_ci = acmod_set_n_ci(mdef->acmod_set);
232
tree = (dtree_t ***)ckd_calloc(n_ci, sizeof(dtree_t **));
233
n_state_ci = (uint32 *)ckd_calloc(n_ci, sizeof(uint32));
239
n_state = mdef->defn[n_ci].n_state-1;
240
n_state_ci[0] = n_state;
244
if (read_phone_trees(mdef, itreedir, "ALLPHONES", 0, n_state,
245
pset, n_pset, tree, &n_seno, &n_twig, &n_node) < 0)
249
for (p = 0, err = FALSE, n_seno = 0, n_twig = 0; p < n_ci; p++) {
250
if (!acmod_set_has_attrib(mdef->acmod_set, (acmod_id_t)p, "filler")) {
254
pname = acmod_set_id2name(mdef->acmod_set, (acmod_id_t)p);
255
n_state = mdef->defn[p].n_state-1;
256
n_state_ci[p] = n_state;
257
if (read_phone_trees(mdef, itreedir, pname, p, n_state,
259
&n_seno, &n_twig, &n_node) < 0)
266
E_ERROR("Error(s) while reading trees; pruning not done\n");
270
E_INFO("Prior to pruning n_seno= %u\n", n_seno);
272
n_seno_wanted = cmd_ln_int32("-nseno");
273
if (n_seno < n_seno_wanted) {
274
E_WARN("n_seno_wanted= %u, but only %u defined by trees\n",
275
n_seno_wanted, n_seno);
278
E_INFO("n_twig= %u\n", n_twig);
280
if (n_seno_wanted < n_seno) {
281
/* Heap of wt_ent_dec for each "twig" question */
282
twig_heap = (float32 *)ckd_calloc(n_twig, sizeof(float32));
283
twig_hkey = (uint32 *)ckd_calloc(n_twig, sizeof(uint32));
285
twig_phnid = (uint32 *)ckd_calloc(n_twig, sizeof(uint32));
286
twig_state = (uint32 *)ckd_calloc(n_twig, sizeof(uint32));
287
twig_nid = (uint32 *)ckd_calloc(n_twig, sizeof(uint32));
289
/* Insert all twig questions over all trees into the heap */
290
for (p = 0, free_key = 0; p < n_ci; p++) {
291
for (s = 0; s < n_state_ci[p]; s++) {
293
|| !acmod_set_has_attrib(mdef->acmod_set, (acmod_id_t)p, "filler")) {
296
ins_twigs(&tr->node[0],
308
E_INFO("Pruning %u nodes\n", n_seno - n_seno_wanted);
310
for (i = n_seno, sz = n_twig; (i > n_seno_wanted) && (sz > 0); i--) {
312
if (!heap_ok(twig_hkey, sz,
313
twig_phnid, twig_state, twig_nid,
315
E_FATAL("heap problems; bug.\n");
319
/* extract the top (minimum wt_ent_dec) node off the
320
heap; this is the worst question of the tree. */
321
sz = heap32b_extr_top(&wt_ent_dec, &key,
322
twig_heap, twig_hkey, sz, heap32b_min_comp);
324
/* Get the node to prune */
329
node = get_node(&tr->node[0], n);
331
assert(IS_TWIG(node));
333
/* Make twig node a leaf by pruning its leaves */
334
prune_subtrees(node);
336
assert(IS_LEAF(node));
340
E_INFO("Root node extracted (%s %u) from heap\n",
341
acmod_set_id2name(mdef->acmod_set, (acmod_id_t)p), s);
344
/* Is the parent (if any) now a twig? */
345
if (prnt && IS_TWIG(prnt)) {
346
/* Put it on the heap and reuse the heap-key for the child */
348
twig_nid[key] = prnt->node_id;
350
sz = heap32b_ins(twig_heap, twig_hkey, sz,
351
prnt->wt_ent_dec, key, heap32b_min_comp);
354
/* Parent node not a "twig" as a result of pruning */
356
/* Set "holes" to values that are almost certain to
357
* cause a seg fault if used as an index */
358
twig_phnid[key] = NO_ID;
359
twig_state[key] = NO_ID;
360
twig_nid[key] = NO_ID;
364
if ((sz == 0) && (i > n_seno_wanted)) {
365
E_WARN("%u seno's not generated because heap ran out\n", n_seno_wanted);
369
otreedir = cmd_ln_str("-otreedir");
370
for (p = 0, n_node = 0; p < n_ci; p++) {
376
pname = acmod_set_id2name(mdef->acmod_set, (acmod_id_t)p);
378
for (s = 0; s < n_state_ci[p]; s++) {
380
|| !acmod_set_has_attrib(mdef->acmod_set, (acmod_id_t)p, "filler")) {
384
n_node += n = reindex(&tr->node[0], &free_idx);
386
E_INFO("%s-%u\t%u\n", pname, s, n);
388
sprintf(fn, "%s/%s-%u.dtree", otreedir, pname, s);
392
E_FATAL_SYSTEM("Unable to open %s for writing", fn);
394
print_final_tree(fp, &tr->node[0], pset);
406
main(int argc, char *argv[])
408
model_def_t *mdef = NULL;
412
parse_cmd_ln(argc, argv);
414
if (init(&mdef, &pset, &n_pset) != S3_SUCCESS) {
415
E_FATAL("Initialization failed\n");
418
prune_tree(mdef, pset, n_pset);
424
* Log record. Maintained by RCS.
427
* Revision 1.7 2005/06/13 22:18:22 dhdfu
428
* Add -allphones arguments to decision tree and state tying code. Allows senones to be shared across multiple base phones (though they are currently still restricted to the same state). This can improve implicit pronunciation modeling in some cases, such as grapheme-based models, though it usually has little effect. Building the big trees can take a very long time.
430
* Revision 1.6 2004/07/21 19:17:26 egouvea
431
* Changed the license terms to make it the same as sphinx2 and sphinx3.
433
* Revision 1.5 2004/06/17 19:17:24 arthchan2003
434
* Code Update for silence deletion and standardize the name for command -line arguments
436
* Revision 1.4 2001/04/05 20:02:31 awb
437
* *** empty log message ***
439
* Revision 1.3 2000/11/25 22:03:03 awb
440
* *** empty log message ***
442
* Revision 1.2 2000/09/29 22:35:14 awb
443
* *** empty log message ***
445
* Revision 1.1 2000/09/24 21:38:32 awb
446
* *** empty log message ***
448
* Revision 1.1 97/07/16 11:36:22 eht