~ubuntu-branches/ubuntu/maverick/speech-tools/maverick : contents of main/wagon

~ubuntu-branches/ubuntu/maverick/speech-tools/maverick : (revision 19)
/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                     Copyright (c) 1996-2006                           */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                     Author :  Alan W Black                            */
/*                     Date   :  May 1996                                */
/*-----------------------------------------------------------------------*/
/*  A Classification and Regression Tree (CART) Program                  */
/*  A basic implementation of many of the techniques in                  */
/*  Briemen et al. 1984                                                  */
/*                                                                       */
/*  Added decision list support, Feb 1997                                */
/*                                                                       */
/*  Added vector support for Clustergen 2005/2006                        */
/*                                                                       */
/*=======================================================================*/
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <cstring>
#include "EST_Wagon.h"
#include "EST_cmd_line.h"

enum wn_strategy_type {wn_decision_list, wn_decision_tree};

static wn_strategy_type wagon_type = wn_decision_tree;

static int wagon_main(int argc, char **argv);

/** @name <command>wagon</command> <emphasis>CART building program</emphasis>
    @id wagon_manual
  * @toc
 */

//@{


/**@name Synopsis
  */
//@{

//@synopsis

/**
wagon is used to build CART tress from feature data, its basic
features include:

<itemizedlist>
<listitem><para>both decisions trees and decision lists are supported</para></listitem>
<listitem><para>predictees can be discrete or continuous</para></listitem>
<listitem><para>input features may be discrete or continuous</para></listitem>
<listitem><para>many options for controlling tree building</para>
<itemizedlist>
<listitem><para>fixed stop value</para></listitem>
<listitem><para>balancing</para></listitem>
<listitem><para>held-out data and pruning</para></listitem>
<listitem><para>stepwise use of input features</para></listitem>
<listitem><para>choice of optimization criteria correct/entropy (for 
classification and rmse/correlation (for regression)</para></listitem>
</itemizedlist>
</listitem>
</itemizedlist>

A detailed description of building CART models can be found in the
<link linkend="cart-overview">CART model overview</link> section.

*/

//@}

/**@name OPTIONS
  */
//@{

//@options

//@}

int main(int argc, char **argv)
{

    wagon_main(argc,argv);

    exit(0);
    return 0;
}

static int set_Vertex_Feats(EST_Track &wgn_VertexFeats,
                            EST_String &wagon_track_features)
{
    int i,s=0,e;
    EST_TokenStream ts;

    for (i=0; i<wgn_VertexFeats.num_channels(); i++)
        wgn_VertexFeats.a(0,i) = 0.0;

    ts.open_string(wagon_track_features);
    ts.set_WhiteSpaceChars(",- ");
    ts.set_PunctuationSymbols("");
    ts.set_PrePunctuationSymbols("");
    ts.set_SingleCharSymbols("");

    while (!ts.eof())
    {
        EST_Token &token = ts.get();
        const EST_String ws = (const char *)token.whitespace();
        if (token == "all")
        {
            for (i=0; i<wgn_VertexFeats.num_channels(); i++)
                wgn_VertexFeats.a(0,i) = 1.0;
            break;
        } else if ((ws == ",") || (ws == ""))
        {
            s = atoi(token.string());
            wgn_VertexFeats.a(0,s) = 1.0;
        } else if (ws == "-")
        {
            if (token == "")
                e = wgn_VertexFeats.num_channels()-1;
            else
                e = atoi(token.string());
            for (i=s; i<=e && i<wgn_VertexFeats.num_channels(); i++)
                wgn_VertexFeats.a(0,i) = 1.0;
        } else
        {
            printf("wagon: track_feats invalid: %s at position %d\n",
                   (const char *)wagon_track_features,
                   ts.filepos());
            exit(-1);
        }
    }

    return 0;
}

static int wagon_main(int argc, char **argv)
{
    // Top level function sets up data and creates a tree
    EST_Option al;
    EST_StrList files;
    EST_String wgn_oname;
    ostream *wgn_coutput = 0;
    float stepwise_limit = 0;
    int feats_start=0, feats_end=0;
    int i;

    parse_command_line
	(argc, argv,
	 EST_String("[options]\n") +
	 "Summary: CART building program\n"+
	 "-desc <ifile>     Field description file\n"+
	 "-data <ifile>     Datafile, one vector per line\n"+
	 "-stop <int> {50}  Minimum number of examples for leaf nodes\n"+
	 "-test <ifile>     Datafile to test tree on\n"+
	 "-frs <float> {10} Float range split, number of partitions to\n"+
	 "                  split a float feature range into\n"+
	 "-dlist            Build a decision list (rather than tree)\n"+
	 "-dtree            Build a decision tree (rather than list) default\n"+
	 "-output <ofile>   \n"+
	 "-o <ofile>        File to save output tree in\n"+
	 "-distmatrix <ifile>\n"+
	 "                  A distance matrix for clustering\n"+
	 "-track <ifile>\n"+
         "                  track for vertex indices\n"+
	 "-track_start <int>\n"+
         "                  start channel vertex indices\n"+
	 "-track_end <int>\n"+
         "                  end (inclusive) channel for vertex indices\n"+
	 "-track_feats <string>\n"+
         "                  Track features to use, comma separated list\n"+
         "                  with feature numbers and/or ranges, 0 start\n"+
	 "-unittrack <ifile>\n"+
         "                  track for unit start and length in vertex track\n"+
	 "-quiet            No questions printed during building\n"+
	 "-verbose          Lost of information printing during build\n"+
	 "-predictee <string>\n"+
	 "                  name of field to predict (default is first field)\n"+
	 "-ignore <string>\n"+
	 "                  Filename or bracket list of fields to ignore\n"+
	 "-count_field <string>\n"+
	 "                  Name of field containing count weight for samples\n"+
	 "-stepwise         Incrementally find best features\n"+
	 "-swlimit <float> {0.0}\n"+
	 "                  Percentage necessary improvement for stepwise,\n"+
	 "                  may be negative.\n"+
	 "-swopt <string>   Parameter to optimize for stepwise, for \n"+
	 "                  classification options are correct or entropy\n"+
	 "                  for regression options are rmse or correlation\n"+
	 "                  correct and correlation are the defaults\n"+
	 "-balance <float>  For derived stop size, if dataset at node, divided\n"+
	 "                  by balance is greater than stop it is used as stop\n"+
	 "                  if balance is 0 (default) always use stop as is.\n"+
         "-vertex_output <string> Output <mean> or <best> of cluster\n"+
	 "-held_out <int>   Percent to hold out for pruning\n"+
	 "-heap <int> {210000}\n"+
	 "              Set size of Lisp heap, should not normally need\n"+
	 "              to be changed from its default, only with *very*\n"+
	 "              large description files (> 1M)\n"+
	 "-noprune          No (same class) pruning required\n",
		       files, al);

    if (al.present("-held_out"))
	wgn_held_out = al.ival("-held_out");
    if (al.present("-balance"))
	wgn_balance = al.fval("-balance");
    if ((!al.present("-desc")) || ((!al.present("-data"))))
    {
	cerr << argv[0] << ": missing description and/or datafile" << endl;
	cerr << "use -h for description of arguments" << endl;
    }

    if (al.present("-quiet"))
	wgn_quiet = TRUE;
    if (al.present("-verbose"))
	wgn_verbose = TRUE;

    if (al.present("-stop"))
	wgn_min_cluster_size = atoi(al.val("-stop"));
    if (al.present("-noprune"))
	wgn_prune = FALSE;
    if (al.present("-predictee"))
	wgn_predictee_name = al.val("-predictee");
    if (al.present("-count_field"))
	wgn_count_field_name = al.val("-count_field");
    if (al.present("-swlimit"))
	stepwise_limit = al.fval("-swlimit");
    if (al.present("-frs"))   // number of partitions to try in floats
	wgn_float_range_split = atof(al.val("-frs"));
    if (al.present("-swopt"))
	wgn_opt_param = al.val("-swopt");
    if (al.present("-vertex_output"))
	wgn_vertex_output = al.val("-vertex_output");
    if (al.present("-output") || al.present("-o"))
    {
	if (al.present("-o"))
	    wgn_oname = al.val("-o");
	else
	    wgn_oname = al.val("-output");
	wgn_coutput = new ofstream(wgn_oname);
	if (!(*wgn_coutput))
	{
	    cerr << "Wagon: can't open file \"" << wgn_oname <<
		"\" for output " << endl;
	    exit(-1);
	}
    }
    else
	wgn_coutput = &cout;
    if (al.present("-distmatrix"))
    {
	if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
	{
	    cerr << "Wagon: failed to load Distance Matrix from \"" <<
		al.val("-distmatrix") << "\"\n" << endl;
	    exit(-1);
	}
    }
    if (al.present("-dlist"))
	wagon_type = wn_decision_list;

    WNode *tree;
    float score;
    LISP ignores = NIL;

    siod_init(al.ival("-heap"));

    if (al.present("-ignore"))
    {
	EST_String ig = al.val("-ignore");
	if (ig[0] == '(')
	    ignores = read_from_string(ig);
	else
	    ignores = vload(ig,1);
    }
    // Load in the data
    wgn_load_datadescription(al.val("-desc"),ignores);
    wgn_load_dataset(wgn_dataset,al.val("-data"));
    if (al.present("-distmatrix") &&
	(wgn_DistMatrix.num_rows() < wgn_dataset.length()))
    {
	cerr << "wagon: distance matrix is smaller than number of training elements\n";
	exit(-1);
    }
    else if (al.present("-track"))
    {
        wgn_VertexTrack.load(al.val("-track"));
        wgn_VertexFeats.resize(1,wgn_VertexTrack.num_channels());
        for (i=0; i<wgn_VertexFeats.num_channels(); i++)
            wgn_VertexFeats.a(0,i) = 1.0;
    }

    if (al.present("-track_start"))
    {
        feats_start = al.ival("-track_start");
        if ((feats_start < 0) ||
            (feats_start > wgn_VertexTrack.num_channels()))
        {
            printf("wagon: track_start invalid: %d out of %d channels\n",
                   feats_start,
                   wgn_VertexTrack.num_channels());
            exit(-1);
        }
        for (i=0; i<feats_start; i++)
            wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats up to start */
            
    }

    if (al.present("-track_end"))
    {
        feats_end = al.ival("-track_end");
        if ((feats_end < feats_start) ||
            (feats_end > wgn_VertexTrack.num_channels()))
        {
            printf("wagon: track_end invalid: %d between start %d out of %d channels\n",
                   feats_end,
                   feats_start,
                   wgn_VertexTrack.num_channels());
            exit(-1);
        }
        for (i=feats_end+1; i<wgn_VertexTrack.num_channels(); i++)
            wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats after end */
    }
    if (al.present("-track_feats"))
    {   /* overrides start and end numbers */
        EST_String wagon_track_features = al.val("-track_feats");
        set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
    }

    //    printf("Track feats\n");
    //    for (i=0; i<wgn_VertexTrack.num_channels(); i++)
    //        if (wgn_VertexFeats.a(0,i) > 0.0)
    //            printf("%d ",i);
    //    printf("\n");

    if (al.present("-unittrack"))
    {   /* contains two features, a start and length.  start indexes */
        /* into VertexTrack to the first vector in the segment */
        wgn_UnitTrack.load(al.val("-unittrack"));
    }

    if (al.present("-test"))
	wgn_load_dataset(wgn_test_dataset,al.val("-test"));

    // Build and test the model 
    if (al.present("-stepwise"))
	tree = wagon_stepwise(stepwise_limit);
    else if (wagon_type == wn_decision_tree)
	tree = wgn_build_tree(score);  // default operation
    else if (wagon_type == wn_decision_list)
	// dlist is printed with build_dlist rather than returned
	tree = wgn_build_dlist(score,wgn_coutput);
    else
    {
	cerr << "Wagon: unknown operation, not tree or list" << endl;
	exit(-1);
    }

    if (tree != 0)
    {
	*wgn_coutput << *tree;
	summary_results(*tree,wgn_coutput);
    }

    if (wgn_coutput != &cout)
	delete wgn_coutput;
    return 0;
}

/** @name Building Trees

To build a decision tree (or list) Wagon requires data and a description
of it.  A data file consists a set of samples, one per line each
consisting of the same set of features.   Features may be categorial
or continuous.  By default the first feature is the predictee and the
others are used as predictors.  A typical data file will look like
this
</para>
<para>
<screen>
0.399 pau sh  0   0     0 1 1 0 0 0 0 0 0 
0.082 sh  iy  pau onset 0 1 0 0 1 1 0 0 1
0.074 iy  hh  sh  coda  1 0 1 0 1 1 0 0 1
0.048 hh  ae  iy  onset 0 1 0 1 1 1 0 1 1
0.062 ae  d   hh  coda  1 0 0 1 1 1 0 1 1
0.020 d   y   ae  coda  2 0 1 1 1 1 0 1 1
0.082 y   ax  d   onset 0 1 0 1 1 1 1 1 1
0.082 ax  r   y   coda  1 0 0 1 1 1 1 1 1
0.036 r   d   ax  coda  2 0 1 1 1 1 1 1 1
...
</screen>
</para>
<para>
The data may come from any source, such as the festival script 
dumpfeats which allows the creation of such files easily from utterance
files.  
</para><para>
In addition to a data file a description file is also require that 
gives a name and a type to each of the features in the datafile.
For the above example it would look like
</para><para>
<screen>
((segment_duration float)
 ( name  aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g 
    hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
 ( n.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g 
    hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
 ( p.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g 
    hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
 (position_type 0 onset coda)
 (pos_in_syl float)
 (syl_initial 0 1)
 (syl_final   0 1)
 (R:Sylstructure.parent.R:Syllable.p.syl_break float)
 (R:Sylstructure.parent.syl_break float)
 (R:Sylstructure.parent.R:Syllable.n.syl_break float)
 (R:Sylstructure.parent.R:Syllable.p.stress 0 1)
 (R:Sylstructure.parent.stress 0 1)
 (R:Sylstructure.parent.R:Syllable.n.stress 0 1)
)
</screen>
</para><para>
The feature names are arbitrary, but as they appear in the generated
trees is most useful if the trees are to be used in prediction of
an utterance that the names are features and/or pathnames.  
</para><para>
Wagon can be used to build a tree with such files with the command
<screen>
wagon -data feats.data -desc fest.desc -stop 10 -output feats.tree
</screen>
A test data set may also be given which must match the given data description.
If specified the built tree will be tested on the test set and results
on that will be presented on completion, without a test set the
results are given with respect to the training data.  However in
stepwise case the test set is used in the multi-level training process
thus it cannot be considered as true test data and more reasonable 
results should found on applying the generate tree to truly
held out data (via the program wagon_test).

*/

//@}