2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* SubsetSizeForwardSelection.java
19
* Copyright (C) 2007 Martin GĆ¼tlein
22
package weka.attributeSelection;
24
import weka.core.Instances;
25
import weka.core.Option;
26
import weka.core.OptionHandler;
27
import weka.core.SelectedTag;
29
import weka.core.Utils;
30
import weka.core.TechnicalInformation;
31
import weka.core.TechnicalInformationHandler;
32
import weka.core.TechnicalInformation.Field;
33
import weka.core.TechnicalInformation.Type;
35
import java.util.BitSet;
36
import java.util.Enumeration;
37
import java.util.Random;
38
import java.util.Vector;
42
<!-- globalinfo-start -->
43
* SubsetSizeForwardSelection :<br/>
44
* Class for performing a subset size forward selection
46
<!-- globalinfo-end -->
48
<!-- options-start -->
53
* Perform initial ranking to select top-ranked attributes.</pre>
55
* <pre> -K <num>
56
* Number of top-ranked attributes that are taken into account.</pre>
58
* <pre> -T <0 = fixed-set | 1 = fixed-width>
59
* Type of Linear Forward Selection (default = 0).</pre>
61
* <pre> -S <num>
62
* Size of lookup cache for evaluated subsets. Expressed as a multiple of the
63
* number of attributes in the data set. (default = 1).</pre>
65
* <pre> -E <string>
66
* class name of subset evaluator to use for subset size determination (default =
67
* null, same subset evaluator as for ranking and final forward selection is
68
* used). Place any evaluator options LAST on the command line following a "--".
69
* eg. -A weka.attributeSelection.ClassifierSubsetEval ... -- -M<pre>
71
* <pre> -F <num>
72
* Number of cross validation folds for subset size determination (default = 5).</pre>
74
* <pre> -R <num>
75
* Seed for cross validation subset size determination. (default = 1)</pre>
78
* verbose on/off.</pre>
82
* @author Martin Guetlein (martin.guetlein@gmail.com)
83
* @version $Revision: 1.1 $
85
public class SubsetSizeForwardSelection extends ASSearch
86
implements OptionHandler {
87
/** search directions */
88
protected static final int TYPE_FIXED_SET = 0;
89
protected static final int TYPE_FIXED_WIDTH = 1;
90
public static final Tag[] TAGS_TYPE = {
91
new Tag(TYPE_FIXED_SET, "Fixed-set"),
92
new Tag(TYPE_FIXED_WIDTH, "Fixed-width"),
96
/** perform initial ranking to select top-ranked attributes */
97
protected boolean m_performRanking;
100
* number of top-ranked attributes that are taken into account for the
103
protected int m_numUsedAttributes;
105
/** 0 == fixed-set, 1 == fixed-width */
106
protected int m_linearSelectionType;
108
/** the subset evaluator to use for subset size determination */
109
private SubsetEvaluator m_setSizeEval;
112
* Number of cross validation folds for subset size determination (default =
115
protected int m_numFolds;
117
/** Seed for cross validation subset size determination. (default = 1) */
118
protected int m_seed;
120
/** number of attributes in the data */
121
protected int m_numAttribs;
123
/** total number of subsets evaluated during a search */
124
protected int m_totalEvals;
127
protected boolean m_verbose;
129
/** holds the merit of the best subset found */
130
protected double m_bestMerit;
132
/** holds the maximum size of the lookup cache for evaluated subsets */
133
protected int m_cacheSize;
138
public SubsetSizeForwardSelection() {
143
* Returns a string describing this search method
145
* @return a description of the search method suitable for displaying in the
146
* explorer/experimenter gui
148
public String globalInfo() {
149
return "SubsetSizeForwardSelection:\n\n" +
150
"Extension of LinearForwardSelection. The search performs an interior " +
151
"cross-validation (seed and number of folds can be specified). A " +
152
"LinearForwardSelection is performed on each foldto determine the optimal " +
153
"subset-size (using the given SubsetSizeEvaluator). Finally, a " +
154
"LinearForwardSelection up to the optimal subset-size is performed on " +
155
"the whole data.\n\n"
156
+ "For more information see:\n\n"
157
+ getTechnicalInformation().toString();
161
* Returns an instance of a TechnicalInformation object, containing
162
* detailed information about the technical background of this class,
163
* e.g., paper reference or book this class is based on.
165
* @return the technical information about this class
167
public TechnicalInformation getTechnicalInformation() {
168
TechnicalInformation result;
170
result = new TechnicalInformation(Type.MASTERSTHESIS);
171
result.setValue(Field.AUTHOR, "Martin Guetlein");
172
result.setValue(Field.YEAR, "2006");
173
result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
174
result.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitat");
175
result.setValue(Field.ADDRESS, "Freiburg, Germany");
181
* Returns an enumeration describing the available options.
183
* @return an enumeration of all the available options.
186
public Enumeration listOptions() {
187
Vector newVector = new Vector(9);
189
newVector.addElement(new Option("\tPerform initial ranking to select the" +
190
"\n\ttop-ranked attributes.", "I", 0, "-I"));
191
newVector.addElement(new Option(
192
"\tNumber of top-ranked attributes that are " +
193
"\n\ttaken into account by the search.", "K", 1, "-K <num>"));
194
newVector.addElement(new Option(
195
"\tType of Linear Forward Selection (default = 0).", "T", 1,
196
"-T <0 = fixed-set | 1 = fixed-width>"));
197
newVector.addElement(new Option(
198
"\tSize of lookup cache for evaluated subsets." +
199
"\n\tExpressed as a multiple of the number of" +
200
"\n\tattributes in the data set. (default = 1)", "S", 1, "-S <num>"));
201
newVector.addElement(new Option(
202
"\tSubset-evaluator used for subset-size determination." + "-- -M",
203
"E", 1, "-E <subset evaluator>"));
204
newVector.addElement(new Option("\tNumber of cross validation folds" +
205
"\n\tfor subset size determination (default = 5).", "F", 1, "-F <num>"));
206
newVector.addElement(new Option("\tSeed for cross validation" +
207
"\n\tsubset size determination. (default = 1)", "R", 1, "-R <num>"));
208
newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z"));
210
if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
211
newVector.addElement(new Option("", "", 0,
212
"\nOptions specific to " + "evaluator " +
213
m_setSizeEval.getClass().getName() + ":"));
215
Enumeration enu = ((OptionHandler) m_setSizeEval).listOptions();
217
while (enu.hasMoreElements()) {
218
newVector.addElement(enu.nextElement());
222
return newVector.elements();
226
* Parses a given list of options.
232
* Perform initial ranking to select top-ranked attributes.
236
* Number of top-ranked attributes that are taken into account.
239
* -T <0 = fixed-set | 1 = fixed-width> <br>
240
* Typ of Linear Forward Selection (default = 0).
244
* Size of lookup cache for evaluated subsets. Expressed as a multiple of
245
* the number of attributes in the data set. (default = 1).
249
* class name of subset evaluator to use for subset size determination
250
* (default = null, same subset evaluator as for ranking and final forward
251
* selection is used). Place any evaluator options LAST on the command line
252
* following a "--". eg. -A weka.attributeSelection.ClassifierSubsetEval ... --
258
* Number of cross validation folds for subset size determination (default =
263
* Seed for cross validation subset size determination. (default = 1)
271
* the list of options as an array of strings
272
* @exception Exception
273
* if an option is not supported
276
public void setOptions(String[] options) throws Exception {
280
setPerformRanking(Utils.getFlag('I', options));
282
optionString = Utils.getOption('K', options);
284
if (optionString.length() != 0) {
285
setNumUsedAttributes(Integer.parseInt(optionString));
288
optionString = Utils.getOption('T', options);
290
if (optionString.length() != 0) {
291
setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE));
293
setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE));
296
optionString = Utils.getOption('S', options);
298
if (optionString.length() != 0) {
299
setLookupCacheSize(Integer.parseInt(optionString));
302
optionString = Utils.getOption('E', options);
304
if (optionString.length() == 0) {
306
"No subset size evaluator given, using evaluator that is used for final search.");
307
m_setSizeEval = null;
309
setSubsetSizeEvaluator(ASEvaluation.forName(optionString,
310
Utils.partitionOptions(options)));
313
optionString = Utils.getOption('F', options);
315
if (optionString.length() != 0) {
316
setNumSubsetSizeCVFolds(Integer.parseInt(optionString));
319
optionString = Utils.getOption('R', options);
321
if (optionString.length() != 0) {
322
setSeed(Integer.parseInt(optionString));
325
m_verbose = Utils.getFlag('Z', options);
329
* Set the maximum size of the evaluated subset cache (hashtable). This is
330
* expressed as a multiplier for the number of attributes in the data set.
334
* the maximum size of the hashtable
336
public void setLookupCacheSize(int size) {
343
* Return the maximum size of the evaluated subset cache (expressed as a
344
* multiplier for the number of attributes in a data set.
346
* @return the maximum size of the hashtable.
348
public int getLookupCacheSize() {
353
* Returns the tip text for this property
355
* @return tip text for this property suitable for displaying in the
356
* explorer/experimenter gui
358
public String lookupCacheSizeTipText() {
359
return "Set the maximum size of the lookup cache of evaluated subsets. This is " +
360
"expressed as a multiplier of the number of attributes in the data set. " +
365
* Returns the tip text for this property
367
* @return tip text for this property suitable for displaying in the
368
* explorer/experimenter gui
370
public String performRankingTipText() {
371
return "Perform initial ranking to select top-ranked attributes.";
375
* Perform initial ranking to select top-ranked attributes.
378
* true if initial ranking should be performed
380
public void setPerformRanking(boolean b) {
381
m_performRanking = b;
385
* Get boolean if initial ranking should be performed to select the
386
* top-ranked attributes
388
* @return true if initial ranking should be performed
390
public boolean getPerformRanking() {
391
return m_performRanking;
395
* Returns the tip text for this property
397
* @return tip text for this property suitable for displaying in the
398
* explorer/experimenter gui
400
public String numUsedAttributesTipText() {
401
return "Set the amount of top-ranked attributes that are taken into account by the search process.";
405
* Set the number of top-ranked attributes that taken into account by the
409
* the number of attributes
410
* @exception Exception
411
* if k is less than 2
413
public void setNumUsedAttributes(int k) throws Exception {
415
throw new Exception("Value of -K must be >= 2.");
418
m_numUsedAttributes = k;
422
* Get the number of top-ranked attributes that taken into account by the
425
* @return the number of top-ranked attributes that taken into account
427
public int getNumUsedAttributes() {
428
return m_numUsedAttributes;
432
* Returns the tip text for this property
434
* @return tip text for this property suitable for displaying in the
435
* explorer/experimenter gui
437
public String typeTipText() {
438
return "Set the type of the search.";
445
* the Linear Forward Selection type
447
public void setType(SelectedTag t) {
448
if (t.getTags() == TAGS_TYPE) {
449
m_linearSelectionType = t.getSelectedTag().getID();
456
* @return the Linear Forward Selection type
458
public SelectedTag getType() {
459
return new SelectedTag(m_linearSelectionType, TAGS_TYPE);
463
* Returns the tip text for this property
465
* @return tip text for this property suitable for displaying in the
466
* explorer/experimenter gui
468
public String subsetSizeEvaluatorTipText() {
469
return "Subset evaluator to use for subset size determination.";
473
* Set the subset evaluator to use for subset size determination.
476
* the subset evaluator to use for subset size determination.
478
public void setSubsetSizeEvaluator(ASEvaluation eval)
480
if (!SubsetEvaluator.class.isInstance(eval)) {
481
throw new Exception(eval.getClass().getName() +
482
" is no subset evaluator.");
485
m_setSizeEval = (SubsetEvaluator) eval;
489
* Get the subset evaluator used for subset size determination.
491
* @return the evaluator used for subset size determination.
493
public ASEvaluation getSubsetSizeEvaluator() {
494
return m_setSizeEval;
498
* Returns the tip text for this property
500
* @return tip text for this property suitable for displaying in the
501
* explorer/experimenter gui
503
public String numSubsetSizeCVFoldsTipText() {
504
return "Number of cross validation folds for subset size determination";
508
* Set the number of cross validation folds for subset size determination
514
public void setNumSubsetSizeCVFolds(int f) {
519
* Get the number of cross validation folds for subset size determination
522
* @return number of folds
524
public int getNumSubsetSizeCVFolds() {
529
* Returns the tip text for this property
531
* @return tip text for this property suitable for displaying in the
532
* explorer/experimenter gui
534
public String seedTipText() {
535
return "Seed for cross validation subset size determination. (default = 1)";
539
* Seed for cross validation subset size determination. (default = 1)
544
public void setSeed(int s) {
549
* Seed for cross validation subset size determination. (default = 1)
553
public int getSeed() {
558
* Returns the tip text for this property
560
* @return tip text for this property suitable for displaying in the
561
* explorer/experimenter gui
563
public String verboseTipText() {
564
return "Turn on verbose output for monitoring the search's progress.";
568
* Set whether verbose output should be generated.
571
* true if output is to be verbose.
573
public void setVerbose(boolean b) {
578
* Get whether output is to be verbose
580
* @return true if output will be verbose
582
public boolean getVerbose() {
587
* Gets the current settings of LinearForwardSelection.
589
* @return an array of strings suitable for passing to setOptions()
591
public String[] getOptions() {
592
String[] evaluatorOptions = new String[0];
594
if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
595
evaluatorOptions = ((OptionHandler) m_setSizeEval).getOptions();
598
String[] options = new String[15 + evaluatorOptions.length];
601
if (m_performRanking) {
602
options[current++] = "-I";
605
options[current++] = "-K";
606
options[current++] = "" + m_numUsedAttributes;
607
options[current++] = "-T";
608
options[current++] = "" + m_linearSelectionType;
610
options[current++] = "-F";
611
options[current++] = "" + m_numFolds;
612
options[current++] = "-S";
613
options[current++] = "" + m_seed;
615
options[current++] = "-Z";
616
options[current++] = "" + m_verbose;
618
if (m_setSizeEval != null) {
619
options[current++] = "-E";
620
options[current++] = m_setSizeEval.getClass().getName();
623
options[current++] = "--";
624
System.arraycopy(evaluatorOptions, 0, options, current,
625
evaluatorOptions.length);
626
current += evaluatorOptions.length;
628
while (current < options.length) {
629
options[current++] = "";
636
* returns a description of the search as a String
638
* @return a description of the search
640
public String toString() {
641
StringBuffer LFSString = new StringBuffer();
643
LFSString.append("\tSubset Size Forward Selection.\n");
645
LFSString.append("\tLinear Forward Selection Type: ");
647
if (m_linearSelectionType == TYPE_FIXED_SET) {
648
LFSString.append("fixed-set\n");
650
LFSString.append("fixed-width\n");
653
LFSString.append("\tNumber of top-ranked attributes that are used: " +
654
m_numUsedAttributes + "\n");
657
"\tNumber of cross validation folds for subset size determination: " +
659
LFSString.append("\tSeed for cross validation subset size determination: " +
662
LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals +
664
LFSString.append("\tMerit of best subset found: " +
665
Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n");
667
return LFSString.toString();
671
* Searches the attribute subset space by subset size forward selection
674
* the attribute evaluator to guide the search
676
* the training instances.
677
* @return an array (not necessarily ordered) of selected attribute indexes
678
* @exception Exception
679
* if the search can't be completed
681
public int[] search(ASEvaluation ASEval, Instances data)
685
if (!(ASEval instanceof SubsetEvaluator)) {
686
throw new Exception(ASEval.getClass().getName() + " is not a " +
687
"Subset evaluator!");
690
if (m_setSizeEval == null) {
691
m_setSizeEval = (SubsetEvaluator) ASEval;
694
m_numAttribs = data.numAttributes();
696
if (m_numUsedAttributes > m_numAttribs) {
698
"Decreasing number of top-ranked attributes to total number of attributes: " +
699
data.numAttributes());
700
m_numUsedAttributes = m_numAttribs;
703
Instances[] trainData = new Instances[m_numFolds];
704
Instances[] testData = new Instances[m_numFolds];
705
LFSMethods[] searchResults = new LFSMethods[m_numFolds];
707
Random random = new Random(m_seed);
708
Instances dataCopy = new Instances(data);
709
dataCopy.randomize(random);
711
if (dataCopy.classAttribute().isNominal()) {
712
dataCopy.stratify(m_numFolds);
715
for (int f = 0; f < m_numFolds; f++) {
716
trainData[f] = dataCopy.trainCV(m_numFolds, f, random);
717
testData[f] = dataCopy.testCV(m_numFolds, f);
720
LFSMethods LSF = new LFSMethods();
724
if (m_performRanking) {
725
((SubsetEvaluator) ASEval).buildEvaluator(data);
726
ranking = LSF.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose);
728
ranking = new int[m_numAttribs];
730
for (int i = 0; i < ranking.length; i++) {
735
int maxSubsetSize = 0;
737
for (int f = 0; f < m_numFolds; f++) {
739
System.out.println("perform search on internal fold: " + (f + 1) + "/" +
743
m_setSizeEval.buildEvaluator(trainData[f]);
744
searchResults[f] = new LFSMethods();
745
searchResults[f].forwardSearch(m_cacheSize, new BitSet(m_numAttribs),
746
ranking, m_numUsedAttributes,
747
m_linearSelectionType == TYPE_FIXED_WIDTH, 1, -1, trainData[f],
748
m_setSizeEval, m_verbose);
750
maxSubsetSize = Math.max(maxSubsetSize,
751
searchResults[f].getBestGroup().cardinality());
756
"continue searches on internal folds to maxSubsetSize (" +
757
maxSubsetSize + ")");
760
for (int f = 0; f < m_numFolds; f++) {
762
System.out.print("perform search on internal fold: " + (f + 1) + "/" +
763
m_numFolds + " with starting set ");
764
LFSMethods.printGroup(searchResults[f].getBestGroup(),
765
trainData[f].numAttributes());
768
if (searchResults[f].getBestGroup().cardinality() < maxSubsetSize) {
769
m_setSizeEval.buildEvaluator(trainData[f]);
770
searchResults[f].forwardSearch(m_cacheSize,
771
searchResults[f].getBestGroup(), ranking, m_numUsedAttributes,
772
m_linearSelectionType == TYPE_FIXED_WIDTH, 1, maxSubsetSize,
773
trainData[f], m_setSizeEval, m_verbose);
777
double[][] testMerit = new double[m_numFolds][maxSubsetSize + 1];
779
for (int f = 0; f < m_numFolds; f++) {
780
for (int s = 1; s <= maxSubsetSize; s++) {
781
if (HoldOutSubsetEvaluator.class.isInstance(m_setSizeEval)) {
782
m_setSizeEval.buildEvaluator(trainData[f]);
783
testMerit[f][s] = ((HoldOutSubsetEvaluator) m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize(
786
m_setSizeEval.buildEvaluator(testData[f]);
787
testMerit[f][s] = m_setSizeEval.evaluateSubset(searchResults[f].getBestGroupOfSize(
793
double[] avgTestMerit = new double[maxSubsetSize + 1];
794
int finalSubsetSize = -1;
796
for (int s = 1; s <= maxSubsetSize; s++) {
797
for (int f = 0; f < m_numFolds; f++) {
798
avgTestMerit[s] = ((avgTestMerit[s] * f) + testMerit[f][s]) / (double) (f +
802
if ((finalSubsetSize == -1) ||
803
(avgTestMerit[s] > avgTestMerit[finalSubsetSize])) {
808
System.out.println("average merit for subset-size " + s + ": " +
814
System.out.println("performing final forward selection to subset-size: " +
818
((SubsetEvaluator) ASEval).buildEvaluator(data);
819
LSF.forwardSearch(m_cacheSize, new BitSet(m_numAttribs), ranking,
820
m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, 1,
821
finalSubsetSize, data, (SubsetEvaluator) ASEval, m_verbose);
823
m_totalEvals = LSF.getNumEvalsTotal();
824
m_bestMerit = LSF.getBestMerit();
826
return attributeList(LSF.getBestGroup());
830
* Reset options to default values
832
protected void resetOptions() {
833
m_performRanking = true;
834
m_numUsedAttributes = 50;
835
m_linearSelectionType = TYPE_FIXED_SET;
836
m_setSizeEval = new ClassifierSubsetEval();
845
* converts a BitSet into a list of attribute indexes
848
* the BitSet to convert
849
* @return an array of attribute indexes
851
protected int[] attributeList(BitSet group) {
854
// count how many were selected
855
for (int i = 0; i < m_numAttribs; i++) {
861
int[] list = new int[count];
864
for (int i = 0; i < m_numAttribs; i++) {