2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
23
package weka.attributeSelection;
25
import weka.core.Instances;
26
import weka.core.Option;
27
import weka.core.OptionHandler;
28
import weka.core.Range;
29
import weka.core.Utils;
31
import java.util.BitSet;
32
import java.util.Enumeration;
33
import java.util.Vector;
36
<!-- globalinfo-start -->
37
* GreedyStepwise :<br/>
39
* Performs a greedy forward or backward search through the space of attribute subsets. May start with no/all attributes or from an arbitrary point in the space. Stops when the addition/deletion of any remaining attributes results in a decrease in evaluation. Can also produce a ranked list of attributes by traversing the space from one side to the other and recording the order that attributes are selected.<br/>
41
<!-- globalinfo-end -->
43
<!-- options-start -->
44
* Valid options are: <p/>
47
* Use conservative forward search</pre>
50
* Use a backward search instead of a
53
* <pre> -P <start set>
54
* Specify a starting set of attributes.
58
* Produce a ranked list of attributes.</pre>
60
* <pre> -T <threshold>
61
* Specify a theshold by which attributes
62
* may be discarded from the ranking.
63
* Use in conjuction with -R</pre>
65
* <pre> -N <num to select>
66
* Specify number of attributes to select</pre>
71
* @version $Revision: 1.9 $
73
public class GreedyStepwise
75
implements RankedOutputSearch, StartSetHandler, OptionHandler {
77
/** for serialization */
78
static final long serialVersionUID = -6312951970168325471L;
80
/** does the data have a class */
81
protected boolean m_hasClass;
83
/** holds the class index */
84
protected int m_classIndex;
86
/** number of attributes in the data */
87
protected int m_numAttribs;
89
/** true if the user has requested a ranked list of attributes */
90
protected boolean m_rankingRequested;
93
* go from one side of the search space to the other in order to generate
96
protected boolean m_doRank;
98
/** used to indicate whether or not ranking has been performed */
99
protected boolean m_doneRanking;
102
* A threshold by which to discard attributes---used by the
103
* AttributeSelection module
105
protected double m_threshold;
107
/** The number of attributes to select. -1 indicates that all attributes
108
are to be retained. Has precedence over m_threshold */
109
protected int m_numToSelect = -1;
111
protected int m_calculatedNumToSelect;
113
/** the merit of the best subset found */
114
protected double m_bestMerit;
116
/** a ranked list of attribute indexes */
117
protected double [][] m_rankedAtts;
118
protected int m_rankedSoFar;
120
/** the best subset found */
121
protected BitSet m_best_group;
122
protected ASEvaluation m_ASEval;
124
protected Instances m_Instances;
126
/** holds the start set for the search as a Range */
127
protected Range m_startRange;
129
/** holds an array of starting attributes */
130
protected int [] m_starting;
132
/** Use a backwards search instead of a forwards one */
133
protected boolean m_backward = false;
135
/** If set then attributes will continue to be added during a forward
136
search as long as the merit does not degrade */
137
protected boolean m_conservativeSelection = false;
142
public GreedyStepwise () {
143
m_threshold = -Double.MAX_VALUE;
144
m_doneRanking = false;
145
m_startRange = new Range();
151
* Returns a string describing this search method
152
* @return a description of the search suitable for
153
* displaying in the explorer/experimenter gui
155
public String globalInfo() {
156
return "GreedyStepwise :\n\nPerforms a greedy forward or backward search "
158
+"the space of attribute subsets. May start with no/all attributes or from "
159
+"an arbitrary point in the space. Stops when the addition/deletion of any "
160
+"remaining attributes results in a decrease in evaluation. "
161
+"Can also produce a ranked list of "
162
+"attributes by traversing the space from one side to the other and "
163
+"recording the order that attributes are selected.\n";
167
* Returns the tip text for this property
168
* @return tip text for this property suitable for
169
* displaying in the explorer/experimenter gui
171
public String searchBackwardsTipText() {
172
return "Search backwards rather than forwards.";
176
* Set whether to search backwards instead of forwards
178
* @param back true to search backwards
180
public void setSearchBackwards(boolean back) {
183
setGenerateRanking(false);
188
* Get whether to search backwards
190
* @return true if the search will proceed backwards
192
public boolean getSearchBackwards() {
197
* Returns the tip text for this property
198
* @return tip text for this property suitable for
199
* displaying in the explorer/experimenter gui
201
public String thresholdTipText() {
202
return "Set threshold by which attributes can be discarded. Default value "
203
+ "results in no attributes being discarded. Use in conjunction with "
208
* Set the threshold by which the AttributeSelection module can discard
210
* @param threshold the threshold.
212
public void setThreshold(double threshold) {
213
m_threshold = threshold;
217
* Returns the threshold so that the AttributeSelection module can
218
* discard attributes from the ranking.
220
public double getThreshold() {
225
* Returns the tip text for this property
226
* @return tip text for this property suitable for
227
* displaying in the explorer/experimenter gui
229
public String numToSelectTipText() {
230
return "Specify the number of attributes to retain. The default value "
231
+"(-1) indicates that all attributes are to be retained. Use either "
232
+"this option or a threshold to reduce the attribute set.";
236
* Specify the number of attributes to select from the ranked list
237
* (if generating a ranking). -1
238
* indicates that all attributes are to be retained.
239
* @param n the number of attributes to retain
241
public void setNumToSelect(int n) {
246
* Gets the number of attributes to be retained.
247
* @return the number of attributes to retain
249
public int getNumToSelect() {
250
return m_numToSelect;
254
* Gets the calculated number of attributes to retain. This is the
255
* actual number of attributes to retain. This is the same as
256
* getNumToSelect if the user specifies a number which is not less
257
* than zero. Otherwise it should be the number of attributes in the
258
* (potentially transformed) data.
260
public int getCalculatedNumToSelect() {
261
if (m_numToSelect >= 0) {
262
m_calculatedNumToSelect = m_numToSelect;
264
return m_calculatedNumToSelect;
268
* Returns the tip text for this property
269
* @return tip text for this property suitable for
270
* displaying in the explorer/experimenter gui
272
public String generateRankingTipText() {
273
return "Set to true if a ranked list is required.";
277
* Records whether the user has requested a ranked list of attributes.
278
* @param doRank true if ranking is requested
280
public void setGenerateRanking(boolean doRank) {
281
m_rankingRequested = doRank;
285
* Gets whether ranking has been requested. This is used by the
286
* AttributeSelection module to determine if rankedAttributes()
288
* @return true if ranking has been requested.
290
public boolean getGenerateRanking() {
291
return m_rankingRequested;
295
* Returns the tip text for this property
296
* @return tip text for this property suitable for
297
* displaying in the explorer/experimenter gui
299
public String startSetTipText() {
300
return "Set the start point for the search. This is specified as a comma "
301
+"seperated list off attribute indexes starting at 1. It can include "
302
+"ranges. Eg. 1,2,5-9,17.";
306
* Sets a starting set of attributes for the search. It is the
307
* search method's responsibility to report this start set (if any)
308
* in its toString() method.
309
* @param startSet a string containing a list of attributes (and or ranges),
311
* @throws Exception if start set can't be set.
313
public void setStartSet (String startSet) throws Exception {
314
m_startRange.setRanges(startSet);
318
* Returns a list of attributes (and or attribute ranges) as a String
319
* @return a list of attributes (and or attribute ranges)
321
public String getStartSet () {
322
return m_startRange.getRanges();
326
* Returns the tip text for this property
327
* @return tip text for this property suitable for
328
* displaying in the explorer/experimenter gui
330
public String conservativeForwardSelectionTipText() {
331
return "If true (and forward search is selected) then attributes "
332
+"will continue to be added to the best subset as long as merit does "
337
* Set whether attributes should continue to be added during
338
* a forward search as long as merit does not decrease
339
* @param c true if atts should continue to be atted
341
public void setConservativeForwardSelection(boolean c) {
342
m_conservativeSelection = c;
346
* Gets whether conservative selection has been enabled
347
* @return true if conservative forward selection is enabled
349
public boolean getConservativeForwardSelection() {
350
return m_conservativeSelection;
354
* Returns an enumeration describing the available options.
355
* @return an enumeration of all the available options.
357
public Enumeration listOptions () {
358
Vector newVector = new Vector(5);
360
newVector.addElement(new Option("\tUse conservative forward search"
363
newVector.addElement(new Option("\tUse a backward search instead of a"
367
.addElement(new Option("\tSpecify a starting set of attributes."
370
, "-P <start set>"));
372
newVector.addElement(new Option("\tProduce a ranked list of attributes."
375
.addElement(new Option("\tSpecify a theshold by which attributes"
376
+ "\n\tmay be discarded from the ranking."
377
+"\n\tUse in conjuction with -R","T",1
378
, "-T <threshold>"));
381
.addElement(new Option("\tSpecify number of attributes to select"
383
, "-N <num to select>"));
385
return newVector.elements();
390
* Parses a given list of options. <p/>
392
<!-- options-start -->
393
* Valid options are: <p/>
396
* Use conservative forward search</pre>
399
* Use a backward search instead of a
402
* <pre> -P <start set>
403
* Specify a starting set of attributes.
407
* Produce a ranked list of attributes.</pre>
409
* <pre> -T <threshold>
410
* Specify a theshold by which attributes
411
* may be discarded from the ranking.
412
* Use in conjuction with -R</pre>
414
* <pre> -N <num to select>
415
* Specify number of attributes to select</pre>
419
* @param options the list of options as an array of strings
420
* @throws Exception if an option is not supported
422
public void setOptions (String[] options)
427
setSearchBackwards(Utils.getFlag('B', options));
429
setConservativeForwardSelection(Utils.getFlag('C', options));
431
optionString = Utils.getOption('P', options);
432
if (optionString.length() != 0) {
433
setStartSet(optionString);
436
setGenerateRanking(Utils.getFlag('R', options));
438
optionString = Utils.getOption('T', options);
439
if (optionString.length() != 0) {
441
temp = Double.valueOf(optionString);
442
setThreshold(temp.doubleValue());
445
optionString = Utils.getOption('N', options);
446
if (optionString.length() != 0) {
447
setNumToSelect(Integer.parseInt(optionString));
452
* Gets the current settings of ReliefFAttributeEval.
454
* @return an array of strings suitable for passing to setOptions()
456
public String[] getOptions () {
457
String[] options = new String[9];
460
if (getSearchBackwards()) {
461
options[current++] = "-B";
464
if (getConservativeForwardSelection()) {
465
options[current++] = "-C";
468
if (!(getStartSet().equals(""))) {
469
options[current++] = "-P";
470
options[current++] = ""+startSetToString();
473
if (getGenerateRanking()) {
474
options[current++] = "-R";
476
options[current++] = "-T";
477
options[current++] = "" + getThreshold();
479
options[current++] = "-N";
480
options[current++] = ""+getNumToSelect();
482
while (current < options.length) {
483
options[current++] = "";
489
* converts the array of starting attributes to a string. This is
490
* used by getOptions to return the actual attributes specified
491
* as the starting set. This is better than using m_startRanges.getRanges()
492
* as the same start set can be specified in different ways from the
493
* command line---eg 1,2,3 == 1-3. This is to ensure that stuff that
494
* is stored in a database is comparable.
495
* @return a comma seperated list of individual attribute numbers as a String
497
protected String startSetToString() {
498
StringBuffer FString = new StringBuffer();
501
if (m_starting == null) {
502
return getStartSet();
504
for (int i = 0; i < m_starting.length; i++) {
507
if ((m_hasClass == false) ||
508
(m_hasClass == true && i != m_classIndex)) {
509
FString.append((m_starting[i] + 1));
513
if (i == (m_starting.length - 1)) {
523
return FString.toString();
527
* returns a description of the search.
528
* @return a description of the search as a String.
530
public String toString() {
531
StringBuffer FString = new StringBuffer();
532
FString.append("\tGreedy Stepwise ("
535
: "forwards)")+".\n\tStart set: ");
537
if (m_starting == null) {
539
FString.append("all attributes\n");
541
FString.append("no attributes\n");
545
FString.append(startSetToString()+"\n");
547
if (!m_doneRanking) {
548
FString.append("\tMerit of best subset found: "
549
+Utils.doubleToString(Math.abs(m_bestMerit),8,3)+"\n");
552
if ((m_threshold != -Double.MAX_VALUE) && (m_doneRanking)) {
553
FString.append("\tThreshold for discarding attributes: "
554
+ Utils.doubleToString(m_threshold,8,4)+"\n");
557
return FString.toString();
562
* Searches the attribute subset space by forward selection.
564
* @param ASEval the attribute evaluator to guide the search
565
* @param data the training instances.
566
* @return an array (not necessarily ordered) of selected attribute indexes
567
* @throws Exception if the search can't be completed
569
public int[] search (ASEvaluation ASEval, Instances data)
573
double best_merit = -Double.MAX_VALUE;
574
double temp_best,temp_merit;
578
if (data != null) { // this is a fresh run so reset
584
m_numAttribs = m_Instances.numAttributes();
586
if (m_best_group == null) {
587
m_best_group = new BitSet(m_numAttribs);
590
if (!(m_ASEval instanceof SubsetEvaluator)) {
591
throw new Exception(m_ASEval.getClass().getName()
593
+ "Subset evaluator!");
596
m_startRange.setUpper(m_numAttribs-1);
597
if (!(getStartSet().equals(""))) {
598
m_starting = m_startRange.getSelection();
601
if (m_ASEval instanceof UnsupervisedSubsetEvaluator) {
607
m_classIndex = m_Instances.classIndex();
610
SubsetEvaluator ASEvaluator = (SubsetEvaluator)m_ASEval;
612
if (m_rankedAtts == null) {
613
m_rankedAtts = new double[m_numAttribs][2];
617
// If a starting subset has been supplied, then initialise the bitset
618
if (m_starting != null && m_rankedSoFar <= 0) {
619
for (i = 0; i < m_starting.length; i++) {
620
if ((m_starting[i]) != m_classIndex) {
621
m_best_group.set(m_starting[i]);
625
if (m_backward && m_rankedSoFar <= 0) {
626
for (i = 0; i < m_numAttribs; i++) {
627
if (i != m_classIndex) {
634
// Evaluate the initial subset
635
best_merit = ASEvaluator.evaluateSubset(m_best_group);
638
boolean done = false;
639
boolean addone = false;
642
temp_group = (BitSet)m_best_group.clone();
643
temp_best = best_merit;
645
temp_best = -Double.MAX_VALUE;
649
for (i=0;i<m_numAttribs;i++) {
651
z = ((i != m_classIndex) && (temp_group.get(i)));
653
z = ((i != m_classIndex) && (!temp_group.get(i)));
662
temp_merit = ASEvaluator.evaluateSubset(temp_group);
664
z = (temp_merit >= temp_best);
666
if (m_conservativeSelection) {
667
z = (temp_merit >= temp_best);
669
z = (temp_merit > temp_best);
674
temp_best = temp_merit;
680
// unset this addition/deletion
693
m_best_group.clear(temp_index);
695
m_best_group.set(temp_index);
697
best_merit = temp_best;
698
m_rankedAtts[m_rankedSoFar][0] = temp_index;
699
m_rankedAtts[m_rankedSoFar][1] = best_merit;
703
m_bestMerit = best_merit;
704
return attributeList(m_best_group);
708
* Produces a ranked list of attributes. Search must have been performed
709
* prior to calling this function. Search is called by this function to
710
* complete the traversal of the the search space. A list of
711
* attributes and merits are returned. The attributes a ranked by the
712
* order they are added to the subset during a forward selection search.
713
* Individual merit values reflect the merit associated with adding the
714
* corresponding attribute to the subset; because of this, merit values
715
* may initially increase but then decrease as the best subset is
716
* "passed by" on the way to the far side of the search space.
718
* @return an array of attribute indexes and associated merit values
719
* @throws Exception if something goes wrong.
721
public double [][] rankedAttributes() throws Exception {
723
if (m_rankedAtts == null || m_rankedSoFar == -1) {
724
throw new Exception("Search must be performed before attributes "
729
search (m_ASEval, null);
731
double [][] final_rank = new double [m_rankedSoFar][2];
732
for (int i=0;i<m_rankedSoFar;i++) {
733
final_rank[i][0] = m_rankedAtts[i][0];
734
final_rank[i][1] = m_rankedAtts[i][1];
738
m_doneRanking = true;
740
if (m_numToSelect > final_rank.length) {
741
throw new Exception("More attributes requested than exist in the data");
744
if (m_numToSelect <= 0) {
745
if (m_threshold == -Double.MAX_VALUE) {
746
m_calculatedNumToSelect = final_rank.length;
748
determineNumToSelectFromThreshold(final_rank);
755
private void determineNumToSelectFromThreshold(double [][] ranking) {
757
for (int i = 0; i < ranking.length; i++) {
758
if (ranking[i][1] > m_threshold) {
762
m_calculatedNumToSelect = count;
766
* converts a BitSet into a list of attribute indexes
767
* @param group the BitSet to convert
768
* @return an array of attribute indexes
770
protected int[] attributeList (BitSet group) {
773
// count how many were selected
774
for (int i = 0; i < m_numAttribs; i++) {
780
int[] list = new int[count];
783
for (int i = 0; i < m_numAttribs; i++) {
795
protected void resetOptions() {