2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
23
package weka.attributeSelection;
25
import weka.core.Instances;
26
import weka.core.Option;
27
import weka.core.OptionHandler;
28
import weka.core.Range;
29
import weka.core.TechnicalInformation;
30
import weka.core.TechnicalInformation.Type;
31
import weka.core.TechnicalInformation.Field;
32
import weka.core.TechnicalInformationHandler;
33
import weka.core.Utils;
35
import java.util.BitSet;
36
import java.util.Enumeration;
37
import java.util.Random;
38
import java.util.Vector;
41
<!-- globalinfo-start -->
42
* RandomSearch : <br/>
44
* Performs a Random search in the space of attribute subsets. If no start set is supplied, Random search starts from a random point and reports the best subset found. If a start set is supplied, Random searches randomly for subsets that are as good or better than the start point with the same or or fewer attributes. Using RandomSearch in conjunction with a start set containing all attributes equates to the LVF algorithm of Liu and Setiono (ICML-96).<br/>
46
* For more information see:<br/>
48
* H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter solution. In: 13th International Conference on Machine Learning, 319-327, 1996.
50
<!-- globalinfo-end -->
52
<!-- technical-bibtex-start -->
55
* @inproceedings{Liu1996,
56
* author = {H. Liu and R. Setiono},
57
* booktitle = {13th International Conference on Machine Learning},
59
* title = {A probabilistic approach to feature selection - A filter solution},
64
<!-- technical-bibtex-end -->
66
<!-- options-start -->
67
* Valid options are: <p/>
69
* <pre> -P <start set>
70
* Specify a starting set of attributes.
72
* If a start point is supplied,
73
* random search evaluates the start
74
* point and then randomly looks for
75
* subsets that are as good as or better
76
* than the start point with the same
77
* or lower cardinality.</pre>
79
* <pre> -F <percent>
80
* Percent of search space to consider.
81
* (default = 25%).</pre>
84
* Output subsets as the search progresses.
85
* (default = false).</pre>
89
* @author Mark Hall (mhall@cs.waikato.ac.nz)
90
* @version $Revision: 1.17 $
92
public class RandomSearch
94
implements StartSetHandler, OptionHandler, TechnicalInformationHandler {
96
/** for serialization */
97
static final long serialVersionUID = 7479392617377425484L;
100
* holds a starting set as an array of attributes.
102
private int[] m_starting;
104
/** holds the start set as a range */
105
private Range m_startRange;
107
/** the best feature set found during the search */
108
private BitSet m_bestGroup;
110
/** the merit of the best subset found */
111
private double m_bestMerit;
114
* only accept a feature set as being "better" than the best if its
115
* merit is better or equal to the best, and it contains fewer
116
* features than the best (this allows LVF to be implimented).
118
private boolean m_onlyConsiderBetterAndSmaller;
120
/** does the data have a class */
121
private boolean m_hasClass;
123
/** holds the class index */
124
private int m_classIndex;
126
/** number of attributes in the data */
127
private int m_numAttribs;
129
/** seed for random number generation */
132
/** percentage of the search space to consider */
133
private double m_searchSize;
135
/** the number of iterations performed */
136
private int m_iterations;
138
/** random number object */
139
private Random m_random;
141
/** output new best subsets as the search progresses */
142
private boolean m_verbose;
145
* Returns a string describing this search method
146
* @return a description of the search suitable for
147
* displaying in the explorer/experimenter gui
149
public String globalInfo() {
150
return "RandomSearch : \n\nPerforms a Random search in "
151
+"the space of attribute subsets. If no start set is supplied, Random "
152
+"search starts from a random point and reports the best subset found. "
153
+"If a start set is supplied, Random searches randomly for subsets "
154
+"that are as good or better than the start point with the same or "
155
+"or fewer attributes. Using RandomSearch in conjunction with a start "
156
+"set containing all attributes equates to the LVF algorithm of Liu "
157
+"and Setiono (ICML-96).\n\n"
158
+ "For more information see:\n\n"
159
+ getTechnicalInformation().toString();
163
* Returns an instance of a TechnicalInformation object, containing
164
* detailed information about the technical background of this class,
165
* e.g., paper reference or book this class is based on.
167
* @return the technical information about this class
169
public TechnicalInformation getTechnicalInformation() {
170
TechnicalInformation result;
172
result = new TechnicalInformation(Type.INPROCEEDINGS);
173
result.setValue(Field.AUTHOR, "H. Liu and R. Setiono");
174
result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution");
175
result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning");
176
result.setValue(Field.YEAR, "1996");
177
result.setValue(Field.PAGES, "319-327");
185
public RandomSearch () {
190
* Returns an enumeration describing the available options.
191
* @return an enumeration of all the available options.
193
public Enumeration listOptions () {
194
Vector newVector = new Vector(3);
196
newVector.addElement(new Option("\tSpecify a starting set of attributes."
198
+"\n\tIf a start point is supplied,"
199
+"\n\trandom search evaluates the start"
200
+"\n\tpoint and then randomly looks for"
201
+"\n\tsubsets that are as good as or better"
202
+"\n\tthan the start point with the same"
203
+"\n\tor lower cardinality."
205
, "-P <start set>"));
207
newVector.addElement(new Option("\tPercent of search space to consider."
208
+"\n\t(default = 25%)."
211
newVector.addElement(new Option("\tOutput subsets as the search progresses."
212
+"\n\t(default = false)."
215
return newVector.elements();
219
* Parses a given list of options. <p/>
221
<!-- options-start -->
222
* Valid options are: <p/>
224
* <pre> -P <start set>
225
* Specify a starting set of attributes.
227
* If a start point is supplied,
228
* random search evaluates the start
229
* point and then randomly looks for
230
* subsets that are as good as or better
231
* than the start point with the same
232
* or lower cardinality.</pre>
234
* <pre> -F <percent>
235
* Percent of search space to consider.
236
* (default = 25%).</pre>
239
* Output subsets as the search progresses.
240
* (default = false).</pre>
244
* @param options the list of options as an array of strings
245
* @throws Exception if an option is not supported
248
public void setOptions (String[] options)
253
optionString = Utils.getOption('P', options);
254
if (optionString.length() != 0) {
255
setStartSet(optionString);
258
optionString = Utils.getOption('F',options);
259
if (optionString.length() != 0) {
260
setSearchPercent((new Double(optionString)).doubleValue());
263
setVerbose(Utils.getFlag('V',options));
267
* Gets the current settings of RandomSearch.
268
* @return an array of strings suitable for passing to setOptions()
270
public String[] getOptions () {
271
String[] options = new String[5];
275
options[current++] = "-V";
278
if (!(getStartSet().equals(""))) {
279
options[current++] = "-P";
280
options[current++] = "" + startSetToString();
283
options[current++] = "-F";
284
options[current++] = "" + getSearchPercent();
286
while (current < options.length) {
287
options[current++] = "";
294
* Returns the tip text for this property
295
* @return tip text for this property suitable for
296
* displaying in the explorer/experimenter gui
298
public String startSetTipText() {
299
return "Set the start point for the search. This is specified as a comma "
300
+"seperated list off attribute indexes starting at 1. It can include "
301
+"ranges. Eg. 1,2,5-9,17. If specified, Random searches for subsets "
302
+"of attributes that are as good as or better than the start set with "
303
+"the same or lower cardinality.";
307
* Sets a starting set of attributes for the search. It is the
308
* search method's responsibility to report this start set (if any)
309
* in its toString() method.
310
* @param startSet a string containing a list of attributes (and or ranges),
311
* eg. 1,2,6,10-15. "" indicates no start point.
312
* If a start point is supplied, random search evaluates the
313
* start point and then looks for subsets that are as good as or better
314
* than the start point with the same or lower cardinality.
315
* @throws Exception if start set can't be set.
317
public void setStartSet (String startSet) throws Exception {
318
m_startRange.setRanges(startSet);
322
* Returns a list of attributes (and or attribute ranges) as a String
323
* @return a list of attributes (and or attribute ranges)
325
public String getStartSet () {
326
return m_startRange.getRanges();
330
* Returns the tip text for this property
331
* @return tip text for this property suitable for
332
* displaying in the explorer/experimenter gui
334
public String verboseTipText() {
335
return "Print progress information. Sends progress info to the terminal "
336
+"as the search progresses.";
340
* set whether or not to output new best subsets as the search proceeds
341
* @param v true if output is to be verbose
343
public void setVerbose(boolean v) {
348
* get whether or not output is verbose
349
* @return true if output is set to verbose
351
public boolean getVerbose() {
356
* Returns the tip text for this property
357
* @return tip text for this property suitable for
358
* displaying in the explorer/experimenter gui
360
public String searchPercentTipText() {
361
return "Percentage of the search space to explore.";
365
* set the percentage of the search space to consider
366
* @param p percent of the search space ( 0 < p <= 100)
368
public void setSearchPercent(double p) {
378
m_searchSize = (p/100.0);
382
* get the percentage of the search space to consider
383
* @return the percent of the search space explored
385
public double getSearchPercent() {
386
return m_searchSize * 100;
390
* converts the array of starting attributes to a string. This is
391
* used by getOptions to return the actual attributes specified
392
* as the starting set. This is better than using m_startRanges.getRanges()
393
* as the same start set can be specified in different ways from the
394
* command line---eg 1,2,3 == 1-3. This is to ensure that stuff that
395
* is stored in a database is comparable.
396
* @return a comma seperated list of individual attribute numbers as a String
398
private String startSetToString() {
399
StringBuffer FString = new StringBuffer();
402
if (m_starting == null) {
403
return getStartSet();
406
for (int i = 0; i < m_starting.length; i++) {
409
if ((m_hasClass == false) ||
410
(m_hasClass == true && i != m_classIndex)) {
411
FString.append((m_starting[i] + 1));
415
if (i == (m_starting.length - 1)) {
425
return FString.toString();
429
* prints a description of the search
430
* @return a description of the search as a string
432
public String toString() {
433
StringBuffer text = new StringBuffer();
435
text.append("\tRandom search.\n\tStart set: ");
436
if (m_starting == null) {
437
text.append("no attributes\n");
440
text.append(startSetToString()+"\n");
442
text.append("\tNumber of iterations: "+m_iterations+" ("
443
+(m_searchSize * 100.0)+"% of the search space)\n");
444
text.append("\tMerit of best subset found: "
445
+Utils.doubleToString(Math.abs(m_bestMerit),8,3)+"\n");
447
return text.toString();
451
* Searches the attribute subset space randomly.
453
* @param ASEval the attribute evaluator to guide the search
454
* @param data the training instances.
455
* @return an array (not necessarily ordered) of selected attribute indexes
456
* @throws Exception if the search can't be completed
458
public int[] search (ASEvaluation ASEval, Instances data)
461
int sizeOfBest = m_numAttribs;
463
m_bestGroup = new BitSet(m_numAttribs);
465
m_onlyConsiderBetterAndSmaller = false;
466
if (!(ASEval instanceof SubsetEvaluator)) {
467
throw new Exception(ASEval.getClass().getName()
469
+ "Subset evaluator!");
472
m_random = new Random(m_seed);
474
if (ASEval instanceof UnsupervisedSubsetEvaluator) {
479
m_classIndex = data.classIndex();
482
SubsetEvaluator ASEvaluator = (SubsetEvaluator)ASEval;
483
m_numAttribs = data.numAttributes();
485
m_startRange.setUpper(m_numAttribs-1);
486
if (!(getStartSet().equals(""))) {
487
m_starting = m_startRange.getSelection();
490
// If a starting subset has been supplied, then initialise the bitset
491
if (m_starting != null) {
492
for (int i = 0; i < m_starting.length; i++) {
493
if ((m_starting[i]) != m_classIndex) {
494
m_bestGroup.set(m_starting[i]);
497
m_onlyConsiderBetterAndSmaller = true;
498
best_merit = ASEvaluator.evaluateSubset(m_bestGroup);
499
sizeOfBest = countFeatures(m_bestGroup);
501
// do initial random subset
502
m_bestGroup = generateRandomSubset();
503
best_merit = ASEvaluator.evaluateSubset(m_bestGroup);
507
System.out.println("Initial subset ("
508
+Utils.doubleToString(Math.
510
+"): "+printSubset(m_bestGroup));
519
m_iterations = (int)((m_searchSize * Math.pow(2, i)));
524
for (i=0;i<m_iterations;i++) {
525
temp = generateRandomSubset();
526
if (m_onlyConsiderBetterAndSmaller) {
527
tempSize = countFeatures(temp);
528
if (tempSize <= sizeOfBest) {
529
tempMerit = ASEvaluator.evaluateSubset(temp);
530
if (tempMerit >= best_merit) {
531
sizeOfBest = tempSize;
533
best_merit = tempMerit;
535
System.out.print("New best subset ("
536
+Utils.doubleToString(Math.
538
+"): "+printSubset(m_bestGroup) + " :");
539
System.out.println(Utils.
540
doubleToString((((double)i)/
541
((double)m_iterations)*
548
tempMerit = ASEvaluator.evaluateSubset(temp);
549
if (tempMerit > best_merit) {
551
best_merit = tempMerit;
553
System.out.print("New best subset ("
554
+Utils.doubleToString(Math.abs(best_merit),8,5)
555
+"): "+printSubset(m_bestGroup) + " :");
556
System.out.println(Utils.
557
doubleToString((((double)i)/
558
((double)m_iterations)
565
m_bestMerit = best_merit;
566
return attributeList(m_bestGroup);
570
* prints a subset as a series of attribute numbers
571
* @param temp the subset to print
572
* @return a subset as a String of attribute numbers
574
private String printSubset(BitSet temp) {
575
StringBuffer text = new StringBuffer();
577
for (int j=0;j<m_numAttribs;j++) {
579
text.append((j+1)+" ");
582
return text.toString();
586
* converts a BitSet into a list of attribute indexes
587
* @param group the BitSet to convert
588
* @return an array of attribute indexes
590
private int[] attributeList (BitSet group) {
593
// count how many were selected
594
for (int i = 0; i < m_numAttribs; i++) {
600
int[] list = new int[count];
603
for (int i = 0; i < m_numAttribs; i++) {
613
* generates a random subset
614
* @return a random subset as a BitSet
616
private BitSet generateRandomSubset() {
617
BitSet temp = new BitSet(m_numAttribs);
620
for (int i=0;i<m_numAttribs;i++) {
621
r = m_random.nextDouble();
623
if (m_hasClass && i == m_classIndex) {
633
* counts the number of features in a subset
634
* @param featureSet the feature set for which to count the features
635
* @return the number of features in the subset
637
private int countFeatures(BitSet featureSet) {
639
for (int i=0;i<m_numAttribs;i++) {
640
if (featureSet.get(i)) {
650
private void resetOptions() {
652
m_startRange = new Range();
655
m_onlyConsiderBetterAndSmaller = false;