2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* RELEASE INFORMATION (December 27, 2004)
21
* Template obtained from Weka
22
* Developed for Weka by Zheng Alan Zhao
25
* FCBF algorithm is a feature selection method based on Symmetrical Uncertainty Measurement for
26
* relevance redundancy analysis. The details of FCBF algorithm are in:
28
<!-- technical-plaintext-start -->
29
* Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
30
<!-- technical-plaintext-end -->
35
* For algorithm implementation:
36
* Zheng Zhao: zhaozheng at asu.edu
39
* Lei Yu: leiyu at asu.edu
40
* Huan Liu: hliu at asu.edu
42
* Data Mining and Machine Learning Lab
43
* Computer Science and Engineering Department
44
* Fulton School of Engineering
45
* Arizona State University
50
* Copyright (C) 2004 Data Mining and Machine Learning Lab,
51
* Computer Science and Engineering Department,
52
* Fulton School of Engineering,
53
* Arizona State University
58
package weka.attributeSelection;
60
import weka.core.Instances;
61
import weka.core.Option;
62
import weka.core.OptionHandler;
63
import weka.core.Range;
64
import weka.core.TechnicalInformation;
65
import weka.core.TechnicalInformation.Type;
66
import weka.core.TechnicalInformation.Field;
67
import weka.core.TechnicalInformationHandler;
68
import weka.core.Utils;
70
import java.util.Enumeration;
71
import java.util.Vector;
74
<!-- globalinfo-start -->
77
* Feature selection method based on correlation measureand relevance&redundancy analysis. Use in conjunction with an attribute set evaluator (SymmetricalUncertAttributeEval).<br/>
79
* For more information see:<br/>
81
* Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
83
<!-- globalinfo-end -->
85
<!-- technical-bibtex-start -->
88
* @inproceedings{Yu2003,
89
* author = {Lei Yu and Huan Liu},
90
* booktitle = {Proceedings of the Twentieth International Conference on Machine Learning},
92
* publisher = {AAAI Press},
93
* title = {Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution},
98
<!-- technical-bibtex-end -->
100
<!-- options-start -->
101
* Valid options are: <p/>
103
* <pre> -D <create dataset>
104
* Specify Whether the selector generates a new dataset.</pre>
106
* <pre> -P <start set>
107
* Specify a starting set of attributes.
109
* Any starting attributes specified are
110
* ignored during the ranking.</pre>
112
* <pre> -T <threshold>
113
* Specify a theshold by which attributes
114
* may be discarded from the ranking.</pre>
116
* <pre> -N <num to select>
117
* Specify number of attributes to select</pre>
121
* @author Zheng Zhao: zhaozheng at asu.edu
122
* @version $Revision: 1.6 $
124
public class FCBFSearch
126
implements RankedOutputSearch, StartSetHandler, OptionHandler,
127
TechnicalInformationHandler {
129
/** for serialization */
130
static final long serialVersionUID = 8209699587428369942L;
132
/** Holds the starting set as an array of attributes */
133
private int[] m_starting;
135
/** Holds the start set for the search as a range */
136
private Range m_startRange;
138
/** Holds the ordered list of attributes */
139
private int[] m_attributeList;
141
/** Holds the list of attribute merit scores */
142
private double[] m_attributeMerit;
144
/** Data has class attribute---if unsupervised evaluator then no class */
145
private boolean m_hasClass;
147
/** Class index of the data if supervised evaluator */
148
private int m_classIndex;
150
/** The number of attribtes */
151
private int m_numAttribs;
154
* A threshold by which to discard attributes---used by the
155
* AttributeSelection module
157
private double m_threshold;
159
/** The number of attributes to select. -1 indicates that all attributes
160
are to be retained. Has precedence over m_threshold */
161
private int m_numToSelect = -1;
163
/** Used to compute the number to select */
164
private int m_calculatedNumToSelect = -1;
166
/*-----------------add begin 2004-11-15 by alan-----------------*/
167
/** Used to determine whether we create a new dataset according to the selected features */
168
private boolean m_generateOutput = false;
170
/** Used to store the ref of the Evaluator we use*/
171
private ASEvaluation m_asEval;
173
/** Holds the list of attribute merit scores generated by FCBF */
174
private double[][] m_rankedFCBF;
176
/** Hold the list of selected features*/
177
private double[][] m_selectedFeatures;
178
/*-----------------add end 2004-11-15 by alan-----------------*/
181
* Returns a string describing this search method
182
* @return a description of the search suitable for
183
* displaying in the explorer/experimenter gui
185
public String globalInfo() {
187
"FCBF : \n\nFeature selection method based on correlation measure"
188
+ "and relevance&redundancy analysis. "
189
+ "Use in conjunction with an attribute set evaluator (SymmetricalUncertAttributeEval).\n\n"
190
+ "For more information see:\n\n"
191
+ getTechnicalInformation().toString();
195
* Returns an instance of a TechnicalInformation object, containing
196
* detailed information about the technical background of this class,
197
* e.g., paper reference or book this class is based on.
199
* @return the technical information about this class
201
public TechnicalInformation getTechnicalInformation() {
202
TechnicalInformation result;
204
result = new TechnicalInformation(Type.INPROCEEDINGS);
205
result.setValue(Field.AUTHOR, "Lei Yu and Huan Liu");
206
result.setValue(Field.TITLE, "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution");
207
result.setValue(Field.BOOKTITLE, "Proceedings of the Twentieth International Conference on Machine Learning");
208
result.setValue(Field.YEAR, "2003");
209
result.setValue(Field.PAGES, "856-863");
210
result.setValue(Field.PUBLISHER, "AAAI Press");
218
public FCBFSearch () {
223
* Returns the tip text for this property
224
* @return tip text for this property suitable for
225
* displaying in the explorer/experimenter gui
227
public String numToSelectTipText() {
228
return "Specify the number of attributes to retain. The default value "
229
+"(-1) indicates that all attributes are to be retained. Use either "
230
+"this option or a threshold to reduce the attribute set.";
234
* Specify the number of attributes to select from the ranked list. -1
235
* indicates that all attributes are to be retained.
236
* @param n the number of attributes to retain
238
public void setNumToSelect(int n) {
243
* Gets the number of attributes to be retained.
244
* @return the number of attributes to retain
246
public int getNumToSelect() {
247
return m_numToSelect;
251
* Gets the calculated number to select. This might be computed
252
* from a threshold, or if < 0 is set as the number to select then
253
* it is set to the number of attributes in the (transformed) data.
254
* @return the calculated number of attributes to select
256
public int getCalculatedNumToSelect() {
257
if (m_numToSelect >= 0) {
258
m_calculatedNumToSelect = m_numToSelect;
260
if (m_selectedFeatures.length>0
261
&& m_selectedFeatures.length<m_calculatedNumToSelect)
263
m_calculatedNumToSelect = m_selectedFeatures.length;
266
return m_calculatedNumToSelect;
270
* Returns the tip text for this property
271
* @return tip text for this property suitable for
272
* displaying in the explorer/experimenter gui
274
public String thresholdTipText() {
275
return "Set threshold by which attributes can be discarded. Default value "
276
+ "results in no attributes being discarded. Use either this option or "
277
+"numToSelect to reduce the attribute set.";
281
* Set the threshold by which the AttributeSelection module can discard
283
* @param threshold the threshold.
285
public void setThreshold(double threshold) {
286
m_threshold = threshold;
290
* Returns the threshold so that the AttributeSelection module can
291
* discard attributes from the ranking.
292
* @return the threshold
294
public double getThreshold() {
299
* Returns the tip text for this property
300
* @return tip text for this property suitable for
301
* displaying in the explorer/experimenter gui
303
public String generateRankingTipText() {
304
return "A constant option. FCBF is capable of generating"
305
+" attribute rankings.";
309
* This is a dummy set method---Ranker is ONLY capable of producing
310
* a ranked list of attributes for attribute evaluators.
311
* @param doRank this parameter is N/A and is ignored
313
public void setGenerateRanking(boolean doRank) {
317
* This is a dummy method. Ranker can ONLY be used with attribute
318
* evaluators and as such can only produce a ranked list of attributes
319
* @return true all the time.
321
public boolean getGenerateRanking() {
326
* Returns the tip text for this property
327
* @return tip text for this property suitable for
328
* displaying in the explorer/experimenter gui
331
public String generateDataOutputTipText() {
332
return "Generating new dataset according to the selected features."
337
* Sets the flag, by which the AttributeSelection module decide
338
* whether create a new dataset according to the selected features.
339
* @param doGenerate the flag, by which the AttributeSelection module
340
* decide whether create a new dataset according to the selected
343
public void setGenerateDataOutput(boolean doGenerate) {
344
this.m_generateOutput = doGenerate;
349
* Returns the flag, by which the AttributeSelection module decide
350
* whether create a new dataset according to the selected features.
351
* @return the flag, by which the AttributeSelection module decide
352
* whether create a new dataset according to the selected features.
354
public boolean getGenerateDataOutput() {
355
return this.m_generateOutput;
359
* Returns the tip text for this property
360
* @return tip text for this property suitable for
361
* displaying in the explorer/experimenter gui
363
public String startSetTipText() {
364
return "Specify a set of attributes to ignore. "
365
+" When generating the ranking, FCBF will not evaluate the attributes "
367
+"This is specified as a comma "
368
+"seperated list off attribute indexes starting at 1. It can include "
369
+"ranges. Eg. 1,2,5-9,17.";
373
* Sets a starting set of attributes for the search. It is the
374
* search method's responsibility to report this start set (if any)
375
* in its toString() method.
376
* @param startSet a string containing a list of attributes (and or ranges),
378
* @throws Exception if start set can't be set.
380
public void setStartSet (String startSet) throws Exception {
381
m_startRange.setRanges(startSet);
385
* Returns a list of attributes (and or attribute ranges) as a String
386
* @return a list of attributes (and or attribute ranges)
388
public String getStartSet () {
389
return m_startRange.getRanges();
393
* Returns an enumeration describing the available options.
394
* @return an enumeration of all the available options.
396
public Enumeration listOptions () {
397
Vector newVector = new Vector(4);
399
newVector.addElement(new Option(
400
"\tSpecify Whether the selector generates a new dataset.",
401
"D", 1, "-D <create dataset>"));
403
newVector.addElement(new Option(
404
"\tSpecify a starting set of attributes.\n"
405
+ "\t\tEg. 1,3,5-7.\n"
406
+ "\tAny starting attributes specified are\n"
407
+ "\tignored during the ranking.",
408
"P", 1 , "-P <start set>"));
410
newVector.addElement(new Option(
411
"\tSpecify a theshold by which attributes\n"
412
+ "\tmay be discarded from the ranking.",
413
"T", 1, "-T <threshold>"));
415
newVector.addElement(new Option(
416
"\tSpecify number of attributes to select",
417
"N", 1, "-N <num to select>"));
419
return newVector.elements();
424
* Parses a given list of options. <p/>
426
<!-- options-start -->
427
* Valid options are: <p/>
429
* <pre> -D <create dataset>
430
* Specify Whether the selector generates a new dataset.</pre>
432
* <pre> -P <start set>
433
* Specify a starting set of attributes.
435
* Any starting attributes specified are
436
* ignored during the ranking.</pre>
438
* <pre> -T <threshold>
439
* Specify a theshold by which attributes
440
* may be discarded from the ranking.</pre>
442
* <pre> -N <num to select>
443
* Specify number of attributes to select</pre>
447
* @param options the list of options as an array of strings
448
* @throws Exception if an option is not supported
451
public void setOptions (String[] options)
456
optionString = Utils.getOption('D', options);
457
if (optionString.length() != 0) {
458
setGenerateDataOutput(Boolean.getBoolean(optionString));
461
optionString = Utils.getOption('P', options);
462
if (optionString.length() != 0) {
463
setStartSet(optionString);
466
optionString = Utils.getOption('T', options);
467
if (optionString.length() != 0) {
469
temp = Double.valueOf(optionString);
470
setThreshold(temp.doubleValue());
473
optionString = Utils.getOption('N', options);
474
if (optionString.length() != 0) {
475
setNumToSelect(Integer.parseInt(optionString));
480
* Gets the current settings of ReliefFAttributeEval.
482
* @return an array of strings suitable for passing to setOptions()
484
public String[] getOptions () {
485
String[] options = new String[8];
488
options[current++] = "-D";
489
options[current++] = ""+getGenerateDataOutput();
491
if (!(getStartSet().equals(""))) {
492
options[current++] = "-P";
493
options[current++] = ""+startSetToString();
496
options[current++] = "-T";
497
options[current++] = "" + getThreshold();
499
options[current++] = "-N";
500
options[current++] = ""+getNumToSelect();
502
while (current < options.length) {
503
options[current++] = "";
509
* converts the array of starting attributes to a string. This is
510
* used by getOptions to return the actual attributes specified
511
* as the starting set. This is better than using m_startRanges.getRanges()
512
* as the same start set can be specified in different ways from the
513
* command line---eg 1,2,3 == 1-3. This is to ensure that stuff that
514
* is stored in a database is comparable.
515
* @return a comma seperated list of individual attribute numbers as a String
517
private String startSetToString() {
518
StringBuffer FString = new StringBuffer();
521
if (m_starting == null) {
522
return getStartSet();
525
for (int i = 0; i < m_starting.length; i++) {
528
if ((m_hasClass == false) ||
529
(m_hasClass == true && i != m_classIndex)) {
530
FString.append((m_starting[i] + 1));
534
if (i == (m_starting.length - 1)) {
544
return FString.toString();
548
* Kind of a dummy search algorithm. Calls a Attribute evaluator to
549
* evaluate each attribute not included in the startSet and then sorts
550
* them to produce a ranked list of attributes.
552
* @param ASEval the attribute evaluator to guide the search
553
* @param data the training instances.
554
* @return an array (not necessarily ordered) of selected attribute indexes
555
* @throws Exception if the search can't be completed
557
public int[] search (ASEvaluation ASEval, Instances data)
561
if (!(ASEval instanceof AttributeSetEvaluator)) {
562
throw new Exception(ASEval.getClass().getName()
564
+ "Attribute Set evaluator!");
567
m_numAttribs = data.numAttributes();
569
if (ASEval instanceof UnsupervisedAttributeEvaluator) {
573
m_classIndex = data.classIndex();
574
if (m_classIndex >= 0) {
581
// get the transformed data and check to see if the transformer
582
// preserves a class index
583
if (ASEval instanceof AttributeTransformer) {
584
data = ((AttributeTransformer)ASEval).transformedHeader();
585
if (m_classIndex >= 0 && data.classIndex() >= 0) {
586
m_classIndex = data.classIndex();
592
m_startRange.setUpper(m_numAttribs - 1);
593
if (!(getStartSet().equals(""))) {
594
m_starting = m_startRange.getSelection();
598
if (m_starting != null) {
599
sl = m_starting.length;
601
if ((m_starting != null) && (m_hasClass == true)) {
602
// see if the supplied list contains the class index
604
for (i = 0; i < sl; i++) {
605
if (m_starting[i] == m_classIndex) {
616
if (m_hasClass == true) {
622
m_attributeList = new int[m_numAttribs - sl];
623
m_attributeMerit = new double[m_numAttribs - sl];
625
// add in those attributes not in the starting (omit list)
626
for (i = 0, j = 0; i < m_numAttribs; i++) {
627
if (!inStarting(i)) {
628
m_attributeList[j++] = i;
632
this.m_asEval = ASEval;
633
AttributeSetEvaluator ASEvaluator = (AttributeSetEvaluator)ASEval;
635
for (i = 0; i < m_attributeList.length; i++) {
636
m_attributeMerit[i] = ASEvaluator.evaluateAttribute(m_attributeList[i]);
639
double[][] tempRanked = rankedAttributes();
640
int[] rankedAttributes = new int[m_selectedFeatures.length];
642
for (i = 0; i < m_selectedFeatures.length; i++) {
643
rankedAttributes[i] = (int)tempRanked[i][0];
645
return rankedAttributes;
651
* Sorts the evaluated attribute list
653
* @return an array of sorted (highest eval to lowest) attribute indexes
654
* @throws Exception of sorting can't be done.
656
public double[][] rankedAttributes ()
660
if (m_attributeList == null || m_attributeMerit == null) {
661
throw new Exception("Search must be performed before a ranked "
662
+ "attribute list can be obtained");
665
int[] ranked = Utils.sort(m_attributeMerit);
666
// reverse the order of the ranked indexes
667
double[][] bestToWorst = new double[ranked.length][2];
669
for (i = ranked.length - 1, j = 0; i >= 0; i--) {
670
bestToWorst[j++][0] = ranked[i];
671
//alan: means in the arrary ranked, varialbe is from ranked as from small to large
674
// convert the indexes to attribute indexes
675
for (i = 0; i < bestToWorst.length; i++) {
676
int temp = ((int)bestToWorst[i][0]);
677
bestToWorst[i][0] = m_attributeList[temp]; //for the index
678
bestToWorst[i][1] = m_attributeMerit[temp]; //for the value of the index
681
if (m_numToSelect > bestToWorst.length) {
682
throw new Exception("More attributes requested than exist in the data");
685
this.FCBFElimination(bestToWorst);
687
if (m_numToSelect <= 0) {
688
if (m_threshold == -Double.MAX_VALUE) {
689
m_calculatedNumToSelect = m_selectedFeatures.length;
691
determineNumToSelectFromThreshold(m_selectedFeatures);
694
/* if (m_numToSelect > 0) {
695
determineThreshFromNumToSelect(bestToWorst);
698
return m_selectedFeatures;
701
private void determineNumToSelectFromThreshold(double [][] ranking) {
703
for (int i = 0; i < ranking.length; i++) {
704
if (ranking[i][1] > m_threshold) {
708
m_calculatedNumToSelect = count;
711
private void determineThreshFromNumToSelect(double [][] ranking)
713
if (m_numToSelect > ranking.length) {
714
throw new Exception("More attributes requested than exist in the data");
717
if (m_numToSelect == ranking.length) {
721
m_threshold = (ranking[m_numToSelect-1][1] +
722
ranking[m_numToSelect][1]) / 2.0;
726
* returns a description of the search as a String
727
* @return a description of the search
729
public String toString () {
730
StringBuffer BfString = new StringBuffer();
731
BfString.append("\tAttribute ranking.\n");
733
if (m_starting != null) {
734
BfString.append("\tIgnored attributes: ");
736
BfString.append(startSetToString());
737
BfString.append("\n");
740
if (m_threshold != -Double.MAX_VALUE) {
741
BfString.append("\tThreshold for discarding attributes: "
742
+ Utils.doubleToString(m_threshold,8,4)+"\n");
745
BfString.append("\n\n");
747
BfString.append(" J || SU(j,Class) || I || SU(i,j). \n");
749
for (int i=0; i<m_rankedFCBF.length; i++)
751
BfString.append(Utils.doubleToString(m_rankedFCBF[i][0]+1,6,0)+" ; "
752
+Utils.doubleToString(m_rankedFCBF[i][1],12,7)+" ; ");
753
if (m_rankedFCBF[i][2] == m_rankedFCBF[i][0])
755
BfString.append(" *\n");
759
BfString.append(Utils.doubleToString(m_rankedFCBF[i][2] + 1,5,0) + " ; "
760
+ m_rankedFCBF[i][3] + "\n");
764
return BfString.toString();
769
* Resets stuff to default values
771
protected void resetOptions () {
773
m_startRange = new Range();
774
m_attributeList = null;
775
m_attributeMerit = null;
776
m_threshold = -Double.MAX_VALUE;
780
private boolean inStarting (int feat) {
781
// omit the class from the evaluation
782
if ((m_hasClass == true) && (feat == m_classIndex)) {
786
if (m_starting == null) {
790
for (int i = 0; i < m_starting.length; i++) {
791
if (m_starting[i] == feat) {
799
private void FCBFElimination(double[][]rankedFeatures)
804
m_rankedFCBF = new double[m_attributeList.length][4];
805
int[] attributes = new int[1];
806
int[] classAtrributes = new int[1];
808
int numSelectedAttributes = 0;
813
AttributeSetEvaluator ASEvaluator = (AttributeSetEvaluator)m_asEval;
815
for (i = 0; i < rankedFeatures.length; i++) {
816
m_rankedFCBF[i][0] = rankedFeatures[i][0];
817
m_rankedFCBF[i][1] = rankedFeatures[i][1];
818
m_rankedFCBF[i][2] = -1;
821
while (startPoint < rankedFeatures.length)
823
if (m_rankedFCBF[startPoint][2] != -1)
829
m_rankedFCBF[startPoint][2] = m_rankedFCBF[startPoint][0];
830
numSelectedAttributes++;
832
for (i = startPoint + 1; i < m_attributeList.length; i++)
834
if (m_rankedFCBF[i][2] != -1)
838
attributes[0] = (int) m_rankedFCBF[startPoint][0];
839
classAtrributes[0] = (int) m_rankedFCBF[i][0];
840
tempSUIJ = ASEvaluator.evaluateAttribute(attributes, classAtrributes);
841
if (m_rankedFCBF[i][1] < tempSUIJ || Math.abs(tempSUIJ-m_rankedFCBF[i][1])<1E-8)
843
m_rankedFCBF[i][2] = m_rankedFCBF[startPoint][0];
844
m_rankedFCBF[i][3] = tempSUIJ;
850
m_selectedFeatures = new double[numSelectedAttributes][2];
852
for (i = 0, j = 0; i < m_attributeList.length; i++)
854
if (m_rankedFCBF[i][2] == m_rankedFCBF[i][0])
856
m_selectedFeatures[j][0] = m_rankedFCBF[i][0];
857
m_selectedFeatures[j][1] = m_rankedFCBF[i][1];