2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
23
package weka.filters.unsupervised.attribute;
25
import weka.core.Instances;
26
import weka.core.Option;
27
import weka.core.TechnicalInformation;
28
import weka.core.TechnicalInformationHandler;
29
import weka.core.Utils;
30
import weka.core.TechnicalInformation.Field;
31
import weka.core.TechnicalInformation.Type;
33
import java.util.Enumeration;
34
import java.util.Vector;
37
<!-- globalinfo-start -->
38
* Discretizes numeric attributes using equal frequency binning, where the number of bins is equal to the square root of the number of non-missing values.<br/>
40
* For more information, see:<br/>
42
* Ying Yang, Geoffrey I. Webb: Proportional k-Interval Discretization for Naive-Bayes Classifiers. In: 12th European Conference on Machine Learning, 564-575, 2001.
44
<!-- globalinfo-end -->
46
<!-- technical-bibtex-start -->
49
* @inproceedings{Yang2001,
50
* author = {Ying Yang and Geoffrey I. Webb},
51
* booktitle = {12th European Conference on Machine Learning},
53
* publisher = {Springer},
55
* title = {Proportional k-Interval Discretization for Naive-Bayes Classifiers},
61
<!-- technical-bibtex-end -->
63
<!-- options-start -->
64
* Valid options are: <p/>
66
* <pre> -unset-class-temporarily
67
* Unsets the class index temporarily before the filter is
68
* applied to the data.
71
* <pre> -R <col1,col2-col4,...>
72
* Specifies list of columns to Discretize. First and last are valid indexes.
73
* (default: first-last)</pre>
76
* Invert matching sense of column indexes.</pre>
79
* Output binary attributes for discretized attributes.</pre>
83
* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
84
* @version $Revision: 1.8 $
86
public class PKIDiscretize
88
implements TechnicalInformationHandler {
90
/** for serialization */
91
static final long serialVersionUID = 6153101248977702675L;
94
* Sets the format of the input instances.
96
* @param instanceInfo an Instances object containing the input instance
97
* structure (any instances contained in the object are ignored - only the
98
* structure is required).
99
* @return true if the outputFormat may be collected immediately
100
* @throws Exception if the input format can't be set successfully
102
public boolean setInputFormat(Instances instanceInfo) throws Exception {
104
// alter child behaviour to do what we want
105
m_FindNumBins = true;
106
return super.setInputFormat(instanceInfo);
110
* Finds the number of bins to use and creates the cut points.
112
* @param index the attribute index
114
protected void findNumBins(int index) {
116
Instances toFilter = getInputFormat();
118
// Find number of instances for attribute where not missing
119
int numOfInstances = toFilter.numInstances();
120
for (int i = 0; i < toFilter.numInstances(); i++) {
121
if (toFilter.instance(i).isMissing(index))
125
m_NumBins = (int)(Math.sqrt(numOfInstances));
128
calculateCutPointsByEqualFrequencyBinning(index);
133
* Gets an enumeration describing the available options.
135
* @return an enumeration of all the available options.
137
public Enumeration listOptions() {
138
Vector result = new Vector();
140
result.addElement(new Option(
141
"\tUnsets the class index temporarily before the filter is\n"
142
+ "\tapplied to the data.\n"
144
"unset-class-temporarily", 1, "-unset-class-temporarily"));
146
result.addElement(new Option(
147
"\tSpecifies list of columns to Discretize. First"
148
+ " and last are valid indexes.\n"
149
+ "\t(default: first-last)",
150
"R", 1, "-R <col1,col2-col4,...>"));
152
result.addElement(new Option(
153
"\tInvert matching sense of column indexes.",
156
result.addElement(new Option(
157
"\tOutput binary attributes for discretized attributes.",
160
return result.elements();
165
* Parses a given list of options. <p/>
167
<!-- options-start -->
168
* Valid options are: <p/>
170
* <pre> -unset-class-temporarily
171
* Unsets the class index temporarily before the filter is
172
* applied to the data.
173
* (default: no)</pre>
175
* <pre> -R <col1,col2-col4,...>
176
* Specifies list of columns to Discretize. First and last are valid indexes.
177
* (default: first-last)</pre>
180
* Invert matching sense of column indexes.</pre>
183
* Output binary attributes for discretized attributes.</pre>
187
* @param options the list of options as an array of strings
188
* @throws Exception if an option is not supported
190
public void setOptions(String[] options) throws Exception {
192
setIgnoreClass(Utils.getFlag("unset-class-temporarily", options));
193
setMakeBinary(Utils.getFlag('D', options));
194
setInvertSelection(Utils.getFlag('V', options));
196
String convertList = Utils.getOption('R', options);
197
if (convertList.length() != 0) {
198
setAttributeIndices(convertList);
200
setAttributeIndices("first-last");
203
if (getInputFormat() != null) {
204
setInputFormat(getInputFormat());
208
* Gets the current settings of the filter.
210
* @return an array of strings suitable for passing to setOptions
212
public String[] getOptions() {
215
result = new Vector();
220
if (getInvertSelection())
223
if (!getAttributeIndices().equals("")) {
225
result.add(getAttributeIndices());
228
return (String[]) result.toArray(new String[result.size()]);
232
* Returns a string describing this filter
234
* @return a description of the filter suitable for
235
* displaying in the explorer/experimenter gui
237
public String globalInfo() {
239
return "Discretizes numeric attributes using equal frequency binning,"
240
+ " where the number of bins is equal to the square root of the"
241
+ " number of non-missing values.\n\n"
242
+ "For more information, see:\n\n"
243
+ getTechnicalInformation().toString();
247
* Returns an instance of a TechnicalInformation object, containing
248
* detailed information about the technical background of this class,
249
* e.g., paper reference or book this class is based on.
251
* @return the technical information about this class
253
public TechnicalInformation getTechnicalInformation() {
254
TechnicalInformation result;
256
result = new TechnicalInformation(Type.INPROCEEDINGS);
257
result.setValue(Field.AUTHOR, "Ying Yang and Geoffrey I. Webb");
258
result.setValue(Field.TITLE, "Proportional k-Interval Discretization for Naive-Bayes Classifiers");
259
result.setValue(Field.BOOKTITLE, "12th European Conference on Machine Learning");
260
result.setValue(Field.YEAR, "2001");
261
result.setValue(Field.PAGES, "564-575");
262
result.setValue(Field.PUBLISHER, "Springer");
263
result.setValue(Field.SERIES, "LNCS");
264
result.setValue(Field.VOLUME, "2167");
270
* Returns the tip text for this property
272
* @return tip text for this property suitable for
273
* displaying in the explorer/experimenter gui
275
public String findNumBinsTipText() {
281
* Get the value of FindNumBins.
283
* @return Value of FindNumBins.
285
public boolean getFindNumBins() {
291
* Set the value of FindNumBins.
293
* @param newFindNumBins Value to assign to FindNumBins.
295
public void setFindNumBins(boolean newFindNumBins) {
300
* Returns the tip text for this property
302
* @return tip text for this property suitable for
303
* displaying in the explorer/experimenter gui
305
public String useEqualFrequencyTipText() {
307
return "Always true.";
311
* Get the value of UseEqualFrequency.
313
* @return Value of UseEqualFrequency.
315
public boolean getUseEqualFrequency() {
321
* Set the value of UseEqualFrequency.
323
* @param newUseEqualFrequency Value to assign to UseEqualFrequency.
325
public void setUseEqualFrequency(boolean newUseEqualFrequency) {
330
* Returns the tip text for this property
332
* @return tip text for this property suitable for
333
* displaying in the explorer/experimenter gui
335
public String binsTipText() {
343
* @return the number of bins.
345
public int getBins() {
353
* @param numBins the number of bins
355
public void setBins(int numBins) {
360
* Main method for testing this class.
362
* @param argv should contain arguments to the filter: use -h for help
364
public static void main(String [] argv) {
365
runFilter(new PKIDiscretize(), argv);