2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* NumericToNominal.java
19
* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
22
package weka.filters.unsupervised.attribute;
24
import weka.core.Attribute;
25
import weka.core.Capabilities;
26
import weka.core.FastVector;
27
import weka.core.Instance;
28
import weka.core.Instances;
29
import weka.core.Option;
30
import weka.core.Range;
31
import weka.core.SparseInstance;
32
import weka.core.Utils;
33
import weka.core.Capabilities.Capability;
34
import weka.filters.SimpleBatchFilter;
36
import java.util.Collections;
37
import java.util.Enumeration;
38
import java.util.HashSet;
39
import java.util.Vector;
42
<!-- globalinfo-start -->
43
* A filter for turning numeric attributes into nominal ones. Unlike discretization, it just takes all numeric values and adds them to the list of nominal values of that attribute. Useful after CSV imports, to enforce certain attributes to become nominal, e.g., the class attribute, containing values from 1 to 5.
45
<!-- globalinfo-end -->
47
<!-- options-start -->
48
* Valid options are: <p/>
50
* <pre> -R <col1,col2-col4,...>
51
* Specifies list of columns to Discretize. First and last are valid indexes.
52
* (default: first-last)</pre>
55
* Invert matching sense of column indexes.</pre>
59
* @author fracpete (fracpete at waikato dot ac dot nz)
60
* @version $Revision: 1.2 $
62
public class NumericToNominal
63
extends SimpleBatchFilter {
65
/** for serialization */
66
private static final long serialVersionUID = -6614630932899796239L;
68
/** the maximum number of decimals to use */
69
protected final static int MAX_DECIMALS = 6;
71
/** Stores which columns to turn into nominals */
72
protected Range m_Cols = new Range("first-last");
74
/** The default columns to turn into nominals */
75
protected String m_DefaultCols = "first-last";
78
* Returns a string describing this filter
80
* @return a description of the filter suitable for
81
* displaying in the explorer/experimenter gui
83
public String globalInfo() {
85
"A filter for turning numeric attributes into nominal ones. Unlike "
86
+ "discretization, it just takes all numeric values and adds them to "
87
+ "the list of nominal values of that attribute. Useful after CSV "
88
+ "imports, to enforce certain attributes to become nominal, e.g., "
89
+ "the class attribute, containing values from 1 to 5.";
93
* Gets an enumeration describing the available options.
95
* @return an enumeration of all the available options.
97
public Enumeration listOptions() {
98
Vector result = new Vector();
100
result.addElement(new Option(
101
"\tSpecifies list of columns to Discretize. First"
102
+ " and last are valid indexes.\n"
103
+ "\t(default: first-last)",
104
"R", 1, "-R <col1,col2-col4,...>"));
106
result.addElement(new Option(
107
"\tInvert matching sense of column indexes.",
110
return result.elements();
114
* Parses a given list of options. <p/>
116
<!-- options-start -->
117
* Valid options are: <p/>
119
* <pre> -R <col1,col2-col4,...>
120
* Specifies list of columns to Discretize. First and last are valid indexes.
121
* (default: first-last)</pre>
124
* Invert matching sense of column indexes.</pre>
128
* @param options the list of options as an array of strings
129
* @throws Exception if an option is not supported
131
public void setOptions(String[] options) throws Exception {
134
super.setOptions(options);
136
setInvertSelection(Utils.getFlag('V', options));
138
tmpStr = Utils.getOption('R', options);
139
if (tmpStr.length() != 0)
140
setAttributeIndices(tmpStr);
142
setAttributeIndices(m_DefaultCols);
144
if (getInputFormat() != null)
145
setInputFormat(getInputFormat());
149
* Gets the current settings of the filter.
151
* @return an array of strings suitable for passing to setOptions
153
public String[] getOptions() {
158
result = new Vector();
159
options = super.getOptions();
160
for (i = 0; i < options.length; i++)
161
result.add(options[i]);
163
if (!getAttributeIndices().equals("")) {
165
result.add(getAttributeIndices());
168
if (getInvertSelection())
171
return (String[]) result.toArray(new String[result.size()]);
175
* Returns the tip text for this property
177
* @return tip text for this property suitable for
178
* displaying in the explorer/experimenter gui
180
public String invertSelectionTipText() {
182
"Set attribute selection mode. If false, only selected"
183
+ " (numeric) attributes in the range will be 'nominalized'; if"
184
+ " true, only non-selected attributes will be 'nominalized'.";
188
* Gets whether the supplied columns are to be worked on or the others.
190
* @return true if the supplied columns will be worked on
192
public boolean getInvertSelection() {
193
return m_Cols.getInvert();
197
* Sets whether selected columns should be worked on or all the others apart
198
* from these. If true all the other columns are considered for
201
* @param value the new invert setting
203
public void setInvertSelection(boolean value) {
204
m_Cols.setInvert(value);
208
* Returns the tip text for this property
210
* @return tip text for this property suitable for
211
* displaying in the explorer/experimenter gui
213
public String attributeIndicesTipText() {
214
return "Specify range of attributes to act on."
215
+ " This is a comma separated list of attribute indices, with"
216
+ " \"first\" and \"last\" valid values. Specify an inclusive"
217
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
221
* Gets the current range selection
223
* @return a string containing a comma separated list of ranges
225
public String getAttributeIndices() {
226
return m_Cols.getRanges();
230
* Sets which attributes are to be "nominalized" (only numeric
231
* attributes among the selection will be transformed).
233
* @param value a string representing the list of attributes. Since
234
* the string will typically come from a user, attributes
235
* are indexed from 1. <br> eg: first-3,5,6-last
236
* @throws IllegalArgumentException if an invalid range list is supplied
238
public void setAttributeIndices(String value) {
239
m_Cols.setRanges(value);
243
* Sets which attributes are to be transoformed to nominal. (only numeric
244
* attributes among the selection will be transformed).
246
* @param value an array containing indexes of attributes to nominalize.
247
* Since the array will typically come from a program,
248
* attributes are indexed from 0.
249
* @throws IllegalArgumentException if an invalid set of ranges is supplied
251
public void setAttributeIndicesArray(int[] value) {
252
setAttributeIndices(Range.indicesToRangeList(value));
256
* Returns the Capabilities of this filter.
258
* @return the capabilities of this object
261
public Capabilities getCapabilities() {
262
Capabilities result = super.getCapabilities();
265
result.enableAllAttributes();
266
result.enable(Capability.MISSING_VALUES);
269
result.enableAllClasses();
270
result.enable(Capability.MISSING_CLASS_VALUES);
271
result.enable(Capability.NO_CLASS);
277
* Determines the output format based on the input format and returns
278
* this. In case the output format cannot be returned immediately, i.e.,
279
* immediateOutputFormat() returns false, then this method will be called
280
* from batchFinished().
282
* @param inputFormat the input format to base the output format on
283
* @return the output format
284
* @throws Exception in case the determination goes wrong
285
* @see #hasImmediateOutputFormat()
286
* @see #batchFinished()
288
protected Instances determineOutputFormat(Instances inputFormat)
302
m_Cols.setUpper(inputFormat.numAttributes() - 1);
303
data = new Instances(inputFormat);
304
atts = new FastVector();
305
for (i = 0; i < data.numAttributes(); i++) {
306
if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
307
atts.addElement(data.attribute(i));
312
isDate = (data.attribute(i).type() == Attribute.DATE);
314
// determine all available attribtues in dataset
315
hash = new HashSet();
316
for (n = 0; n < data.numInstances(); n++) {
317
inst = data.instance(n);
318
if (inst.isMissing(i))
322
hash.add(inst.stringValue(i));
324
hash.add(new Double(inst.value(i)));
328
sorted = new Vector();
331
Collections.sort(sorted);
333
// create attribute from sorted values
334
values = new FastVector();
335
for (Object o: sorted) {
341
Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
343
atts.addElement(new Attribute(data.attribute(i).name(), values));
346
result = new Instances(inputFormat.relationName(), atts, 0);
347
result.setClassIndex(inputFormat.classIndex());
353
* Processes the given data (may change the provided dataset) and returns
354
* the modified version. This method is called in batchFinished().
356
* @param instances the data to process
357
* @return the modified data
358
* @throws Exception in case the processing goes wrong
359
* @see #batchFinished()
361
protected Instances process(Instances instances) throws Exception {
370
// we need the complete input data!
371
if (!isFirstBatchDone())
372
setOutputFormat(determineOutputFormat(getInputFormat()));
374
result = new Instances(getOutputFormat());
376
for (i = 0; i < instances.numInstances(); i++) {
377
inst = instances.instance(i);
378
values = inst.toDoubleArray();
380
for (n = 0; n < values.length; n++) {
381
if ( !m_Cols.isInRange(n)
382
|| !instances.attribute(n).isNumeric()
383
|| inst.isMissing(n) )
386
// get index of value
387
if (instances.attribute(n).type() == Attribute.DATE)
388
value = inst.stringValue(n);
390
value = Utils.doubleToString(inst.value(n), MAX_DECIMALS);
392
values[n] = result.attribute(n).indexOfValue(value);
395
// generate new instance
396
if (inst instanceof SparseInstance)
397
newInst = new SparseInstance(inst.weight(), values);
399
newInst = new Instance(inst.weight(), values);
401
// copy possible string, relational values
402
newInst.setDataset(getOutputFormat());
403
copyValues(newInst, false, inst.dataset(), getOutputFormat());
412
* Runs the filter with the given parameters. Use -h to list options.
414
* @param args the commandline options
416
public static void main(String[] args) {
417
runFilter(new NumericToNominal(), args);