2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
23
package weka.filters.unsupervised.attribute;
25
import weka.clusterers.Clusterer;
26
import weka.core.Attribute;
27
import weka.core.Capabilities;
28
import weka.core.FastVector;
29
import weka.core.Instance;
30
import weka.core.Instances;
31
import weka.core.Option;
32
import weka.core.OptionHandler;
33
import weka.core.Range;
34
import weka.core.SparseInstance;
35
import weka.core.Utils;
36
import weka.filters.Filter;
37
import weka.filters.UnsupervisedFilter;
39
import java.util.Enumeration;
40
import java.util.Vector;
43
<!-- globalinfo-start -->
44
* A filter that adds a new nominal attribute representing the cluster assigned to each instance by the specified clustering algorithm.
46
<!-- globalinfo-end -->
48
<!-- options-start -->
49
* Valid options are: <p/>
51
* <pre> -W <clusterer specification>
52
* Full class name of clusterer to use, followed
53
* by scheme options. eg:
54
* "weka.clusterers.SimpleKMeans -N 3"
55
* (default: weka.clusterers.SimpleKMeans)</pre>
57
* <pre> -I <att1,att2-att4,...>
58
* The range of attributes the clusterer should ignore.
63
* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
64
* @version $Revision: 1.10 $
66
public class AddCluster
68
implements UnsupervisedFilter, OptionHandler {
70
/** for serialization */
71
static final long serialVersionUID = 7414280611943807337L;
73
/** The clusterer used to do the cleansing */
74
protected Clusterer m_Clusterer = new weka.clusterers.SimpleKMeans();
76
/** Range of attributes to ignore */
77
protected Range m_IgnoreAttributesRange = null;
79
/** Filter for removing attributes */
80
protected Filter m_removeAttributes = new Remove();
83
* Returns the Capabilities of this filter, makes sure that the class is
84
* never set (for the clusterer).
86
* @param data the data to use for customization
87
* @return the capabilities of this object, based on the data
88
* @see #getCapabilities()
90
public Capabilities getCapabilities(Instances data) {
93
newData = new Instances(data, 0);
94
newData.setClassIndex(-1);
96
return super.getCapabilities(newData);
100
* Returns the Capabilities of this filter.
102
* @return the capabilities of this object
105
public Capabilities getCapabilities() {
106
Capabilities result = m_Clusterer.getCapabilities();
108
result.setMinimumNumberInstances(0);
114
* tests the data whether the filter can actually handle it
116
* @param instanceInfo the data to test
117
* @throws Exception if the test fails
119
protected void testInputFormat(Instances instanceInfo) throws Exception {
120
getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo));
124
* Sets the format of the input instances.
126
* @param instanceInfo an Instances object containing the input instance
127
* structure (any instances contained in the object are ignored - only the
128
* structure is required).
129
* @return true if the outputFormat may be collected immediately
130
* @throws Exception if the inputFormat can't be set successfully
132
public boolean setInputFormat(Instances instanceInfo) throws Exception {
134
super.setInputFormat(instanceInfo);
135
m_removeAttributes = null;
141
* filters all attributes that should be ignored
143
* @param data the data to filter
144
* @return the filtered data
145
* @throws Exception if filtering fails
147
protected Instances removeIgnored(Instances data) throws Exception {
148
Instances result = data;
150
if (m_IgnoreAttributesRange != null || data.classIndex() >= 0) {
151
m_removeAttributes = new Remove();
152
String rangeString = "";
153
if (m_IgnoreAttributesRange != null) {
154
rangeString += m_IgnoreAttributesRange.getRanges();
156
if (data.classIndex() >= 0) {
157
if (rangeString.length() > 0) {
158
rangeString += "," + (data.classIndex() + 1);
160
rangeString = "" + (data.classIndex() + 1);
163
((Remove) m_removeAttributes).setAttributeIndices(rangeString);
164
((Remove) m_removeAttributes).setInvertSelection(false);
165
m_removeAttributes.setInputFormat(data);
166
result = Filter.useFilter(data, m_removeAttributes);
173
* Signify that this batch of input to the filter is finished.
175
* @return true if there are instances pending output
176
* @throws IllegalStateException if no input structure has been defined
178
public boolean batchFinished() throws Exception {
180
if (getInputFormat() == null) {
181
throw new IllegalStateException("No input instance format defined");
184
Instances toFilter = getInputFormat();
186
if (!isFirstBatchDone()) {
187
// filter out attributes if necessary
188
Instances toFilterIgnoringAttributes = removeIgnored(toFilter);
190
// build the clusterer
191
m_Clusterer.buildClusterer(toFilterIgnoringAttributes);
193
// create output dataset with new attribute
194
Instances filtered = new Instances(toFilter, 0);
195
FastVector nominal_values = new FastVector(m_Clusterer.numberOfClusters());
196
for (int i=0; i<m_Clusterer.numberOfClusters(); i++) {
197
nominal_values.addElement("cluster" + (i+1));
199
filtered.insertAttributeAt(new Attribute("cluster", nominal_values),
200
filtered.numAttributes());
202
setOutputFormat(filtered);
206
for (int i=0; i<toFilter.numInstances(); i++) {
207
convertInstance(toFilter.instance(i));
212
m_FirstBatchDone = true;
214
return (numPendingOutput() != 0);
218
* Input an instance for filtering. Ordinarily the instance is processed
219
* and made available for output immediately. Some filters require all
220
* instances be read before producing output.
222
* @param instance the input instance
223
* @return true if the filtered instance may now be
224
* collected with output().
225
* @throws IllegalStateException if no input format has been defined.
227
public boolean input(Instance instance) throws Exception {
229
if (getInputFormat() == null) {
230
throw new IllegalStateException("No input instance format defined");
237
if (outputFormatPeek() != null) {
238
convertInstance(instance);
242
bufferInput(instance);
247
* Convert a single instance over. The converted instance is added to
248
* the end of the output queue.
250
* @param instance the instance to convert
251
* @throws Exception if something goes wrong
253
protected void convertInstance(Instance instance) throws Exception {
254
Instance original, processed;
258
double[] instanceVals = new double[instance.numAttributes()+1];
259
for(int j = 0; j < instance.numAttributes(); j++) {
260
instanceVals[j] = original.value(j);
262
Instance filteredI = null;
263
if (m_removeAttributes != null) {
264
m_removeAttributes.input(instance);
265
filteredI = m_removeAttributes.output();
267
filteredI = instance;
270
// add cluster to end
271
instanceVals[instance.numAttributes()]
272
= m_Clusterer.clusterInstance(filteredI);
274
// create new instance
275
if (original instanceof SparseInstance) {
276
processed = new SparseInstance(original.weight(), instanceVals);
278
processed = new Instance(original.weight(), instanceVals);
281
processed.setDataset(instance.dataset());
282
copyValues(processed, false, instance.dataset(), getOutputFormat());
283
processed.setDataset(getOutputFormat());
289
* Returns an enumeration describing the available options.
291
* @return an enumeration of all the available options.
293
public Enumeration listOptions() {
295
Vector newVector = new Vector(2);
297
newVector.addElement(new Option(
298
"\tFull class name of clusterer to use, followed\n"
299
+ "\tby scheme options. eg:\n"
300
+ "\t\t\"weka.clusterers.SimpleKMeans -N 3\"\n"
301
+ "\t(default: weka.clusterers.SimpleKMeans)",
302
"W", 1, "-W <clusterer specification>"));
304
newVector.addElement(new Option(
305
"\tThe range of attributes the clusterer should ignore.\n",
306
"I", 1,"-I <att1,att2-att4,...>"));
308
return newVector.elements();
313
* Parses a given list of options. <p/>
315
<!-- options-start -->
316
* Valid options are: <p/>
318
* <pre> -W <clusterer specification>
319
* Full class name of clusterer to use, followed
320
* by scheme options. eg:
321
* "weka.clusterers.SimpleKMeans -N 3"
322
* (default: weka.clusterers.SimpleKMeans)</pre>
324
* <pre> -I <att1,att2-att4,...>
325
* The range of attributes the clusterer should ignore.
330
* @param options the list of options as an array of strings
331
* @throws Exception if an option is not supported
333
public void setOptions(String[] options) throws Exception {
335
String clustererString = Utils.getOption('W', options);
336
if (clustererString.length() == 0)
337
clustererString = weka.clusterers.SimpleKMeans.class.getName();
338
String[] clustererSpec = Utils.splitOptions(clustererString);
339
if (clustererSpec.length == 0) {
340
throw new Exception("Invalid clusterer specification string");
342
String clustererName = clustererSpec[0];
343
clustererSpec[0] = "";
344
setClusterer(Clusterer.forName(clustererName, clustererSpec));
346
setIgnoredAttributeIndices(Utils.getOption('I', options));
348
Utils.checkForRemainingOptions(options);
352
* Gets the current settings of the filter.
354
* @return an array of strings suitable for passing to setOptions
356
public String [] getOptions() {
358
String [] options = new String [5];
361
options[current++] = "-W"; options[current++] = "" + getClustererSpec();
363
if (!getIgnoredAttributeIndices().equals("")) {
364
options[current++] = "-I"; options[current++] = getIgnoredAttributeIndices();
367
while (current < options.length) {
368
options[current++] = "";
374
* Returns a string describing this filter
376
* @return a description of the filter suitable for
377
* displaying in the explorer/experimenter gui
379
public String globalInfo() {
381
return "A filter that adds a new nominal attribute representing the cluster "
382
+ "assigned to each instance by the specified clustering algorithm.";
386
* Returns the tip text for this property
388
* @return tip text for this property suitable for
389
* displaying in the explorer/experimenter gui
391
public String clustererTipText() {
393
return "The clusterer to assign clusters with.";
397
* Sets the clusterer to assign clusters with.
399
* @param clusterer The clusterer to be used (with its options set).
401
public void setClusterer(Clusterer clusterer) {
403
m_Clusterer = clusterer;
407
* Gets the clusterer used by the filter.
409
* @return The clusterer being used.
411
public Clusterer getClusterer() {
417
* Gets the clusterer specification string, which contains the class name of
418
* the clusterer and any options to the clusterer.
420
* @return the clusterer string.
422
protected String getClustererSpec() {
424
Clusterer c = getClusterer();
425
if (c instanceof OptionHandler) {
426
return c.getClass().getName() + " "
427
+ Utils.joinOptions(((OptionHandler)c).getOptions());
429
return c.getClass().getName();
433
* Returns the tip text for this property
435
* @return tip text for this property suitable for
436
* displaying in the explorer/experimenter gui
438
public String ignoredAttributeIndicesTipText() {
440
return "The range of attributes to be ignored by the clusterer. eg: first-3,5,9-last";
444
* Gets ranges of attributes to be ignored.
446
* @return a string containing a comma-separated list of ranges
448
public String getIgnoredAttributeIndices() {
450
if (m_IgnoreAttributesRange == null) {
453
return m_IgnoreAttributesRange.getRanges();
458
* Sets the ranges of attributes to be ignored. If provided string
459
* is null, no attributes will be ignored.
461
* @param rangeList a string representing the list of attributes.
462
* eg: first-3,5,6-last
463
* @throws IllegalArgumentException if an invalid range list is supplied
465
public void setIgnoredAttributeIndices(String rangeList) {
467
if ((rangeList == null) || (rangeList.length() == 0)) {
468
m_IgnoreAttributesRange = null;
470
m_IgnoreAttributesRange = new Range();
471
m_IgnoreAttributesRange.setRanges(rangeList);
476
* Main method for testing this class.
478
* @param argv should contain arguments to the filter: use -h for help
480
public static void main(String [] argv) {
481
runFilter(new AddCluster(), argv);