2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
23
package weka.classifiers.rules;
25
import weka.classifiers.Classifier;
26
import weka.classifiers.Sourcable;
27
import weka.core.Attribute;
28
import weka.core.Capabilities;
29
import weka.core.Instance;
30
import weka.core.Instances;
31
import weka.core.Option;
32
import weka.core.TechnicalInformation;
33
import weka.core.TechnicalInformationHandler;
34
import weka.core.Utils;
35
import weka.core.WekaException;
36
import weka.core.Capabilities.Capability;
37
import weka.core.TechnicalInformation.Field;
38
import weka.core.TechnicalInformation.Type;
40
import java.io.Serializable;
41
import java.util.Enumeration;
42
import java.util.Vector;
45
<!-- globalinfo-start -->
46
* Class for building and using a 1R classifier; in other words, uses the minimum-error attribute for prediction, discretizing numeric attributes. For more information, see:<br/>
48
* R.C. Holte (1993). Very simple classification rules perform well on most commonly used datasets. Machine Learning. 11:63-91.
50
<!-- globalinfo-end -->
52
<!-- technical-bibtex-start -->
55
* @article{Holte1993,
56
* author = {R.C. Holte},
57
* journal = {Machine Learning},
59
* title = {Very simple classification rules perform well on most commonly used datasets},
65
<!-- technical-bibtex-end -->
67
<!-- options-start -->
68
* Valid options are: <p/>
70
* <pre> -B <minimum bucket size>
71
* The minimum number of objects in a bucket (default: 6).</pre>
75
* @author Ian H. Witten (ihw@cs.waikato.ac.nz)
76
* @version $Revision: 1.25 $
80
implements TechnicalInformationHandler, Sourcable {
82
/** for serialization */
83
static final long serialVersionUID = -2459427002147861445L;
86
* Returns a string describing classifier
87
* @return a description suitable for
88
* displaying in the explorer/experimenter gui
90
public String globalInfo() {
92
return "Class for building and using a 1R classifier; in other words, uses "
93
+ "the minimum-error attribute for prediction, discretizing numeric "
94
+ "attributes. For more information, see:\n\n"
95
+ getTechnicalInformation().toString();
99
* Returns an instance of a TechnicalInformation object, containing
100
* detailed information about the technical background of this class,
101
* e.g., paper reference or book this class is based on.
103
* @return the technical information about this class
105
public TechnicalInformation getTechnicalInformation() {
106
TechnicalInformation result;
108
result = new TechnicalInformation(Type.ARTICLE);
109
result.setValue(Field.AUTHOR, "R.C. Holte");
110
result.setValue(Field.YEAR, "1993");
111
result.setValue(Field.TITLE, "Very simple classification rules perform well on most commonly used datasets");
112
result.setValue(Field.JOURNAL, "Machine Learning");
113
result.setValue(Field.VOLUME, "11");
114
result.setValue(Field.PAGES, "63-91");
120
* Class for storing store a 1R rule.
122
private class OneRRule
123
implements Serializable {
125
/** for serialization */
126
static final long serialVersionUID = 1152814630957092281L;
128
/** The class attribute. */
129
private Attribute m_class;
131
/** The number of instances used for building the rule. */
132
private int m_numInst;
134
/** Attribute to test */
135
private Attribute m_attr;
137
/** Training set examples this rule gets right */
138
private int m_correct;
140
/** Predicted class for each value of attr */
141
private int[] m_classifications;
143
/** Predicted class for missing values */
144
private int m_missingValueClass = -1;
146
/** Breakpoints (numeric attributes only) */
147
private double[] m_breakpoints;
150
* Constructor for nominal attribute.
152
* @param data the data to work with
153
* @param attribute the attribute to use
154
* @throws Exception if something goes wrong
156
public OneRRule(Instances data, Attribute attribute) throws Exception {
158
m_class = data.classAttribute();
159
m_numInst = data.numInstances();
162
m_classifications = new int[m_attr.numValues()];
166
* Constructor for numeric attribute.
168
* @param data the data to work with
169
* @param attribute the attribute to use
170
* @param nBreaks the break point
171
* @throws Exception if something goes wrong
173
public OneRRule(Instances data, Attribute attribute, int nBreaks) throws Exception {
175
m_class = data.classAttribute();
176
m_numInst = data.numInstances();
179
m_classifications = new int[nBreaks];
180
m_breakpoints = new double[nBreaks - 1]; // last breakpoint is infinity
184
* Returns a description of the rule.
186
* @return a string representation of the rule
188
public String toString() {
191
StringBuffer text = new StringBuffer();
192
text.append(m_attr.name() + ":\n");
193
for (int v = 0; v < m_classifications.length; v++) {
195
if (m_attr.isNominal()) {
196
text.append(m_attr.value(v));
197
} else if (v < m_breakpoints.length) {
198
text.append("< " + m_breakpoints[v]);
200
text.append(">= " + m_breakpoints[v - 1]);
202
text.append("not ?");
204
text.append("\t-> " + m_class.value(m_classifications[v]) + "\n");
206
if (m_missingValueClass != -1) {
207
text.append("\t?\t-> " + m_class.value(m_missingValueClass) + "\n");
209
text.append("(" + m_correct + "/" + m_numInst + " instances correct)\n");
210
return text.toString();
211
} catch (Exception e) {
212
return "Can't print OneR classifier!";
218
private OneRRule m_rule;
220
/** The minimum bucket size */
221
private int m_minBucketSize = 6;
223
/** a ZeroR model in case no model can be built from the data */
224
private Classifier m_ZeroR;
227
* Classifies a given instance.
229
* @param inst the instance to be classified
230
* @return the classification of the instance
232
public double classifyInstance(Instance inst) throws Exception {
235
if (m_ZeroR != null) {
236
return m_ZeroR.classifyInstance(inst);
240
if (inst.isMissing(m_rule.m_attr)) {
241
if (m_rule.m_missingValueClass != -1) {
242
return m_rule.m_missingValueClass;
244
return 0; // missing values occur in test but not training set
247
if (m_rule.m_attr.isNominal()) {
248
v = (int) inst.value(m_rule.m_attr);
250
while (v < m_rule.m_breakpoints.length &&
251
inst.value(m_rule.m_attr) >= m_rule.m_breakpoints[v]) {
255
return m_rule.m_classifications[v];
259
* Returns default capabilities of the classifier.
261
* @return the capabilities of this classifier
263
public Capabilities getCapabilities() {
264
Capabilities result = super.getCapabilities();
267
result.enable(Capability.NOMINAL_ATTRIBUTES);
268
result.enable(Capability.NUMERIC_ATTRIBUTES);
269
result.enable(Capability.DATE_ATTRIBUTES);
270
result.enable(Capability.MISSING_VALUES);
273
result.enable(Capability.NOMINAL_CLASS);
274
result.enable(Capability.MISSING_CLASS_VALUES);
280
* Generates the classifier.
282
* @param instances the instances to be used for building the classifier
283
* @throws Exception if the classifier can't be built successfully
285
public void buildClassifier(Instances instances)
288
boolean noRule = true;
290
// can classifier handle the data?
291
getCapabilities().testWithFail(instances);
293
// remove instances with missing class
294
Instances data = new Instances(instances);
295
data.deleteWithMissingClass();
297
// only class? -> build ZeroR model
298
if (data.numAttributes() == 1) {
300
"Cannot build model (only class attribute present in data!), "
301
+ "using ZeroR model instead!");
302
m_ZeroR = new weka.classifiers.rules.ZeroR();
303
m_ZeroR.buildClassifier(data);
310
// for each attribute ...
311
Enumeration enu = instances.enumerateAttributes();
312
while (enu.hasMoreElements()) {
314
OneRRule r = newRule((Attribute) enu.nextElement(), data);
316
// if this attribute is the best so far, replace the rule
317
if (noRule || r.m_correct > m_rule.m_correct) {
321
} catch (Exception ex) {
326
throw new WekaException("No attributes found to work with!");
330
* Create a rule branching on this attribute.
332
* @param attr the attribute to branch on
333
* @param data the data to be used for creating the rule
334
* @return the generated rule
335
* @throws Exception if the rule can't be built successfully
337
public OneRRule newRule(Attribute attr, Instances data) throws Exception {
341
// ... create array to hold the missing value counts
342
int[] missingValueCounts =
343
new int [data.classAttribute().numValues()];
345
if (attr.isNominal()) {
346
r = newNominalRule(attr, data, missingValueCounts);
348
r = newNumericRule(attr, data, missingValueCounts);
350
r.m_missingValueClass = Utils.maxIndex(missingValueCounts);
351
if (missingValueCounts[r.m_missingValueClass] == 0) {
352
r.m_missingValueClass = -1; // signal for no missing value class
354
r.m_correct += missingValueCounts[r.m_missingValueClass];
360
* Create a rule branching on this nominal attribute.
362
* @param attr the attribute to branch on
363
* @param data the data to be used for creating the rule
364
* @param missingValueCounts to be filled in
365
* @return the generated rule
366
* @throws Exception if the rule can't be built successfully
368
public OneRRule newNominalRule(Attribute attr, Instances data,
369
int[] missingValueCounts) throws Exception {
371
// ... create arrays to hold the counts
372
int[][] counts = new int [attr.numValues()]
373
[data.classAttribute().numValues()];
375
// ... calculate the counts
376
Enumeration enu = data.enumerateInstances();
377
while (enu.hasMoreElements()) {
378
Instance i = (Instance) enu.nextElement();
379
if (i.isMissing(attr)) {
380
missingValueCounts[(int) i.classValue()]++;
382
counts[(int) i.value(attr)][(int) i.classValue()]++;
386
OneRRule r = new OneRRule(data, attr); // create a new rule
387
for (int value = 0; value < attr.numValues(); value++) {
388
int best = Utils.maxIndex(counts[value]);
389
r.m_classifications[value] = best;
390
r.m_correct += counts[value][best];
396
* Create a rule branching on this numeric attribute
398
* @param attr the attribute to branch on
399
* @param data the data to be used for creating the rule
400
* @param missingValueCounts to be filled in
401
* @return the generated rule
402
* @throws Exception if the rule can't be built successfully
404
public OneRRule newNumericRule(Attribute attr, Instances data,
405
int[] missingValueCounts) throws Exception {
408
// ... can't be more than numInstances buckets
409
int [] classifications = new int[data.numInstances()];
410
double [] breakpoints = new double[data.numInstances()];
412
// create array to hold the counts
413
int [] counts = new int[data.classAttribute().numValues()];
415
int lastInstance = data.numInstances();
417
// missing values get sorted to the end of the instances
419
while (lastInstance > 0 &&
420
data.instance(lastInstance-1).isMissing(attr)) {
422
missingValueCounts[(int) data.instance(lastInstance).
426
int cl = 0; // index of next bucket to create
428
while (i < lastInstance) { // start a new bucket
429
for (int j = 0; j < counts.length; j++) counts[j] = 0;
430
do { // fill it until it has enough of the majority class
431
it = (int) data.instance(i++).classValue();
433
} while (counts[it] < m_minBucketSize && i < lastInstance);
435
// while class remains the same, keep on filling
436
while (i < lastInstance &&
437
(int) data.instance(i).classValue() == it) {
441
while (i < lastInstance && // keep on while attr value is the same
442
(data.instance(i - 1).value(attr)
443
== data.instance(i).value(attr))) {
444
counts[(int) data.instance(i++).classValue()]++;
446
for (int j = 0; j < counts.length; j++) {
447
if (counts[j] > counts[it]) {
451
if (cl > 0) { // can we coalesce with previous class?
452
if (counts[classifications[cl - 1]] == counts[it]) {
453
it = classifications[cl - 1];
455
if (it == classifications[cl - 1]) {
459
correct += counts[it];
460
classifications[cl] = it;
461
if (i < lastInstance) {
462
breakpoints[cl] = (data.instance(i - 1).value(attr)
463
+ data.instance(i).value(attr)) / 2;
468
throw new Exception("Only missing values in the training data!");
470
OneRRule r = new OneRRule(data, attr, cl); // new rule with cl branches
471
r.m_correct = correct;
472
for (int v = 0; v < cl; v++) {
473
r.m_classifications[v] = classifications[v];
475
r.m_breakpoints[v] = breakpoints[v];
483
* Returns an enumeration describing the available options..
485
* @return an enumeration of all the available options.
487
public Enumeration listOptions() {
489
String string = "\tThe minimum number of objects in a bucket (default: 6).";
491
Vector newVector = new Vector(1);
493
newVector.addElement(new Option(string, "B", 1,
494
"-B <minimum bucket size>"));
496
return newVector.elements();
500
* Parses a given list of options. <p/>
502
<!-- options-start -->
503
* Valid options are: <p/>
505
* <pre> -B <minimum bucket size>
506
* The minimum number of objects in a bucket (default: 6).</pre>
510
* @param options the list of options as an array of strings
511
* @throws Exception if an option is not supported
513
public void setOptions(String[] options) throws Exception {
515
String bucketSizeString = Utils.getOption('B', options);
516
if (bucketSizeString.length() != 0) {
517
m_minBucketSize = Integer.parseInt(bucketSizeString);
524
* Gets the current settings of the OneR classifier.
526
* @return an array of strings suitable for passing to setOptions
528
public String [] getOptions() {
530
String [] options = new String [2];
533
options[current++] = "-B"; options[current++] = "" + m_minBucketSize;
535
while (current < options.length) {
536
options[current++] = "";
542
* Returns a string that describes the classifier as source. The
543
* classifier will be contained in a class with the given name (there may
544
* be auxiliary classes),
545
* and will contain a method with the signature:
547
* public static double classify(Object[] i);
549
* where the array <code>i</code> contains elements that are either
550
* Double, String, with missing values represented as null. The generated
551
* code is public domain and comes with no warranty.
553
* @param className the name that should be given to the source class.
554
* @return the object source described by a string
555
* @throws Exception if the souce can't be computed
557
public String toSource(String className) throws Exception {
561
result = new StringBuffer();
563
if (m_ZeroR != null) {
564
result.append(((ZeroR) m_ZeroR).toSource(className));
567
result.append("class " + className + " {\n");
568
result.append(" public static double classify(Object[] i) {\n");
569
result.append(" // chosen attribute: " + m_rule.m_attr.name() + " (" + m_rule.m_attr.index() + ")\n");
572
result.append(" // missing value?\n");
573
result.append(" if (i[" + m_rule.m_attr.index() + "] == null)\n");
574
if (m_rule.m_missingValueClass != -1)
575
result.append(" return Double.NaN;\n");
577
result.append(" return 0;\n");
581
result.append(" // prediction\n");
582
result.append(" double v = 0;\n");
583
result.append(" double[] classifications = new double[]{" + Utils.arrayToString(m_rule.m_classifications) + "};");
584
result.append(" // ");
585
for (i = 0; i < m_rule.m_classifications.length; i++) {
588
result.append(m_rule.m_class.value(m_rule.m_classifications[i]));
591
if (m_rule.m_attr.isNominal()) {
592
for (i = 0; i < m_rule.m_attr.numValues(); i++) {
595
result.append("else ");
596
result.append("if (((String) i[" + m_rule.m_attr.index() + "]).equals(\"" + m_rule.m_attr.value(i) + "\"))\n");
597
result.append(" v = " + i + "; // " + m_rule.m_class.value(m_rule.m_classifications[i]) + "\n");
601
result.append(" double[] breakpoints = new double[]{" + Utils.arrayToString(m_rule.m_breakpoints) + "};\n");
602
result.append(" while (v < breakpoints.length && \n");
603
result.append(" ((Double) i[" + m_rule.m_attr.index() + "]) >= breakpoints[(int) v]) {\n");
604
result.append(" v++;\n");
605
result.append(" }\n");
607
result.append(" return classifications[(int) v];\n");
609
result.append(" }\n");
610
result.append("}\n");
613
return result.toString();
617
* Returns a description of the classifier
619
* @return a string representation of the classifier
621
public String toString() {
624
if (m_ZeroR != null) {
625
StringBuffer buf = new StringBuffer();
626
buf.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n");
627
buf.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n\n");
628
buf.append("Warning: No model could be built, hence ZeroR model is used:\n\n");
629
buf.append(m_ZeroR.toString());
630
return buf.toString();
633
if (m_rule == null) {
634
return "OneR: No model built yet.";
636
return m_rule.toString();
640
* Returns the tip text for this property
641
* @return tip text for this property suitable for
642
* displaying in the explorer/experimenter gui
644
public String minBucketSizeTipText() {
645
return "The minimum bucket size used for discretizing numeric "
650
* Get the value of minBucketSize.
651
* @return Value of minBucketSize.
653
public int getMinBucketSize() {
655
return m_minBucketSize;
659
* Set the value of minBucketSize.
660
* @param v Value to assign to minBucketSize.
662
public void setMinBucketSize(int v) {
668
* Main method for testing this class
670
* @param argv the commandline options
672
public static void main(String [] argv) {
673
runClassifier(new OneR(), argv);