2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
23
package weka.clusterers;
25
import weka.core.CheckScheme;
26
import weka.core.FastVector;
27
import weka.core.Instance;
28
import weka.core.Instances;
29
import weka.core.MultiInstanceCapabilitiesHandler;
30
import weka.core.Option;
31
import weka.core.OptionHandler;
32
import weka.core.SerializationHelper;
33
import weka.core.TestInstances;
34
import weka.core.Utils;
35
import weka.core.WeightedInstancesHandler;
37
import java.util.Enumeration;
38
import java.util.Random;
39
import java.util.Vector;
42
* Class for examining the capabilities and finding problems with
43
* clusterers. If you implement a clusterer using the WEKA.libraries,
44
* you should run the checks on it to ensure robustness and correct
45
* operation. Passing all the tests of this object does not mean
46
* bugs in the clusterer don't exist, but this will help find some
50
* <code>java weka.clusterers.CheckClusterer -W clusterer_name
51
* -- clusterer_options </code><p/>
53
* CheckClusterer reports on the following:
55
* <li> Clusterer abilities
57
* <li> Possible command line options to the clusterer </li>
58
* <li> Whether the clusterer can predict nominal, numeric, string,
59
* date or relational class attributes.</li>
60
* <li> Whether the clusterer can handle numeric predictor attributes </li>
61
* <li> Whether the clusterer can handle nominal predictor attributes </li>
62
* <li> Whether the clusterer can handle string predictor attributes </li>
63
* <li> Whether the clusterer can handle date predictor attributes </li>
64
* <li> Whether the clusterer can handle relational predictor attributes </li>
65
* <li> Whether the clusterer can handle multi-instance data </li>
66
* <li> Whether the clusterer can handle missing predictor values </li>
67
* <li> Whether the clusterer can handle instance weights </li>
70
* <li> Correct functioning
72
* <li> Correct initialisation during buildClusterer (i.e. no result
73
* changes when buildClusterer called repeatedly) </li>
74
* <li> Whether the clusterer alters the data pased to it
75
* (number of instances, instance order, instance weights, etc) </li>
78
* <li> Degenerate cases
80
* <li> building clusterer with zero training instances </li>
81
* <li> all but one predictor attribute values missing </li>
82
* <li> all predictor attribute values missing </li>
83
* <li> all but one class values missing </li>
84
* <li> all class values missing </li>
88
* Running CheckClusterer with the debug option set will output the
89
* training dataset for any failed tests.<p/>
91
* The <code>weka.clusterers.AbstractClustererTest</code> uses this
92
* class to test all the clusterers. Any changes here, have to be
93
* checked in that abstract test class, too. <p/>
95
<!-- options-start -->
96
* Valid options are: <p/>
99
* Turn on debugging output.</pre>
102
* Silent mode - prints nothing to stdout.</pre>
104
* <pre> -N <num>
105
* The number of instances in the datasets (default 20).</pre>
107
* <pre> -nominal <num>
108
* The number of nominal attributes (default 2).</pre>
110
* <pre> -nominal-values <num>
111
* The number of values for nominal attributes (default 1).</pre>
113
* <pre> -numeric <num>
114
* The number of numeric attributes (default 1).</pre>
116
* <pre> -string <num>
117
* The number of string attributes (default 1).</pre>
119
* <pre> -date <num>
120
* The number of date attributes (default 1).</pre>
122
* <pre> -relational <num>
123
* The number of relational attributes (default 1).</pre>
125
* <pre> -num-instances-relational <num>
126
* The number of instances in relational/bag attributes (default 10).</pre>
128
* <pre> -words <comma-separated-list>
129
* The words to use in string attributes.</pre>
131
* <pre> -word-separators <chars>
132
* The word separators to use in string attributes.</pre>
135
* Full name of the clusterer analyzed.
136
* eg: weka.clusterers.SimpleKMeans
137
* (default weka.clusterers.SimpleKMeans)</pre>
140
* Options specific to clusterer weka.clusterers.SimpleKMeans:
143
* <pre> -N <num>
144
* number of clusters. (default = 2).</pre>
146
* <pre> -S <num>
147
* random number seed.
152
* Options after -- are passed to the designated clusterer.<p/>
154
* @author Len Trigg (trigg@cs.waikato.ac.nz)
155
* @author FracPete (fracpete at waikato dot ac dot nz)
156
* @version $Revision: 1.8 $
159
public class CheckClusterer
160
extends CheckScheme {
163
* Note about test methods:
164
* - methods return array of booleans
165
* - first index: success or not
166
* - second index: acceptable or not (e.g., Exception is OK)
168
* FracPete (fracpete at waikato dot ac dot nz)
171
/*** The clusterer to be examined */
172
protected Clusterer m_Clusterer = new SimpleKMeans();
175
* default constructor
177
public CheckClusterer() {
184
* Returns an enumeration describing the available options.
186
* @return an enumeration of all the available options.
188
public Enumeration listOptions() {
189
Vector result = new Vector();
191
Enumeration en = super.listOptions();
192
while (en.hasMoreElements())
193
result.addElement(en.nextElement());
195
result.addElement(new Option(
196
"\tFull name of the clusterer analyzed.\n"
197
+"\teg: weka.clusterers.SimpleKMeans\n"
198
+ "\t(default weka.clusterers.SimpleKMeans)",
201
if ((m_Clusterer != null)
202
&& (m_Clusterer instanceof OptionHandler)) {
203
result.addElement(new Option("", "", 0,
204
"\nOptions specific to clusterer "
205
+ m_Clusterer.getClass().getName()
207
Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
208
while (enu.hasMoreElements())
209
result.addElement(enu.nextElement());
212
return result.elements();
216
* Parses a given list of options. <p/>
218
<!-- options-start -->
219
* Valid options are: <p/>
222
* Turn on debugging output.</pre>
225
* Silent mode - prints nothing to stdout.</pre>
227
* <pre> -N <num>
228
* The number of instances in the datasets (default 20).</pre>
230
* <pre> -nominal <num>
231
* The number of nominal attributes (default 2).</pre>
233
* <pre> -nominal-values <num>
234
* The number of values for nominal attributes (default 1).</pre>
236
* <pre> -numeric <num>
237
* The number of numeric attributes (default 1).</pre>
239
* <pre> -string <num>
240
* The number of string attributes (default 1).</pre>
242
* <pre> -date <num>
243
* The number of date attributes (default 1).</pre>
245
* <pre> -relational <num>
246
* The number of relational attributes (default 1).</pre>
248
* <pre> -num-instances-relational <num>
249
* The number of instances in relational/bag attributes (default 10).</pre>
251
* <pre> -words <comma-separated-list>
252
* The words to use in string attributes.</pre>
254
* <pre> -word-separators <chars>
255
* The word separators to use in string attributes.</pre>
258
* Full name of the clusterer analyzed.
259
* eg: weka.clusterers.SimpleKMeans
260
* (default weka.clusterers.SimpleKMeans)</pre>
263
* Options specific to clusterer weka.clusterers.SimpleKMeans:
266
* <pre> -N <num>
267
* number of clusters. (default = 2).</pre>
269
* <pre> -S <num>
270
* random number seed.
275
* @param options the list of options as an array of strings
276
* @throws Exception if an option is not supported
278
public void setOptions(String[] options) throws Exception {
281
tmpStr = Utils.getOption('N', options);
283
super.setOptions(options);
285
if (tmpStr.length() != 0)
286
setNumInstances(Integer.parseInt(tmpStr));
290
tmpStr = Utils.getOption('W', options);
291
if (tmpStr.length() == 0)
292
tmpStr = weka.clusterers.SimpleKMeans.class.getName();
298
Utils.partitionOptions(options)));
302
* Gets the current settings of the CheckClusterer.
304
* @return an array of strings suitable for passing to setOptions
306
public String[] getOptions() {
311
result = new Vector();
313
options = super.getOptions();
314
for (i = 0; i < options.length; i++)
315
result.add(options[i]);
317
if (getClusterer() != null) {
319
result.add(getClusterer().getClass().getName());
322
if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler))
323
options = ((OptionHandler) m_Clusterer).getOptions();
325
options = new String[0];
327
if (options.length > 0) {
329
for (i = 0; i < options.length; i++)
330
result.add(options[i]);
333
return (String[]) result.toArray(new String[result.size()]);
337
* Begin the tests, reporting results to System.out
339
public void doTests() {
341
if (getClusterer() == null) {
342
println("\n=== No clusterer set ===");
345
println("\n=== Check on Clusterer: "
346
+ getClusterer().getClass().getName()
350
println("--> Checking for interfaces");
352
boolean updateable = updateableClusterer()[0];
353
boolean weightedInstancesHandler = weightedInstancesHandler()[0];
354
boolean multiInstanceHandler = multiInstanceHandler()[0];
355
println("--> Clusterer tests");
356
declaresSerialVersionUID();
357
runTests(weightedInstancesHandler, multiInstanceHandler, updateable);
361
* Set the clusterer for testing.
363
* @param newClusterer the Clusterer to use.
365
public void setClusterer(Clusterer newClusterer) {
366
m_Clusterer = newClusterer;
370
* Get the clusterer used as the clusterer
372
* @return the clusterer used as the clusterer
374
public Clusterer getClusterer() {
379
* Run a battery of tests
381
* @param weighted true if the clusterer says it handles weights
382
* @param multiInstance true if the clusterer is a multi-instance clusterer
383
* @param updateable true if the classifier is updateable
385
protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) {
387
boolean PNom = canPredict(true, false, false, false, false, multiInstance)[0];
388
boolean PNum = canPredict(false, true, false, false, false, multiInstance)[0];
389
boolean PStr = canPredict(false, false, true, false, false, multiInstance)[0];
390
boolean PDat = canPredict(false, false, false, true, false, multiInstance)[0];
393
PRel = canPredict(false, false, false, false, true, multiInstance)[0];
397
if (PNom || PNum || PStr || PDat || PRel) {
399
instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance);
401
canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance);
402
boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel,
403
multiInstance, true, 20)[0];
404
if (handleMissingPredictors)
405
canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100);
407
correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance);
408
datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors);
410
updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance);
415
* Checks whether the scheme can take command line options.
417
* @return index 0 is true if the clusterer can take options
419
protected boolean[] canTakeOptions() {
421
boolean[] result = new boolean[2];
424
if (m_Clusterer instanceof OptionHandler) {
427
println("\n=== Full report ===");
428
Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();
429
while (enu.hasMoreElements()) {
430
Option option = (Option) enu.nextElement();
431
print(option.synopsis() + "\n"
432
+ option.description() + "\n");
447
* Checks whether the scheme can build models incrementally.
449
* @return index 0 is true if the clusterer can train incrementally
451
protected boolean[] updateableClusterer() {
453
boolean[] result = new boolean[2];
455
print("updateable clusterer...");
456
if (m_Clusterer instanceof UpdateableClusterer) {
469
* Checks whether the scheme says it can handle instance weights.
471
* @return true if the clusterer handles instance weights
473
protected boolean[] weightedInstancesHandler() {
475
boolean[] result = new boolean[2];
477
print("weighted instances clusterer...");
478
if (m_Clusterer instanceof WeightedInstancesHandler) {
491
* Checks whether the scheme handles multi-instance data.
493
* @return true if the clusterer handles multi-instance data
495
protected boolean[] multiInstanceHandler() {
496
boolean[] result = new boolean[2];
498
print("multi-instance clusterer...");
499
if (m_Clusterer instanceof MultiInstanceCapabilitiesHandler) {
512
* tests for a serialVersionUID. Fails in case the scheme doesn't declare
515
* @return index 0 is true if the scheme declares a UID
517
protected boolean[] declaresSerialVersionUID() {
518
boolean[] result = new boolean[2];
520
print("serialVersionUID...");
522
result[0] = !SerializationHelper.needsUID(m_Clusterer.getClass());
533
* Checks basic prediction of the scheme, for simple non-troublesome
536
* @param nominalPredictor if true use nominal predictor attributes
537
* @param numericPredictor if true use numeric predictor attributes
538
* @param stringPredictor if true use string predictor attributes
539
* @param datePredictor if true use date predictor attributes
540
* @param relationalPredictor if true use relational predictor attributes
541
* @param multiInstance whether multi-instance is needed
542
* @return index 0 is true if the test was passed, index 1 is true if test
545
protected boolean[] canPredict(
546
boolean nominalPredictor,
547
boolean numericPredictor,
548
boolean stringPredictor,
549
boolean datePredictor,
550
boolean relationalPredictor,
551
boolean multiInstance) {
553
print("basic predict");
554
printAttributeSummary(
555
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
557
FastVector accepts = new FastVector();
558
accepts.addElement("unary");
559
accepts.addElement("binary");
560
accepts.addElement("nominal");
561
accepts.addElement("numeric");
562
accepts.addElement("string");
563
accepts.addElement("date");
564
accepts.addElement("relational");
565
accepts.addElement("multi-instance");
566
accepts.addElement("not in classpath");
567
int numTrain = getNumInstances(), missingLevel = 0;
568
boolean predictorMissing = false;
570
return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
571
datePredictor, relationalPredictor,
573
missingLevel, predictorMissing,
579
* Checks whether the scheme can handle zero training instances.
581
* @param nominalPredictor if true use nominal predictor attributes
582
* @param numericPredictor if true use numeric predictor attributes
583
* @param stringPredictor if true use string predictor attributes
584
* @param datePredictor if true use date predictor attributes
585
* @param relationalPredictor if true use relational predictor attributes
586
* @param multiInstance whether multi-instance is needed
587
* @return index 0 is true if the test was passed, index 1 is true if test
590
protected boolean[] canHandleZeroTraining(
591
boolean nominalPredictor,
592
boolean numericPredictor,
593
boolean stringPredictor,
594
boolean datePredictor,
595
boolean relationalPredictor,
596
boolean multiInstance) {
598
print("handle zero training instances");
599
printAttributeSummary(
600
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
602
FastVector accepts = new FastVector();
603
accepts.addElement("train");
604
accepts.addElement("value");
605
int numTrain = 0, missingLevel = 0;
606
boolean predictorMissing = false;
609
nominalPredictor, numericPredictor, stringPredictor,
610
datePredictor, relationalPredictor,
612
missingLevel, predictorMissing,
618
* Checks whether the scheme correctly initialises models when
619
* buildClusterer is called. This test calls buildClusterer with
620
* one training dataset. buildClusterer is then called on a training set
621
* with different structure, and then again with the original training set.
622
* If the equals method of the ClusterEvaluation class returns
623
* false, this is noted as incorrect build initialisation.
625
* @param nominalPredictor if true use nominal predictor attributes
626
* @param numericPredictor if true use numeric predictor attributes
627
* @param stringPredictor if true use string predictor attributes
628
* @param datePredictor if true use date predictor attributes
629
* @param relationalPredictor if true use relational predictor attributes
630
* @param multiInstance whether multi-instance is needed
631
* @return index 0 is true if the test was passed
633
protected boolean[] correctBuildInitialisation(
634
boolean nominalPredictor,
635
boolean numericPredictor,
636
boolean stringPredictor,
637
boolean datePredictor,
638
boolean relationalPredictor,
639
boolean multiInstance) {
641
boolean[] result = new boolean[2];
643
print("correct initialisation during buildClusterer");
644
printAttributeSummary(
645
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
647
int numTrain = getNumInstances(), missingLevel = 0;
648
boolean predictorMissing = false;
650
Instances train1 = null;
651
Instances train2 = null;
652
Clusterer clusterer = null;
653
ClusterEvaluation evaluation1A = null;
654
ClusterEvaluation evaluation1B = null;
655
ClusterEvaluation evaluation2 = null;
656
boolean built = false;
660
// Make two train sets with different numbers of attributes
661
train1 = makeTestDataset(42, numTrain,
662
nominalPredictor ? getNumNominal() : 0,
663
numericPredictor ? getNumNumeric() : 0,
664
stringPredictor ? getNumString() : 0,
665
datePredictor ? getNumDate() : 0,
666
relationalPredictor ? getNumRelational() : 0,
668
train2 = makeTestDataset(84, numTrain,
669
nominalPredictor ? getNumNominal() + 1 : 0,
670
numericPredictor ? getNumNumeric() + 1 : 0,
671
stringPredictor ? getNumString() : 0,
672
datePredictor ? getNumDate() : 0,
673
relationalPredictor ? getNumRelational() : 0,
675
if (nominalPredictor && !multiInstance) {
676
train1.deleteAttributeAt(0);
677
train2.deleteAttributeAt(0);
679
if (missingLevel > 0) {
680
addMissing(train1, missingLevel, predictorMissing);
681
addMissing(train2, missingLevel, predictorMissing);
684
clusterer = Clusterer.makeCopies(getClusterer(), 1)[0];
685
evaluation1A = new ClusterEvaluation();
686
evaluation1B = new ClusterEvaluation();
687
evaluation2 = new ClusterEvaluation();
688
} catch (Exception ex) {
689
throw new Error("Error setting up for tests: " + ex.getMessage());
693
clusterer.buildClusterer(train1);
695
evaluation1A.setClusterer(clusterer);
696
evaluation1A.evaluateClusterer(train1);
700
clusterer.buildClusterer(train2);
702
evaluation2.setClusterer(clusterer);
703
evaluation2.evaluateClusterer(train2);
707
clusterer.buildClusterer(train1);
709
evaluation1B.setClusterer(clusterer);
710
evaluation1B.evaluateClusterer(train1);
713
if (!evaluation1A.equals(evaluation1B)) {
715
println("\n=== Full report ===\n");
716
println("First buildClusterer()");
717
println(evaluation1A.clusterResultsToString() + "\n\n");
718
println("Second buildClusterer()");
719
println(evaluation1B.clusterResultsToString() + "\n\n");
721
throw new Exception("Results differ between buildClusterer calls");
726
if (false && m_Debug) {
727
println("\n=== Full report ===\n");
728
println("First buildClusterer()");
729
println(evaluation1A.clusterResultsToString() + "\n\n");
730
println("Second buildClusterer()");
731
println(evaluation1B.clusterResultsToString() + "\n\n");
734
catch (Exception ex) {
738
println("\n=== Full Report ===");
739
print("Problem during");
747
print(" of dataset 1");
750
print(" of dataset 2");
753
print(" of dataset 1 (2nd build)");
756
print(", comparing results from builds of dataset 1");
759
println(": " + ex.getMessage() + "\n");
760
println("here are the datasets:\n");
761
println("=== Train1 Dataset ===\n"
762
+ train1.toString() + "\n");
763
println("=== Train2 Dataset ===\n"
764
+ train2.toString() + "\n");
772
* Checks basic missing value handling of the scheme. If the missing
773
* values cause an exception to be thrown by the scheme, this will be
776
* @param nominalPredictor if true use nominal predictor attributes
777
* @param numericPredictor if true use numeric predictor attributes
778
* @param stringPredictor if true use string predictor attributes
779
* @param datePredictor if true use date predictor attributes
780
* @param relationalPredictor if true use relational predictor attributes
781
* @param multiInstance whether multi-instance is needed
782
* @param predictorMissing true if the missing values may be in
784
* @param missingLevel the percentage of missing values
785
* @return index 0 is true if the test was passed, index 1 is true if test
788
protected boolean[] canHandleMissing(
789
boolean nominalPredictor,
790
boolean numericPredictor,
791
boolean stringPredictor,
792
boolean datePredictor,
793
boolean relationalPredictor,
794
boolean multiInstance,
795
boolean predictorMissing,
798
if (missingLevel == 100)
801
if (predictorMissing) {
805
printAttributeSummary(
806
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
808
FastVector accepts = new FastVector();
809
accepts.addElement("missing");
810
accepts.addElement("value");
811
accepts.addElement("train");
812
int numTrain = getNumInstances();
814
return runBasicTest(nominalPredictor, numericPredictor, stringPredictor,
815
datePredictor, relationalPredictor,
817
missingLevel, predictorMissing,
823
* Checks whether the clusterer can handle instance weights.
824
* This test compares the clusterer performance on two datasets
825
* that are identical except for the training weights. If the
826
* results change, then the clusterer must be using the weights. It
827
* may be possible to get a false positive from this test if the
828
* weight changes aren't significant enough to induce a change
829
* in clusterer performance (but the weights are chosen to minimize
830
* the likelihood of this).
832
* @param nominalPredictor if true use nominal predictor attributes
833
* @param numericPredictor if true use numeric predictor attributes
834
* @param stringPredictor if true use string predictor attributes
835
* @param datePredictor if true use date predictor attributes
836
* @param relationalPredictor if true use relational predictor attributes
837
* @param multiInstance whether multi-instance is needed
838
* @return index 0 true if the test was passed
840
protected boolean[] instanceWeights(
841
boolean nominalPredictor,
842
boolean numericPredictor,
843
boolean stringPredictor,
844
boolean datePredictor,
845
boolean relationalPredictor,
846
boolean multiInstance) {
848
print("clusterer uses instance weights");
849
printAttributeSummary(
850
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
852
int numTrain = 2*getNumInstances(), missingLevel = 0;
853
boolean predictorMissing = false;
855
boolean[] result = new boolean[2];
856
Instances train = null;
857
Clusterer [] clusterers = null;
858
ClusterEvaluation evaluationB = null;
859
ClusterEvaluation evaluationI = null;
860
boolean built = false;
861
boolean evalFail = false;
863
train = makeTestDataset(42, numTrain,
864
nominalPredictor ? getNumNominal() + 1 : 0,
865
numericPredictor ? getNumNumeric() + 1 : 0,
866
stringPredictor ? getNumString() : 0,
867
datePredictor ? getNumDate() : 0,
868
relationalPredictor ? getNumRelational() : 0,
870
if (nominalPredictor && !multiInstance)
871
train.deleteAttributeAt(0);
872
if (missingLevel > 0)
873
addMissing(train, missingLevel, predictorMissing);
874
clusterers = Clusterer.makeCopies(getClusterer(), 2);
875
evaluationB = new ClusterEvaluation();
876
evaluationI = new ClusterEvaluation();
877
clusterers[0].buildClusterer(train);
878
evaluationB.setClusterer(clusterers[0]);
879
} catch (Exception ex) {
880
throw new Error("Error setting up for tests: " + ex.getMessage());
884
// Now modify instance weights and re-built/test
885
for (int i = 0; i < train.numInstances(); i++) {
886
train.instance(i).setWeight(0);
888
Random random = new Random(1);
889
for (int i = 0; i < train.numInstances() / 2; i++) {
890
int inst = Math.abs(random.nextInt()) % train.numInstances();
891
int weight = Math.abs(random.nextInt()) % 10 + 1;
892
train.instance(inst).setWeight(weight);
894
clusterers[1].buildClusterer(train);
896
evaluationI.setClusterer(clusterers[1]);
897
if (evaluationB.equals(evaluationI)) {
900
throw new Exception("evalFail");
905
} catch (Exception ex) {
910
println("\n=== Full Report ===");
913
println("Results don't differ between non-weighted and "
914
+ "weighted instance models.");
915
println("Here are the results:\n");
916
println("\nboth methods\n");
917
println(evaluationB.clusterResultsToString());
919
print("Problem during");
925
println(": " + ex.getMessage() + "\n");
927
println("Here is the dataset:\n");
928
println("=== Train Dataset ===\n"
929
+ train.toString() + "\n");
930
println("=== Train Weights ===\n");
931
for (int i = 0; i < train.numInstances(); i++) {
932
println(" " + (i + 1)
933
+ " " + train.instance(i).weight());
942
* Checks whether the scheme alters the training dataset during
943
* training. If the scheme needs to modify the training
944
* data it should take a copy of the training data. Currently checks
945
* for changes to header structure, number of instances, order of
946
* instances, instance weights.
948
* @param nominalPredictor if true use nominal predictor attributes
949
* @param numericPredictor if true use numeric predictor attributes
950
* @param stringPredictor if true use string predictor attributes
951
* @param datePredictor if true use date predictor attributes
952
* @param relationalPredictor if true use relational predictor attributes
953
* @param multiInstance whether multi-instance is needed
954
* @param predictorMissing true if we know the clusterer can handle
955
* (at least) moderate missing predictor values
956
* @return index 0 is true if the test was passed
958
protected boolean[] datasetIntegrity(
959
boolean nominalPredictor,
960
boolean numericPredictor,
961
boolean stringPredictor,
962
boolean datePredictor,
963
boolean relationalPredictor,
964
boolean multiInstance,
965
boolean predictorMissing) {
967
print("clusterer doesn't alter original datasets");
968
printAttributeSummary(
969
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
971
int numTrain = getNumInstances(), missingLevel = 20;
973
boolean[] result = new boolean[2];
974
Instances train = null;
975
Clusterer clusterer = null;
977
train = makeTestDataset(42, numTrain,
978
nominalPredictor ? getNumNominal() : 0,
979
numericPredictor ? getNumNumeric() : 0,
980
stringPredictor ? getNumString() : 0,
981
datePredictor ? getNumDate() : 0,
982
relationalPredictor ? getNumRelational() : 0,
984
if (nominalPredictor && !multiInstance)
985
train.deleteAttributeAt(0);
986
if (missingLevel > 0)
987
addMissing(train, missingLevel, predictorMissing);
988
clusterer = Clusterer.makeCopies(getClusterer(), 1)[0];
989
} catch (Exception ex) {
990
throw new Error("Error setting up for tests: " + ex.getMessage());
993
Instances trainCopy = new Instances(train);
994
clusterer.buildClusterer(trainCopy);
995
compareDatasets(train, trainCopy);
999
} catch (Exception ex) {
1004
println("\n=== Full Report ===");
1005
print("Problem during training");
1006
println(": " + ex.getMessage() + "\n");
1007
println("Here is the dataset:\n");
1008
println("=== Train Dataset ===\n"
1009
+ train.toString() + "\n");
1017
* Checks whether an updateable scheme produces the same model when
1018
* trained incrementally as when batch trained. The model itself
1019
* cannot be compared, so we compare the evaluation on test data
1020
* for both models. It is possible to get a false positive on this
1021
* test (likelihood depends on the classifier).
1023
* @param nominalPredictor if true use nominal predictor attributes
1024
* @param numericPredictor if true use numeric predictor attributes
1025
* @param stringPredictor if true use string predictor attributes
1026
* @param datePredictor if true use date predictor attributes
1027
* @param relationalPredictor if true use relational predictor attributes
1028
* @param multiInstance whether multi-instance is needed
1029
* @return index 0 is true if the test was passed
1031
protected boolean[] updatingEquality(
1032
boolean nominalPredictor,
1033
boolean numericPredictor,
1034
boolean stringPredictor,
1035
boolean datePredictor,
1036
boolean relationalPredictor,
1037
boolean multiInstance) {
1039
print("incremental training produces the same results"
1040
+ " as batch training");
1041
printAttributeSummary(
1042
nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
1044
int numTrain = getNumInstances(), missingLevel = 0;
1045
boolean predictorMissing = false, classMissing = false;
1047
boolean[] result = new boolean[2];
1048
Instances train = null;
1049
Clusterer[] clusterers = null;
1050
ClusterEvaluation evaluationB = null;
1051
ClusterEvaluation evaluationI = null;
1052
boolean built = false;
1054
train = makeTestDataset(42, numTrain,
1055
nominalPredictor ? getNumNominal() : 0,
1056
numericPredictor ? getNumNumeric() : 0,
1057
stringPredictor ? getNumString() : 0,
1058
datePredictor ? getNumDate() : 0,
1059
relationalPredictor ? getNumRelational() : 0,
1061
if (missingLevel > 0)
1062
addMissing(train, missingLevel, predictorMissing, classMissing);
1063
clusterers = Clusterer.makeCopies(getClusterer(), 2);
1064
evaluationB = new ClusterEvaluation();
1065
evaluationI = new ClusterEvaluation();
1066
clusterers[0].buildClusterer(train);
1067
evaluationB.setClusterer(clusterers[0]);
1068
} catch (Exception ex) {
1069
throw new Error("Error setting up for tests: " + ex.getMessage());
1072
clusterers[1].buildClusterer(new Instances(train, 0));
1073
for (int i = 0; i < train.numInstances(); i++) {
1074
((UpdateableClusterer)clusterers[1]).updateClusterer(
1078
evaluationI.setClusterer(clusterers[1]);
1079
if (!evaluationB.equals(evaluationI)) {
1084
println("\n=== Full Report ===");
1085
println("Results differ between batch and "
1086
+ "incrementally built models.\n"
1087
+ "Depending on the classifier, this may be OK");
1088
println("Here are the results:\n");
1089
println("\nbatch built results\n" + evaluationB.clusterResultsToString());
1090
println("\nincrementally built results\n" + evaluationI.clusterResultsToString());
1091
println("Here are the datasets:\n");
1092
println("=== Train Dataset ===\n"
1093
+ train.toString() + "\n");
1100
} catch (Exception ex) {
1103
print("Problem during");
1108
println(": " + ex.getMessage() + "\n");
1115
* Runs a text on the datasets with the given characteristics.
1117
* @param nominalPredictor if true use nominal predictor attributes
1118
* @param numericPredictor if true use numeric predictor attributes
1119
* @param stringPredictor if true use string predictor attributes
1120
* @param datePredictor if true use date predictor attributes
1121
* @param relationalPredictor if true use relational predictor attributes
1122
* @param multiInstance whether multi-instance is needed
1123
* @param missingLevel the percentage of missing values
1124
* @param predictorMissing true if the missing values may be in
1126
* @param numTrain the number of instances in the training set
1127
* @param accepts the acceptable string in an exception
1128
* @return index 0 is true if the test was passed, index 1 is true if test
1131
protected boolean[] runBasicTest(boolean nominalPredictor,
1132
boolean numericPredictor,
1133
boolean stringPredictor,
1134
boolean datePredictor,
1135
boolean relationalPredictor,
1136
boolean multiInstance,
1138
boolean predictorMissing,
1140
FastVector accepts) {
1142
boolean[] result = new boolean[2];
1143
Instances train = null;
1144
Clusterer clusterer = null;
1146
train = makeTestDataset(42, numTrain,
1147
nominalPredictor ? getNumNominal() : 0,
1148
numericPredictor ? getNumNumeric() : 0,
1149
stringPredictor ? getNumString() : 0,
1150
datePredictor ? getNumDate() : 0,
1151
relationalPredictor ? getNumRelational() : 0,
1153
if (nominalPredictor && !multiInstance)
1154
train.deleteAttributeAt(0);
1155
if (missingLevel > 0)
1156
addMissing(train, missingLevel, predictorMissing);
1157
clusterer = Clusterer.makeCopies(getClusterer(), 1)[0];
1158
} catch (Exception ex) {
1159
ex.printStackTrace();
1160
throw new Error("Error setting up for tests: " + ex.getMessage());
1163
clusterer.buildClusterer(train);
1167
catch (Exception ex) {
1168
boolean acceptable = false;
1169
String msg = ex.getMessage().toLowerCase();
1170
for (int i = 0; i < accepts.size(); i++) {
1171
if (msg.indexOf((String)accepts.elementAt(i)) >= 0) {
1176
println("no" + (acceptable ? " (OK error message)" : ""));
1177
result[1] = acceptable;
1180
println("\n=== Full Report ===");
1181
print("Problem during training");
1182
println(": " + ex.getMessage() + "\n");
1184
if (accepts.size() > 0) {
1185
print("Error message doesn't mention ");
1186
for (int i = 0; i < accepts.size(); i++) {
1190
print('"' + (String)accepts.elementAt(i) + '"');
1193
println("here is the dataset:\n");
1194
println("=== Train Dataset ===\n"
1195
+ train.toString() + "\n");
1204
* Add missing values to a dataset.
1206
* @param data the instances to add missing values to
1207
* @param level the level of missing values to add (if positive, this
1208
* is the probability that a value will be set to missing, if negative
1209
* all but one value will be set to missing (not yet implemented))
1210
* @param predictorMissing if true, predictor attributes will be modified
1212
protected void addMissing(Instances data, int level, boolean predictorMissing) {
1214
Random random = new Random(1);
1215
for (int i = 0; i < data.numInstances(); i++) {
1216
Instance current = data.instance(i);
1217
for (int j = 0; j < data.numAttributes(); j++) {
1218
if (predictorMissing) {
1219
if (Math.abs(random.nextInt()) % 100 < level)
1220
current.setMissing(j);
1227
* Make a simple set of instances with variable position of the class
1228
* attribute, which can later be modified for use in specific tests.
1230
* @param seed the random number seed
1231
* @param numInstances the number of instances to generate
1232
* @param numNominal the number of nominal attributes
1233
* @param numNumeric the number of numeric attributes
1234
* @param numString the number of string attributes
1235
* @param numDate the number of date attributes
1236
* @param numRelational the number of relational attributes
1237
* @param multiInstance whether the dataset should a multi-instance dataset
1238
* @return the test dataset
1239
* @throws Exception if the dataset couldn't be generated
1240
* @see TestInstances#CLASS_IS_LAST
1242
protected Instances makeTestDataset(int seed, int numInstances,
1243
int numNominal, int numNumeric,
1244
int numString, int numDate,
1246
boolean multiInstance)
1249
TestInstances dataset = new TestInstances();
1251
dataset.setSeed(seed);
1252
dataset.setNumInstances(numInstances);
1253
dataset.setNumNominal(numNominal);
1254
dataset.setNumNumeric(numNumeric);
1255
dataset.setNumString(numString);
1256
dataset.setNumDate(numDate);
1257
dataset.setNumRelational(numRelational);
1258
dataset.setClassIndex(TestInstances.NO_CLASS);
1259
dataset.setMultiInstance(multiInstance);
1261
return dataset.generate();
1265
* Print out a short summary string for the dataset characteristics
1267
* @param nominalPredictor true if nominal predictor attributes are present
1268
* @param numericPredictor true if numeric predictor attributes are present
1269
* @param stringPredictor true if string predictor attributes are present
1270
* @param datePredictor true if date predictor attributes are present
1271
* @param relationalPredictor true if relational predictor attributes are present
1272
* @param multiInstance whether multi-instance is needed
1274
protected void printAttributeSummary(boolean nominalPredictor,
1275
boolean numericPredictor,
1276
boolean stringPredictor,
1277
boolean datePredictor,
1278
boolean relationalPredictor,
1279
boolean multiInstance) {
1283
if (numericPredictor)
1286
if (nominalPredictor) {
1287
if (str.length() > 0)
1292
if (stringPredictor) {
1293
if (str.length() > 0)
1298
if (datePredictor) {
1299
if (str.length() > 0)
1304
if (relationalPredictor) {
1305
if (str.length() > 0)
1307
str += "relational";
1310
str = " (" + str + " predictors)";
1316
* Test method for this class
1318
* @param args the commandline options
1320
public static void main(String [] args) {
1321
runCheck(new CheckClusterer(), args);