2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* MakeDensityBasedClusterer.java
19
* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
23
package weka.clusterers;
25
import weka.core.Capabilities;
26
import weka.core.Instance;
27
import weka.core.Instances;
28
import weka.core.Option;
29
import weka.core.OptionHandler;
30
import weka.core.Utils;
31
import weka.core.WeightedInstancesHandler;
32
import weka.estimators.DiscreteEstimator;
33
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
35
import java.util.Enumeration;
36
import java.util.Vector;
39
<!-- globalinfo-start -->
40
* Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
42
<!-- globalinfo-end -->
44
<!-- options-start -->
45
* Valid options are: <p/>
47
* <pre> -M <num>
48
* minimum allowable standard deviation for normal density computation
49
* (default 1e-6)</pre>
51
* <pre> -W <clusterer name>
53
* (default weka.clusterers.SimpleKMeans)</pre>
56
* Options specific to clusterer weka.clusterers.SimpleKMeans:
59
* <pre> -N <num>
60
* number of clusters. (default = 2).</pre>
62
* <pre> -S <num>
68
* Options after "--" are passed on to the base clusterer.
70
* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
71
* @author Mark Hall (mhall@cs.waikato.ac.nz)
72
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
73
* @version $Revision: 1.13 $
75
public class MakeDensityBasedClusterer
76
extends DensityBasedClusterer
77
implements NumberOfClustersRequestable,
79
WeightedInstancesHandler {
81
/** for serialization */
82
static final long serialVersionUID = -5643302427972186631L;
84
/** holds training instances header information */
85
private Instances m_theInstances;
86
/** prior probabilities for the fitted clusters */
87
private double [] m_priors;
88
/** normal distributions fitted to each numeric attribute in each cluster */
89
private double [][][] m_modelNormal;
90
/** discrete distributions fitted to each discrete attribute in each cluster */
91
private DiscreteEstimator [][] m_model;
92
/** default minimum standard deviation */
93
private double m_minStdDev = 1e-6;
94
/** The clusterer being wrapped */
95
private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
96
/** globally replace missing values */
97
private ReplaceMissingValues m_replaceMissing;
100
* Default constructor.
103
public MakeDensityBasedClusterer() {
108
* Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
110
* @param toWrap the clusterer to wrap around
112
public MakeDensityBasedClusterer(Clusterer toWrap) {
114
setClusterer(toWrap);
118
* Returns a string describing classifier
119
* @return a description suitable for
120
* displaying in the explorer/experimenter gui
122
public String globalInfo() {
124
"Class for wrapping a Clusterer to make it return a distribution "
125
+ "and density. Fits normal distributions and discrete distributions "
126
+ "within each cluster produced by the wrapped clusterer. Supports the "
127
+ "NumberOfClustersRequestable interface only if the wrapped Clusterer "
132
* String describing default clusterer.
134
* @return the default clusterer classname
136
protected String defaultClustererString() {
137
return SimpleKMeans.class.getName();
141
* Set the number of clusters to generate.
143
* @param n the number of clusters to generate
144
* @throws Exception if the wrapped clusterer has not been set, or if
145
* the wrapped clusterer does not implement this facility.
147
public void setNumClusters(int n) throws Exception {
148
if (m_wrappedClusterer == null) {
149
throw new Exception("Can't set the number of clusters to generate - "
150
+"no clusterer has been set yet.");
152
if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {
153
throw new Exception("Can't set the number of clusters to generate - "
154
+"wrapped clusterer does not support this facility.");
157
((NumberOfClustersRequestable)m_wrappedClusterer).setNumClusters(n);
161
* Returns default capabilities of the clusterer (i.e., of the wrapper
164
* @return the capabilities of this clusterer
166
public Capabilities getCapabilities() {
167
if (m_wrappedClusterer != null)
168
return m_wrappedClusterer.getCapabilities();
170
return super.getCapabilities();
174
* Builds a clusterer for a set of instances.
176
* @param data the instances to train the clusterer with
177
* @throws Exception if the clusterer hasn't been set or something goes wrong
179
public void buildClusterer(Instances data) throws Exception {
180
// can clusterer handle the data?
181
getCapabilities().testWithFail(data);
183
m_replaceMissing = new ReplaceMissingValues();
184
m_replaceMissing.setInputFormat(data);
185
data = weka.filters.Filter.useFilter(data, m_replaceMissing);
187
m_theInstances = new Instances(data, 0);
188
if (m_wrappedClusterer == null) {
189
throw new Exception("No clusterer has been set");
191
m_wrappedClusterer.buildClusterer(data);
193
new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
195
new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()][2];
196
double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
197
m_priors = new double[m_wrappedClusterer.numberOfClusters()];
198
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
199
for (int j = 0; j < data.numAttributes(); j++) {
200
if (data.attribute(j).isNominal()) {
201
m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),
207
Instance inst = null;
209
// Compute mean, etc.
210
int[] clusterIndex = new int[data.numInstances()];
211
for (int i = 0; i < data.numInstances(); i++) {
212
inst = data.instance(i);
213
int cluster = m_wrappedClusterer.clusterInstance(inst);
214
m_priors[cluster] += inst.weight();
215
for (int j = 0; j < data.numAttributes(); j++) {
216
if (!inst.isMissing(j)) {
217
if (data.attribute(j).isNominal()) {
218
m_model[cluster][j].addValue(inst.value(j),inst.weight());
220
m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);
221
weights[cluster][j] += inst.weight();
225
clusterIndex[i] = cluster;
228
for (int j = 0; j < data.numAttributes(); j++) {
229
if (data.attribute(j).isNumeric()) {
230
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
231
if (weights[i][j] > 0) {
232
m_modelNormal[i][j][0] /= weights[i][j];
238
// Compute standard deviations
239
for (int i = 0; i < data.numInstances(); i++) {
240
inst = data.instance(i);
241
for (int j = 0; j < data.numAttributes(); j++) {
242
if (!inst.isMissing(j)) {
243
if (data.attribute(j).isNumeric()) {
244
double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);
245
m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;
251
for (int j = 0; j < data.numAttributes(); j++) {
252
if (data.attribute(j).isNumeric()) {
253
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
254
if (weights[i][j] > 0) {
255
m_modelNormal[i][j][1] =
256
Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]);
257
} else if (weights[i][j] <= 0) {
258
m_modelNormal[i][j][1] = Double.MAX_VALUE;
260
if (m_modelNormal[i][j][1] <= m_minStdDev) {
261
m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
262
if (m_modelNormal[i][j][1] <= m_minStdDev) {
263
m_modelNormal[i][j][1] = m_minStdDev;
270
Utils.normalize(m_priors);
274
* Returns the cluster priors.
276
* @return the cluster priors
278
public double[] clusterPriors() {
280
double[] n = new double[m_priors.length];
282
System.arraycopy(m_priors, 0, n, 0, n.length);
287
* Computes the log of the conditional density (per cluster) for a given instance.
289
* @param inst the instance to compute the density for
290
* @return an array containing the estimated densities
291
* @throws Exception if the density could not be computed
294
public double[] logDensityPerClusterForInstance(Instance inst) throws Exception {
298
double[] wghts = new double[m_wrappedClusterer.numberOfClusters()];
300
m_replaceMissing.input(inst);
301
inst = m_replaceMissing.output();
303
for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
305
for (j = 0; j < inst.numAttributes(); j++) {
306
if (!inst.isMissing(j)) {
307
if (inst.attribute(j).isNominal()) {
308
logprob += Math.log(m_model[i][j].getProbability(inst.value(j)));
309
} else { // numeric attribute
310
logprob += logNormalDens(inst.value(j),
311
m_modelNormal[i][j][0],
312
m_modelNormal[i][j][1]);
321
/** Constant for normal distribution. */
322
private static double m_normConst = 0.5 * Math.log(2 * Math.PI);
325
* Density function of normal distribution.
326
* @param x input value
327
* @param mean mean of distribution
328
* @param stdDev standard deviation of distribution
329
* @return the density
331
private double logNormalDens (double x, double mean, double stdDev) {
333
double diff = x - mean;
335
return - (diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev);
339
* Returns the number of clusters.
341
* @return the number of clusters generated for a training dataset.
342
* @throws Exception if number of clusters could not be returned successfully
344
public int numberOfClusters() throws Exception {
346
return m_wrappedClusterer.numberOfClusters();
350
* Returns a description of the clusterer.
352
* @return a string containing a description of the clusterer
354
public String toString() {
355
StringBuffer text = new StringBuffer();
356
text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
357
+ m_wrappedClusterer.toString());
359
text.append("\nFitted estimators (with ML estimates of variance):\n");
361
for (int j = 0; j < m_priors.length; j++) {
362
text.append("\nCluster: " + j + " Prior probability: "
363
+ Utils.doubleToString(m_priors[j], 4) + "\n\n");
365
for (int i = 0; i < m_model[0].length; i++) {
366
text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");
368
if (m_theInstances.attribute(i).isNominal()) {
369
if (m_model[j][i] != null) {
370
text.append(m_model[j][i].toString());
374
text.append("Normal Distribution. Mean = "
375
+ Utils.doubleToString(m_modelNormal[j][i][0], 4)
377
+ Utils.doubleToString(m_modelNormal[j][i][1], 4)
383
return text.toString();
387
* Returns the tip text for this property
388
* @return tip text for this property suitable for
389
* displaying in the explorer/experimenter gui
391
public String clustererTipText() {
392
return "the clusterer to wrap";
396
* Sets the clusterer to wrap.
398
* @param toWrap the clusterer
400
public void setClusterer(Clusterer toWrap) {
402
m_wrappedClusterer = toWrap;
406
* Gets the clusterer being wrapped.
408
* @return the clusterer
410
public Clusterer getClusterer() {
412
return m_wrappedClusterer;
416
* Returns the tip text for this property
417
* @return tip text for this property suitable for
418
* displaying in the explorer/experimenter gui
420
public String minStdDevTipText() {
421
return "set minimum allowable standard deviation";
425
* Set the minimum value for standard deviation when calculating
426
* normal density. Reducing this value can help prevent arithmetic
427
* overflow resulting from multiplying large densities (arising from small
428
* standard deviations) when there are many singleton or near singleton
430
* @param m minimum value for standard deviation
432
public void setMinStdDev(double m) {
437
* Get the minimum allowable standard deviation.
438
* @return the minumum allowable standard deviation
440
public double getMinStdDev() {
445
* Returns an enumeration describing the available options..
447
* @return an enumeration of all the available options.
449
public Enumeration listOptions() {
450
Vector result = new Vector();
452
result.addElement(new Option(
453
"\tminimum allowable standard deviation for normal density computation "
454
+"\n\t(default 1e-6)"
457
result.addElement(new Option(
458
"\tClusterer to wrap.\n"
459
+ "\t(default " + defaultClustererString() + ")",
460
"W", 1,"-W <clusterer name>"));
462
if ((m_wrappedClusterer != null) &&
463
(m_wrappedClusterer instanceof OptionHandler)) {
464
result.addElement(new Option(
466
"", 0, "\nOptions specific to clusterer "
467
+ m_wrappedClusterer.getClass().getName() + ":"));
468
Enumeration enu = ((OptionHandler)m_wrappedClusterer).listOptions();
469
while (enu.hasMoreElements()) {
470
result.addElement(enu.nextElement());
474
return result.elements();
478
* Parses a given list of options. <p/>
480
<!-- options-start -->
481
* Valid options are: <p/>
483
* <pre> -M <num>
484
* minimum allowable standard deviation for normal density computation
485
* (default 1e-6)</pre>
487
* <pre> -W <clusterer name>
489
* (default weka.clusterers.SimpleKMeans)</pre>
492
* Options specific to clusterer weka.clusterers.SimpleKMeans:
495
* <pre> -N <num>
496
* number of clusters. (default = 2).</pre>
498
* <pre> -S <num>
499
* random number seed.
504
* @param options the list of options as an array of strings
505
* @throws Exception if an option is not supported
507
public void setOptions(String[] options) throws Exception {
509
String optionString = Utils.getOption('M', options);
510
if (optionString.length() != 0)
511
setMinStdDev((new Double(optionString)).doubleValue());
515
String wString = Utils.getOption('W', options);
516
if (wString.length() == 0)
517
wString = defaultClustererString();
518
setClusterer(Clusterer.forName(wString, Utils.partitionOptions(options)));
522
* Gets the current settings of the clusterer.
524
* @return an array of strings suitable for passing to setOptions()
526
public String[] getOptions() {
528
String [] clustererOptions = new String [0];
529
if ((m_wrappedClusterer != null) &&
530
(m_wrappedClusterer instanceof OptionHandler)) {
531
clustererOptions = ((OptionHandler)m_wrappedClusterer).getOptions();
533
String [] options = new String [clustererOptions.length + 5];
536
options[current++] = "-M";
537
options[current++] = ""+getMinStdDev();
539
if (getClusterer() != null) {
540
options[current++] = "-W";
541
options[current++] = getClusterer().getClass().getName();
543
options[current++] = "--";
545
System.arraycopy(clustererOptions, 0, options, current,
546
clustererOptions.length);
547
current += clustererOptions.length;
548
while (current < options.length) {
549
options[current++] = "";
555
* Main method for testing this class.
557
* @param argv the options
559
public static void main(String [] argv) {
560
runClusterer(new MakeDensityBasedClusterer(), argv);