2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* HierarchicalClusterer.java
19
* Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
22
package weka.clusterers;
24
import java.io.Serializable;
25
import java.text.DecimalFormat;
26
import java.util.Comparator;
27
import java.util.Enumeration;
28
import java.util.PriorityQueue;
29
import java.util.Vector;
31
import weka.core.Capabilities;
32
import weka.core.CapabilitiesHandler;
33
import weka.core.DistanceFunction;
34
import weka.core.Drawable;
35
import weka.core.EuclideanDistance;
36
import weka.core.Instance;
37
import weka.core.Instances;
38
import weka.core.Option;
39
import weka.core.OptionHandler;
40
import weka.core.RevisionUtils;
41
import weka.core.SelectedTag;
43
import weka.core.Utils;
44
import weka.core.Capabilities.Capability;
47
<!-- globalinfo-start -->
48
* Hierarchical clustering class.
49
* Implements a number of classic hierarchical clustering methods.
50
<!-- globalinfo-end -->
52
<!-- options-start -->
53
* Valid options are: <p/>
61
* Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
62
* [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]
66
* Distance function to use. (default: weka.core.EuclideanDistance)
70
* Print hierarchy in Newick format, which can be used for display in other programs.
74
* If set, classifier is run in debug mode and may output additional info to the console.
78
* \If set, distance is interpreted as branch length, otherwise it is node height.
84
* @author Remco Bouckaert (rrb@xm.co.nz, remco@cs.waikato.ac.nz)
85
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
86
* @version $Revision: 6592 $
88
public class HierarchicalClusterer extends AbstractClusterer implements OptionHandler, CapabilitiesHandler, Drawable {
89
private static final long serialVersionUID = 1L;
91
/** Whether the classifier is run in debug mode. */
92
protected boolean m_bDebug = false;
94
/** Whether the distance represent node height (if false) or branch length (if true). */
95
protected boolean m_bDistanceIsBranchLength = false;
98
Instances m_instances;
100
/** number of clusters desired in clustering **/
101
int m_nNumClusters = 2;
102
public void setNumClusters(int nClusters) {m_nNumClusters = Math.max(1,nClusters);}
103
public int getNumClusters() {return m_nNumClusters;}
105
/** distance function used for comparing members of a cluster **/
106
protected DistanceFunction m_DistanceFunction = new EuclideanDistance();
107
public DistanceFunction getDistanceFunction() {return m_DistanceFunction;}
108
public void setDistanceFunction(DistanceFunction distanceFunction) {m_DistanceFunction = distanceFunction;}
110
/** used for priority queue for efficient retrieval of pair of clusters to merge**/
112
public Tuple(double d, int i, int j, int nSize1, int nSize2) {
116
m_nClusterSize1 = nSize1;
117
m_nClusterSize2 = nSize2;
125
/** comparator used by priority queue**/
126
class TupleComparator implements Comparator<Tuple> {
127
public int compare(Tuple o1, Tuple o2) {
128
if (o1.m_fDist < o2.m_fDist) {
130
} else if (o1.m_fDist == o2.m_fDist) {
137
/** the various link types */
138
final static int SINGLE = 0;
139
final static int COMPLETE = 1;
140
final static int AVERAGE = 2;
141
final static int MEAN = 3;
142
final static int CENTROID = 4;
143
final static int WARD = 5;
144
final static int ADJCOMLPETE = 6;
145
final static int NEIGHBOR_JOINING = 7;
146
public static final Tag[] TAGS_LINK_TYPE = {
147
new Tag(SINGLE, "SINGLE"),
148
new Tag(COMPLETE, "COMPLETE"),
149
new Tag(AVERAGE, "AVERAGE"),
150
new Tag(MEAN, "MEAN"),
151
new Tag(CENTROID, "CENTROID"),
152
new Tag(WARD, "WARD"),
153
new Tag(ADJCOMLPETE,"ADJCOMLPETE"),
154
new Tag(NEIGHBOR_JOINING,"NEIGHBOR_JOINING")
158
* Holds the Link type used calculate distance between clusters
160
int m_nLinkType = SINGLE;
162
boolean m_bPrintNewick = true;;
163
public boolean getPrintNewick() {return m_bPrintNewick;}
164
public void setPrintNewick(boolean bPrintNewick) {m_bPrintNewick = bPrintNewick;}
166
public void setLinkType(SelectedTag newLinkType) {
167
if (newLinkType.getTags() == TAGS_LINK_TYPE) {
168
m_nLinkType = newLinkType.getSelectedTag().getID();
172
public SelectedTag getLinkType() {
173
return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE);
176
/** class representing node in cluster hierarchy **/
177
class Node implements Serializable {
182
int m_iRightInstance;
183
double m_fLeftLength = 0;
184
double m_fRightLength = 0;
185
double m_fHeight = 0;
186
public String toString(int attIndex) {
187
DecimalFormat myFormatter = new DecimalFormat("#.#####");
189
if (m_left == null) {
190
if (m_right == null) {
191
return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
192
m_instances.instance(m_iRightInstance).stringValue(attIndex) +":" + myFormatter.format(m_fRightLength) + ")";
194
return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
195
m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
198
if (m_right == null) {
199
return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
200
m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
202
return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
206
public String toString2(int attIndex) {
207
DecimalFormat myFormatter = new DecimalFormat("#.#####");
209
if (m_left == null) {
210
if (m_right == null) {
211
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
212
m_instances.instance(m_iRightInstance).value(attIndex) +":" + myFormatter.format(m_fRightLength) + ")";
214
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
215
m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
218
if (m_right == null) {
219
return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
220
m_instances.instance(m_iRightInstance).value(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
222
return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
226
void setHeight(double fHeight1, double fHeight2) {
227
m_fHeight = fHeight1;
228
if (m_left == null) {
229
m_fLeftLength = fHeight1;
231
m_fLeftLength = fHeight1 - m_left.m_fHeight;
233
if (m_right == null) {
234
m_fRightLength = fHeight2;
236
m_fRightLength = fHeight2 - m_right.m_fHeight;
239
void setLength(double fLength1, double fLength2) {
240
m_fLeftLength = fLength1;
241
m_fRightLength = fLength2;
242
m_fHeight = fLength1;
243
if (m_left != null) {
244
m_fHeight += m_left.m_fHeight;
253
public void buildClusterer(Instances data) throws Exception {
254
// /System.err.println("Method " + m_nLinkType);
256
int nInstances = m_instances.numInstances();
257
if (nInstances == 0) {
260
m_DistanceFunction.setInstances(m_instances);
261
// use array of integer vectors to store cluster indices,
262
// starting with one cluster per instance
263
Vector<Integer> [] nClusterID = new Vector[data.numInstances()];
264
for (int i = 0; i < data.numInstances(); i++) {
265
nClusterID[i] = new Vector<Integer>();
266
nClusterID[i].add(i);
268
// calculate distance matrix
269
int nClusters = data.numInstances();
271
// used for keeping track of hierarchy
272
Node [] clusterNodes = new Node[nInstances];
273
if (m_nLinkType == NEIGHBOR_JOINING) {
274
neighborJoining(nClusters, nClusterID, clusterNodes);
276
doLinkClustering(nClusters, nClusterID, clusterNodes);
279
// move all clusters in m_nClusterID array
280
// & collect hierarchy
282
m_clusters = new Node[m_nNumClusters];
283
m_nClusterNr = new int[nInstances];
284
for (int i = 0; i < nInstances; i++) {
285
if (nClusterID[i].size() > 0) {
286
for (int j = 0; j < nClusterID[i].size(); j++) {
287
m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent;
289
m_clusters[iCurrent] = clusterNodes[i];
296
/** use neighbor joining algorithm for clustering
297
* This is roughly based on the RapidNJ simple implementation and runs at O(n^3)
298
* More efficient implementations exist, see RapidNJ (or my GPU implementation :-))
301
* @param clusterNodes
303
void neighborJoining(int nClusters, Vector<Integer>[] nClusterID, Node [] clusterNodes) {
304
int n = m_instances.numInstances();
306
double [][] fDist = new double[nClusters][nClusters];
307
for (int i = 0; i < nClusters; i++) {
309
for (int j = i+1; j < nClusters; j++) {
310
fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
311
fDist[j][i] = fDist[i][j];
315
double [] fSeparationSums = new double [n];
316
double [] fSeparations = new double [n];
317
int [] nNextActive = new int[n];
319
//calculate initial separation rows
320
for(int i = 0; i < n; i++){
322
for(int j = 0; j < n; j++){
325
fSeparationSums[i] = fSum;
326
fSeparations[i] = fSum / (nClusters - 2);
327
nNextActive[i] = i +1;
330
while (nClusters > 2) {
334
double fMin = Double.MAX_VALUE;
336
for (int i = 0; i < n; i++) {
337
if(nClusterID[i].size() > 0){
338
double [] fRow = fDist[i];
339
double fSep1 = fSeparations[i];
340
for(int j = 0; j < n; j++){
341
if(nClusterID[j].size() > 0 && i != j){
342
double fSep2 = fSeparations[j];
343
double fVal = fRow[j] - fSep1 - fSep2;
358
double fSep1 = fSeparations[i];
359
double [] fRow = fDist[i];
360
int j = nNextActive[i];
362
double fSep2 = fSeparations[j];
363
double fVal = fRow[j] - fSep1 - fSep2;
376
double fMinDistance = fDist[iMin1][iMin2];
378
double fSep1 = fSeparations[iMin1];
379
double fSep2 = fSeparations[iMin2];
380
double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2));
381
double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1));
383
// update separations & distance
384
double fNewSeparationSum = 0;
385
double fMutualDistance = fDist[iMin1][iMin2];
386
double[] fRow1 = fDist[iMin1];
387
double[] fRow2 = fDist[iMin2];
388
for(int i = 0; i < n; i++) {
389
if(i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) {
392
double fVal1 = fRow1[i];
393
double fVal2 = fRow2[i];
394
double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0;
395
fNewSeparationSum += fDistance;
396
// update the separationsum of cluster i.
397
fSeparationSums[i] += (fDistance - fVal1 - fVal2);
398
fSeparations[i] = fSeparationSums[i] / (nClusters -2);
399
fRow1[i] = fDistance;
400
fDist[i][iMin1] = fDistance;
403
fSeparationSums[iMin1] = fNewSeparationSum;
404
fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2);
405
fSeparationSums[iMin2] = 0;
406
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
408
// since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop should be save
409
while (nClusterID[iPrev].size() == 0) {
412
nNextActive[iPrev] = nNextActive[iMin2];
414
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
419
for (int i = 0; i < n; i++) {
420
if (nClusterID[i].size() > 0) {
421
for (int j = i+1; j < n; j++) {
422
if (nClusterID[j].size() > 0) {
423
double fDist1 = fDist[i][j];
424
if(nClusterID[i].size() == 1) {
425
merge(i,j,fDist1,0,nClusterID, clusterNodes);
426
} else if (nClusterID[j].size() == 1) {
427
merge(i,j,0,fDist1,nClusterID, clusterNodes);
429
merge(i,j,fDist1/2.0,fDist1/2.0,nClusterID, clusterNodes);
438
/** Perform clustering using a link method
439
* This implementation uses a priority queue resulting in a O(n^2 log(n)) algorithm
440
* @param nClusters number of clusters
442
* @param clusterNodes
444
void doLinkClustering(int nClusters, Vector<Integer>[] nClusterID, Node [] clusterNodes) {
445
int nInstances = m_instances.numInstances();
446
PriorityQueue<Tuple> queue = new PriorityQueue<Tuple>(nClusters*nClusters/2, new TupleComparator());
447
double [][] fDistance0 = new double[nClusters][nClusters];
448
double [][] fClusterDistance = null;
450
fClusterDistance = new double[nClusters][nClusters];
452
for (int i = 0; i < nClusters; i++) {
453
fDistance0[i][i] = 0;
454
for (int j = i+1; j < nClusters; j++) {
455
fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
456
fDistance0[j][i] = fDistance0[i][j];
457
queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1));
459
fClusterDistance[i][j] = fDistance0[i][j];
460
fClusterDistance[j][i] = fDistance0[i][j];
464
while (nClusters > m_nNumClusters) {
467
// find closest two clusters
469
/* simple but inefficient implementation */
470
double fMinDistance = Double.MAX_VALUE;
471
for (int i = 0; i < nInstances; i++) {
472
if (nClusterID[i].size()>0) {
473
for (int j = i+1; j < nInstances; j++) {
474
if (nClusterID[j].size()>0) {
475
double fDist = fClusterDistance[i][j];
476
if (fDist < fMinDistance) {
477
fMinDistance = fDist;
485
merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID, clusterNodes);
487
// use priority queue to find next best pair to cluster
491
} while (t!=null && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2].size() != t.m_nClusterSize2));
492
iMin1 = t.m_iCluster1;
493
iMin2 = t.m_iCluster2;
494
merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes);
498
// update distances & queue
499
for (int i = 0; i < nInstances; i++) {
500
if (i != iMin1 && nClusterID[i].size()!=0) {
501
int i1 = Math.min(iMin1,i);
502
int i2 = Math.max(iMin1,i);
503
double fDistance = getDistance(fDistance0, nClusterID[i1], nClusterID[i2]);
505
fClusterDistance[i1][i2] = fDistance;
506
fClusterDistance[i2][i1] = fDistance;
508
queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(), nClusterID[i2].size()));
514
} // doLinkClustering
516
void merge(int iMin1, int iMin2, double fDist1, double fDist2, Vector<Integer>[] nClusterID, Node [] clusterNodes) {
518
System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " " + fDist2);
521
int h = iMin1; iMin1 = iMin2; iMin2 = h;
522
double f = fDist1; fDist1 = fDist2; fDist2 = f;
524
nClusterID[iMin1].addAll(nClusterID[iMin2]);
525
nClusterID[iMin2].removeAllElements();
528
Node node = new Node();
529
if (clusterNodes[iMin1] == null) {
530
node.m_iLeftInstance = iMin1;
532
node.m_left = clusterNodes[iMin1];
533
clusterNodes[iMin1].m_parent = node;
535
if (clusterNodes[iMin2] == null) {
536
node.m_iRightInstance = iMin2;
538
node.m_right = clusterNodes[iMin2];
539
clusterNodes[iMin2].m_parent = node;
541
if (m_bDistanceIsBranchLength) {
542
node.setLength(fDist1, fDist2);
544
node.setHeight(fDist1, fDist2);
546
clusterNodes[iMin1] = node;
549
/** calculate distance the first time when setting up the distance matrix **/
550
double getDistance0(Vector<Integer> cluster1, Vector<Integer> cluster2) {
551
double fBestDist = Double.MAX_VALUE;
552
switch (m_nLinkType) {
554
case NEIGHBOR_JOINING:
560
// set up two instances for distance function
561
Instance instance1 = (Instance) m_instances.instance(cluster1.elementAt(0)).copy();
562
Instance instance2 = (Instance) m_instances.instance(cluster2.elementAt(0)).copy();
563
fBestDist = m_DistanceFunction.distance(instance1, instance2);
567
// finds the distance of the change in caused by merging the cluster.
568
// The information of a cluster is calculated as the error sum of squares of the
569
// centroids of the cluster and its members.
570
double ESS1 = calcESS(cluster1);
571
double ESS2 = calcESS(cluster2);
572
Vector<Integer> merged = new Vector<Integer>();
573
merged.addAll(cluster1);
574
merged.addAll(cluster2);
575
double ESS = calcESS(merged);
576
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
583
/** calculate the distance between two clusters
584
* @param cluster1 list of indices of instances in the first cluster
585
* @param cluster2 dito for second cluster
586
* @return distance between clusters based on link type
588
double getDistance(double [][] fDistance, Vector<Integer> cluster1, Vector<Integer> cluster2) {
589
double fBestDist = Double.MAX_VALUE;
590
switch (m_nLinkType) {
592
// find single link distance aka minimum link, which is the closest distance between
593
// any item in cluster1 and any item in cluster2
594
fBestDist = Double.MAX_VALUE;
595
for (int i = 0; i < cluster1.size(); i++) {
596
int i1 = cluster1.elementAt(i);
597
for (int j = 0; j < cluster2.size(); j++) {
598
int i2 = cluster2.elementAt(j);
599
double fDist = fDistance[i1][i2];
600
if (fBestDist > fDist) {
608
// find complete link distance aka maximum link, which is the largest distance between
609
// any item in cluster1 and any item in cluster2
611
for (int i = 0; i < cluster1.size(); i++) {
612
int i1 = cluster1.elementAt(i);
613
for (int j = 0; j < cluster2.size(); j++) {
614
int i2 = cluster2.elementAt(j);
615
double fDist = fDistance[i1][i2];
616
if (fBestDist < fDist) {
621
if (m_nLinkType == COMPLETE) {
624
// calculate adjustment, which is the largest within cluster distance
626
for (int i = 0; i < cluster1.size(); i++) {
627
int i1 = cluster1.elementAt(i);
628
for (int j = i+1; j < cluster1.size(); j++) {
629
int i2 = cluster1.elementAt(j);
630
double fDist = fDistance[i1][i2];
631
if (fMaxDist < fDist) {
636
for (int i = 0; i < cluster2.size(); i++) {
637
int i1 = cluster2.elementAt(i);
638
for (int j = i+1; j < cluster2.size(); j++) {
639
int i2 = cluster2.elementAt(j);
640
double fDist = fDistance[i1][i2];
641
if (fMaxDist < fDist) {
646
fBestDist -= fMaxDist;
649
// finds average distance between the elements of the two clusters
651
for (int i = 0; i < cluster1.size(); i++) {
652
int i1 = cluster1.elementAt(i);
653
for (int j = 0; j < cluster2.size(); j++) {
654
int i2 = cluster2.elementAt(j);
655
fBestDist += fDistance[i1][i2];
658
fBestDist /= (cluster1.size() * cluster2.size());
662
// calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)
663
Vector<Integer> merged = new Vector<Integer>();
664
merged.addAll(cluster1);
665
merged.addAll(cluster2);
667
for (int i = 0; i < merged.size(); i++) {
668
int i1 = merged.elementAt(i);
669
for (int j = i+1; j < merged.size(); j++) {
670
int i2 = merged.elementAt(j);
671
fBestDist += fDistance[i1][i2];
674
int n = merged.size();
675
fBestDist /= (n*(n-1.0)/2.0);
679
// finds the distance of the centroids of the clusters
680
double [] fValues1 = new double[m_instances.numAttributes()];
681
for (int i = 0; i < cluster1.size(); i++) {
682
Instance instance = m_instances.instance(cluster1.elementAt(i));
683
for (int j = 0; j < m_instances.numAttributes(); j++) {
684
fValues1[j] += instance.value(j);
687
double [] fValues2 = new double[m_instances.numAttributes()];
688
for (int i = 0; i < cluster2.size(); i++) {
689
Instance instance = m_instances.instance(cluster2.elementAt(i));
690
for (int j = 0; j < m_instances.numAttributes(); j++) {
691
fValues2[j] += instance.value(j);
694
for (int j = 0; j < m_instances.numAttributes(); j++) {
695
fValues1[j] /= cluster1.size();
696
fValues2[j] /= cluster2.size();
698
// set up two instances for distance function
699
Instance instance1 = (Instance) m_instances.instance(0).copy();
700
Instance instance2 = (Instance) m_instances.instance(0).copy();
701
for (int j = 0; j < m_instances.numAttributes(); j++) {
702
instance1.setValue(j, fValues1[j]);
703
instance2.setValue(j, fValues2[j]);
705
fBestDist = m_DistanceFunction.distance(instance1, instance2);
709
// finds the distance of the change in caused by merging the cluster.
710
// The information of a cluster is calculated as the error sum of squares of the
711
// centroids of the cluster and its members.
712
double ESS1 = calcESS(cluster1);
713
double ESS2 = calcESS(cluster2);
714
Vector<Integer> merged = new Vector<Integer>();
715
merged.addAll(cluster1);
716
merged.addAll(cluster2);
717
double ESS = calcESS(merged);
718
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
725
/** calculated error sum-of-squares for instances wrt centroid **/
726
double calcESS(Vector<Integer> cluster) {
727
double [] fValues1 = new double[m_instances.numAttributes()];
728
for (int i = 0; i < cluster.size(); i++) {
729
Instance instance = m_instances.instance(cluster.elementAt(i));
730
for (int j = 0; j < m_instances.numAttributes(); j++) {
731
fValues1[j] += instance.value(j);
734
for (int j = 0; j < m_instances.numAttributes(); j++) {
735
fValues1[j] /= cluster.size();
737
// set up two instances for distance function
738
Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy();
739
for (int j = 0; j < m_instances.numAttributes(); j++) {
740
centroid.setValue(j, fValues1[j]);
743
for (int i = 0; i < cluster.size(); i++) {
744
Instance instance = m_instances.instance(cluster.elementAt(i));
745
fESS += m_DistanceFunction.distance(centroid, instance);
747
return fESS / cluster.size();
751
/** instances are assigned a cluster by finding the instance in the training data
752
* with the closest distance to the instance to be clustered. The cluster index of
753
* the training data point is taken as the cluster index.
755
public int clusterInstance(Instance instance) throws Exception {
756
if (m_instances.numInstances() == 0) {
759
double fBestDist = Double.MAX_VALUE;
760
int iBestInstance = -1;
761
for (int i = 0; i < m_instances.numInstances(); i++) {
762
double fDist = m_DistanceFunction.distance(instance, m_instances.instance(i));
763
if (fDist < fBestDist) {
768
return m_nClusterNr[iBestInstance];
772
/** create distribution with all clusters having zero probability, except the
773
* cluster the instance is assigned to.
775
public double[] distributionForInstance(Instance instance) throws Exception {
776
if (numberOfClusters() == 0) {
777
double [] p = new double[1];
781
double [] p = new double[numberOfClusters()];
782
p[clusterInstance(instance)] = 1.0;
787
public Capabilities getCapabilities() {
788
Capabilities result = new Capabilities(this);
790
result.enable(Capability.NO_CLASS);
793
result.enable(Capability.NOMINAL_ATTRIBUTES);
794
result.enable(Capability.NUMERIC_ATTRIBUTES);
795
result.enable(Capability.DATE_ATTRIBUTES);
796
result.enable(Capability.MISSING_VALUES);
797
result.enable(Capability.STRING_ATTRIBUTES);
800
result.setMinimumNumberInstances(0);
805
public int numberOfClusters() throws Exception {
806
return Math.min(m_nNumClusters, m_instances.numInstances());
810
* Returns an enumeration describing the available options.
812
* @return an enumeration of all the available options.
814
public Enumeration listOptions() {
816
Vector newVector = new Vector(8);
817
newVector.addElement(new Option(
818
"\tIf set, classifier is run in debug mode and\n"
819
+ "\tmay output additional info to the console",
821
newVector.addElement(new Option(
822
"\tIf set, distance is interpreted as branch length\n"
823
+ "\totherwise it is node height.",
826
newVector.addElement(new Option(
827
"\tnumber of clusters",
828
"N", 1,"-N <Nr Of Clusters>"));
829
newVector.addElement(new Option(
830
"\tFlag to indicate the cluster should be printed in Newick format.",
832
newVector.addElement(
834
"Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor joining)", "L", 1,
835
"-L [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]"));
836
newVector.add(new Option(
837
"\tDistance function to use.\n"
838
+ "\t(default: weka.core.EuclideanDistance)",
839
"A", 1,"-A <classname and options>"));
840
return newVector.elements();
844
* Parses a given list of options. <p/>
846
<!-- options-start -->
847
* Valid options are: <p/>
851
* @param options the list of options as an array of strings
852
* @throws Exception if an option is not supported
854
public void setOptions(String[] options) throws Exception {
855
m_bPrintNewick = Utils.getFlag('P', options);
857
String optionString = Utils.getOption('N', options);
858
if (optionString.length() != 0) {
859
Integer temp = new Integer(optionString);
860
setNumClusters(temp);
866
setDebug(Utils.getFlag('D', options));
867
setDistanceIsBranchLength(Utils.getFlag('B', options));
869
String sLinkType = Utils.getOption('L', options);
872
if (sLinkType.compareTo("SINGLE") == 0) {setLinkType(new SelectedTag(SINGLE, TAGS_LINK_TYPE));}
873
if (sLinkType.compareTo("COMPLETE") == 0) {setLinkType(new SelectedTag(COMPLETE, TAGS_LINK_TYPE));}
874
if (sLinkType.compareTo("AVERAGE") == 0) {setLinkType(new SelectedTag(AVERAGE, TAGS_LINK_TYPE));}
875
if (sLinkType.compareTo("MEAN") == 0) {setLinkType(new SelectedTag(MEAN, TAGS_LINK_TYPE));}
876
if (sLinkType.compareTo("CENTROID") == 0) {setLinkType(new SelectedTag(CENTROID, TAGS_LINK_TYPE));}
877
if (sLinkType.compareTo("WARD") == 0) {setLinkType(new SelectedTag(WARD, TAGS_LINK_TYPE));}
878
if (sLinkType.compareTo("ADJCOMLPETE") == 0) {setLinkType(new SelectedTag(ADJCOMLPETE, TAGS_LINK_TYPE));}
879
if (sLinkType.compareTo("NEIGHBOR_JOINING") == 0) {setLinkType(new SelectedTag(NEIGHBOR_JOINING, TAGS_LINK_TYPE));}
881
String nnSearchClass = Utils.getOption('A', options);
882
if(nnSearchClass.length() != 0) {
883
String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass);
884
if(nnSearchClassSpec.length == 0) {
885
throw new Exception("Invalid DistanceFunction specification string.");
887
String className = nnSearchClassSpec[0];
888
nnSearchClassSpec[0] = "";
890
setDistanceFunction( (DistanceFunction)
891
Utils.forName( DistanceFunction.class,
892
className, nnSearchClassSpec) );
895
setDistanceFunction(new EuclideanDistance());
898
Utils.checkForRemainingOptions(options);
902
* Gets the current settings of the clusterer.
904
* @return an array of strings suitable for passing to setOptions()
906
public String [] getOptions() {
908
String [] options = new String [14];
911
options[current++] = "-N";
912
options[current++] = "" + getNumClusters();
914
options[current++] = "-L";
915
switch (m_nLinkType) {
916
case (SINGLE) :options[current++] = "SINGLE";break;
917
case (COMPLETE) :options[current++] = "COMPLETE";break;
918
case (AVERAGE) :options[current++] = "AVERAGE";break;
919
case (MEAN) :options[current++] = "MEAN";break;
920
case (CENTROID) :options[current++] = "CENTROID";break;
921
case (WARD) :options[current++] = "WARD";break;
922
case (ADJCOMLPETE) :options[current++] = "ADJCOMLPETE";break;
923
case (NEIGHBOR_JOINING) :options[current++] = "NEIGHBOR_JOINING";break;
925
if (m_bPrintNewick) {
926
options[current++] = "-P";
929
options[current++] = "-D";
931
if (getDistanceIsBranchLength()) {
932
options[current++] = "-B";
935
options[current++] = "-A";
936
options[current++] = (m_DistanceFunction.getClass().getName() + " " +
937
Utils.joinOptions(m_DistanceFunction.getOptions())).trim();
939
while (current < options.length) {
940
options[current++] = "";
945
public String toString() {
946
StringBuffer buf = new StringBuffer();
947
int attIndex = m_instances.classIndex();
949
// try find a string, or last attribute otherwise
951
while (attIndex < m_instances.numAttributes()-1) {
952
if (m_instances.attribute(attIndex).isString()) {
959
if (m_bPrintNewick && (numberOfClusters() > 0)) {
960
for (int i = 0; i < m_clusters.length; i++) {
961
if (m_clusters[i] != null) {
962
buf.append("Cluster " + i + "\n");
963
if (m_instances.attribute(attIndex).isString()) {
964
buf.append(m_clusters[i].toString(attIndex));
966
buf.append(m_clusters[i].toString2(attIndex));
972
} catch (Exception e) {
975
return buf.toString();
978
* Set debugging mode.
980
* @param debug true if debug output should be printed
982
public void setDebug(boolean debug) {
988
* Get whether debugging is turned on.
990
* @return true if debugging output is on
992
public boolean getDebug() {
997
public boolean getDistanceIsBranchLength() {return m_bDistanceIsBranchLength;}
999
public void setDistanceIsBranchLength(boolean bDistanceIsHeight) {m_bDistanceIsBranchLength = bDistanceIsHeight;}
1001
public String distanceIsBranchLengthTipText() {
1002
return "If set to false, the distance between clusters is interpreted " +
1003
"as the height of the node linking the clusters. This is appropriate for " +
1004
"example for single link clustering. However, for neighbor joining, the " +
1005
"distance is better interpreted as branch length. Set this flag to " +
1006
"get the latter interpretation.";
1009
* Returns the tip text for this property
1010
* @return tip text for this property suitable for
1011
* displaying in the explorer/experimenter gui
1013
public String debugTipText() {
1014
return "If set to true, classifier may output additional info to " +
1018
* @return a string to describe the NumClusters
1020
public String numClustersTipText() {
1021
return "Sets the number of clusters. " +
1022
"If a single hierarchy is desired, set this to 1.";
1026
* @return a string to describe the print Newick flag
1028
public String printNewickTipText() {
1029
return "Flag to indicate whether the cluster should be print in Newick format." +
1030
" This can be useful for display in other programs. However, for large datasets" +
1031
" a lot of text may be produced, which may not be a nuisance when the Newick format" +
1036
* @return a string to describe the distance function
1038
public String distanceFunctionTipText() {
1039
return "Sets the distance function, which measures the distance between two individual. " +
1040
"instances (or possibly the distance between an instance and the centroid of a cluster" +
1041
"depending on the Link type).";
1045
* @return a string to describe the Link type
1047
public String linkTypeTipText() {
1048
return "Sets the method used to measure the distance between two clusters.\n" +
1050
" find single link distance aka minimum link, which is the closest distance between" +
1051
" any item in cluster1 and any item in cluster2\n" +
1053
" find complete link distance aka maximum link, which is the largest distance between" +
1054
" any item in cluster1 and any item in cluster2\n" +
1056
" as COMPLETE, but with adjustment, which is the largest within cluster distance\n" +
1058
" finds average distance between the elements of the two clusters\n" +
1060
" calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)\n" +
1062
" finds the distance of the centroids of the clusters\n" +
1064
" finds the distance of the change in caused by merging the cluster." +
1065
" The information of a cluster is calculated as the error sum of squares of the" +
1066
" centroids of the cluster and its members.\n" +
1067
"NEIGHBOR_JOINING\n" +
1068
" use neighbor joining algorithm."
1073
* This will return a string describing the clusterer.
1074
* @return The string.
1076
public String globalInfo() {
1078
"Hierarchical clustering class.\n" +
1079
"Implements a number of classic agglomorative (i.e. bottom up) hierarchical clustering methods" +
1083
public static void main(String [] argv) {
1084
runClusterer(new HierarchicalClusterer(), argv);
1087
public String graph() throws Exception {
1088
if (numberOfClusters() == 0) {
1089
return "Newick:(no,clusters)";
1091
int attIndex = m_instances.classIndex();
1093
// try find a string, or last attribute otherwise
1095
while (attIndex < m_instances.numAttributes()-1) {
1096
if (m_instances.attribute(attIndex).isString()) {
1102
String sNewick = null;
1103
if (m_instances.attribute(attIndex).isString()) {
1104
sNewick = m_clusters[0].toString(attIndex);
1106
sNewick = m_clusters[0].toString2(attIndex);
1108
return "Newick:" + sNewick;
1111
public int graphType() {
1112
return Drawable.Newick;
1116
* Returns the revision string.
1118
* @return the revision
1120
public String getRevision() {
1121
return RevisionUtils.extract("$Revision: 6592 $");
1123
} // class HierarchicalClusterer