69
69
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
70
70
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
71
71
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
72
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
73
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
72
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
74
73
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
75
74
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
76
75
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
171
170
public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
172
171
// Instantiate DiSH distance (and thus run the preprocessor)
173
if (LOG.isVerbose()) {
172
if(LOG.isVerbose()) {
174
173
LOG.verbose("*** Run DiSH preprocessor.");
176
175
DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation);
177
176
// Configure and run OPTICS.
178
if (LOG.isVerbose()) {
177
if(LOG.isVerbose()) {
179
178
LOG.verbose("*** Run OPTICS algorithm.");
181
180
ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters);
206
205
// extract clusters
207
206
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder);
209
if (LOG.isVerbose()) {
208
if(LOG.isVerbose()) {
210
209
StringBuilder msg = new StringBuilder("Step 1: extract clusters");
211
for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
212
for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
210
for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
211
for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
213
212
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
219
218
// check if there are clusters < minpts
220
219
checkClusters(database, distFunc, clustersMap, minpts);
221
if (LOG.isVerbose()) {
220
if(LOG.isVerbose()) {
222
221
StringBuilder msg = new StringBuilder("Step 2: check clusters");
223
for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
224
for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
222
for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
223
for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
225
224
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
231
230
// sort the clusters
232
231
List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap);
233
if (LOG.isVerbose()) {
232
if(LOG.isVerbose()) {
234
233
StringBuilder msg = new StringBuilder("Step 3: sort clusters");
235
for (Cluster<SubspaceModel<V>> c : clusters) {
234
for(Cluster<SubspaceModel<V>> c : clusters) {
236
235
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size());
238
237
LOG.verbose(msg.toString());
241
240
// build the hierarchy
242
241
Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering");
243
242
buildHierarchy(database, distFunc, clustering, clusters, dimensionality);
244
if (LOG.isVerbose()) {
243
if(LOG.isVerbose()) {
245
244
StringBuilder msg = new StringBuilder("Step 4: build hierarchy");
246
for (Cluster<SubspaceModel<V>> c : clusters) {
245
for(Cluster<SubspaceModel<V>> c : clusters) {
247
246
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size());
248
for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
247
for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
249
248
msg.append("\n parent ").append(iter.get());
251
for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
250
for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
252
251
msg.append("\n child ").append(iter.get());
278
277
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>();
279
278
Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>();
280
279
Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>();
281
for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
280
for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
282
281
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next();
283
282
entryMap.put(entry.getID(), entry);
288
287
// get the list of (parallel) clusters for the preference vector
289
288
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector);
290
if (parallelClusters == null) {
289
if(parallelClusters == null) {
291
290
parallelClusters = new ArrayList<>();
292
291
clustersMap.put(preferenceVector, parallelClusters);
295
294
// look for the proper cluster
296
295
Pair<BitSet, ArrayModifiableDBIDs> cluster = null;
297
for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
296
for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
298
297
V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database);
299
298
PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector);
300
if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
299
if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
301
300
double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector());
302
if (d <= 2 * epsilon) {
301
if(d <= 2 * epsilon) {
308
if (cluster == null) {
307
if(cluster == null) {
309
308
cluster = new Pair<>(preferenceVector, DBIDUtil.newArray());
310
309
parallelClusters.add(cluster);
312
311
cluster.second.add(entry.getID());
313
312
entryToClusterMap.put(entry.getID(), cluster);
315
if (progress != null) {
314
if(progress != null) {
316
315
progress.setProcessed(++processed, LOG);
319
if (progress != null) {
318
if(progress != null) {
320
319
progress.ensureCompleted(LOG);
323
if (LOG.isDebuggingFiner()) {
322
if(LOG.isDebuggingFiner()) {
324
323
StringBuilder msg = new StringBuilder("Step 0");
325
for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
326
for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
324
for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
325
for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
327
326
msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size());
333
332
// add the predecessor to the cluster
334
for (BitSet pv : clustersMap.keySet()) {
333
for(BitSet pv : clustersMap.keySet()) {
335
334
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
336
for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
337
if (cluster.second.isEmpty()) {
335
for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
336
if(cluster.second.isEmpty()) {
340
339
DBID firstID = cluster.second.get(0);
341
340
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID);
342
341
DBID predecessorID = entry.getPredecessorID();
343
if (predecessorID == null) {
342
if(predecessorID == null) {
346
345
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID);
347
346
// parallel cluster
348
if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
347
if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
351
if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
350
if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
375
374
final int db_dim = RelationUtil.dimensionality(database);
377
376
List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>();
378
for (BitSet pv : clustersMap.keySet()) {
377
for(BitSet pv : clustersMap.keySet()) {
379
378
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
380
for (int i = 0; i < parallelClusters.size(); i++) {
379
for(int i = 0; i < parallelClusters.size(); i++) {
381
380
Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i);
382
381
Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second);
383
382
cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database)));
384
383
String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, "");
385
if (parallelClusters.size() > 1) {
384
if(parallelClusters.size() > 1) {
386
385
cluster.setName("Cluster_" + subspace + "_" + i);
388
388
cluster.setName("Cluster_" + subspace);
390
390
clusters.add(cluster);
417
417
List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>();
418
418
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>();
419
419
Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray());
420
for (BitSet pv : clustersMap.keySet()) {
420
for(BitSet pv : clustersMap.keySet()) {
422
if (pv.cardinality() == 0) {
422
if(pv.cardinality() == 0) {
423
423
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
424
for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
424
for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
425
425
noise.second.addDBIDs(c.second);
430
430
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
431
431
List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size());
432
for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
433
if (!pv.equals(new BitSet()) && c.second.size() < minpts) {
432
for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
433
if(!pv.equals(new BitSet()) && c.second.size() < minpts) {
434
434
notAssigned.add(c);
436
437
newParallelClusters.add(c);
443
444
clustersMap.clear();
444
445
clustersMap.putAll(newClustersMap);
446
for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
447
if (c.second.isEmpty()) {
447
for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
448
if(c.second.isEmpty()) {
450
451
Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap);
451
if (parent != null) {
452
453
parent.second.addDBIDs(c.second);
454
456
noise.second.addDBIDs(c.second);
478
480
BitSet childPV = child.first;
479
481
int childCardinality = childPV.cardinality();
480
for (BitSet parentPV : clustersMap.keySet()) {
482
for(BitSet parentPV : clustersMap.keySet()) {
481
483
int parentCardinality = parentPV.cardinality();
482
if (parentCardinality >= childCardinality) {
484
if(parentCardinality >= childCardinality) {
485
if (resultCardinality != -1 && parentCardinality <= resultCardinality) {
487
if(resultCardinality != -1 && parentCardinality <= resultCardinality) {
489
491
BitSet pv = (BitSet) childPV.clone();
490
492
pv.and(parentPV);
491
if (pv.equals(parentPV)) {
493
if(pv.equals(parentPV)) {
492
494
List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV);
493
for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
495
for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
494
496
V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database);
495
497
double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV);
496
if (d <= 2 * epsilon) {
498
if(d <= 2 * epsilon) {
498
500
resultCardinality = parentCardinality;
519
521
final int db_dim = RelationUtil.dimensionality(database);
520
522
Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy();
522
for (int i = 0; i < clusters.size() - 1; i++) {
524
for(int i = 0; i < clusters.size() - 1; i++) {
523
525
Cluster<SubspaceModel<V>> c_i = clusters.get(i);
524
526
int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality();
525
527
V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database);
527
for (int j = i + 1; j < clusters.size(); j++) {
529
for(int j = i + 1; j < clusters.size(); j++) {
528
530
Cluster<SubspaceModel<V>> c_j = clusters.get(j);
529
531
int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality();
531
if (subspaceDim_i < subspaceDim_j) {
532
if (LOG.isDebugging()) {
533
if(subspaceDim_i < subspaceDim_j) {
534
if(LOG.isDebugging()) {
533
535
msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']');
534
536
msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']');
537
539
// noise level reached
538
if (c_j.getModel().getSubspace().dimensionality() == 0) {
540
if(c_j.getModel().getSubspace().dimensionality() == 0) {
539
541
// no parents exists -> parent is noise
540
if (hier.numParents(c_i) == 0) {
542
if(hier.numParents(c_i) == 0) {
541
543
clustering.addChildCluster(c_j, c_i);
542
if (LOG.isDebugging()) {
544
if(LOG.isDebugging()) {
543
545
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
544
546
msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
549
552
V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database);
550
553
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions());
551
554
double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector());
552
if (LOG.isDebugging()) {
555
if(LOG.isDebugging()) {
553
556
msg.append("\n dist = ").append(distance.getCorrelationValue());
556
if (distance.getCorrelationValue() == subspaceDim_j) {
557
if (LOG.isDebugging()) {
559
if(distance.getCorrelationValue() == subspaceDim_j) {
560
if(LOG.isDebugging()) {
558
561
msg.append("\n d = ").append(d);
560
if (d <= 2 * epsilon) {
563
if(d <= 2 * epsilon) {
561
564
// no parent exists or c_j is not a parent of the already
562
565
// existing parents
563
if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
566
if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
564
567
clustering.addChildCluster(c_j, c_i);
565
if (LOG.isDebugging()) {
568
if(LOG.isDebugging()) {
566
569
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
567
570
msg.append("] is parent of [");
568
571
msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
573
577
throw new RuntimeException("Should never happen: d = " + d);
599
603
int dimensionality = RelationUtil.dimensionality(database);
600
604
int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality();
602
for (; iter.valid(); iter.advance()) {
606
for(; iter.valid(); iter.advance()) {
603
607
Cluster<SubspaceModel<V>> child = iter.get();
604
608
V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database);
605
609
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions());
606
if (distance.getCorrelationValue() == subspaceDim_parent) {
610
if(distance.getCorrelationValue() == subspaceDim_parent) {
642
646
super.makeOptions(config);
644
648
DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001);
645
epsilonP.addConstraint(new GreaterEqualConstraint(0));
646
if (config.grab(epsilonP)) {
649
epsilonP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
650
if(config.grab(epsilonP)) {
647
651
epsilon = epsilonP.doubleValue();
650
654
IntParameter muP = new IntParameter(MU_ID, 1);
651
muP.addConstraint(new GreaterConstraint(0));
652
if (config.grab(muP)) {
655
muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
656
if(config.grab(muP)) {
653
657
mu = muP.intValue();