1
package de.lmu.ifi.dbs.elki.algorithm.outlier;
4
This file is part of ELKI:
5
Environment for Developing KDD-Applications Supported by Index-Structures
8
Ludwig-Maximilians-Universität München
9
Lehr- und Forschungseinheit für Datenbanksysteme
12
This program is free software: you can redistribute it and/or modify
13
it under the terms of the GNU Affero General Public License as published by
14
the Free Software Foundation, either version 3 of the License, or
15
(at your option) any later version.
17
This program is distributed in the hope that it will be useful,
18
but WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
GNU Affero General Public License for more details.
22
You should have received a copy of the GNU Affero General Public License
23
along with this program. If not, see <http://www.gnu.org/licenses/>.
26
import de.lmu.ifi.dbs.elki.data.NumberVector;
27
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
28
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
29
import de.lmu.ifi.dbs.elki.database.Database;
30
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
31
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
32
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
33
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
34
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
35
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
36
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
37
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
38
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
39
import de.lmu.ifi.dbs.elki.database.relation.Relation;
40
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
41
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
42
import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix;
43
import de.lmu.ifi.dbs.elki.logging.Logging;
44
import de.lmu.ifi.dbs.elki.logging.Logging.Level;
45
import de.lmu.ifi.dbs.elki.logging.LoggingConfiguration;
46
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
47
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
48
import de.lmu.ifi.dbs.elki.math.MeanVariance;
49
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
50
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
51
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
52
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap;
53
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap;
54
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
55
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap;
56
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
57
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
58
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
59
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
60
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
61
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
62
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
65
* Angle-Based Outlier Detection / Angle-Based Outlier Factor.
67
* LB-ABOD (lower-bound) version. Exact on the top k outliers, approximate on
70
* Outlier detection using variance analysis on angles, especially for high
71
* dimensional data sets.
73
* H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in
74
* High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge
75
* Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008.
77
* @author Matthias Schubert (Original Code)
78
* @author Erich Schubert (ELKIfication)
80
* @param <V> Vector type
82
@Title("LB-ABOD: Lower Bounded Angle-Based Outlier Detection")
83
@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.")
84
@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946")
85
public class LBABOD<V extends NumberVector<?>> extends FastABOD<V> {
87
* The logger for this class.
89
private static final Logging LOG = Logging.getLogger(LBABOD.class);
92
* Number of outliers to refine.
97
* Actual constructor, with parameters. Fast mode (sampling).
99
* @param kernelFunction Kernel function to use
100
* @param k k parameter
101
* @param l Number of outliers to find exact
103
public LBABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction, int k, int l) {
104
super(kernelFunction, k);
109
* Run LB-ABOD on the data set.
111
* @param relation Relation to process
112
* @return Outlier detection result
115
public OutlierResult run(Database db, Relation<V> relation) {
116
DBIDs ids = relation.getDBIDs();
117
SimilarityQuery<V, DoubleDistance> sq = relation.getDatabase().getSimilarityQuery(relation, kernelFunction);
118
KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
121
WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
122
DoubleMinMax minmaxabod = new DoubleMinMax();
125
// Storage for squared distances (will be reused!)
126
WritableDoubleDataStore sqDists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
127
// Nearest neighbor heap (will be reused!)
128
ComparableMaxHeap<DoubleDBIDPair> nn = new ComparableMaxHeap<>(k);
130
// Priority queue for candidates
131
ComparableMinHeap<DoubleDBIDPair> candidates = new ComparableMinHeap<>(relation.size());
132
// get Candidate Ranking
133
for(DBIDIter pA = relation.iterDBIDs(); pA.valid(); pA.advance()) {
134
// Compute nearest neighbors and distances.
136
double simAA = kernelMatrix.getSimilarity(pA, pA);
137
// Sum of 1./(|AB|) and 1./(|AB|^2); for computing R2.
138
double sumid = 0., sumisqd = 0.;
139
for(DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
140
if(DBIDUtil.equal(nB, pA)) {
143
double simBB = kernelMatrix.getSimilarity(nB, nB);
144
double simAB = kernelMatrix.getSimilarity(pA, nB);
145
double sqdAB = simAA + simBB - simAB - simAB;
146
sqDists.putDouble(nB, sqdAB);
150
sumid += 1. / Math.sqrt(sqdAB);
151
sumisqd += 1. / sqdAB;
154
nn.add(DBIDUtil.newPair(sqdAB, nB));
156
else if(sqdAB < nn.peek().doubleValue()) {
157
nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB));
161
// Compute FastABOD approximation, adjust for lower bound.
162
// LB-ABOF is defined via a numerically unstable formula.
163
// Variance as E(X^2)-E(X)^2 suffers from catastrophic cancellation!
164
// TODO: ensure numerical precision!
165
double nnsum = 0., nnsumsq = 0., nnsumisqd = 0.;
166
for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iB = nn.unsortedIter(); iB.valid(); iB.advance()) {
167
DoubleDBIDPair nB = iB.get();
168
double sqdAB = nB.doubleValue();
169
double simAB = kernelMatrix.getSimilarity(pA, nB);
173
for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iC = nn.unsortedIter(); iC.valid(); iC.advance()) {
174
DoubleDBIDPair nC = iC.get();
175
if(DBIDUtil.compare(nC, nB) < 0) {
178
double sqdAC = nC.doubleValue();
179
double simAC = kernelMatrix.getSimilarity(pA, nC);
183
// Exploit bilinearity of scalar product:
184
// <B-A, C-A> = <B, C-A> - <A,C-A>
185
// = <B,C> - <B,A> - <A,C> + <A,A>
186
double simBC = kernelMatrix.getSimilarity(nB, nC);
187
double numerator = simBC - simAB - simAC + simAA;
188
double sqweight = 1. / (sqdAB * sqdAC);
189
double weight = Math.sqrt(sqweight);
190
double val = numerator * sqweight;
191
nnsum += val * weight;
192
nnsumsq += val * val * weight;
193
nnsumisqd += sqweight;
196
// Remaining weight, term R2:
197
double r2 = sumisqd * sumisqd - 2. * nnsumisqd;
198
double tmp = (2. * nnsum + r2) / (sumid * sumid);
199
double lbabof = 2. * nnsumsq / (sumid * sumid) - tmp * tmp;
205
abodvalues.putDouble(pA, lbabof);
206
candidates.add(DBIDUtil.newPair(lbabof, pA));
208
minmaxabod.put(max); // Put maximum from approximate values.
212
DoubleMinHeap topscores = new DoubleMinHeap(l);
213
MeanVariance s = new MeanVariance();
214
while(!candidates.isEmpty()) {
216
if(topscores.size() >= k && candidates.peek().doubleValue() > topscores.peek()) {
219
DoubleDBIDPair pA = candidates.poll();
220
final double abof = computeABOF(relation, kernelMatrix, pA, s);
221
// Store refined score:
222
abodvalues.putDouble(pA, abof);
223
minmaxabod.put(abof);
224
// Update the heap tracking the top scores.
225
if(topscores.size() < k) {
229
if(topscores.peek() > abof) {
230
topscores.replaceTopElement(abof);
235
if(LOG.isStatistics()) {
236
LoggingConfiguration.setVerbose(Level.VERYVERBOSE);
237
LOG.statistics(new LongStatistic("lb-abod.refinements", refinements));
239
// Build result representation.
240
Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids);
241
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
242
return new OutlierResult(scoreMeta, scoreResult);
246
public TypeInformation[] getInputTypeRestriction() {
247
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
251
protected Logging getLogger() {
256
* Parameterization class.
258
* @author Erich Schubert
262
public static class Parameterizer<V extends NumberVector<?>> extends FastABOD.Parameterizer<V> {
264
* Parameter to specify the number of outliers to compute exactly.
266
public static final OptionID L_ID = new OptionID("abod.l", "Number of top outliers to compute.");
269
* Number of outliers to find.
274
protected void makeOptions(Parameterization config) {
275
super.makeOptions(config);
276
final IntParameter lP = new IntParameter(L_ID);
277
lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
278
if(config.grab(lP)) {
284
protected LBABOD<V> makeInstance() {
285
return new LBABOD<>(kernelFunction, k, l);