3
* $Date: 2007-01-04 18:46:10 +0100 (Thu, 04 Jan 2007) $
6
* Copyright (C) 1997-2007 The Chemistry Development Kit (CDK) project
8
* Contact: cdk-devel@lists.sourceforge.net
10
* This program is free software; you can redistribute it and/or
11
* modify it under the terms of the GNU Lesser General Public License
12
* as published by the Free Software Foundation; either version 2.1
13
* of the License, or (at your option) any later version.
14
* All we ask is that proper credit is given for our work, which includes
15
* - but is not limited to - adding the above copyright notice to the beginning
16
* of your source code files, and to any copyright notice that you may distribute
17
* with programs based on this work.
19
* This program is distributed in the hope that it will be useful,
20
* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22
* GNU Lesser General Public License for more details.
24
* You should have received a copy of the GNU Lesser General Public License
25
* along with this program; if not, write to the Free Software
26
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
29
package org.openscience.cdk.similarity;
32
import java.util.BitSet;
33
import org.openscience.cdk.exception.CDKException;
36
* Calculates the Tanimoto coefficient for a given pair of two
37
* fingerprint bitsets or real valued feature vectors.
39
* The Tanimoto coefficient is one way to
40
* quantitatively measure the "distance" or similarity of
41
* two chemical structures.
43
* <p>You can use the FingerPrinter class to retrieve two fingerprint bitsets.
44
* We assume that you have two structures stored in cdk.Molecule objects.
45
* A tanimoto coefficient can then be calculated like:
47
* BitSet fingerprint1 = Fingerprinter.getFingerprint(molecule1);
48
* BitSet fingerprint2 = Fingerprinter.getFingerprint(molecule2);
49
* float tanimoto_coefficient = Tanimoto.calculate(fingerprint1, fingerprint2);
52
* <p>The FingerPrinter assumes that hydrogens are explicitely given, if this
54
* <p>Note that the continuous Tanimoto coefficient does not lead to a metric space
57
*@cdk.created 2005-10-19
59
*@cdk.keyword similarity, tanimoto
65
* Evaluates Tanimoto coefficient for two bit sets.
67
* @param bitset1 A bitset (such as a fingerprint) for the first molecule
68
* @param bitset2 A bitset (such as a fingerprint) for the second molecule
69
* @return The Tanimoto coefficient
71
public static float calculate(BitSet bitset1, BitSet bitset2) throws CDKException
73
float _bitset1_cardinality = bitset1.cardinality();
74
float _bitset2_cardinality = bitset2.cardinality();
75
if (bitset1.size() != bitset2.size()) {
76
throw new CDKException("Bisets must have the same bit length");
78
BitSet one_and_two = (BitSet)bitset1.clone();
79
one_and_two.and(bitset2);
80
float _common_bit_count = one_and_two.cardinality();
81
float _tanimoto_coefficient = _common_bit_count/(_bitset1_cardinality + _bitset2_cardinality - _common_bit_count);
82
return _tanimoto_coefficient;
86
* Evaluates the continuous Tanimoto coefficient for two real valued vectors.
88
* @param features1 The first feature vector
89
* @param features2 The second feature vector
90
* @return The continuous Tanimoto coefficient
92
public static float calculate(double[] features1, double[] features2) throws CDKException {
94
if (features1.length != features2.length) {
95
throw new CDKException("Features vectors must be of the same length");
98
int n = features1.length;
103
for (int i = 0; i < n; i++) {
104
ab += features1[i] * features2[i];
105
a2 += features1[i]*features1[i];
106
b2 += features2[i]*features2[i];
108
return (float)ab/(float)(a2+b2-ab);