4
* Miscellaneous functions to speed up the IMDbPY package.
8
* Function that implements the Ratcliff-Obershelp comparison
9
* amongst Python strings.
12
* Return a soundex code string, for the given string.
14
* Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
15
* Released under the GPL license.
17
* NOTE: The Ratcliff-Obershelp part was heavily based on code from the
18
* "simil" Python module.
19
* The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it>
20
* and can be found here: http://spazioinwind.libero.it/montecchiani/
21
* It was released under the GPL license; original comments are leaved
27
/*========== Ratcliff-Obershelp ==========*/
28
/*****************************************************************************
32
* [Python-Dev] Why is soundex marked obsolete?
33
* by Eric S. Raymond [4]esr@thyrsus.com
34
* on Sun, 14 Jan 2001 14:09:01 -0500
36
*****************************************************************************/
38
/*****************************************************************************
40
* Ratcliff-Obershelp common-subpattern similarity.
42
* This code first appeared in a letter to the editor in Doctor
43
* Dobbs's Journal, 11/1988. The original article on the algorithm,
44
* "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the
45
* July 1988 issue (#181) but the algorithm was presented in assembly.
46
* The main drawback of the Ratcliff-Obershelp algorithm is the cost
47
* of the pairwise comparisons. It is significantly more expensive
48
* than stemming, Hamming distance, soundex, and the like.
50
* Running time quadratic in the data size, memory usage constant.
52
*****************************************************************************/
56
#define DONTCOMPARE_NULL 0.0
57
#define DONTCOMPARE_SAME 1.0
59
#define STRING_MAXLENDIFFER 0.7
61
/* As of 05 Mar 2008, the longest title is ~600 chars. */
62
#define MXLINELEN 1023
64
#define MAX(a,b) ((a) > (b) ? (a) : (b))
67
//*****************************************
68
// preliminary check....
69
//*****************************************
71
strings_check(char const *s, char const *t)
73
float threshold; // lenght difference
74
int s_len = strlen(s); // length of s
75
int t_len = strlen(t); // length of t
78
if ((t_len * s_len) == 0)
79
return (DONTCOMPARE_NULL);
82
if (strcmp(s, t) == 0)
83
return (DONTCOMPARE_SAME);
85
// string lenght difference threshold
86
// we don't want to compare too different lenght strings ;)
88
threshold = (float) s_len / (float) t_len;
90
threshold = (float) t_len / (float) s_len;
91
if (threshold < STRING_MAXLENDIFFER)
92
return (DONTCOMPARE_NULL);
100
RatcliffObershelp(char *st1, char *end1, char *st2, char *end2)
102
register char *a1, *a2;
104
char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */
107
if (end1 <= st1 || end2 <= st2)
109
if (end1 == st1 + 1 && end2 == st2 + 1)
116
for (a1 = st1; a1 < b1; a1++) {
117
for (a2 = st2; a2 < b2; a2++) {
119
/* determine length of common substring */
120
for (i = 1; a1[i] && (a1[i] == a2[i]); i++)
134
max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */
135
max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */
141
ratcliff(char *s1, char *s2)
142
/* compute Ratcliff-Obershelp similarity of two strings */
148
res = strings_check(s1, s2);
155
return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2);
159
/* Change a string to lowercase. */
164
for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]);
168
/* Ratcliff-Obershelp for two python strings; returns a python float. */
170
pyratcliff(PyObject *self, PyObject *pArgs)
174
PyObject *discard = NULL;
175
char s1copy[MXLINELEN+1];
176
char s2copy[MXLINELEN+1];
178
/* The optional PyObject parameter is here to be compatible
179
* with the pure python implementation, which uses a
180
* difflib.SequenceMatcher object. */
181
if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard))
184
strncpy(s1copy, s1, MXLINELEN);
185
strncpy(s2copy, s2, MXLINELEN);
186
/* Work on copies. */
190
return Py_BuildValue("f", ratcliff(s1copy, s2copy));
194
/*========== soundex ==========*/
195
/* Max length of the soundex code to output (an uppercase char and
196
* _at most_ 4 digits). */
197
#define SOUNDEX_LEN 5
199
/* Group Number Lookup Table */
200
static char soundTable[26] =
201
{ 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */,
202
'2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */,
203
'5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */,
204
'2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */,
205
0 /* Y */, '2' /* Z */};
208
pysoundex(PyObject *self, PyObject *pArgs)
212
char word[MXLINELEN+1];
213
char soundCode[SOUNDEX_LEN+1];
216
if (!PyArg_ParseTuple(pArgs, "s", &s))
222
/* Convert to uppercase and exclude non-ascii chars. */
223
for (i = 0; i < n; i++) {
225
if (c < 91 && c > 64) {
234
/* If the string is empty, returns None. */
235
return Py_BuildValue("");
237
soundCode[0] = word[0];
239
/* Build the soundCode string. */
241
for (i = 1; j < SOUNDEX_LEN && i < n; i++) {
242
c = soundTable[(word[i]-65)];
243
/* Compact zeroes and equal consecutive digits ("12234112"->"123412") */
244
if (c != 0 && c != soundCode[j-1]) {
250
return Py_BuildValue("s", soundCode);
254
static PyMethodDef cutils_methods[] = {
255
{"ratcliff", pyratcliff,
256
METH_VARARGS, "Ratcliff-Obershelp similarity."},
257
{"soundex", pysoundex,
258
METH_VARARGS, "Soundex code for strings."},
266
Py_InitModule("cutils", cutils_methods);