2
Functions to calculate the distance between two strings.
4
Copyright 2011 Canonical Ltd.
7
Ted Gould <ted@canonical.com>
9
This program is free software: you can redistribute it and/or modify it
10
under the terms of the GNU General Public License version 3, as published
11
by the Free Software Foundation.
13
This program is distributed in the hope that it will be useful, but
14
WITHOUT ANY WARRANTY; without even the implied warranties of
15
MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
16
PURPOSE. See the GNU General Public License for more details.
18
You should have received a copy of the GNU General Public License along
19
with this program. If not, see <http://www.gnu.org/licenses/>.
23
#include <glib/gprintf.h>
24
#include <glib-object.h>
25
#include <glib/gi18n.h>
29
#define ADD_PENALTY 10
30
#define PRE_ADD_PENALTY 1
31
#define DROP_PENALTY 10
32
#define END_DROP_PENALTY 10
33
#define TRANSPOSE_PENALTY 10
34
#define DELETE_PENALTY 10
35
#define SWAP_PENALTY 10
38
ignore_character (gchar inchar)
40
static gchar * ignore = NULL;
42
/* TRANSLATORS: These are chacaters that should not be considered
43
mistakes in the comparison functions. Typically they are gramatical
44
characters that can be found in menus. */
49
for (i = 0; i < 4; i++) {
50
if (ignore[i] == inchar) {
58
swap_cost (gchar a, gchar b)
62
if (ignore_character(a) || ignore_character(b))
64
if (g_unichar_toupper(a) == g_unichar_toupper(b))
65
return SWAP_PENALTY / 10; /* Some penalty, but close */
69
#define MATRIX_VAL(needle_loc, haystack_loc) (penalty_matrix[(needle_loc + 1) + (haystack_loc + 1) * (len_needle + 1)])
72
dumpmatrix (const gchar * needle, guint len_needle, const gchar * haystack, guint len_haystack, guint * penalty_matrix)
80
/* Character Column */
84
for (i = 0; i < len_needle; i++) {
85
g_printf("%c ", needle[i]);
89
/* Character Column */
91
for (i = -1; i < (gint)len_needle; i++) {
92
g_printf("%u ", MATRIX_VAL(i, -1));
96
for (j = 0; j < len_haystack; j++) {
97
g_printf("%c ", haystack[j]);
98
for (i = -1; i < (gint)len_needle; i++) {
99
g_printf("%u ", MATRIX_VAL(i, j));
109
calculate_token_distance (const gchar * needle, const gchar * haystack)
111
// g_debug("Comparing token '%s' to token '%s'", needle, haystack);
113
guint len_needle = 0;
114
guint len_haystack = 0;
116
if (needle != NULL) {
117
len_needle = g_utf8_strlen(needle, -1);
120
if (haystack != NULL) {
121
len_haystack = g_utf8_strlen(haystack, -1);
124
/* Handle the cases of very short or NULL strings quickly */
125
if (len_needle == 0) {
126
return DROP_PENALTY * len_haystack;
129
if (len_haystack == 0) {
130
return ADD_PENALTY * len_needle;
133
/* Allocate the matrix of penalties */
134
guint * penalty_matrix = g_malloc0(sizeof(guint) * (len_needle + 1) * (len_haystack + 1));
137
/* Take the first row and first column and make them additional letter penalties */
138
for (i = 0; i < len_needle + 1; i++) {
139
MATRIX_VAL(i - 1, -1) = i * ADD_PENALTY;
142
for (i = 0; i < len_haystack + 1; i++) {
143
if (i < len_haystack - len_needle) {
144
MATRIX_VAL(-1, i - 1) = i * PRE_ADD_PENALTY;
146
MATRIX_VAL(-1, i - 1) = (len_haystack - len_needle) * PRE_ADD_PENALTY + (i - (len_haystack - len_needle)) * DROP_PENALTY;
150
/* Now go through the matrix building up the penalties */
151
int ineedle, ihaystack;
152
for (ineedle = 0; ineedle < len_needle; ineedle++) {
153
for (ihaystack = 0; ihaystack < len_haystack; ihaystack++) {
154
char needle_let = needle[ineedle];
155
char haystack_let = haystack[ihaystack];
157
guint subst_pen = MATRIX_VAL(ineedle - 1, ihaystack - 1) + swap_cost(needle_let, haystack_let);
158
guint drop_pen = MATRIX_VAL(ineedle - 1, ihaystack);
159
if (ineedle < ihaystack) {
160
drop_pen += DROP_PENALTY;
162
drop_pen += END_DROP_PENALTY;
165
guint add_pen = MATRIX_VAL(ineedle, ihaystack - 1);
166
if (len_haystack - len_needle - ineedle > 0) {
167
add_pen += PRE_ADD_PENALTY;
169
add_pen += ADD_PENALTY;
171
guint transpose_pen = drop_pen + 1; /* ensures won't be chosen */
173
if (ineedle > 0 && ihaystack > 0 && needle_let == haystack[ihaystack - 1] && haystack_let == needle[ineedle - 1]) {
174
transpose_pen = MATRIX_VAL(ineedle - 2, ihaystack - 2) + TRANSPOSE_PENALTY;
177
MATRIX_VAL(ineedle, ihaystack) = MIN(MIN(subst_pen, drop_pen), MIN(add_pen, transpose_pen));
181
dumpmatrix(needle, len_needle, haystack, len_haystack, penalty_matrix);
183
guint retval = penalty_matrix[(len_needle + 1) * (len_haystack + 1) - 1];
184
g_free(penalty_matrix);
189
/* Looks through the array of paths and tries to find a minimum path
190
looking up from the needle specified. This way we can look through
191
all the possible paths */
193
minimize_distance_recurse (guint needle, guint num_needles, guint haystack, guint num_haystacks, guint * distances, guint * matches)
197
/* Put where we are in the array so that we don't forget */
198
matches[needle] = haystack;
200
/* First check to see if we've already used this entry */
201
for (i = needle - 1; i >= 0; i--) {
202
if (matches[i] == haystack) {
207
/* If we're the last needle, we can return our distance */
208
if (needle + 1 >= num_needles) {
209
return distances[(needle * num_haystacks) + haystack];
212
guint * local_match = g_new0(guint, num_needles);
213
for (i = 0; i < num_needles && i < needle + 1; i++) {
214
local_match[i] = matches[i];
217
/* Now look where we can get the minimum with the other needles */
218
guint min = G_MAXUINT;
219
for (i = 0; i < num_haystacks; i++) {
220
guint local = minimize_distance_recurse(needle + 1, num_needles, i, num_haystacks, distances, local_match);
226
for (j = needle + 1; j < num_needles; j++) {
227
matches[j] = local_match[j];
234
/* Return the min of everyone else plus our distance */
235
if (min < G_MAXUINT) {
236
min += distances[(needle * num_haystacks) + haystack];
242
/* Figuring out the lowest path through the distance array
243
where we don't use the same haystack tokens */
245
minimize_distance (guint num_needles, guint num_haystacks, guint * distances, guint * matches)
247
guint final_distance = G_MAXUINT;
248
guint * local_matches = g_new0(guint, num_needles);
250
guint haystack_token;
251
for (haystack_token = 0; haystack_token < num_haystacks; haystack_token++) {
252
guint distance = minimize_distance_recurse(0, num_needles, haystack_token, num_haystacks, distances, local_matches);
254
if (distance < final_distance) {
255
final_distance = distance;
258
for (match_cnt = 0; match_cnt < num_needles; match_cnt++) {
259
matches[match_cnt] = local_matches[match_cnt];
264
g_free(local_matches);
266
return final_distance;
269
/* Dups a specific token in the array of strv arrays */
271
find_token (guint token_number, GStrv * haystacks, guint num_haystacks)
275
for (haystack = 0; haystack < num_haystacks; haystack++) {
276
guint strvlen = g_strv_length(haystacks[haystack]);
278
if (token_number < strvlen) {
279
return g_strdup(haystacks[haystack][token_number]);
282
token_number -= strvlen;
288
#define SEPARATORS " .->"
291
calculate_distance (const gchar * needle, GStrv haystacks, GStrv * matches)
293
g_return_val_if_fail(needle != NULL || haystacks != NULL, G_MAXUINT);
294
guint final_distance = G_MAXUINT;
296
if (needle == NULL) {
297
return DROP_PENALTY * g_utf8_strlen(haystacks[0], 1024);
299
if (haystacks == NULL) {
300
return ADD_PENALTY * g_utf8_strlen(needle, 1024);
303
/* Tokenize all the haystack strings */
305
guint num_haystacks = g_strv_length(haystacks);
306
guint num_haystack_tokens = 0;
307
GStrv * haystacks_array = g_new0(GStrv, num_haystacks);
308
for (i = 0; i < num_haystacks; i++) {
309
haystacks_array[i] = g_strsplit_set(haystacks[i], SEPARATORS, 0);
310
num_haystack_tokens += g_strv_length(haystacks_array[i]);
313
/* Tokenize our needles the same way */
314
GStrv needle_tokens = g_strsplit_set(needle, SEPARATORS, 0);
315
guint num_needle_tokens = g_strv_length(needle_tokens);
317
/* If we can't even find a set that works, let's just cut
319
if (num_needle_tokens > num_haystack_tokens) {
323
/* We need a place to store all the distances */
324
guint * distances = g_new0(guint, num_haystack_tokens * num_needle_tokens);
326
/* Calculate all the distance combinations */
328
for (needle_token = 0; needle_tokens[needle_token] != NULL; needle_token++) {
329
gchar * ineedle = needle_tokens[needle_token];
332
guint haystack_token_cnt = 0;
333
for (haystacks_cnt = 0; haystacks_cnt < num_haystacks; haystacks_cnt++) {
335
for (haystack_cnt = 0; haystacks_array[haystacks_cnt][haystack_cnt] != NULL; haystack_cnt++) {
336
gchar * ihaystack = haystacks_array[haystacks_cnt][haystack_cnt];
337
guint distance = calculate_token_distance(ineedle, ihaystack);
339
distances[(needle_token * num_haystack_tokens) + haystack_token_cnt] = distance;
340
haystack_token_cnt++;
345
/* Now, try to find a path through the array that results in the
346
lowest total value */
347
guint * final_matches = g_new0(guint, num_needle_tokens);
349
final_distance = minimize_distance(num_needle_tokens, num_haystack_tokens, distances, final_matches);
351
/* Set up an array for matches so that we can enter
352
the items as we go */
353
if (matches != NULL) {
354
GStrv match_array = NULL;
355
match_array = g_new0(gchar *, num_needle_tokens + 1);
356
match_array[num_needle_tokens] = NULL;
358
/* Copy the strings that we care about */
360
for (i = 0; i < num_needle_tokens; i++) {
361
match_array[i] = find_token(final_matches[i], haystacks_array, num_haystacks);
364
*matches = match_array;
367
g_free(final_matches);
371
g_strfreev(needle_tokens);
372
for (i = 0; i < num_haystacks; i++) {
373
g_strfreev(haystacks_array[i]);
375
g_free(haystacks_array);
377
if (final_distance != G_MAXUINT) {
378
return final_distance / num_needle_tokens;