5
/****************************************************************************
9
* The main method here is the fuzq_score method which scores a term against
10
* another term. The other methods all act in support.
12
****************************************************************************/
14
static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
16
return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
19
static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
22
for (i = 0; i < TYPICAL_LONGEST_WORD; i++) {
23
fuzq->max_distances[i] = fuzq_calculate_max_distance(fuzq, i);
27
static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
29
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
: fuzq_calculate_max_distance(fuzq, m);
34
* The following algorithm is taken from Bob Carpenter's FuzzyTermEnum
37
* http://mail-archives.apache.org/mod_mbox/lucene-java-dev/200606.mbox/%3c448F0E8C.3050901@alias-i.com%3e
39
float fuzq_score(FuzzyQuery *fuzq, const char *target)
41
const int m = (int)strlen(target);
42
const int n = fuzq->text_len;
45
/* we don't have anything to compare. That means if we just add
46
* the letters for m we get the new word */
47
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) m / fuzq->pre_len);
50
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
55
const char *text = fuzq->text;
56
const int max_distance = fuzq_get_max_distance(fuzq, m);
59
printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n,
62
if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
63
/* Just adding the characters of m to n or vice-versa results in
64
* too many edits for example "pre" length is 3 and "prefixes"
65
* length is 8. We can see that given this optimal circumstance,
66
* the edit distance cannot be less than 5 which is 8-3 or more
67
* precisesly Math.abs(3-8). If our maximum edit distance is 4,
68
* then we can discard this word without looking at it. */
73
d_prev = d_curr + n + 1;
76
for (j = 0; j <= n; j++) {
80
/* start computing edit distance */
83
/* swap d_current into d_prev */
87
prune = (d_curr[0] = ++i) > max_distance;
89
for (j = 0; j < n; j++) {
90
d_curr[j + 1] = (s_i == text[j])
91
? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
92
: min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
93
if (prune && d_curr[j + 1] <= max_distance) {
103
printf("<%f, d_curr[n] = %d min_len = %d>",
104
1.0f - ((float)d_curr[m] / (float) (fuzq->pre_len + min2(n, m))),
105
d_curr[m], fuzq->pre_len + min2(n, m));
108
/* this will return less than 0.0 when the edit distance is greater
109
* than the number of characters in the shorter word. but this was
110
* the formula that was previously used in FuzzyTermEnum, so it has
111
* not been changed (even though min_sim must be greater than 0.0) */
112
return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
116
/****************************************************************************
120
****************************************************************************/
122
#define FzQ(query) ((FuzzyQuery *)(query))
124
static char *fuzq_to_s(Query *self, const char *curr_field)
127
char *term = FzQ(self)->term;
128
char *field = FzQ(self)->field;
129
int tlen = (int)strlen(term);
130
int flen = (int)strlen(field);
131
bptr = buffer = ALLOC_N(char, tlen + flen + 70);
133
if (strcmp(curr_field, field) != 0) {
134
sprintf(bptr, "%s:", field);
138
sprintf(bptr, "%s~", term);
140
if (FzQ(self)->min_sim != 0.5) {
141
dbl_to_s(bptr, FzQ(self)->min_sim);
142
bptr += strlen(bptr);
145
if (self->boost != 1.0) {
147
dbl_to_s(++bptr, self->boost);
153
static Query *fuzq_rewrite(Query *self, IndexReader *ir)
156
FuzzyQuery *fuzq = FzQ(self);
158
const char *term = fuzq->term;
159
const char *field = fuzq->field;
160
const int field_num = fis_get_field_num(ir->fis, field);
165
else if (fuzq->pre_len >= (int)strlen(term)) {
166
q = tq_new(field, term);
171
int pre_len = fuzq->pre_len;
173
q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
176
prefix = ALLOC_N(char, pre_len + 1);
177
strncpy(prefix, term, pre_len);
178
prefix[pre_len] = '\0';
179
te = ir->terms_from(ir, field_num, prefix);
182
te = ir->terms(ir, field_num);
185
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
186
fuzq->text = term + pre_len;
187
fuzq->text_len = (int)strlen(fuzq->text);
188
fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
189
fuzq_initialize_max_distances(fuzq);
192
const char *curr_term = te->curr_term;
193
const char *curr_suffix = curr_term + pre_len;
198
if ((prefix && strncmp(curr_term, prefix, pre_len) != 0)) {
202
score = fuzq_score(fuzq, curr_suffix);
204
printf("%s:%s:%f < %f\n", curr_term, term, score, min_score);
206
multi_tq_add_term_boost(q, curr_term, score);
208
} while (te->next(te) != NULL);
218
static void fuzq_destroy(Query *self)
220
free(FzQ(self)->term);
221
free(FzQ(self)->field);
226
static unsigned long fuzq_hash(Query *self)
228
return str_hash(FzQ(self)->term) ^ str_hash(FzQ(self)->field)
229
^ float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
232
static int fuzq_eq(Query *self, Query *o)
234
FuzzyQuery *fq1 = FzQ(self);
235
FuzzyQuery *fq2 = FzQ(o);
237
return (strcmp(fq1->term, fq2->term) == 0)
238
&& (strcmp(fq1->field, fq2->field) == 0)
239
&& (fq1->pre_len == fq2->pre_len)
240
&& (fq1->min_sim == fq2->min_sim);
243
Query *fuzq_new_conf(const char *field, const char *term,
244
float min_sim, int pre_len, int max_terms)
246
Query *self = q_new(FuzzyQuery);
248
FzQ(self)->field = estrdup(field);
249
FzQ(self)->term = estrdup(term);
250
FzQ(self)->pre_len = pre_len ? pre_len : DEF_PRE_LEN;
251
FzQ(self)->min_sim = min_sim ? min_sim : DEF_MIN_SIM;
252
MTQMaxTerms(self) = max_terms ? max_terms : DEF_MAX_TERMS;
254
self->type = FUZZY_QUERY;
255
self->to_s = &fuzq_to_s;
256
self->hash = &fuzq_hash;
258
self->rewrite = &fuzq_rewrite;
259
self->destroy_i = &fuzq_destroy;
260
self->create_weight_i = &q_create_weight_unsup;
265
Query *fuzq_new(const char *field, const char *term)
267
return fuzq_new_conf(field, term, 0.0f, 0, 0);