1
package org.apache.lucene.search;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.util.List;
21
import java.util.Arrays;
22
import java.io.IOException;
24
import org.apache.lucene.analysis.MockAnalyzer;
25
import org.apache.lucene.analysis.standard.StandardAnalyzer;
26
import org.apache.lucene.util.LuceneTestCase;
27
import org.apache.lucene.document.Document;
28
import org.apache.lucene.document.Field;
29
import org.apache.lucene.index.IndexReader;
30
import org.apache.lucene.index.MultiReader;
31
import org.apache.lucene.index.RandomIndexWriter;
32
import org.apache.lucene.index.Term;
33
import org.apache.lucene.store.Directory;
34
import org.apache.lucene.queryParser.QueryParser;
37
* Tests {@link FuzzyQuery}.
40
public class TestFuzzyQuery extends LuceneTestCase {
42
public void testFuzziness() throws Exception {
43
Directory directory = newDirectory();
44
RandomIndexWriter writer = new RandomIndexWriter(random, directory);
45
addDoc("aaaaa", writer);
46
addDoc("aaaab", writer);
47
addDoc("aaabb", writer);
48
addDoc("aabbb", writer);
49
addDoc("abbbb", writer);
50
addDoc("bbbbb", writer);
51
addDoc("ddddd", writer);
53
IndexReader reader = writer.getReader();
54
IndexSearcher searcher = newSearcher(reader);
57
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
58
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
59
assertEquals(3, hits.length);
62
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
63
hits = searcher.search(query, null, 1000).scoreDocs;
64
assertEquals(3, hits.length);
65
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
66
hits = searcher.search(query, null, 1000).scoreDocs;
67
assertEquals(3, hits.length);
68
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
69
hits = searcher.search(query, null, 1000).scoreDocs;
70
assertEquals(3, hits.length);
71
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
72
hits = searcher.search(query, null, 1000).scoreDocs;
73
assertEquals(2, hits.length);
74
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
75
hits = searcher.search(query, null, 1000).scoreDocs;
76
assertEquals(1, hits.length);
77
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
78
hits = searcher.search(query, null, 1000).scoreDocs;
79
assertEquals(1, hits.length);
82
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
83
hits = searcher.search(query, null, 1000).scoreDocs;
84
assertEquals("3 documents should match", 3, hits.length);
85
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
86
for (int i = 0; i < hits.length; i++) {
87
final String term = searcher.doc(hits[i].doc).get("field");
88
//System.out.println(hits[i].score);
89
assertEquals(order.get(i), term);
92
// test pq size by supplying maxExpansions=2
93
// This query would normally return 3 documents, because 3 terms match (see above):
94
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
95
hits = searcher.search(query, null, 1000).scoreDocs;
96
assertEquals("only 2 documents should match", 2, hits.length);
97
order = Arrays.asList("bbbbb","abbbb");
98
for (int i = 0; i < hits.length; i++) {
99
final String term = searcher.doc(hits[i].doc).get("field");
100
//System.out.println(hits[i].score);
101
assertEquals(order.get(i), term);
104
// not similar enough:
105
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
106
hits = searcher.search(query, null, 1000).scoreDocs;
107
assertEquals(0, hits.length);
108
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
109
hits = searcher.search(query, null, 1000).scoreDocs;
110
assertEquals(0, hits.length);
112
// query identical to a word in the index:
113
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
114
hits = searcher.search(query, null, 1000).scoreDocs;
115
assertEquals(3, hits.length);
116
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
117
// default allows for up to two edits:
118
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
119
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
121
// query similar to a word in the index:
122
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
123
hits = searcher.search(query, null, 1000).scoreDocs;
124
assertEquals(3, hits.length);
125
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
126
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
127
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
130
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
131
hits = searcher.search(query, null, 1000).scoreDocs;
132
assertEquals(3, hits.length);
133
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
134
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
135
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
136
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
137
hits = searcher.search(query, null, 1000).scoreDocs;
138
assertEquals(3, hits.length);
139
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
140
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
141
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
142
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
143
hits = searcher.search(query, null, 1000).scoreDocs;
144
assertEquals(3, hits.length);
145
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
146
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
147
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
148
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
149
hits = searcher.search(query, null, 1000).scoreDocs;
150
assertEquals(2, hits.length);
151
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
152
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
153
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
154
hits = searcher.search(query, null, 1000).scoreDocs;
155
assertEquals(0, hits.length);
158
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
159
hits = searcher.search(query, null, 1000).scoreDocs;
160
assertEquals(1, hits.length);
161
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
164
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
165
hits = searcher.search(query, null, 1000).scoreDocs;
166
assertEquals(1, hits.length);
167
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
168
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
169
hits = searcher.search(query, null, 1000).scoreDocs;
170
assertEquals(1, hits.length);
171
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
172
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
173
hits = searcher.search(query, null, 1000).scoreDocs;
174
assertEquals(1, hits.length);
175
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
176
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
177
hits = searcher.search(query, null, 1000).scoreDocs;
178
assertEquals(1, hits.length);
179
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
180
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
181
hits = searcher.search(query, null, 1000).scoreDocs;
182
assertEquals(0, hits.length);
185
// different field = no match:
186
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
187
hits = searcher.search(query, null, 1000).scoreDocs;
188
assertEquals(0, hits.length);
195
public void testFuzzinessLong() throws Exception {
196
Directory directory = newDirectory();
197
RandomIndexWriter writer = new RandomIndexWriter(random, directory);
198
addDoc("aaaaaaa", writer);
199
addDoc("segment", writer);
201
IndexReader reader = writer.getReader();
202
IndexSearcher searcher = newSearcher(reader);
206
// not similar enough:
207
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
208
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
209
assertEquals(0, hits.length);
210
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
211
// in testDefaultFuzziness so a bigger difference is allowed:
212
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);
213
hits = searcher.search(query, null, 1000).scoreDocs;
214
assertEquals(1, hits.length);
215
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
218
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);
219
hits = searcher.search(query, null, 1000).scoreDocs;
220
assertEquals(1, hits.length);
221
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
222
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);
223
hits = searcher.search(query, null, 1000).scoreDocs;
224
assertEquals(1, hits.length);
225
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
226
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);
227
hits = searcher.search(query, null, 1000).scoreDocs;
228
assertEquals(0, hits.length);
230
// no match, more than half of the characters is wrong:
231
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);
232
hits = searcher.search(query, null, 1000).scoreDocs;
233
assertEquals(0, hits.length);
236
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);
237
hits = searcher.search(query, null, 1000).scoreDocs;
238
assertEquals(0, hits.length);
240
// "student" and "stellent" are indeed similar to "segment" by default:
241
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);
242
hits = searcher.search(query, null, 1000).scoreDocs;
243
assertEquals(1, hits.length);
244
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);
245
hits = searcher.search(query, null, 1000).scoreDocs;
246
assertEquals(1, hits.length);
249
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);
250
hits = searcher.search(query, null, 1000).scoreDocs;
251
assertEquals(1, hits.length);
252
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);
253
hits = searcher.search(query, null, 1000).scoreDocs;
254
assertEquals(1, hits.length);
255
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);
256
hits = searcher.search(query, null, 1000).scoreDocs;
257
assertEquals(0, hits.length);
258
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);
259
hits = searcher.search(query, null, 1000).scoreDocs;
260
assertEquals(0, hits.length);
262
// "student" doesn't match anymore thanks to increased minimum similarity:
263
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
264
hits = searcher.search(query, null, 1000).scoreDocs;
265
assertEquals(0, hits.length);
268
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
269
fail("Expected IllegalArgumentException");
270
} catch (IllegalArgumentException e) {
271
// expecting exception
274
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
275
fail("Expected IllegalArgumentException");
276
} catch (IllegalArgumentException e) {
277
// expecting exception
285
public void testTokenLengthOpt() throws IOException {
286
Directory directory = newDirectory();
287
RandomIndexWriter writer = new RandomIndexWriter(random, directory);
288
addDoc("12345678911", writer);
289
addDoc("segment", writer);
291
IndexReader reader = writer.getReader();
292
IndexSearcher searcher = newSearcher(reader);
296
// term not over 10 chars, so optimization shortcuts
297
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
298
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
299
assertEquals(0, hits.length);
301
// 10 chars, so no optimization
302
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
303
hits = searcher.search(query, null, 1000).scoreDocs;
304
assertEquals(0, hits.length);
306
// over 10 chars, so no optimization
307
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
308
hits = searcher.search(query, null, 1000).scoreDocs;
309
assertEquals(1, hits.length);
311
// over 10 chars, no match
312
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
313
hits = searcher.search(query, null, 1000).scoreDocs;
314
assertEquals(0, hits.length);
321
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
322
public void testBoostOnlyRewrite() throws Exception {
323
Directory directory = newDirectory();
324
RandomIndexWriter writer = new RandomIndexWriter(random, directory);
325
addDoc("Lucene", writer);
326
addDoc("Lucene", writer);
327
addDoc("Lucenne", writer);
329
IndexReader reader = writer.getReader();
330
IndexSearcher searcher = newSearcher(reader);
333
FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene"));
334
query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
335
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
336
assertEquals(3, hits.length);
337
// normally, 'Lucenne' would be the first result as IDF will skew the score.
338
assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
339
assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
340
assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
346
public void testGiga() throws Exception {
348
MockAnalyzer analyzer = new MockAnalyzer(random);
349
Directory index = newDirectory();
350
RandomIndexWriter w = new RandomIndexWriter(random, index);
352
addDoc("Lucene in Action", w);
353
addDoc("Lucene for Dummies", w);
356
addDoc("Giga byte", w);
358
addDoc("ManagingGigabytesManagingGigabyte", w);
359
addDoc("ManagingGigabytesManagingGigabytes", w);
361
addDoc("The Art of Computer Science", w);
362
addDoc("J. K. Rowling", w);
363
addDoc("JK Rowling", w);
364
addDoc("Joanne K Roling", w);
365
addDoc("Bruce Willis", w);
366
addDoc("Willis bruce", w);
367
addDoc("Brute willis", w);
368
addDoc("B. willis", w);
369
IndexReader r = w.getReader();
372
Query q = new QueryParser(TEST_VERSION_CURRENT, "field", analyzer).parse( "giga~0.9" );
375
IndexSearcher searcher = newSearcher(r);
376
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
377
assertEquals(1, hits.length);
378
assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
384
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
385
Document doc = new Document();
386
doc.add(newField("field", text, Field.Store.YES, Field.Index.ANALYZED));
387
writer.addDocument(doc);