1
package org.apache.lucene.search.spell;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.Collections;
23
import java.util.Comparator;
24
import java.util.List;
25
import java.util.concurrent.ExecutorService;
26
import java.util.concurrent.Executors;
27
import java.util.concurrent.TimeUnit;
29
import org.apache.lucene.analysis.MockAnalyzer;
30
import org.apache.lucene.document.Document;
31
import org.apache.lucene.document.Field;
32
import org.apache.lucene.index.CorruptIndexException;
33
import org.apache.lucene.index.IndexReader;
34
import org.apache.lucene.index.IndexWriter;
35
import org.apache.lucene.index.IndexWriterConfig;
36
import org.apache.lucene.search.IndexSearcher;
37
import org.apache.lucene.store.AlreadyClosedException;
38
import org.apache.lucene.store.Directory;
39
import org.apache.lucene.util.English;
40
import org.apache.lucene.util.LuceneTestCase;
43
* Spell checker test case
45
public class TestSpellChecker extends LuceneTestCase {
46
private SpellCheckerMock spellChecker;
47
private Directory userindex, spellindex;
48
private List<IndexSearcher> searchers;
51
public void setUp() throws Exception {
55
userindex = newDirectory();
56
IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(
57
TEST_VERSION_CURRENT, new MockAnalyzer(random)));
59
for (int i = 0; i < 1000; i++) {
60
Document doc = new Document();
61
doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
62
doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
63
doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
64
writer.addDocument(doc);
67
Document doc = new Document();
68
doc.add(newField("field1", "eight", Field.Index.ANALYZED)); // "eight" in
71
writer.addDocument(doc);
74
Document doc = new Document();
76
.add(newField("field1", "twenty-one twenty-one",
77
Field.Index.ANALYZED)); // "twenty-one" in the index thrice
78
writer.addDocument(doc);
81
Document doc = new Document();
82
doc.add(newField("field1", "twenty", Field.Index.ANALYZED)); // "twenty"
86
writer.addDocument(doc);
90
searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
91
// create the spellChecker
92
spellindex = newDirectory();
93
spellChecker = new SpellCheckerMock(spellindex);
97
public void tearDown() throws Exception {
99
if (!spellChecker.isClosed())
100
spellChecker.close();
106
public void testBuild() throws CorruptIndexException, IOException {
107
IndexReader r = IndexReader.open(userindex, true);
109
spellChecker.clearIndex();
111
addwords(r, spellChecker, "field1");
112
int num_field1 = this.numdoc();
114
addwords(r, spellChecker, "field2");
115
int num_field2 = this.numdoc();
117
assertEquals(num_field2, num_field1 + 1);
119
assertLastSearcherOpen(4);
121
checkCommonSuggestions(r);
122
checkLevenshteinSuggestions(r);
124
spellChecker.setStringDistance(new JaroWinklerDistance());
125
spellChecker.setAccuracy(0.8f);
126
checkCommonSuggestions(r);
127
checkJaroWinklerSuggestions();
128
// the accuracy is set to 0.8 by default, but the best result has a score of 0.925
129
String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f);
130
assertTrue(similar.length == 0);
131
similar = spellChecker.suggestSimilar("fvie", 2, 0.92f);
132
assertTrue(similar.length == 1);
134
similar = spellChecker.suggestSimilar("fiv", 2);
135
assertTrue(similar.length > 0);
136
assertEquals(similar[0], "five");
138
spellChecker.setStringDistance(new NGramDistance(2));
139
spellChecker.setAccuracy(0.5f);
140
checkCommonSuggestions(r);
141
checkNGramSuggestions();
146
public void testComparator() throws Exception {
147
IndexReader r = IndexReader.open(userindex, true);
148
Directory compIdx = newDirectory();
149
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
150
addwords(r, compareSP, "field3");
152
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
153
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
154
assertEquals(2, similar.length);
155
//five and fvei have the same score, but different frequencies.
156
assertEquals("fvei", similar[0]);
157
assertEquals("five", similar[1]);
159
if (!compareSP.isClosed())
164
public void testBogusField() throws Exception {
165
IndexReader r = IndexReader.open(userindex, true);
166
Directory compIdx = newDirectory();
167
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
168
addwords(r, compareSP, "field3");
170
String[] similar = compareSP.suggestSimilar("fvie", 2, r,
171
"bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
172
assertEquals(0, similar.length);
174
if (!compareSP.isClosed())
179
public void testSuggestModes() throws Exception {
180
IndexReader r = IndexReader.open(userindex, true);
181
spellChecker.clearIndex();
182
addwords(r, spellChecker, "field1");
185
String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1",
186
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
187
assertEquals(1, similar.length);
188
assertEquals("eighty", similar[0]);
192
String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1",
193
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
194
assertEquals(1, similar.length);
195
assertEquals("eight", similar[0]);
199
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
200
SuggestMode.SUGGEST_MORE_POPULAR);
201
assertEquals(5, similar.length);
202
assertEquals("eight", similar[0]);
206
String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1",
207
SuggestMode.SUGGEST_MORE_POPULAR);
208
assertEquals(1, similar.length);
209
assertEquals("twenty-one", similar[0]);
213
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
214
SuggestMode.SUGGEST_MORE_POPULAR);
215
assertEquals(0, similar.length);
219
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
220
SuggestMode.SUGGEST_ALWAYS);
221
assertEquals(5, similar.length);
222
assertEquals("eight", similar[0]);
226
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
227
SuggestMode.SUGGEST_ALWAYS);
228
assertEquals(5, similar.length);
229
assertEquals("eighty", similar[0]);
233
private void checkCommonSuggestions(IndexReader r) throws IOException {
234
String[] similar = spellChecker.suggestSimilar("fvie", 2);
235
assertTrue(similar.length > 0);
236
assertEquals(similar[0], "five");
238
similar = spellChecker.suggestSimilar("five", 2);
239
if (similar.length > 0) {
240
assertFalse(similar[0].equals("five")); // don't suggest a word for itself
243
similar = spellChecker.suggestSimilar("fiv", 2);
244
assertTrue(similar.length > 0);
245
assertEquals(similar[0], "five");
247
similar = spellChecker.suggestSimilar("fives", 2);
248
assertTrue(similar.length > 0);
249
assertEquals(similar[0], "five");
251
assertTrue(similar.length > 0);
252
similar = spellChecker.suggestSimilar("fie", 2);
253
assertEquals(similar[0], "five");
255
// test restraint to a field
256
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
257
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
258
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
260
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
261
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
262
assertEquals(1, similar.length); // there is the term thousand in the field field2
265
private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
267
String[] similar = spellChecker.suggestSimilar("fvie", 2);
268
assertEquals(1, similar.length);
269
assertEquals(similar[0], "five");
271
similar = spellChecker.suggestSimilar("five", 2);
272
assertEquals(1, similar.length);
273
assertEquals(similar[0], "nine"); // don't suggest a word for itself
275
similar = spellChecker.suggestSimilar("fiv", 2);
276
assertEquals(1, similar.length);
277
assertEquals(similar[0], "five");
279
similar = spellChecker.suggestSimilar("ive", 2);
280
assertEquals(2, similar.length);
281
assertEquals(similar[0], "five");
282
assertEquals(similar[1], "nine");
284
similar = spellChecker.suggestSimilar("fives", 2);
285
assertEquals(1, similar.length);
286
assertEquals(similar[0], "five");
288
similar = spellChecker.suggestSimilar("fie", 2);
289
assertEquals(2, similar.length);
290
assertEquals(similar[0], "five");
291
assertEquals(similar[1], "nine");
293
similar = spellChecker.suggestSimilar("fi", 2);
294
assertEquals(1, similar.length);
295
assertEquals(similar[0], "five");
297
// test restraint to a field
298
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
299
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
300
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
302
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
303
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
304
assertEquals(1, similar.length); // there is the term thousand in the field field2
306
similar = spellChecker.suggestSimilar("onety", 2);
307
assertEquals(2, similar.length);
308
assertEquals(similar[0], "ninety");
309
assertEquals(similar[1], "one");
311
similar = spellChecker.suggestSimilar("tousand", 10, r, null,
312
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
313
} catch (NullPointerException e) {
314
assertTrue("threw an NPE, and it shouldn't have", false);
318
private void checkJaroWinklerSuggestions() throws IOException {
319
String[] similar = spellChecker.suggestSimilar("onety", 2);
320
assertEquals(2, similar.length);
321
assertEquals(similar[0], "one");
322
assertEquals(similar[1], "ninety");
325
private void checkNGramSuggestions() throws IOException {
326
String[] similar = spellChecker.suggestSimilar("onety", 2);
327
assertEquals(2, similar.length);
328
assertEquals(similar[0], "one");
329
assertEquals(similar[1], "ninety");
332
private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
333
long time = System.currentTimeMillis();
334
sc.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false);
335
time = System.currentTimeMillis() - time;
336
//System.out.println("time to build " + field + ": " + time);
339
private int numdoc() throws IOException {
340
IndexReader rs = IndexReader.open(spellindex, true);
341
int num = rs.numDocs();
342
assertTrue(num != 0);
343
//System.out.println("num docs: " + num);
348
public void testClose() throws IOException {
349
IndexReader r = IndexReader.open(userindex, true);
350
spellChecker.clearIndex();
351
String field = "field1";
352
addwords(r, spellChecker, "field1");
353
int num_field1 = this.numdoc();
354
addwords(r, spellChecker, "field2");
355
int num_field2 = this.numdoc();
356
assertEquals(num_field2, num_field1 + 1);
357
checkCommonSuggestions(r);
358
assertLastSearcherOpen(4);
359
spellChecker.close();
360
assertSearchersClosed();
362
spellChecker.close();
363
fail("spellchecker was already closed");
364
} catch (AlreadyClosedException e) {
368
checkCommonSuggestions(r);
369
fail("spellchecker was already closed");
370
} catch (AlreadyClosedException e) {
375
spellChecker.clearIndex();
376
fail("spellchecker was already closed");
377
} catch (AlreadyClosedException e) {
382
spellChecker.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false);
383
fail("spellchecker was already closed");
384
} catch (AlreadyClosedException e) {
389
spellChecker.setSpellIndex(spellindex);
390
fail("spellchecker was already closed");
391
} catch (AlreadyClosedException e) {
394
assertEquals(4, searchers.size());
395
assertSearchersClosed();
400
* tests if the internally shared indexsearcher is correctly closed
401
* when the spellchecker is concurrently accessed and closed.
403
public void testConcurrentAccess() throws IOException, InterruptedException {
404
assertEquals(1, searchers.size());
405
final IndexReader r = IndexReader.open(userindex, true);
406
spellChecker.clearIndex();
407
assertEquals(2, searchers.size());
408
addwords(r, spellChecker, "field1");
409
assertEquals(3, searchers.size());
410
int num_field1 = this.numdoc();
411
addwords(r, spellChecker, "field2");
412
assertEquals(4, searchers.size());
413
int num_field2 = this.numdoc();
414
assertEquals(num_field2, num_field1 + 1);
415
int numThreads = 5 + this.random.nextInt(5);
416
ExecutorService executor = Executors.newFixedThreadPool(numThreads);
417
SpellCheckWorker[] workers = new SpellCheckWorker[numThreads];
418
for (int i = 0; i < numThreads; i++) {
419
SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r);
420
executor.execute(spellCheckWorker);
421
workers[i] = spellCheckWorker;
424
int iterations = 5 + random.nextInt(5);
425
for (int i = 0; i < iterations; i++) {
427
// concurrently reset the spell index
428
spellChecker.setSpellIndex(this.spellindex);
429
// for debug - prints the internal open searchers
430
// showSearchersOpen();
433
spellChecker.close();
435
// wait for 60 seconds - usually this is very fast but coverage runs could take quite long
436
executor.awaitTermination(60L, TimeUnit.SECONDS);
438
for (int i = 0; i < workers.length; i++) {
439
assertFalse(String.format("worker thread %d failed", i), workers[i].failed);
440
assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated);
442
// 4 searchers more than iterations
445
// 2. and 3. during addwords
446
assertEquals(iterations + 4, searchers.size());
447
assertSearchersClosed();
451
private void assertLastSearcherOpen(int numSearchers) {
452
assertEquals(numSearchers, searchers.size());
453
IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]);
454
for (int i = 0; i < searcherArray.length; i++) {
455
if (i == searcherArray.length - 1) {
456
assertTrue("expected last searcher open but was closed",
457
searcherArray[i].getIndexReader().getRefCount() > 0);
459
assertFalse("expected closed searcher but was open - Index: " + i,
460
searcherArray[i].getIndexReader().getRefCount() > 0);
465
private void assertSearchersClosed() {
466
for (IndexSearcher searcher : searchers) {
467
assertEquals(0, searcher.getIndexReader().getRefCount());
472
// private void showSearchersOpen() {
474
// for (IndexSearcher searcher : searchers) {
475
// if(searcher.getIndexReader().getRefCount() > 0)
478
// System.out.println(count);
482
private class SpellCheckWorker implements Runnable {
483
private final IndexReader reader;
484
volatile boolean terminated = false;
485
volatile boolean failed = false;
487
SpellCheckWorker(IndexReader reader) {
489
this.reader = reader;
496
checkCommonSuggestions(reader);
497
} catch (AlreadyClosedException e) {
500
} catch (Throwable e) {
514
class SpellCheckerMock extends SpellChecker {
515
public SpellCheckerMock(Directory spellIndex) throws IOException {
519
public SpellCheckerMock(Directory spellIndex, StringDistance sd)
521
super(spellIndex, sd);
524
public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
525
super(spellIndex, sd, comparator);
529
IndexSearcher createSearcher(Directory dir) throws IOException {
530
IndexSearcher searcher = super.createSearcher(dir);
531
TestSpellChecker.this.searchers.add(searcher);