1
package org.apache.solr.spelling;
3
import org.apache.lucene.search.spell.StringDistance;
5
* Licensed to the Apache Software Foundation (ASF) under one or more
6
* contributor license agreements. See the NOTICE file distributed with
7
* this work for additional information regarding copyright ownership.
8
* The ASF licenses this file to You under the Apache License, Version 2.0
9
* (the "License"); you may not use this file except in compliance with
10
* the License. You may obtain a copy of the License at
12
* http://www.apache.org/licenses/LICENSE-2.0
14
* Unless required by applicable law or agreed to in writing, software
15
* distributed under the License is distributed on an "AS IS" BASIS,
16
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
* See the License for the specific language governing permissions and
18
* limitations under the License.
22
import java.io.IOException;
23
import java.util.Arrays;
24
import java.util.Collection;
25
import java.util.Collections;
26
import java.util.Comparator;
27
import java.util.List;
29
import org.apache.lucene.search.spell.SuggestMode;
30
import org.apache.lucene.search.spell.SuggestWord;
31
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
32
import org.apache.lucene.search.spell.SuggestWordQueue;
33
import org.slf4j.Logger;
34
import org.slf4j.LoggerFactory;
36
import org.apache.lucene.analysis.Token;
37
import org.apache.lucene.analysis.WhitespaceAnalyzer;
38
import org.apache.lucene.index.IndexReader;
39
import org.apache.lucene.index.Term;
40
import org.apache.lucene.search.spell.Dictionary;
41
import org.apache.lucene.search.spell.LevensteinDistance;
42
import org.apache.lucene.search.spell.SpellChecker;
43
import org.apache.lucene.store.Directory;
44
import org.apache.lucene.store.FSDirectory;
45
import org.apache.lucene.store.RAMDirectory;
46
import org.apache.solr.common.params.ShardParams;
47
import org.apache.solr.common.params.SolrParams;
48
import org.apache.solr.common.util.NamedList;
49
import org.apache.solr.core.SolrCore;
50
import org.apache.solr.schema.FieldType;
51
import org.apache.solr.search.SolrIndexSearcher;
55
* Abstract base class for all Lucene-based spell checking implementations.
58
* Refer to <a href="http://wiki.apache.org/solr/SpellCheckComponent">SpellCheckComponent</a>
64
public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
65
public static final Logger log = LoggerFactory.getLogger(AbstractLuceneSpellChecker.class);
67
public static final String SPELLCHECKER_ARG_NAME = "spellchecker";
68
public static final String LOCATION = "sourceLocation";
69
public static final String INDEX_DIR = "spellcheckIndexDir";
70
public static final String ACCURACY = "accuracy";
71
public static final String STRING_DISTANCE = "distanceMeasure";
72
public static final String COMPARATOR_CLASS = "comparatorClass";
74
public static final String SCORE_COMP = "score";
75
public static final String FREQ_COMP = "freq";
77
protected org.apache.lucene.search.spell.SpellChecker spellChecker;
79
protected String sourceLocation;
81
* The Directory containing the Spell checking index
83
protected Directory index;
84
protected Dictionary dictionary;
86
public static final int DEFAULT_SUGGESTION_COUNT = 5;
87
protected String indexDir;
88
protected float accuracy = 0.5f;
89
public static final String FIELD = "field";
91
protected StringDistance sd;
94
public String init(NamedList config, SolrCore core) {
95
super.init(config, core);
96
indexDir = (String) config.get(INDEX_DIR);
97
String accuracy = (String) config.get(ACCURACY);
98
//If indexDir is relative then create index inside core.getDataDir()
99
if (indexDir != null) {
100
if (!new File(indexDir).isAbsolute()) {
101
indexDir = core.getDataDir() + File.separator + indexDir;
104
sourceLocation = (String) config.get(LOCATION);
105
String compClass = (String) config.get(COMPARATOR_CLASS);
106
Comparator<SuggestWord> comp = null;
107
if (compClass != null){
108
if (compClass.equalsIgnoreCase(SCORE_COMP)){
109
comp = SuggestWordQueue.DEFAULT_COMPARATOR;
110
} else if (compClass.equalsIgnoreCase(FREQ_COMP)){
111
comp = new SuggestWordFrequencyComparator();
112
} else{//must be a FQCN
113
comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass);
116
comp = SuggestWordQueue.DEFAULT_COMPARATOR;
118
String strDistanceName = (String)config.get(STRING_DISTANCE);
119
if (strDistanceName != null) {
120
sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName);
121
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
123
sd = new LevensteinDistance();
127
spellChecker = new SpellChecker(index, sd, comp);
128
} catch (IOException e) {
129
throw new RuntimeException(e);
131
if (accuracy != null) {
133
this.accuracy = Float.parseFloat(accuracy);
134
spellChecker.setAccuracy(this.accuracy);
135
} catch (NumberFormatException e) {
136
throw new RuntimeException(
137
"Unparseable accuracy given for dictionary: " + name, e);
144
* Kept around for back compatibility purposes.
146
* @param tokens The Tokens to be spell checked.
147
* @param reader The (optional) IndexReader. If there is not IndexReader, than extendedResults are not possible
148
* @param count The maximum number of suggestions to return
149
* @param onlyMorePopular TODO
150
* @param extendedResults TODO
151
* @throws IOException
154
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, int count, boolean onlyMorePopular, boolean extendedResults) throws IOException {
155
return getSuggestions(new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults, spellChecker.getAccuracy(), null));
159
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
160
boolean shardRequest = false;
161
SolrParams params = options.customParams;
164
shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
166
SpellingResult result = new SpellingResult(options.tokens);
167
IndexReader reader = determineReader(options.reader);
168
Term term = field != null ? new Term(field, "") : null;
169
float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
171
int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
172
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
173
for (Token token : options.tokens) {
174
String tokenText = new String(token.buffer(), 0, token.length());
175
String[] suggestions = spellChecker.suggestSimilar(tokenText,
177
field != null ? reader : null, //workaround LUCENE-1295
180
if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
181
//These are spelled the same, continue on
185
if (options.extendedResults == true && reader != null && field != null) {
186
term = term.createTerm(tokenText);
187
result.addFrequency(token, reader.docFreq(term));
188
int countLimit = Math.min(options.count, suggestions.length);
191
for (int i = 0; i < countLimit; i++) {
192
term = term.createTerm(suggestions[i]);
193
result.add(token, suggestions[i], reader.docFreq(term));
195
} else if(shardRequest) {
196
List<String> suggList = Collections.emptyList();
197
result.add(token, suggList);
200
if (suggestions.length > 0) {
201
List<String> suggList = Arrays.asList(suggestions);
202
if (suggestions.length > options.count) {
203
suggList = suggList.subList(0, options.count);
205
result.add(token, suggList);
206
} else if(shardRequest) {
207
List<String> suggList = Collections.emptyList();
208
result.add(token, suggList);
215
protected IndexReader determineReader(IndexReader reader) {
220
public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {
221
spellChecker.setSpellIndex(index);
226
* Initialize the {@link #index} variable based on the {@link #indexDir}. Does not actually create the spelling index.
228
* @throws IOException
230
protected void initIndex() throws IOException {
231
if (indexDir != null) {
232
index = FSDirectory.open(new File(indexDir));
234
index = new RAMDirectory();
239
* @return the Accuracy used for the Spellchecker
241
public float getAccuracy() {
246
* @return the Field used
249
public String getField() {
255
* @return the FieldType name.
257
public String getFieldTypeName() {
258
return fieldTypeName;
263
* @return the Index directory
265
public String getIndexDir() {
270
* @return the location of the source
272
public String getSourceLocation() {
273
return sourceLocation;
276
public StringDistance getStringDistance() {
280
public SpellChecker getSpellChecker() {