2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.handler;
20
import org.apache.lucene.index.IndexReader;
21
import org.apache.lucene.index.IndexWriterConfig;
22
import org.apache.lucene.index.Term;
23
import org.apache.lucene.search.IndexSearcher;
24
import org.apache.lucene.search.spell.Dictionary;
25
import org.apache.lucene.search.spell.SpellChecker;
26
import org.apache.lucene.search.spell.HighFrequencyDictionary;
27
import org.apache.lucene.store.Directory;
28
import org.apache.lucene.store.FSDirectory;
29
import org.apache.lucene.store.RAMDirectory;
30
import org.apache.solr.request.SolrQueryRequest;
31
import org.apache.solr.response.SolrQueryResponse;
32
import org.apache.solr.common.SolrException;
33
import org.apache.solr.common.params.SolrParams;
34
import org.apache.solr.common.util.NamedList;
35
import org.apache.solr.common.util.SimpleOrderedMap;
36
import org.apache.solr.core.SolrCore;
37
import org.apache.solr.util.plugin.SolrCoreAware;
40
import java.io.IOException;
42
import java.util.Arrays;
43
import org.slf4j.Logger;
44
import org.slf4j.LoggerFactory;
47
* Takes a string (e.g. a query string) as the value of the "q" parameter
48
* and looks up alternative spelling suggestions in the spellchecker.
49
* The spellchecker used by this handler is the Lucene contrib SpellChecker.
54
border: 1pt solid #AEBDCC;
55
background-color: #F3F5F7;
57
font-family: courier, monospace;
59
// begin css 3 or browser specific rules - do not remove!
60
//see: http://forums.techguy.org/archive/index.php/t-249849.html
61
white-space: pre-wrap;
62
word-wrap: break-word;
63
white-space: -moz-pre-wrap;
64
white-space: -pre-wrap;
65
white-space: -o-pre-wrap;
66
// end css 3 or browser specific rules
71
* <p>The results identifies the original words echoing it as an entry with the
72
* name of "words" and original word value. It
73
* also identifies if the requested "words" is contained in the index through
74
* the use of the exist true/false name value. Examples of these output
75
* parameters in the standard output format is as follows:</p>
77
<str name="words">facial</str>
78
<str name="exist">true</str> </pre>
80
* <p>If a query string parameter of "extendedResults" is used, then each word within the
81
* "q" parameter (seperated by a space or +) will
82
* be iterated through the spell checker and will be wrapped in an
83
* NamedList. Each word will then get its own set of results: words, exists, and
85
* <P><bold>NOTE</bold>: Query terms are simply split on whitespace when using extendedResults mode. This is may not be adequate.
86
* See the {@link org.apache.solr.handler.component.SpellCheckComponent} for alternatives.
88
* <p>Also note that multiword queries will be treated as a single term if extendedResults is false. This may or may not make sense
89
* depending on how the spelling field was indexed.</p>
91
* <p>Examples of the use of the standard ouput (XML) without and with the
92
* use of the "extendedResults" parameter are as follows.</p>
94
* <p> The following URL
95
* examples were configured with the solr.SpellCheckerRequestHandler
96
* named as "/spellchecker".</p>
98
* <p>Without the use of "extendedResults" and one word
99
* spelled correctly: facial </p>
100
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial</pre>
102
<?xml version="1.0" encoding="UTF-8"?>
105
<lst name="responseHeader">
106
<int name="status">0</int>
107
<int name="QTime">6</int>
109
<str name="words">facial</str>
110
<str name="exist">true</str>
111
<arr name="suggestions">
112
<str>faciale</str>
113
<str>faucial</str>
114
<str>fascial</str>
115
<str>facing</str>
116
<str>faciei</str>
117
<str>facialis</str>
118
<str>social</str>
119
<str>facile</str>
120
<str>spacial</str>
121
<str>glacial</str>
122
<str>marcial</str>
123
<str>facies</str>
124
<str>facio</str>
126
</response> </pre>
128
* <p>Without the use of "extendedResults" and two words,
129
* one spelled correctly and one misspelled: facial salophosphoprotein </p>
130
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial+salophosphoprotein</pre>
132
<?xml version="1.0" encoding="UTF-8"?>
135
<lst name="responseHeader">
136
<int name="status">0</int>
137
<int name="QTime">18</int>
139
<str name="words">facial salophosphoprotein</str>
140
<str name="exist">false</str>
141
<arr name="suggestions">
142
<str>sialophosphoprotein</str>
144
</response> </pre>
147
* <p>With the use of "extendedResults" and two words,
148
* one spelled correctly and one misspelled: facial salophosphoprotein </p>
149
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&extendedResults=true&q=facial+salophosphoprotein</pre>
151
<?xml version="1.0" encoding="UTF-8"?>
154
<lst name="responseHeader">
155
<int name="status">0</int>
156
<int name="QTime">23</int>
158
<lst name="result">
159
<lst name="facial">
160
<int name="frequency">1</int>
161
<lst name="suggestions">
162
<lst name="faciale"><int name="frequency">1</int></lst>
163
<lst name="faucial"><int name="frequency">1</int></lst>
164
<lst name="fascial"><int name="frequency">1</int></lst>
165
<lst name="facing"><int name="frequency">1</int></lst>
166
<lst name="faciei"><int name="frequency">1</int></lst>
167
<lst name="facialis"><int name="frequency">1</int></lst>
168
<lst name="social"><int name="frequency">1</int></lst>
169
<lst name="facile"><int name="frequency">1</int></lst>
170
<lst name="spacial"><int name="frequency">1</int></lst>
171
<lst name="glacial"><int name="frequency">1</int></lst>
172
<lst name="marcial"><int name="frequency">1</int></lst>
173
<lst name="facies"><int name="frequency">1</int></lst>
174
<lst name="facio"><int name="frequency">1</int></lst>
177
<lst name="salophosphoprotein">
178
<int name="frequency">0</int>
179
<lst name="suggestions">
180
<lst name="sialophosphoprotein"><int name="frequency">1</int></lst>
181
<lst name="phosphoprotein"><int name="frequency">1</int></lst>
182
<lst name="phosphoproteins"><int name="frequency">1</int></lst>
183
<lst name="alphalipoprotein"><int name="frequency">1</int></lst>
187
</response> </pre>
190
* @see <a href="http://wiki.apache.org/jakarta-lucene/SpellChecker">The Lucene Spellchecker documentation</a>
193
* @deprecated Use {@link org.apache.solr.handler.component.SpellCheckComponent} instead.
195
* See also https://issues.apache.org/jira/browse/SOLR-474 and https://issues.apache.org/jira/browse/SOLR-485
199
public class SpellCheckerRequestHandler extends RequestHandlerBase implements SolrCoreAware {
201
private static Logger log = LoggerFactory.getLogger(SpellCheckerRequestHandler.class);
203
private SpellChecker spellChecker;
206
* From http://wiki.apache.org/jakarta-lucene/SpellChecker
207
* If reader and restrictToField are both not null:
208
* 1. The returned words are restricted only to the words presents in the field
209
* "restrictToField "of the Lucene Index "reader".
211
* 2. The list is also sorted with a second criterium: the popularity (the
212
* frequence) of the word in the user field.
214
* 3. If "onlyMorePopular" is true and the mispelled word exist in the user field,
215
* return only the words more frequent than this.
219
protected Directory spellcheckerIndexDir = new RAMDirectory();
220
protected String dirDescription = "(ramdir)";
221
protected String termSourceField;
223
protected static final String PREFIX = "sp.";
224
protected static final String QUERY_PREFIX = PREFIX + "query.";
225
protected static final String DICTIONARY_PREFIX = PREFIX + "dictionary.";
227
protected static final String SOURCE_FIELD = DICTIONARY_PREFIX + "termSourceField";
228
protected static final String INDEX_DIR = DICTIONARY_PREFIX + "indexDir";
229
protected static final String THRESHOLD = DICTIONARY_PREFIX + "threshold";
231
protected static final String ACCURACY = QUERY_PREFIX + "accuracy";
232
protected static final String SUGGESTIONS = QUERY_PREFIX + "suggestionCount";
233
protected static final String POPULAR = QUERY_PREFIX + "onlyMorePopular";
234
protected static final String EXTENDED = QUERY_PREFIX + "extendedResults";
236
protected static final float DEFAULT_ACCURACY = 0.5f;
237
protected static final int DEFAULT_SUGGESTION_COUNT = 1;
238
protected static final boolean DEFAULT_MORE_POPULAR = false;
239
protected static final boolean DEFAULT_EXTENDED_RESULTS = false;
240
protected static final float DEFAULT_DICTIONARY_THRESHOLD = 0.0f;
242
protected SolrParams args = null;
245
public void init(NamedList args) {
247
this.args = SolrParams.toSolrParams(args);
250
public void inform(SolrCore core)
252
termSourceField = args.get(SOURCE_FIELD, args.get("termSourceField"));
254
String dir = args.get(INDEX_DIR, args.get("spellcheckerIndexDir"));
256
File f = new File(dir);
257
if ( ! f.isAbsolute() ) {
258
f = new File(core.getDataDir(), dir);
260
dirDescription = f.getAbsolutePath();
261
log.info("using spell directory: " + dirDescription);
262
spellcheckerIndexDir = FSDirectory.open(f);
264
log.info("using RAM based spell directory");
266
spellChecker = new SpellChecker(spellcheckerIndexDir);
267
} catch (IOException e) {
268
throw new RuntimeException("Cannot open SpellChecker index", e);
273
* Processes the following query string parameters: q, extendedResults, cmd rebuild,
274
* cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
277
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
279
SolrParams p = req.getParams();
280
String words = p.get("q");
281
String cmd = p.get("cmd");
284
if (cmd.equals("rebuild")) {
286
rsp.add("cmdExecuted","rebuild");
287
} else if (cmd.equals("reopen")) {
289
rsp.add("cmdExecuted","reopen");
291
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd);
295
// empty query string
296
if (null == words || "".equals(words.trim())) {
300
IndexReader indexReader = null;
301
String suggestionField = null;
304
boolean onlyMorePopular;
305
boolean extendedResults;
307
accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
308
spellChecker.setAccuracy(accuracy);
309
} catch (NumberFormatException e) {
310
throw new RuntimeException("Accuracy must be a valid positive float", e);
313
numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
314
} catch (NumberFormatException e) {
315
throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
318
onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
319
} catch (SolrException e) {
320
throw new RuntimeException("'Only more popular' must be a valid boolean", e);
323
extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
324
} catch (SolrException e) {
325
throw new RuntimeException("'Extended results' must be a valid boolean", e);
328
// when searching for more popular, a non null index-reader and
329
// restricted-field are required
330
if (onlyMorePopular || extendedResults) {
331
indexReader = req.getSearcher().getReader();
332
suggestionField = termSourceField;
335
if (extendedResults) {
337
rsp.add("numDocs", indexReader.numDocs());
339
SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
340
String[] wordz = words.split(" ");
341
for (String word : wordz)
343
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
344
nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
345
String[] suggestions =
346
spellChecker.suggestSimilar(word, numSug,
347
indexReader, suggestionField, onlyMorePopular);
350
NamedList<Object> sa = new NamedList<Object>();
351
for (int i=0; i<suggestions.length; i++) {
353
SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
354
si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
355
sa.add(suggestions[i], si);
357
nl.add("suggestions", sa);
358
results.add(word, nl);
360
rsp.add( "result", results );
363
rsp.add("words", words);
364
if (spellChecker.exist(words)) {
365
rsp.add("exist","true");
367
rsp.add("exist","false");
369
String[] suggestions =
370
spellChecker.suggestSimilar(words, numSug,
371
indexReader, suggestionField,
374
rsp.add("suggestions", Arrays.asList(suggestions));
378
/** Returns a dictionary to be used when building the spell-checker index.
379
* Override the method for custom dictionary
381
protected Dictionary getDictionary(SolrQueryRequest req) {
384
threshold = req.getParams().getFloat(THRESHOLD, DEFAULT_DICTIONARY_THRESHOLD);
385
} catch (NumberFormatException e) {
386
throw new RuntimeException("Threshold must be a valid positive float", e);
388
IndexReader indexReader = req.getSearcher().getReader();
389
return new HighFrequencyDictionary(indexReader, termSourceField, threshold);
392
/** Rebuilds the SpellChecker index using values from the <code>termSourceField</code> from the
393
* index pointed to by the current {@link IndexSearcher}.
394
* Any word appearing in less that thresh documents will not be added to the spellcheck index.
396
private void rebuild(SolrQueryRequest req) throws IOException, SolrException {
397
if (null == termSourceField) {
398
throw new SolrException
399
(SolrException.ErrorCode.SERVER_ERROR, "can't rebuild spellchecker index without termSourceField configured");
402
Dictionary dictionary = getDictionary(req);
403
spellChecker.clearIndex();
404
spellChecker.indexDictionary(dictionary, new IndexWriterConfig(req.getCore().getSolrConfig().luceneMatchVersion, null), false);
409
* Reopens the SpellChecker index directory.
410
* Useful if an external process is responsible for building
411
* the spell checker index.
413
private void reopen() throws IOException {
414
spellChecker.setSpellIndex(spellcheckerIndexDir);
417
//////////////////////// SolrInfoMBeans methods //////////////////////
420
public String getVersion() {
421
return "$Revision: 1197478 $";
425
public String getDescription() {
426
return "The SpellChecker Solr request handler for SpellChecker index: " + dirDescription;
430
public String getSourceId() {
431
return "$Id: SpellCheckerRequestHandler.java 1197478 2011-11-04 10:10:03Z rmuir $";
435
public String getSource() {
436
return "$URL: http://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_5_0/solr/core/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java $";
440
public URL[] getDocs() {