1
package org.apache.lucene.analysis.query;
3
* Licensed to the Apache Software Foundation (ASF) under one or more
4
* contributor license agreements. See the NOTICE file distributed with
5
* this work for additional information regarding copyright ownership.
6
* The ASF licenses this file to You under the Apache License, Version 2.0
7
* (the "License"); you may not use this file except in compliance with
8
* the License. You may obtain a copy of the License at
10
* http://www.apache.org/licenses/LICENSE-2.0
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
19
import org.apache.lucene.index.IndexReader;
20
import org.apache.lucene.index.Term;
21
import org.apache.lucene.index.TermEnum;
22
import org.apache.lucene.analysis.Analyzer;
23
import org.apache.lucene.analysis.TokenStream;
24
import org.apache.lucene.analysis.StopFilter;
25
import org.apache.lucene.util.StringHelper;
26
import org.apache.lucene.util.Version;
28
import java.io.IOException;
29
import java.io.Reader;
33
* An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
34
* which prevents very common words from being passed into queries.
36
* For very large indexes the cost
37
* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
38
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
39
* this term to take 2 seconds.
42
* Use the various "addStopWords" methods in this class to automate the identification and addition of
43
* stop words found in an already existing index.
46
public final class QueryAutoStopWordAnalyzer extends Analyzer {
48
private final Analyzer delegate;
49
private final Map<String, Set<String>> stopWordsPerField = new HashMap<String, Set<String>>();
50
//The default maximum percentage (40%) of index documents which
51
//can contain a term, after which the term is considered to be a stop word.
52
public static final float defaultMaxDocFreqPercent = 0.4f;
53
private final Version matchVersion;
56
* Initializes this analyzer with the Analyzer object that actually produces the tokens
58
* @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
59
* @deprecated Stopwords should be calculated at instantiation using one of the other constructors
62
public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
63
this.delegate = delegate;
64
this.matchVersion = matchVersion;
68
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
69
* indexed fields from terms with a document frequency percentage greater than
70
* {@link #defaultMaxDocFreqPercent}
72
* @param matchVersion Version to be used in {@link StopFilter}
73
* @param delegate Analyzer whose TokenStream will be filtered
74
* @param indexReader IndexReader to identify the stopwords from
75
* @throws IOException Can be thrown while reading from the IndexReader
77
public QueryAutoStopWordAnalyzer(
80
IndexReader indexReader) throws IOException {
81
this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
85
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
86
* indexed fields from terms with a document frequency greater than the given
89
* @param matchVersion Version to be used in {@link StopFilter}
90
* @param delegate Analyzer whose TokenStream will be filtered
91
* @param indexReader IndexReader to identify the stopwords from
92
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
93
* @throws IOException Can be thrown while reading from the IndexReader
95
public QueryAutoStopWordAnalyzer(
98
IndexReader indexReader,
99
int maxDocFreq) throws IOException {
100
this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
104
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
105
* indexed fields from terms with a document frequency percentage greater than
106
* the given maxPercentDocs
108
* @param matchVersion Version to be used in {@link StopFilter}
109
* @param delegate Analyzer whose TokenStream will be filtered
110
* @param indexReader IndexReader to identify the stopwords from
111
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
112
* contain a term, after which the word is considered to be a stop word
113
* @throws IOException Can be thrown while reading from the IndexReader
115
public QueryAutoStopWordAnalyzer(
116
Version matchVersion,
118
IndexReader indexReader,
119
float maxPercentDocs) throws IOException {
120
this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
124
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
125
* given selection of fields from terms with a document frequency percentage
126
* greater than the given maxPercentDocs
128
* @param matchVersion Version to be used in {@link StopFilter}
129
* @param delegate Analyzer whose TokenStream will be filtered
130
* @param indexReader IndexReader to identify the stopwords from
131
* @param fields Selection of fields to calculate stopwords for
132
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
133
* contain a term, after which the word is considered to be a stop word
134
* @throws IOException Can be thrown while reading from the IndexReader
136
public QueryAutoStopWordAnalyzer(
137
Version matchVersion,
139
IndexReader indexReader,
140
Collection<String> fields,
141
float maxPercentDocs) throws IOException {
142
this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
146
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
147
* given selection of fields from terms with a document frequency greater than
148
* the given maxDocFreq
150
* @param matchVersion Version to be used in {@link StopFilter}
151
* @param delegate Analyzer whose TokenStream will be filtered
152
* @param indexReader IndexReader to identify the stopwords from
153
* @param fields Selection of fields to calculate stopwords for
154
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
155
* @throws IOException Can be thrown while reading from the IndexReader
157
public QueryAutoStopWordAnalyzer(
158
Version matchVersion,
160
IndexReader indexReader,
161
Collection<String> fields,
162
int maxDocFreq) throws IOException {
163
this.matchVersion = matchVersion;
164
this.delegate = delegate;
166
for (String field : fields) {
167
Set<String> stopWords = new HashSet<String>();
168
String internedFieldName = StringHelper.intern(field);
169
TermEnum te = indexReader.terms(new Term(field));
170
Term term = te.term();
171
while (term != null) {
172
if (term.field() != internedFieldName) {
175
if (te.docFreq() > maxDocFreq) {
176
stopWords.add(term.text());
183
stopWordsPerField.put(field, stopWords);
188
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
190
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
191
* exceed the required document frequency
192
* @return The number of stop words identified.
193
* @throws IOException
194
* @deprecated Stopwords should be calculated at instantiation using
195
* {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader)}
198
public int addStopWords(IndexReader reader) throws IOException {
199
return addStopWords(reader, defaultMaxDocFreqPercent);
203
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
205
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
206
* exceed the required document frequency
207
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
208
* the term is considered to be a stop word
209
* @return The number of stop words identified.
210
* @throws IOException
211
* @deprecated Stopwords should be calculated at instantiation using
212
* {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, int)}
215
public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
216
int numStopWords = 0;
217
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
218
for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
219
String fieldName = iter.next();
220
numStopWords += addStopWords(reader, fieldName, maxDocFreq);
226
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
228
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
229
* exceed the required document frequency
230
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
231
* contain a term, after which the word is considered to be a stop word.
232
* @return The number of stop words identified.
233
* @throws IOException
234
* @deprecated Stowords should be calculated at instantiation using
235
* {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, float)}
238
public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
239
int numStopWords = 0;
240
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
241
for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
242
String fieldName = iter.next();
243
numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
249
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
251
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
252
* exceed the required document frequency
253
* @param fieldName The field for which stopwords will be added
254
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
255
* contain a term, after which the word is considered to be a stop word.
256
* @return The number of stop words identified.
257
* @throws IOException
258
* @deprecated Stowords should be calculated at instantiation using
259
* {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, float)}
262
public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
263
return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
267
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
269
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
270
* exceed the required document frequency
271
* @param fieldName The field for which stopwords will be added
272
* @param maxDocFreq The maximum number of index documents which
273
* can contain a term, after which the term is considered to be a stop word.
274
* @return The number of stop words identified.
275
* @throws IOException
276
* @deprecated Stowords should be calculated at instantiation using
277
* {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, int)}
280
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
281
HashSet<String> stopWords = new HashSet<String>();
282
String internedFieldName = StringHelper.intern(fieldName);
283
TermEnum te = reader.terms(new Term(fieldName));
284
Term term = te.term();
285
while (term != null) {
286
if (term.field() != internedFieldName) {
289
if (te.docFreq() > maxDocFreq) {
290
stopWords.add(term.text());
297
stopWordsPerField.put(fieldName, stopWords);
299
/* if the stopwords for a field are changed,
300
* then saved streams for that field are erased.
302
@SuppressWarnings("unchecked")
303
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
304
if (streamMap != null)
305
streamMap.remove(fieldName);
307
return stopWords.size();
311
public TokenStream tokenStream(String fieldName, Reader reader) {
314
result = delegate.reusableTokenStream(fieldName, reader);
315
} catch (IOException e) {
316
result = delegate.tokenStream(fieldName, reader);
318
Set<String> stopWords = stopWordsPerField.get(fieldName);
319
if (stopWords != null) {
320
result = new StopFilter(matchVersion, result, stopWords);
325
private class SavedStreams {
326
/* the underlying stream */
330
* when there are no stopwords for the field, refers to wrapped.
331
* if there stopwords, it is a StopFilter around wrapped.
333
TokenStream withStopFilter;
336
@SuppressWarnings("unchecked")
338
public TokenStream reusableTokenStream(String fieldName, Reader reader)
340
/* map of SavedStreams for each field */
341
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
342
if (streamMap == null) {
343
streamMap = new HashMap<String, SavedStreams>();
344
setPreviousTokenStream(streamMap);
347
SavedStreams streams = streamMap.get(fieldName);
348
if (streams == null) {
349
/* an entry for this field does not exist, create one */
350
streams = new SavedStreams();
351
streamMap.put(fieldName, streams);
352
streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
354
/* if there are any stopwords for the field, save the stopfilter */
355
Set<String> stopWords = stopWordsPerField.get(fieldName);
356
if (stopWords != null) {
357
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
359
streams.withStopFilter = streams.wrapped;
364
* an entry for this field exists, verify the wrapped stream has not
365
* changed. if it has not, reuse it, otherwise wrap the new stream.
367
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
368
if (result == streams.wrapped) {
369
/* the wrapped analyzer reused the stream */
372
* the wrapped analyzer did not. if there are any stopwords for the
373
* field, create a new StopFilter around the new stream
375
streams.wrapped = result;
376
Set<String> stopWords = stopWordsPerField.get(fieldName);
377
if (stopWords != null) {
378
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
380
streams.withStopFilter = streams.wrapped;
385
return streams.withStopFilter;
389
* Provides information on which stop words have been identified for a field
391
* @param fieldName The field for which stop words identified in "addStopWords"
392
* method calls will be returned
393
* @return the stop words identified for a field
395
public String[] getStopWords(String fieldName) {
396
Set<String> stopWords = stopWordsPerField.get(fieldName);
397
return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
401
* Provides information on which stop words have been identified for all fields
403
* @return the stop words (as terms)
405
public Term[] getStopWords() {
406
List<Term> allStopWords = new ArrayList<Term>();
407
for (String fieldName : stopWordsPerField.keySet()) {
408
Set<String> stopWords = stopWordsPerField.get(fieldName);
409
for (String text : stopWords) {
410
allStopWords.add(new Term(fieldName, text));
413
return allStopWords.toArray(new Term[allStopWords.size()]);