2
* Licensed under the Apache License,
3
* Version 2.0 (the "License"); you may not use this file except in compliance with the License.
4
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
5
* Unless required by applicable law or agreed to in writing, software distributed under the License
6
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7
* See the License for the specific language governing permissions and limitations under the License.
10
package org.apache.solr.analysis;
12
import java.io.IOException;
13
import java.util.Arrays;
16
import org.apache.lucene.analysis.CharArraySet;
17
import org.apache.lucene.analysis.TokenFilter;
18
import org.apache.lucene.analysis.TokenStream;
19
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
20
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
21
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
23
import org.apache.lucene.util.Version;
26
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
30
* Construct bigrams for frequently occurring terms while indexing. Single terms
31
* are still indexed too, with bigrams overlaid. This is achieved through the
32
* use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
33
* of {@link #GRAM_TYPE} Example:
35
* <li>input:"the quick brown fox"</li>
36
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
37
* <li>"the-quick" has a position increment of 0 so it is in the same position
38
* as "the" "the-quick" has a term.type() of "gram"</li>
44
* Constructors and makeCommonSet based on similar code in StopFilter
46
public final class CommonGramsFilter extends TokenFilter {
48
static final String GRAM_TYPE = "gram";
49
private static final char SEPARATOR = '_';
51
private final CharArraySet commonWords;
53
private final StringBuilder buffer = new StringBuilder();
55
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
56
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
57
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
58
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
60
private int lastStartOffset;
61
private boolean lastWasCommon;
62
private State savedState;
64
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
66
public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
67
this(Version.LUCENE_29, input, commonWords);
70
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
72
public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
73
this(Version.LUCENE_29, input, commonWords, ignoreCase);
77
* Construct a token stream filtering the given input using a Set of common
78
* words to create bigrams. Outputs both unigrams with position increment and
79
* bigrams with position increment 0 type=gram where one or both of the words
80
* in a potential bigram are in the set of common words .
82
* @param input TokenStream input in filter chain
83
* @param commonWords The set of common words.
85
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
86
this(matchVersion, input, commonWords, false);
90
* Construct a token stream filtering the given input using a Set of common
91
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
92
* is CharArraySet). If <code>commonWords</code> is an instance of
93
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
94
* construct the set) it will be directly used and <code>ignoreCase</code>
95
* will be ignored since <code>CharArraySet</code> directly controls case
98
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
99
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
100
* used to specify the case sensitivity of that set.
102
* @param input TokenStream input in filter chain.
103
* @param commonWords The set of common words.
104
* @param ignoreCase -Ignore case when constructing bigrams for common words.
106
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
108
if (commonWords instanceof CharArraySet) {
109
this.commonWords = (CharArraySet) commonWords;
111
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
112
this.commonWords.addAll(commonWords);
117
* Construct a token stream filtering the given input using an Array of common
118
* words to create bigrams.
120
* @param input Tokenstream in filter chain
121
* @param commonWords words to be used in constructing bigrams
122
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
125
public CommonGramsFilter(TokenStream input, String[] commonWords) {
126
this(input, commonWords, false);
130
* Construct a token stream filtering the given input using an Array of common
131
* words to create bigrams and is case-sensitive if ignoreCase is false.
133
* @param input Tokenstream in filter chain
134
* @param commonWords words to be used in constructing bigrams
135
* @param ignoreCase -Ignore case when constructing bigrams for common words.
136
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
139
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
141
this.commonWords = makeCommonSet(commonWords, ignoreCase);
145
* Build a CharArraySet from an array of common words, appropriate for passing
146
* into the CommonGramsFilter constructor. This permits this commonWords
147
* construction to be cached once when an Analyzer is constructed.
149
* @param commonWords Array of common words which will be converted into the CharArraySet
150
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
151
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
152
* @deprecated create a CharArraySet with CharArraySet instead
155
public static CharArraySet makeCommonSet(String[] commonWords) {
156
return makeCommonSet(commonWords, false);
160
* Build a CharArraySet from an array of common words, appropriate for passing
161
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
164
* @param commonWords Array of common words which will be converted into the CharArraySet
165
* @param ignoreCase If true, all words are lower cased first.
166
* @return a Set containing the words
167
* @deprecated create a CharArraySet with CharArraySet instead
170
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
171
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
172
commonSet.addAll(Arrays.asList(commonWords));
177
* Inserts bigrams for common words into a token stream. For each input token,
178
* output the token. If the token and/or the following token are in the list
179
* of common words also output a bigram with position increment 0 and
182
* TODO:Consider adding an option to not emit unigram stopwords
183
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
184
* changed to work with this.
186
* TODO: Consider optimizing for the case of three
187
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
188
* "of-the", "the-year" but with proper management of positions we could
189
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
193
public boolean incrementToken() throws IOException {
194
// get the next piece of input
195
if (savedState != null) {
196
restoreState(savedState);
200
} else if (!input.incrementToken()) {
204
/* We build n-grams before and after stopwords.
205
* When valid, the buffer always contains at least the separator.
206
* If its empty, there is nothing before this stopword.
208
if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
209
savedState = captureState();
222
public void reset() throws IOException {
224
lastWasCommon = false;
229
// ================================================= Helper Methods ================================================
232
* Determines if the current token is a common term
234
* @return {@code true} if the current token is a common term, {@code false} otherwise
236
private boolean isCommon() {
237
return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
241
* Saves this information to form the left part of a gram
243
private void saveTermBuffer() {
245
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
246
buffer.append(SEPARATOR);
247
lastStartOffset = offsetAttribute.startOffset();
248
lastWasCommon = isCommon();
252
* Constructs a compound token.
254
private void gramToken() {
255
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
256
int endOffset = offsetAttribute.endOffset();
260
int length = buffer.length();
261
char termText[] = termAttribute.buffer();
262
if (length > termText.length) {
263
termText = termAttribute.resizeBuffer(length);
266
buffer.getChars(0, length, termText, 0);
267
termAttribute.setLength(length);
268
posIncAttribute.setPositionIncrement(0);
269
offsetAttribute.setOffset(lastStartOffset, endOffset);
270
typeAttribute.setType(GRAM_TYPE);