1
package org.apache.lucene.queryParser.analyzing;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.StringReader;
22
import java.util.ArrayList;
23
import java.util.List;
25
import org.apache.lucene.analysis.Analyzer;
26
import org.apache.lucene.analysis.TokenStream;
27
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
28
import org.apache.lucene.queryParser.ParseException;
29
import org.apache.lucene.search.Query;
30
import org.apache.lucene.util.Version;
33
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
34
* are also passed through the given analyzer, but wild card characters (like <code>*</code>)
35
* don't get removed from the search terms.
37
* <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
38
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
39
* will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
40
* become <code>h?user</code> when using this parser and thus no match would be found (i.e.
41
* using this parser will be no improvement over QueryParser in such cases).
43
* @version $Revision$, $Date$
45
public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
48
* Constructs a query parser.
49
* @param field the default field for query terms.
50
* @param analyzer used to find terms in the query text.
52
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
53
super(matchVersion, field, analyzer);
58
* parses an input term token that contains one or more wildcard
59
* characters (like <code>*</code>), but is not a prefix term token (one
60
* that has just a single * character at the end).
62
* Example: will be called for <code>H?user</code> or for <code>H*user</code>
63
* but not for <code>*user</code>.
65
* Depending on analyzer and settings, a wildcard term may (most probably will)
66
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
68
* Overrides super class, by passing terms through analyzer.
70
* @param field Name of the field query will use.
71
* @param termStr Term token that contains one or more wild card
72
* characters (? or *), but is not simple prefix term
74
* @return Resulting {@link Query} built for the term
75
* @throws ParseException
78
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
79
List<String> tlist = new ArrayList<String>();
80
List<String> wlist = new ArrayList<String>();
81
/* somewhat a hack: find/store wildcard chars
82
* in order to put them back after analyzing */
83
boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
84
StringBuilder tmpBuffer = new StringBuilder();
85
char[] chars = termStr.toCharArray();
86
for (int i = 0; i < termStr.length(); i++) {
87
if (chars[i] == '?' || chars[i] == '*') {
89
tlist.add(tmpBuffer.toString());
90
tmpBuffer.setLength(0);
92
isWithinToken = false;
95
wlist.add(tmpBuffer.toString());
96
tmpBuffer.setLength(0);
100
tmpBuffer.append(chars[i]);
103
tlist.add(tmpBuffer.toString());
105
wlist.add(tmpBuffer.toString());
108
// get Analyzer from superclass and tokenize the term
113
source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
115
} catch (IOException e1) {
116
throw new RuntimeException(e1);
118
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
121
if (!source.incrementToken()) break;
122
} catch (IOException e) {
125
String term = termAtt.toString();
126
if (!"".equals(term)) {
128
tlist.set(countTokens++, term);
129
} catch (IndexOutOfBoundsException ioobe) {
137
} catch (IOException e) {
141
if (countTokens != tlist.size()) {
142
/* this means that the analyzer used either added or consumed
143
* (common for a stemmer) tokens, and we can't build a WildcardQuery */
144
throw new ParseException("Cannot build WildcardQuery with analyzer "
145
+ getAnalyzer().getClass() + " - tokens added or lost");
148
if (tlist.size() == 0) {
150
} else if (tlist.size() == 1) {
151
if (wlist != null && wlist.size() == 1) {
152
/* if wlist contains one wildcard, it must be at the end, because:
153
* 1) wildcards are not allowed in 1st position of a term by QueryParser
154
* 2) if wildcard was *not* in end, there would be *two* or more tokens */
155
return super.getWildcardQuery(field, tlist.get(0)
156
+ wlist.get(0).toString());
158
/* we should never get here! if so, this method was called
159
* with a termStr containing no wildcard ... */
160
throw new IllegalArgumentException("getWildcardQuery called without wildcard");
163
/* the term was tokenized, let's rebuild to one token
164
* with wildcards put back in postion */
165
StringBuilder sb = new StringBuilder();
166
for (int i = 0; i < tlist.size(); i++) {
167
sb.append( tlist.get(i));
168
if (wlist != null && wlist.size() > i) {
169
sb.append(wlist.get(i));
172
return super.getWildcardQuery(field, sb.toString());
177
* Called when parser parses an input term
178
* token that uses prefix notation; that is, contains a single '*' wildcard
179
* character as its last character. Since this is a special case
180
* of generic wildcard term, and such a query can be optimized easily,
181
* this usually results in a different query object.
183
* Depending on analyzer and settings, a prefix term may (most probably will)
184
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
186
* Overrides super class, by passing terms through analyzer.
188
* @param field Name of the field query will use.
189
* @param termStr Term token to use for building term for the query
190
* (<b>without</b> trailing '*' character!)
192
* @return Resulting {@link Query} built for the term
193
* @throws ParseException
196
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
197
// get Analyzer from superclass and tokenize the term
199
List<String> tlist = new ArrayList<String>();
201
source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
203
} catch (IOException e1) {
204
throw new RuntimeException(e1);
206
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
209
if (!source.incrementToken()) break;
210
} catch (IOException e) {
213
tlist.add(termAtt.toString());
219
} catch (IOException e) {
223
if (tlist.size() == 1) {
224
return super.getPrefixQuery(field, tlist.get(0));
226
/* this means that the analyzer used either added or consumed
227
* (common for a stemmer) tokens, and we can't build a PrefixQuery */
228
throw new ParseException("Cannot build PrefixQuery with analyzer "
229
+ getAnalyzer().getClass()
230
+ (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
235
* Called when parser parses an input term token that has the fuzzy suffix (~) appended.
237
* Depending on analyzer and settings, a fuzzy term may (most probably will)
238
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
240
* Overrides super class, by passing terms through analyzer.
242
* @param field Name of the field query will use.
243
* @param termStr Term token to use for building term for the query
245
* @return Resulting {@link Query} built for the term
246
* @exception ParseException
249
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
250
throws ParseException {
251
// get Analyzer from superclass and tokenize the term
252
TokenStream source = null;
253
String nextToken = null;
254
boolean multipleTokens = false;
257
source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
258
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
260
if (source.incrementToken()) {
261
nextToken = termAtt.toString();
263
multipleTokens = source.incrementToken();
264
} catch (IOException e) {
271
} catch (IOException e) {
275
if (multipleTokens) {
276
throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
277
+ " - tokens were added");
280
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
284
* Overrides super class, by passing terms through analyzer.
285
* @exception ParseException
288
protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
289
throws ParseException {
290
// get Analyzer from superclass and tokenize the terms
291
TokenStream source = null;
292
CharTermAttribute termAtt = null;
293
boolean multipleTokens = false;
298
source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
299
termAtt = source.addAttribute(CharTermAttribute.class);
301
multipleTokens = false;
304
if (source.incrementToken()) {
305
part1 = termAtt.toString();
307
multipleTokens = source.incrementToken();
308
} catch (IOException e) {
315
} catch (IOException e) {
318
if (multipleTokens) {
319
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
320
+ " - tokens were added to part1");
325
} catch (IOException e) {
328
if (multipleTokens) {
329
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
330
+ " - tokens were added to part1");
336
source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
337
termAtt = source.addAttribute(CharTermAttribute.class);
339
if (source.incrementToken()) {
340
part2 = termAtt.toString();
342
multipleTokens = source.incrementToken();
343
} catch (IOException e) {
349
} catch (IOException e) {
352
if (multipleTokens) {
353
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
354
+ " - tokens were added to part2");
359
} catch (IOException e) {
362
if (multipleTokens) {
363
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
364
+ " - tokens were added to part2");
366
return super.getRangeQuery(field, part1, part2, inclusive);