1
package org.apache.lucene.analysis.miscellaneous;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.io.StringReader;
23
import java.util.Arrays;
24
import java.util.Locale;
26
import java.util.regex.Matcher;
27
import java.util.regex.Pattern;
29
import org.apache.lucene.analysis.Analyzer;
30
import org.apache.lucene.analysis.CharArraySet;
31
import org.apache.lucene.analysis.StopAnalyzer;
32
import org.apache.lucene.analysis.StopFilter;
33
import org.apache.lucene.analysis.TokenStream;
34
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
35
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
36
import org.apache.lucene.util.Version;
39
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
40
* {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
41
* (with behaviour identical to {@link String#split(String)}),
42
* and that combines the functionality of
43
* {@link org.apache.lucene.analysis.LetterTokenizer},
44
* {@link org.apache.lucene.analysis.LowerCaseTokenizer},
45
* {@link org.apache.lucene.analysis.WhitespaceTokenizer},
46
* {@link org.apache.lucene.analysis.StopFilter} into a single efficient
47
* multi-purpose class.
49
* If you are unsure how exactly a regular expression should look like, consider
50
* prototyping by simply trying various expressions on some test texts via
51
* {@link String#split(String)}. Once you are satisfied, give that regex to
52
* PatternAnalyzer. Also see <a target="_blank"
53
* href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
55
* This class can be considerably faster than the "normal" Lucene tokenizers.
56
* It can also serve as a building block in a compound Lucene
57
* {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
60
* PatternAnalyzer pat = ...
61
* TokenStream tokenStream = new SnowballFilter(
62
* pat.tokenStream("content", "James is running round in the woods"),
67
public final class PatternAnalyzer extends Analyzer {
69
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
70
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
72
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
73
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
75
private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
76
CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
78
"a", "about", "above", "across", "adj", "after", "afterwards",
79
"again", "against", "albeit", "all", "almost", "alone", "along",
80
"already", "also", "although", "always", "among", "amongst", "an",
81
"and", "another", "any", "anyhow", "anyone", "anything",
82
"anywhere", "are", "around", "as", "at", "be", "became", "because",
83
"become", "becomes", "becoming", "been", "before", "beforehand",
84
"behind", "being", "below", "beside", "besides", "between",
85
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
86
"down", "during", "each", "eg", "either", "else", "elsewhere",
87
"enough", "etc", "even", "ever", "every", "everyone", "everything",
88
"everywhere", "except", "few", "first", "for", "former",
89
"formerly", "from", "further", "had", "has", "have", "he", "hence",
90
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
91
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
92
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
93
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
94
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
95
"must", "my", "myself", "namely", "neither", "never",
96
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
97
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
98
"once one", "only", "onto", "or", "other", "others", "otherwise",
99
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
100
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
101
"several", "she", "should", "since", "so", "some", "somehow",
102
"someone", "something", "sometime", "sometimes", "somewhere",
103
"still", "such", "t", "than", "that", "the", "their", "them",
104
"themselves", "then", "thence", "there", "thereafter", "thereby",
105
"therefor", "therein", "thereupon", "these", "they", "this",
106
"those", "though", "through", "throughout", "thru", "thus", "to",
107
"together", "too", "toward", "towards", "under", "until", "up",
108
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
109
"whatever", "whatsoever", "when", "whence", "whenever",
110
"whensoever", "where", "whereafter", "whereas", "whereat",
111
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
112
"whereon", "whereto", "whereunto", "whereupon", "wherever",
113
"wherewith", "whether", "which", "whichever", "whichsoever",
114
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
115
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
116
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
117
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
122
* A lower-casing word analyzer with English stop words (can be shared
123
* freely across threads without harm); global per class loader.
125
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
126
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
129
* A lower-casing word analyzer with <b>extended </b> English stop words
130
* (can be shared freely across threads without harm); global per class
131
* loader. The stop words are borrowed from
132
* http://thomas.loc.gov/home/stopwords.html, see
133
* http://thomas.loc.gov/home/all.about.inquery.html
135
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
136
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
138
private final Pattern pattern;
139
private final boolean toLowerCase;
140
private final Set<?> stopWords;
142
private final Version matchVersion;
145
* Constructs a new instance with the given parameters.
147
* @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
149
* a regular expression delimiting tokens
151
* if <code>true</code> returns tokens after applying
152
* String.toLowerCase()
154
* if non-null, ignores all tokens that are contained in the
155
* given stop set (after previously having applied toLowerCase()
156
* if applicable). For example, created via
157
* {@link StopFilter#makeStopSet(Version, String[])}and/or
158
* {@link org.apache.lucene.analysis.WordlistLoader}as in
159
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
160
* or <a href="http://www.unine.ch/info/clef/">other stop words
163
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) {
165
throw new IllegalArgumentException("pattern must not be null");
167
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
168
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
170
if (stopWords != null && stopWords.size() == 0) stopWords = null;
172
this.pattern = pattern;
173
this.toLowerCase = toLowerCase;
174
this.stopWords = stopWords;
175
this.matchVersion = matchVersion;
179
* Creates a token stream that tokenizes the given string into token terms
183
* the name of the field to tokenize (currently ignored).
185
* the string to tokenize
186
* @return a new token stream
188
public TokenStream tokenStream(String fieldName, String text) {
189
// Ideally the Analyzer superclass should have a method with the same signature,
190
// with a default impl that simply delegates to the StringReader flavour.
192
throw new IllegalArgumentException("text must not be null");
195
if (pattern == NON_WORD_PATTERN) { // fast path
196
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
198
else if (pattern == WHITESPACE_PATTERN) { // fast path
199
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
202
stream = new PatternTokenizer(text, pattern, toLowerCase);
203
if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords);
210
* Creates a token stream that tokenizes all the text in the given Reader;
211
* This implementation forwards to <code>tokenStream(String, String)</code> and is
212
* less efficient than <code>tokenStream(String, String)</code>.
215
* the name of the field to tokenize (currently ignored).
217
* the reader delivering the text
218
* @return a new token stream
221
public TokenStream tokenStream(String fieldName, Reader reader) {
222
if (reader instanceof FastStringReader) { // fast path
223
return tokenStream(fieldName, ((FastStringReader)reader).getString());
227
String text = toString(reader);
228
return tokenStream(fieldName, text);
229
} catch (IOException e) {
230
throw new RuntimeException(e);
235
* Indicates whether some other object is "equal to" this one.
238
* the reference object with which to compare.
239
* @return true if equal, false otherwise
242
public boolean equals(Object other) {
243
if (this == other) return true;
244
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
245
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
247
if (other instanceof PatternAnalyzer) {
248
PatternAnalyzer p2 = (PatternAnalyzer) other;
250
toLowerCase == p2.toLowerCase &&
251
eqPattern(pattern, p2.pattern) &&
252
eq(stopWords, p2.stopWords);
258
* Returns a hash code value for the object.
260
* @return the hash code.
263
public int hashCode() {
264
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
265
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
268
h = 31*h + pattern.pattern().hashCode();
269
h = 31*h + pattern.flags();
270
h = 31*h + (toLowerCase ? 1231 : 1237);
271
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
275
/** equality where o1 and/or o2 can be null */
276
private static boolean eq(Object o1, Object o2) {
277
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
280
/** assumes p1 and p2 are not null */
281
private static boolean eqPattern(Pattern p1, Pattern p2) {
282
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
286
* Reads until end-of-stream and returns all read chars, finally closes the stream.
288
* @param input the input stream
289
* @throws IOException if an I/O error occurs while reading the stream
291
private static String toString(Reader input) throws IOException {
294
char[] buffer = new char[len];
295
char[] output = new char[len];
299
while ((n = input.read(buffer)) >= 0) {
300
if (len + n > output.length) { // grow capacity
301
char[] tmp = new char[Math.max(output.length << 1, len + n)];
302
System.arraycopy(output, 0, tmp, 0, len);
303
System.arraycopy(buffer, 0, tmp, len, n);
304
buffer = output; // use larger buffer for future larger bulk reads
307
System.arraycopy(buffer, 0, output, len, n);
312
return new String(output, 0, len);
319
///////////////////////////////////////////////////////////////////////////////
321
///////////////////////////////////////////////////////////////////////////////
323
* The work horse; performance isn't fantastic, but it's not nearly as bad
324
* as one might think - kudos to the Sun regex developers.
326
private static final class PatternTokenizer extends TokenStream {
328
private final String str;
329
private final boolean toLowerCase;
330
private Matcher matcher;
332
private static final Locale locale = Locale.getDefault();
333
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
334
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
336
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
338
this.matcher = pattern.matcher(str);
339
this.toLowerCase = toLowerCase;
343
public final boolean incrementToken() {
344
if (matcher == null) return false;
346
while (true) { // loop takes care of leading and trailing boundary cases
349
boolean isMatch = matcher.find();
351
end = matcher.start();
355
matcher = null; // we're finished
358
if (start != end) { // non-empty match (header/trailer)
359
String text = str.substring(start, end);
360
if (toLowerCase) text = text.toLowerCase(locale);
361
termAtt.setEmpty().append(text);
362
offsetAtt.setOffset(start, end);
365
if (!isMatch) return false;
370
public final void end() {
372
final int finalOffset = str.length();
373
this.offsetAtt.setOffset(finalOffset, finalOffset);
378
///////////////////////////////////////////////////////////////////////////////
380
///////////////////////////////////////////////////////////////////////////////
382
* Special-case class for best performance in common cases; this class is
383
* otherwise unnecessary.
385
private static final class FastStringTokenizer extends TokenStream {
387
private final String str;
389
private final boolean isLetter;
390
private final boolean toLowerCase;
391
private final Set<?> stopWords;
392
private static final Locale locale = Locale.getDefault();
393
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
394
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
396
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
398
this.isLetter = isLetter;
399
this.toLowerCase = toLowerCase;
400
this.stopWords = stopWords;
404
public boolean incrementToken() {
406
// cache loop instance vars (performance)
408
int len = s.length();
410
boolean letter = isLetter;
415
// find beginning of token
417
while (i < len && !isTokenChar(s.charAt(i), letter)) {
421
if (i < len) { // found beginning; now find end of token
423
while (i < len && isTokenChar(s.charAt(i), letter)) {
427
text = s.substring(start, i);
428
if (toLowerCase) text = text.toLowerCase(locale);
429
// if (toLowerCase) {
430
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
431
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
432
// text = s.substring(start, i).toLowerCase();
433
//// char[] chars = new char[i-start];
434
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
435
//// text = new String(chars);
437
// text = s.substring(start, i);
440
} while (text != null && isStopWord(text));
447
termAtt.setEmpty().append(text);
448
offsetAtt.setOffset(start, i);
453
public final void end() {
455
final int finalOffset = str.length();
456
this.offsetAtt.setOffset(finalOffset, finalOffset);
459
private boolean isTokenChar(char c, boolean isLetter) {
460
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
463
private boolean isStopWord(String text) {
464
return stopWords != null && stopWords.contains(text);
470
///////////////////////////////////////////////////////////////////////////////
472
///////////////////////////////////////////////////////////////////////////////
474
* A StringReader that exposes it's contained string for fast direct access.
475
* Might make sense to generalize this to CharSequence and make it public?
477
static final class FastStringReader extends StringReader {
479
private final String s;
481
FastStringReader(String s) {