2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.analysis;
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.util.regex.Matcher;
23
import java.util.regex.Pattern;
24
import org.apache.lucene.analysis.Tokenizer;
25
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27
import org.apache.commons.io.IOUtils;
30
* This tokenizer uses regex pattern matching to construct distinct tokens
31
* for the input stream. It takes two arguments: "pattern" and "group".
34
* <li>"pattern" is the regular expression.</li>
35
* <li>"group" says which group to extract into tokens.</li>
38
* group=-1 (the default) is equivalent to "split". In this case, the tokens will
39
* be equivalent to the output from (without empty tokens):
40
* {@link String#split(java.lang.String)}
43
* Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
45
* pattern = \'([^\']+)\'
47
* input = aaa 'bbb' 'ccc'
49
* the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
50
* but using group=1, the output would be: bbb and ccc (no ' marks)
52
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
54
* @version $Id: PatternTokenizer.java 940806 2010-05-04 11:18:46Z uschindler $
57
public final class PatternTokenizer extends Tokenizer {
59
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
60
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
65
private final Pattern pattern;
66
private final int group;
67
private final Matcher matcher;
69
/** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
70
public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
72
this.pattern = pattern;
74
str = IOUtils.toString(input);
75
matcher = pattern.matcher(str);
80
public boolean incrementToken() throws IOException {
81
if (index >= str.length()) return false;
85
// match a specific group
86
while (matcher.find()) {
87
final String match = matcher.group(group);
88
if (match.length() == 0) continue;
89
termAtt.setEmpty().append(match);
90
index = matcher.start(group);
91
offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
95
index = Integer.MAX_VALUE; // mark exhausted
100
// String.split() functionality
101
while (matcher.find()) {
102
if (matcher.start() - index > 0) {
103
// found a non-zero-length token
104
termAtt.setEmpty().append(str, index, matcher.start());
105
offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
106
index = matcher.end();
110
index = matcher.end();
113
if (str.length() - index == 0) {
114
index = Integer.MAX_VALUE; // mark exhausted
118
termAtt.setEmpty().append(str, index, str.length());
119
offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
120
index = Integer.MAX_VALUE; // mark exhausted
126
public void end() throws IOException {
127
final int ofs = correctOffset(str.length());
128
offsetAtt.setOffset(ofs, ofs);
132
public void reset(Reader input) throws IOException {
134
str = IOUtils.toString(input);