1
package org.apache.lucene.analysis.ngram;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.analysis.TokenFilter;
21
import org.apache.lucene.analysis.TokenStream;
22
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25
import java.io.IOException;
28
* Tokenizes the given token into n-grams of given size(s).
30
* This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
33
public final class EdgeNGramTokenFilter extends TokenFilter {
34
public static final Side DEFAULT_SIDE = Side.FRONT;
35
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
36
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
38
/** Specifies which side of the input the n-gram should be generated from */
39
public static enum Side {
41
/** Get the n-gram from the front of the input */
44
public String getLabel() { return "front"; }
47
/** Get the n-gram from the end of the input */
50
public String getLabel() { return "back"; }
53
public abstract String getLabel();
55
// Get the appropriate Side from a string
56
public static Side getSide(String sideName) {
57
if (FRONT.getLabel().equals(sideName)) {
60
if (BACK.getLabel().equals(sideName)) {
67
private final int minGram;
68
private final int maxGram;
70
private char[] curTermBuffer;
71
private int curTermLength;
72
private int curGramSize;
75
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
76
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
79
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
81
* @param input {@link TokenStream} holding the input to be tokenized
82
* @param side the {@link Side} from which to chop off an n-gram
83
* @param minGram the smallest n-gram to generate
84
* @param maxGram the largest n-gram to generate
86
public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) {
90
throw new IllegalArgumentException("sideLabel must be either front or back");
94
throw new IllegalArgumentException("minGram must be greater than zero");
97
if (minGram > maxGram) {
98
throw new IllegalArgumentException("minGram must not be greater than maxGram");
101
this.minGram = minGram;
102
this.maxGram = maxGram;
107
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
109
* @param input {@link TokenStream} holding the input to be tokenized
110
* @param sideLabel the name of the {@link Side} from which to chop off an n-gram
111
* @param minGram the smallest n-gram to generate
112
* @param maxGram the largest n-gram to generate
114
public EdgeNGramTokenFilter(TokenStream input, String sideLabel, int minGram, int maxGram) {
115
this(input, Side.getSide(sideLabel), minGram, maxGram);
119
public final boolean incrementToken() throws IOException {
121
if (curTermBuffer == null) {
122
if (!input.incrementToken()) {
125
curTermBuffer = termAtt.buffer().clone();
126
curTermLength = termAtt.length();
127
curGramSize = minGram;
128
tokStart = offsetAtt.startOffset();
131
if (curGramSize <= maxGram) {
132
if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
133
|| curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
134
// grab gramSize chars from front or back
135
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
136
int end = start + curGramSize;
138
offsetAtt.setOffset(tokStart + start, tokStart + end);
139
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
144
curTermBuffer = null;
149
public void reset() throws IOException {
151
curTermBuffer = null;