1
package org.apache.lucene.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Closeable;
22
import java.lang.reflect.Modifier;
24
import org.apache.lucene.document.Document;
25
import org.apache.lucene.document.Field;
26
import org.apache.lucene.index.IndexWriter;
27
import org.apache.lucene.util.Attribute;
28
import org.apache.lucene.util.AttributeImpl;
29
import org.apache.lucene.util.AttributeSource;
32
* A <code>TokenStream</code> enumerates the sequence of tokens, either from
33
* {@link Field}s of a {@link Document} or from query text.
35
* This is an abstract class; concrete subclasses are:
37
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
38
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
39
* <code>TokenStream</code>.
41
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
42
* has moved from being {@link Token}-based to {@link Attribute}-based. While
43
* {@link Token} still exists in 2.9 as a convenience class, the preferred way
44
* to store the information of a {@link Token} is to use {@link AttributeImpl}s.
46
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides
47
* access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
48
* Note that only one instance per {@link AttributeImpl} is created and reused
49
* for every token. This approach reduces object creation and allows local
50
* caching of references to the {@link AttributeImpl}s. See
51
* {@link #incrementToken()} for further details.
53
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
55
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
56
* attributes to/from the {@link AttributeSource}.
57
* <li>The consumer calls {@link TokenStream#reset()}.
58
* <li>The consumer retrieves attributes from the stream and stores local
59
* references to all attributes it wants to access.
60
* <li>The consumer calls {@link #incrementToken()} until it returns false
61
* consuming the attributes after each call.
62
* <li>The consumer calls {@link #end()} so that any end-of-stream operations
64
* <li>The consumer calls {@link #close()} to release any resource when finished
65
* using the <code>TokenStream</code>.
67
* To make sure that filters and consumers know which attributes are available,
68
* the attributes must be added during instantiation. Filters and consumers are
69
* not required to check for availability of attributes in
70
* {@link #incrementToken()}.
72
* You can find some example code for the new API in the analysis package level
75
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
76
* e.g., for buffering purposes (see {@link CachingTokenFilter},
77
* {@link TeeSinkTokenFilter}). For this usecase
78
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
80
* <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
81
* Therefore all non-abstract subclasses must be final or have at least a final
82
* implementation of {@link #incrementToken}! This is checked when Java
83
* assertions are enabled.
85
public abstract class TokenStream extends AttributeSource implements Closeable {
88
* A TokenStream using the default attribute factory.
90
protected TokenStream() {
96
* A TokenStream that uses the same attributes as the supplied one.
98
protected TokenStream(AttributeSource input) {
100
assert assertFinal();
104
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
106
protected TokenStream(AttributeFactory factory) {
108
assert assertFinal();
111
private boolean assertFinal() {
113
final Class<?> clazz = getClass();
114
if (!clazz.desiredAssertionStatus())
116
assert clazz.isAnonymousClass() ||
117
(clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
118
Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
119
"TokenStream implementation classes or at least their incrementToken() implementation must be final";
121
} catch (NoSuchMethodException nsme) {
127
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
128
* the next token. Implementing classes must implement this method and update
129
* the appropriate {@link AttributeImpl}s with the attributes of the next
132
* The producer must make no assumptions about the attributes after the method
133
* has been returned: the caller may arbitrarily change it. If the producer
134
* needs to preserve the state for subsequent calls, it can use
135
* {@link #captureState} to create a copy of the current attribute state.
137
* This method is called for every token of a document, so an efficient
138
* implementation is crucial for good performance. To avoid calls to
139
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
140
* references to all {@link AttributeImpl}s that this stream uses should be
141
* retrieved during instantiation.
143
* To ensure that filters and consumers know which attributes are available,
144
* the attributes must be added during instantiation. Filters and consumers
145
* are not required to check for availability of attributes in
146
* {@link #incrementToken()}.
148
* @return false for end of stream; true otherwise
150
public abstract boolean incrementToken() throws IOException;
153
* This method is called by the consumer after the last token has been
154
* consumed, after {@link #incrementToken()} returned <code>false</code>
155
* (using the new <code>TokenStream</code> API). Streams implementing the old API
156
* should upgrade to use this feature.
158
* This method can be used to perform any end-of-stream operations, such as
159
* setting the final offset of a stream. The final offset of a stream might
160
* differ from the offset of the last token eg in case one or more whitespaces
161
* followed after the last token, but a {@link WhitespaceTokenizer} was used.
163
* @throws IOException
165
public void end() throws IOException {
166
// do nothing by default
170
* Resets this stream to the beginning. This is an optional operation, so
171
* subclasses may or may not implement this method. {@link #reset()} is not needed for
172
* the standard indexing process. However, if the tokens of a
173
* <code>TokenStream</code> are intended to be consumed more than once, it is
174
* necessary to implement {@link #reset()}. Note that if your TokenStream
175
* caches tokens and feeds them back again after a reset, it is imperative
176
* that you clone the tokens when you store them away (on the first pass) as
177
* well as when you return them (on future passes after {@link #reset()}).
179
public void reset() throws IOException {}
181
/** Releases resources associated with this stream. */
182
public void close() throws IOException {}