1
package org.apache.lucene.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl;
21
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
22
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
23
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
24
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
25
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
26
import org.apache.lucene.index.Payload;
27
import org.apache.lucene.index.TermPositions; // for javadoc
28
import org.apache.lucene.util.Attribute;
29
import org.apache.lucene.util.AttributeSource;
30
import org.apache.lucene.util.AttributeImpl;
31
import org.apache.lucene.util.AttributeReflector;
34
A Token is an occurrence of a term from the text of a field. It consists of
35
a term's text, the start and end offset of the term in the text of the field,
38
The start and end offsets permit applications to re-associate a token with
39
its source text, e.g., to display highlighted query terms in a document
40
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
43
The type is a string, assigned by a lexical analyzer
44
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
45
belongs to. For example an end of sentence marker token might be implemented
46
with type "eos". The default token type is "word".
48
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
49
length byte array. Use {@link TermPositions#getPayloadLength()} and
50
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
54
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
55
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
56
Even though it is not necessary to use Token anymore, with the new TokenStream API it can
57
be used as convenience class that implements all {@link Attribute}s, which is especially useful
58
to easily switch from the old to the new TokenStream API.
62
<p>Tokenizers and TokenFilters should try to re-use a Token
63
instance when possible for best performance, by
64
implementing the {@link TokenStream#incrementToken()} API.
65
Failing that, to create a new Token you should first use
66
one of the constructors that starts with null text. To load
67
the token from a char[] use {@link #copyBuffer(char[], int, int)}.
68
To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}.
69
Alternatively you can get the Token's termBuffer by calling either {@link #buffer()},
70
if you know that your text is shorter than the capacity of the termBuffer
71
or {@link #resizeBuffer(int)}, if there is any possibility
72
that you may need to grow the buffer. Fill in the characters of your term into this
73
buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
74
or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to
75
set the length of the term text. See <a target="_top"
76
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
78
<p>Typical Token reuse patterns:
80
<li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
82
return reusableToken.reinit(string, startOffset, endOffset[, type]);
85
<li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
87
return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
91
<li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
93
return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
96
<li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
98
return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
101
<li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
103
return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
107
A few things to note:
109
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
110
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
111
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
112
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
116
<b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
117
{@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
118
This method now only prints the term text, no additional information anymore.
120
@see org.apache.lucene.index.Payload
122
// TODO: change superclass to CharTermAttribute in 4.0! Maybe deprecate the whole class?
123
public class Token extends TermAttributeImpl
124
implements TypeAttribute, PositionIncrementAttribute,
125
FlagsAttribute, OffsetAttribute, PayloadAttribute {
127
private int startOffset,endOffset;
128
private String type = DEFAULT_TYPE;
130
private Payload payload;
131
private int positionIncrement = 1;
133
/** Constructs a Token will null text. */
137
/** Constructs a Token with null text and start & end
139
* @param start start offset in the source text
140
* @param end end offset in the source text */
141
public Token(int start, int end) {
146
/** Constructs a Token with null text and start & end
147
* offsets plus the Token type.
148
* @param start start offset in the source text
149
* @param end end offset in the source text
150
* @param typ the lexical type of this Token */
151
public Token(int start, int end, String typ) {
158
* Constructs a Token with null text and start & end
159
* offsets plus flags. NOTE: flags is EXPERIMENTAL.
160
* @param start start offset in the source text
161
* @param end end offset in the source text
162
* @param flags The bits to set for this token
164
public Token(int start, int end, int flags) {
170
/** Constructs a Token with the given term text, and start
171
* & end offsets. The type defaults to "word."
172
* <b>NOTE:</b> for better indexing speed you should
173
* instead use the char[] termBuffer methods to set the
175
* @param text term text
176
* @param start start offset
177
* @param end end offset
179
public Token(String text, int start, int end) {
185
/** Constructs a Token with the given text, start and end
186
* offsets, & type. <b>NOTE:</b> for better indexing
187
* speed you should instead use the char[] termBuffer
188
* methods to set the term text.
189
* @param text term text
190
* @param start start offset
191
* @param end end offset
192
* @param typ token type
194
public Token(String text, int start, int end, String typ) {
202
* Constructs a Token with the given text, start and end
203
* offsets, & type. <b>NOTE:</b> for better indexing
204
* speed you should instead use the char[] termBuffer
205
* methods to set the term text.
209
* @param flags token type bits
211
public Token(String text, int start, int end, int flags) {
219
* Constructs a Token with the given term buffer (offset
220
* & length), start and end
222
* @param startTermBuffer
223
* @param termBufferOffset
224
* @param termBufferLength
228
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
229
copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
234
/** Set the position increment. This determines the position of this token
235
* relative to the previous Token in a {@link TokenStream}, used in phrase
238
* <p>The default value is one.
240
* <p>Some common uses for this are:<ul>
242
* <li>Set it to zero to put multiple terms in the same position. This is
243
* useful if, e.g., a word has multiple stems. Searches for phrases
244
* including either stem will match. In this case, all but the first stem's
245
* increment should be set to zero: the increment of the first instance
246
* should be one. Repeating a token with an increment of zero can also be
247
* used to boost the scores of matches on that token.
249
* <li>Set it to values greater than one to inhibit exact phrase matches.
250
* If, for example, one does not want phrases to match across removed stop
251
* words, then one could build a stop word filter that removes stop words and
252
* also sets the increment to the number of stop words removed before each
253
* non-stop word. Then exact phrase queries will only match when the terms
254
* occur with no intervening stop words.
257
* @param positionIncrement the distance from the prior term
258
* @see org.apache.lucene.index.TermPositions
260
public void setPositionIncrement(int positionIncrement) {
261
if (positionIncrement < 0)
262
throw new IllegalArgumentException
263
("Increment must be zero or greater: " + positionIncrement);
264
this.positionIncrement = positionIncrement;
267
/** Returns the position increment of this Token.
268
* @see #setPositionIncrement
270
public int getPositionIncrement() {
271
return positionIncrement;
274
/** Returns this Token's starting offset, the position of the first character
275
corresponding to this token in the source text.
277
Note that the difference between endOffset() and startOffset() may not be
278
equal to {@link #length}, as the term text may have been altered by a
279
stemmer or some other filter. */
280
public final int startOffset() {
284
/** Set the starting offset.
285
@see #startOffset() */
286
public void setStartOffset(int offset) {
287
this.startOffset = offset;
290
/** Returns this Token's ending offset, one greater than the position of the
291
last character corresponding to this token in the source text. The length
292
of the token in the source text is (endOffset - startOffset). */
293
public final int endOffset() {
297
/** Set the ending offset.
299
public void setEndOffset(int offset) {
300
this.endOffset = offset;
303
/** Set the starting and ending offset.
304
@see #startOffset() and #endOffset()*/
305
public void setOffset(int startOffset, int endOffset) {
306
this.startOffset = startOffset;
307
this.endOffset = endOffset;
310
/** Returns this Token's lexical type. Defaults to "word". */
311
public final String type() {
315
/** Set the lexical type.
317
public final void setType(String type) {
324
* Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
325
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
329
* @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
331
public int getFlags() {
338
public void setFlags(int flags) {
343
* Returns this Token's payload.
345
public Payload getPayload() {
350
* Sets this Token's payload.
352
public void setPayload(Payload payload) {
353
this.payload = payload;
356
/** Resets the term text, payload, flags, and positionIncrement,
357
* startOffset, endOffset and token type to default.
360
public void clear() {
363
positionIncrement = 1;
365
startOffset = endOffset = 0;
370
public Object clone() {
371
Token t = (Token)super.clone();
373
if (payload != null) {
374
t.payload = (Payload) payload.clone();
379
/** Makes a clone, but replaces the term buffer &
380
* start/end offset in the process. This is more
381
* efficient than doing a full clone (and then calling
382
* {@link #copyBuffer}) because it saves a wasted copy of the old
384
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
385
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
386
t.positionIncrement = positionIncrement;
390
t.payload = (Payload) payload.clone();
395
public boolean equals(Object obj) {
399
if (obj instanceof Token) {
400
final Token other = (Token) obj;
401
return (startOffset == other.startOffset &&
402
endOffset == other.endOffset &&
403
flags == other.flags &&
404
positionIncrement == other.positionIncrement &&
405
(type == null ? other.type == null : type.equals(other.type)) &&
406
(payload == null ? other.payload == null : payload.equals(other.payload)) &&
414
public int hashCode() {
415
int code = super.hashCode();
416
code = code * 31 + startOffset;
417
code = code * 31 + endOffset;
418
code = code * 31 + flags;
419
code = code * 31 + positionIncrement;
421
code = code * 31 + type.hashCode();
423
code = code * 31 + payload.hashCode();
427
// like clear() but doesn't clear termBuffer/text
428
private void clearNoTermBuffer() {
430
positionIncrement = 1;
432
startOffset = endOffset = 0;
436
/** Shorthand for calling {@link #clear},
437
* {@link #copyBuffer(char[], int, int)},
438
* {@link #setStartOffset},
439
* {@link #setEndOffset},
441
* @return this Token instance */
442
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
444
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
446
positionIncrement = 1;
447
startOffset = newStartOffset;
448
endOffset = newEndOffset;
453
/** Shorthand for calling {@link #clear},
454
* {@link #copyBuffer(char[], int, int)},
455
* {@link #setStartOffset},
456
* {@link #setEndOffset}
457
* {@link #setType} on Token.DEFAULT_TYPE
458
* @return this Token instance */
459
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
461
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
462
startOffset = newStartOffset;
463
endOffset = newEndOffset;
468
/** Shorthand for calling {@link #clear},
469
* {@link #append(CharSequence)},
470
* {@link #setStartOffset},
471
* {@link #setEndOffset}
473
* @return this Token instance */
474
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
477
startOffset = newStartOffset;
478
endOffset = newEndOffset;
483
/** Shorthand for calling {@link #clear},
484
* {@link #append(CharSequence, int, int)},
485
* {@link #setStartOffset},
486
* {@link #setEndOffset}
488
* @return this Token instance */
489
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
491
append(newTerm, newTermOffset, newTermOffset + newTermLength);
492
startOffset = newStartOffset;
493
endOffset = newEndOffset;
498
/** Shorthand for calling {@link #clear},
499
* {@link #append(CharSequence)},
500
* {@link #setStartOffset},
501
* {@link #setEndOffset}
502
* {@link #setType} on Token.DEFAULT_TYPE
503
* @return this Token instance */
504
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
507
startOffset = newStartOffset;
508
endOffset = newEndOffset;
513
/** Shorthand for calling {@link #clear},
514
* {@link #append(CharSequence, int, int)},
515
* {@link #setStartOffset},
516
* {@link #setEndOffset}
517
* {@link #setType} on Token.DEFAULT_TYPE
518
* @return this Token instance */
519
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
521
append(newTerm, newTermOffset, newTermOffset + newTermLength);
522
startOffset = newStartOffset;
523
endOffset = newEndOffset;
529
* Copy the prototype token's fields into this one. Note: Payloads are shared.
532
public void reinit(Token prototype) {
533
copyBuffer(prototype.buffer(), 0, prototype.length());
534
positionIncrement = prototype.positionIncrement;
535
flags = prototype.flags;
536
startOffset = prototype.startOffset;
537
endOffset = prototype.endOffset;
538
type = prototype.type;
539
payload = prototype.payload;
543
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
547
public void reinit(Token prototype, String newTerm) {
548
setEmpty().append(newTerm);
549
positionIncrement = prototype.positionIncrement;
550
flags = prototype.flags;
551
startOffset = prototype.startOffset;
552
endOffset = prototype.endOffset;
553
type = prototype.type;
554
payload = prototype.payload;
558
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
560
* @param newTermBuffer
564
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
565
copyBuffer(newTermBuffer, offset, length);
566
positionIncrement = prototype.positionIncrement;
567
flags = prototype.flags;
568
startOffset = prototype.startOffset;
569
endOffset = prototype.endOffset;
570
type = prototype.type;
571
payload = prototype.payload;
575
public void copyTo(AttributeImpl target) {
576
if (target instanceof Token) {
577
final Token to = (Token) target;
579
// reinit shares the payload, so clone it:
580
if (payload !=null) {
581
to.payload = (Payload) payload.clone();
584
super.copyTo(target);
585
((OffsetAttribute) target).setOffset(startOffset, endOffset);
586
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
587
((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
588
((FlagsAttribute) target).setFlags(flags);
589
((TypeAttribute) target).setType(type);
594
public void reflectWith(AttributeReflector reflector) {
595
super.reflectWith(reflector);
596
reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
597
reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
598
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
599
reflector.reflect(PayloadAttribute.class, "payload", payload);
600
reflector.reflect(FlagsAttribute.class, "flags", flags);
601
reflector.reflect(TypeAttribute.class, "type", type);
604
/** Convenience factory that returns <code>Token</code> as implementation for the basic
605
* attributes and return the default impl (with "Impl" appended) for all other
609
public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
610
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
612
/** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
613
* and for all other attributes calls the given delegate factory.
616
public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
618
private final AttributeSource.AttributeFactory delegate;
620
/** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
621
* and for all other attributes calls the given delegate factory. */
622
public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
623
this.delegate = delegate;
627
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
628
return attClass.isAssignableFrom(Token.class)
629
? new Token() : delegate.createAttributeInstance(attClass);
633
public boolean equals(Object other) {
634
if (this == other) return true;
635
if (other instanceof TokenAttributeFactory) {
636
final TokenAttributeFactory af = (TokenAttributeFactory) other;
637
return this.delegate.equals(af.delegate);
643
public int hashCode() {
644
return delegate.hashCode() ^ 0x0a45aa31;