1
package org.apache.lucene.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.StringReader;
22
import java.io.Reader;
24
import org.apache.lucene.analysis.standard.StandardTokenizer;
25
import org.apache.lucene.analysis.standard.StandardAnalyzer;
26
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
27
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
28
import org.apache.lucene.index.Payload;
30
public class TestAnalyzers extends BaseTokenStreamTestCase {
32
public void testSimple() throws Exception {
33
Analyzer a = new SimpleAnalyzer(TEST_VERSION_CURRENT);
34
assertAnalyzesTo(a, "foo bar FOO BAR",
35
new String[] { "foo", "bar", "foo", "bar" });
36
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
37
new String[] { "foo", "bar", "foo", "bar" });
38
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
39
new String[] { "foo", "bar", "foo", "bar" });
40
assertAnalyzesTo(a, "U.S.A.",
41
new String[] { "u", "s", "a" });
42
assertAnalyzesTo(a, "C++",
43
new String[] { "c" });
44
assertAnalyzesTo(a, "B2B",
45
new String[] { "b", "b" });
46
assertAnalyzesTo(a, "2B",
47
new String[] { "b" });
48
assertAnalyzesTo(a, "\"QUOTED\" word",
49
new String[] { "quoted", "word" });
52
public void testNull() throws Exception {
53
Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
54
assertAnalyzesTo(a, "foo bar FOO BAR",
55
new String[] { "foo", "bar", "FOO", "BAR" });
56
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
57
new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
58
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
59
new String[] { "foo.bar.FOO.BAR" });
60
assertAnalyzesTo(a, "U.S.A.",
61
new String[] { "U.S.A." });
62
assertAnalyzesTo(a, "C++",
63
new String[] { "C++" });
64
assertAnalyzesTo(a, "B2B",
65
new String[] { "B2B" });
66
assertAnalyzesTo(a, "2B",
67
new String[] { "2B" });
68
assertAnalyzesTo(a, "\"QUOTED\" word",
69
new String[] { "\"QUOTED\"", "word" });
72
public void testStop() throws Exception {
73
Analyzer a = new StopAnalyzer(TEST_VERSION_CURRENT);
74
assertAnalyzesTo(a, "foo bar FOO BAR",
75
new String[] { "foo", "bar", "foo", "bar" });
76
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
77
new String[] { "foo", "bar", "foo", "bar" });
80
void verifyPayload(TokenStream ts) throws IOException {
81
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
83
boolean hasNext = ts.incrementToken();
85
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
86
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
87
assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
91
// Make sure old style next() calls result in a new copy of payloads
92
public void testPayloadCopy() throws IOException {
93
String s = "how now brown cow";
95
ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
96
ts = new PayloadSetter(ts);
99
ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
100
ts = new PayloadSetter(ts);
104
// LUCENE-1150: Just a compile time test, to ensure the
105
// StandardAnalyzer constants remain publicly accessible
106
@SuppressWarnings("unused")
107
public void _testStandardConstants() {
108
int x = StandardTokenizer.ALPHANUM;
109
x = StandardTokenizer.APOSTROPHE;
110
x = StandardTokenizer.ACRONYM;
111
x = StandardTokenizer.COMPANY;
112
x = StandardTokenizer.EMAIL;
113
x = StandardTokenizer.HOST;
114
x = StandardTokenizer.NUM;
115
x = StandardTokenizer.CJ;
116
String[] y = StandardTokenizer.TOKEN_TYPES;
119
private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
122
public TokenStream tokenStream(String fieldName, Reader reader) {
123
return new LowerCaseFilter(TEST_VERSION_CURRENT,
124
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
130
* @deprecated remove this when lucene 3.0 "broken unicode 4" support
131
* is no longer needed.
134
private static class LowerCaseWhitespaceAnalyzerBWComp extends Analyzer {
137
public TokenStream tokenStream(String fieldName, Reader reader) {
138
return new LowerCaseFilter(new WhitespaceTokenizer(reader));
144
* Test that LowercaseFilter handles entire unicode range correctly
146
public void testLowerCaseFilter() throws IOException {
147
Analyzer a = new LowerCaseWhitespaceAnalyzer();
149
assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
151
assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
152
new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
153
assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
154
new String[] { "abaca\ud801\udc3edaba" });
155
// unpaired lead surrogate
156
assertAnalyzesTo(a, "AbaC\uD801AdaBa",
157
new String [] { "abac\uD801adaba" });
158
// unpaired trail surrogate
159
assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
160
new String [] { "abac\uDC16adaba" });
164
* Test that LowercaseFilter handles the lowercasing correctly if the term
165
* buffer has a trailing surrogate character leftover and the current term in
166
* the buffer ends with a corresponding leading surrogate.
168
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
169
// test if the limit of the termbuffer is correctly used with supplementary
171
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
172
new StringReader("BogustermBogusterm\udc16"));
173
LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT,
175
assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
177
String highSurEndingUpper = "BogustermBoguster\ud801";
178
String highSurEndingLower = "bogustermboguster\ud801";
179
tokenizer.reset(new StringReader(highSurEndingUpper));
180
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
181
assertTrue(filter.hasAttribute(CharTermAttribute.class));
182
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
183
int length = highSurEndingLower.length();
184
assertEquals('\ud801', termBuffer[length - 1]);
185
assertEquals('\udc3e', termBuffer[length]);
189
public void testLimitTokenCountAnalyzer() throws IOException {
190
Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
191
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
192
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4);
193
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
195
a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2);
196
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
197
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
198
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
202
* Test that LowercaseFilter only works on BMP for back compat,
203
* depending upon version
204
* @deprecated remove this test when lucene 3.0 "broken unicode 4" support
205
* is no longer needed.
208
public void testLowerCaseFilterBWComp() throws IOException {
209
Analyzer a = new LowerCaseWhitespaceAnalyzerBWComp();
211
assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
212
// supplementary, no-op
213
assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
214
new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
215
assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
216
new String[] { "abaca\ud801\udc16daba" });
217
// unpaired lead surrogate
218
assertAnalyzesTo(a, "AbaC\uD801AdaBa",
219
new String [] { "abac\uD801adaba" });
220
// unpaired trail surrogate
221
assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
222
new String [] { "abac\uDC16adaba" });
225
/** blast some random strings through the analyzer */
226
public void testRandomStrings() throws Exception {
227
checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
228
checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
229
checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
233
final class PayloadSetter extends TokenFilter {
234
PayloadAttribute payloadAtt;
235
public PayloadSetter(TokenStream input) {
237
payloadAtt = addAttribute(PayloadAttribute.class);
240
byte[] data = new byte[1];
241
Payload p = new Payload(data,0,1);
244
public boolean incrementToken() throws IOException {
245
boolean hasNext = input.incrementToken();
246
if (!hasNext) return false;
247
payloadAtt.setPayload(p); // reuse the payload / byte[]
b'\\ No newline at end of file'