1
package org.apache.lucene.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.io.StringReader;
24
import org.apache.lucene.util.Version;
27
* Testcase for {@link CharTokenizer} subclasses
29
public class TestCharTokenizers extends BaseTokenStreamTestCase {
32
* test to read surrogate pairs without loosing the pairing
33
* if the surrogate pair is at the border of the internal IO buffer
35
public void testReadSupplementaryChars() throws IOException {
36
StringBuilder builder = new StringBuilder();
37
// create random input
38
int num = 1024 + random.nextInt(1024);
39
num *= RANDOM_MULTIPLIER;
40
for (int i = 1; i < num; i++) {
41
builder.append("\ud801\udc1cabc");
45
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
46
builder.insert(1023, "\ud801\udc1c");
47
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
48
TEST_VERSION_CURRENT, new StringReader(builder.toString()));
49
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
53
* test to extend the buffer TermAttribute buffer internally. If the internal
54
* alg that extends the size of the char array only extends by 1 char and the
55
* next char to be filled in is a supplementary codepoint (using 2 chars) an
56
* index out of bound exception is triggered.
58
public void testExtendCharBuffer() throws IOException {
59
for (int i = 0; i < 40; i++) {
60
StringBuilder builder = new StringBuilder();
61
for (int j = 0; j < 1+i; j++) {
64
builder.append("\ud801\udc1cabc");
65
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
66
TEST_VERSION_CURRENT, new StringReader(builder.toString()));
67
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
72
* tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
74
public void testMaxWordLength() throws IOException {
75
StringBuilder builder = new StringBuilder();
77
for (int i = 0; i < 255; i++) {
80
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
81
TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
82
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
86
* tests the max word length of 255 with a surrogate pair at position 255
88
public void testMaxWordLengthWithSupplementary() throws IOException {
89
StringBuilder builder = new StringBuilder();
91
for (int i = 0; i < 254; i++) {
94
builder.append("\ud801\udc1c");
95
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
96
TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
97
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
100
public void testLowerCaseTokenizer() throws IOException {
101
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
102
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT,
104
assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
105
"\ud801\udc44test" });
108
public void testLowerCaseTokenizerBWCompat() throws IOException {
109
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
110
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
112
assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
115
public void testWhitespaceTokenizer() throws IOException {
116
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
117
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
119
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
120
"\ud801\udc1ctest" });
123
public void testWhitespaceTokenizerBWCompat() throws IOException {
124
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
125
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
127
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
128
"\ud801\udc1ctest" });
131
public void testIsTokenCharCharInSubclass() {
132
new TestingCharTokenizer(Version.LUCENE_30, new StringReader(""));
134
new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader(""));
135
fail("version 3.1 is not permitted if char based method is implemented");
136
} catch (IllegalArgumentException e) {
141
public void testNormalizeCharInSubclass() {
142
new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader(""));
144
new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT,
145
new StringReader(""));
146
fail("version 3.1 is not permitted if char based method is implemented");
147
} catch (IllegalArgumentException e) {
152
public void testNormalizeAndIsTokenCharCharInSubclass() {
153
new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30,
154
new StringReader(""));
156
new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT,
157
new StringReader(""));
158
fail("version 3.1 is not permitted if char based method is implemented");
159
} catch (IllegalArgumentException e) {
164
static final class TestingCharTokenizer extends CharTokenizer {
165
public TestingCharTokenizer(Version matchVersion, Reader input) {
166
super(matchVersion, input);
170
protected boolean isTokenChar(int c) {
171
return Character.isLetter(c);
174
@Deprecated @Override
175
protected boolean isTokenChar(char c) {
176
return Character.isLetter(c);
180
static final class TestingCharTokenizerNormalize extends CharTokenizer {
181
public TestingCharTokenizerNormalize(Version matchVersion, Reader input) {
182
super(matchVersion, input);
185
@Deprecated @Override
186
protected char normalize(char c) {
191
protected int normalize(int c) {
196
static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer {
197
public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion,
199
super(matchVersion, input);
202
@Deprecated @Override
203
protected char normalize(char c) {
208
protected int normalize(int c) {
213
protected boolean isTokenChar(int c) {
214
return Character.isLetter(c);
217
@Deprecated @Override
218
protected boolean isTokenChar(char c) {
219
return Character.isLetter(c);