1
package org.apache.lucene.analysis.compound;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.StringReader;
23
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24
import org.apache.lucene.analysis.MockTokenizer;
25
import org.apache.lucene.analysis.TokenFilter;
26
import org.apache.lucene.analysis.TokenStream;
27
import org.apache.lucene.analysis.Tokenizer;
28
import org.apache.lucene.analysis.WhitespaceTokenizer;
29
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
30
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31
import org.apache.lucene.util.Attribute;
32
import org.apache.lucene.util.AttributeImpl;
33
import org.xml.sax.InputSource;
35
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
36
public void testHyphenationCompoundWordsDA() throws Exception {
37
String[] dict = { "læse", "hest" };
39
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
40
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
41
.getHyphenationTree(is);
43
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
44
new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
46
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
47
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
48
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
49
assertTokenStreamContents(tf,
50
new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
51
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
55
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
56
String[] dict = { "basketball", "basket", "ball", "kurv" };
58
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
59
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
60
.getHyphenationTree(is);
62
// the word basket will not be added due to the longest match option
63
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
64
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
66
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
67
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
68
assertTokenStreamContents(tf,
69
new String[] { "basketballkurv", "basketball", "ball", "kurv" },
70
new int[] { 1, 0, 0, 0 }
76
* With hyphenation-only, you can get a lot of nonsense tokens.
77
* This can be controlled with the min/max subword size.
79
public void testHyphenationOnly() throws Exception {
80
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
81
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
82
.getHyphenationTree(is);
84
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
86
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
88
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
92
assertTokenStreamContents(tf,
93
new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
96
tf = new HyphenationCompoundWordTokenFilter(
98
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
100
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
104
assertTokenStreamContents(tf,
105
new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
108
tf = new HyphenationCompoundWordTokenFilter(
109
TEST_VERSION_CURRENT,
110
new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
112
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
116
assertTokenStreamContents(tf,
117
new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
118
"sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
123
public void testDumbCompoundWordsSE() throws Exception {
124
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
125
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
126
"Sko", "Vind", "Rute", "Torkare", "Blad" };
128
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
131
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
132
MockTokenizer.WHITESPACE, false),
135
assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
136
"Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
137
"Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
138
"Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
139
"fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
140
"fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
141
"Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
142
"Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
143
17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
144
77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
145
137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
146
28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
147
87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
148
155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
149
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
153
public void testDumbCompoundWordsSELongestMatch() throws Exception {
154
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
155
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
156
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
158
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
159
new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
160
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
161
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
162
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
164
assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
165
"fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
166
14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
170
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
171
String[] dict = {"ab", "cd", "ef"};
173
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
174
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
179
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
180
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
181
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
183
assertTokenStreamContents(tf,
184
new String[] { "abcdef", "ab", "cd", "ef" },
185
new int[] { 0, 0, 2, 4},
186
new int[] { 6, 2, 4, 6},
187
new int[] { 1, 0, 0, 0}
191
public void testWordComponentWithLessThanMinimumLength() throws Exception {
192
String[] dict = {"abc", "d", "efg"};
194
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
195
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
200
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
201
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
202
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
204
// since "d" is shorter than the minimum subword size, it should not be added to the token stream
205
assertTokenStreamContents(tf,
206
new String[] { "abcdefg", "abc", "efg" },
207
new int[] { 0, 0, 4},
208
new int[] { 7, 3, 7},
213
public void testReset() throws Exception {
214
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
215
"Aufgabe", "Überwachung" };
217
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
218
"Rindfleischüberwachungsgesetz"));
219
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
221
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
222
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
223
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
225
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
226
assertTrue(tf.incrementToken());
227
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
228
assertTrue(tf.incrementToken());
229
assertEquals("Rind", termAtt.toString());
230
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
232
assertTrue(tf.incrementToken());
233
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
236
public void testRetainMockAttribute() throws Exception {
237
String[] dict = { "abc", "d", "efg" };
238
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
239
new StringReader("abcdefg"));
240
TokenStream stream = new MockRetainAttributeFilter(tokenizer);
241
stream = new DictionaryCompoundWordTokenFilter(
242
TEST_VERSION_CURRENT, stream, dict,
243
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
244
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
245
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
246
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
247
while (stream.incrementToken()) {
248
assertTrue("Custom attribute value was lost", retAtt.getRetain());
253
public static interface MockRetainAttribute extends Attribute {
254
void setRetain(boolean attr);
258
public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
259
private boolean retain = false;
261
public void clear() {
264
public boolean getRetain() {
267
public void setRetain(boolean retain) {
268
this.retain = retain;
271
public void copyTo(AttributeImpl target) {
272
MockRetainAttribute t = (MockRetainAttribute) target;
277
private static class MockRetainAttributeFilter extends TokenFilter {
279
MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
281
MockRetainAttributeFilter(TokenStream input) {
286
public boolean incrementToken() throws IOException {
287
if (input.incrementToken()){
288
retainAtt.setRetain(true);