1
package org.apache.lucene.analysis.nl;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
import java.io.IOException;
23
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24
import org.apache.lucene.analysis.Analyzer;
25
import org.apache.lucene.analysis.CharArraySet;
26
import org.apache.lucene.util.Version;
29
* Test the Dutch Stem Filter, which only modifies the term text.
31
* The code states that it uses the snowball algorithm, but tests reveal some differences.
34
public class TestDutchStemmer extends BaseTokenStreamTestCase {
36
public void testWithSnowballExamples() throws Exception {
37
check("lichaamsziek", "lichaamsziek");
38
check("lichamelijk", "licham");
39
check("lichamelijke", "licham");
40
check("lichamelijkheden", "licham");
41
check("lichamen", "licham");
42
check("lichere", "licher");
43
check("licht", "licht");
44
check("lichtbeeld", "lichtbeeld");
45
check("lichtbruin", "lichtbruin");
46
check("lichtdoorlatende", "lichtdoorlat");
47
check("lichte", "licht");
48
check("lichten", "licht");
49
check("lichtende", "lichtend");
50
check("lichtenvoorde", "lichtenvoord");
51
check("lichter", "lichter");
52
check("lichtere", "lichter");
53
check("lichters", "lichter");
54
check("lichtgevoeligheid", "lichtgevoel");
55
check("lichtgewicht", "lichtgewicht");
56
check("lichtgrijs", "lichtgrijs");
57
check("lichthoeveelheid", "lichthoevel");
58
check("lichtintensiteit", "lichtintensiteit");
59
check("lichtje", "lichtj");
60
check("lichtjes", "lichtjes");
61
check("lichtkranten", "lichtkrant");
62
check("lichtkring", "lichtkring");
63
check("lichtkringen", "lichtkring");
64
check("lichtregelsystemen", "lichtregelsystem");
65
check("lichtste", "lichtst");
66
check("lichtstromende", "lichtstrom");
67
check("lichtte", "licht");
68
check("lichtten", "licht");
69
check("lichttoetreding", "lichttoetred");
70
check("lichtverontreinigde", "lichtverontreinigd");
71
check("lichtzinnige", "lichtzinn");
73
check("lidia", "lidia");
74
check("lidmaatschap", "lidmaatschap");
75
check("lidstaten", "lidstat");
76
check("lidvereniging", "lidveren");
77
check("opgingen", "opging");
78
check("opglanzing", "opglanz");
79
check("opglanzingen", "opglanz");
80
check("opglimlachten", "opglimlacht");
81
check("opglimpen", "opglimp");
82
check("opglimpende", "opglimp");
83
check("opglimping", "opglimp");
84
check("opglimpingen", "opglimp");
85
check("opgraven", "opgrav");
86
check("opgrijnzen", "opgrijnz");
87
check("opgrijzende", "opgrijz");
88
check("opgroeien", "opgroei");
89
check("opgroeiende", "opgroei");
90
check("opgroeiplaats", "opgroeiplat");
91
check("ophaal", "ophal");
92
check("ophaaldienst", "ophaaldienst");
93
check("ophaalkosten", "ophaalkost");
94
check("ophaalsystemen", "ophaalsystem");
95
check("ophaalt", "ophaalt");
96
check("ophaaltruck", "ophaaltruck");
97
check("ophalen", "ophal");
98
check("ophalend", "ophal");
99
check("ophalers", "ophaler");
100
check("ophef", "ophef");
101
check("opheldering", "ophelder");
102
check("ophemelde", "ophemeld");
103
check("ophemelen", "ophemel");
104
check("opheusden", "opheusd");
105
check("ophief", "ophief");
106
check("ophield", "ophield");
107
check("ophieven", "ophiev");
108
check("ophoepelt", "ophoepelt");
109
check("ophoog", "ophog");
110
check("ophoogzand", "ophoogzand");
111
check("ophopen", "ophop");
112
check("ophoping", "ophop");
113
check("ophouden", "ophoud");
117
* @deprecated remove this test in Lucene 4.0
120
public void testOldBuggyStemmer() throws Exception {
121
Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
122
checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
123
checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
124
checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
127
public void testSnowballCorrectness() throws Exception {
128
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
129
checkOneTermReuse(a, "opheffen", "opheff");
130
checkOneTermReuse(a, "opheffende", "opheff");
131
checkOneTermReuse(a, "opheffing", "opheff");
134
public void testReusableTokenStream() throws Exception {
135
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
136
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
137
checkOneTermReuse(a, "lichamelijk", "licham");
138
checkOneTermReuse(a, "lichamelijke", "licham");
139
checkOneTermReuse(a, "lichamelijkheden", "licham");
143
* Test that changes to the exclusion table are applied immediately
144
* when using reusable token streams.
146
public void testExclusionTableReuse() throws Exception {
147
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
148
checkOneTermReuse(a, "lichamelijk", "licham");
149
a.setStemExclusionTable(new String[] { "lichamelijk" });
150
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
155
public void testExclusionTableViaCtor() throws IOException {
156
CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
157
set.add("lichamelijk");
158
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
159
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
161
a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
162
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
167
* Test that changes to the dictionary stemming table are applied immediately
168
* when using reusable token streams.
170
public void testStemDictionaryReuse() throws Exception {
171
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
172
checkOneTermReuse(a, "lichamelijk", "licham");
173
File customDictFile = getDataFile("customStemDict.txt");
174
a.setStemDictionary(customDictFile);
175
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
179
* Prior to 3.1, this analyzer had no lowercase filter.
180
* stopwords were case sensitive. Preserve this for back compat.
181
* @deprecated Remove this test in Lucene 4.0
184
public void testBuggyStopwordsCasing() throws IOException {
185
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
186
assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
190
* Test that stopwords are not case sensitive
192
public void testStopwordsCasing() throws IOException {
193
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
194
assertAnalyzesTo(a, "Zelf", new String[] { });
197
private void check(final String input, final String expected) throws Exception {
198
checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
201
/** blast some random strings through the analyzer */
202
public void testRandomStrings() throws Exception {
203
checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
b'\\ No newline at end of file'