2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.analysis;
20
import java.io.StringReader;
21
import java.util.HashMap;
24
import org.apache.lucene.analysis.KeywordTokenizer;
25
import org.apache.lucene.analysis.TokenStream;
26
import org.apache.lucene.analysis.Tokenizer;
27
import org.apache.lucene.analysis.WhitespaceTokenizer;
33
public class TestCapitalizationFilter extends BaseTokenTestCase {
35
public void testCapitalization() throws Exception
37
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
38
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
39
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
41
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
44
termBuffer = "kiTTEN".toCharArray();
45
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
46
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
48
factory.forceFirstLetter = true;
50
termBuffer = "and".toCharArray();
51
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
52
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced
54
termBuffer = "AnD".toCharArray();
55
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
56
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either
58
factory.forceFirstLetter = false;
59
termBuffer = "AnD".toCharArray();
60
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
61
assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either
63
factory.forceFirstLetter = true;
64
termBuffer = "big".toCharArray();
65
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
66
assertEquals( "Big", new String(termBuffer, 0, termBuffer.length));
67
termBuffer = "BIG".toCharArray();
68
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
69
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
71
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
72
TokenStream stream = factory.create(tokenizer);
73
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
76
factory.onlyFirstWord = false;
77
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
78
stream = factory.create(tokenizer);
79
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
81
// now only the long words
82
factory.minWordLength = 3;
83
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
84
stream = factory.create(tokenizer);
85
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
88
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
89
stream = factory.create(tokenizer);
90
assertTokenStreamContents(stream, new String[] { "Mckinley" });
92
// Now try some prefixes
93
factory = new CapitalizationFilterFactory();
94
args.put( "okPrefix", "McK" ); // all words
96
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
97
stream = factory.create(tokenizer);
98
assertTokenStreamContents(stream, new String[] { "McKinley" });
100
// now try some stuff with numbers
101
factory.forceFirstLetter = false;
102
factory.onlyFirstWord = false;
103
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
104
stream = factory.create(tokenizer);
105
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
107
factory.forceFirstLetter = true;
108
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
109
stream = factory.create(tokenizer);
110
assertTokenStreamContents(stream, new String[] { "The The the" });
113
public void testKeepIgnoreCase() throws Exception {
114
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
115
args.put( CapitalizationFilterFactory.KEEP, "kitten" );
116
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
117
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
119
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
120
factory.init( args );
122
termBuffer = "kiTTEN".toCharArray();
123
factory.forceFirstLetter = true;
124
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
125
assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length));
127
factory.forceFirstLetter = false;
128
termBuffer = "kiTTEN".toCharArray();
129
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
130
assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length));
133
termBuffer = "kiTTEN".toCharArray();
134
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
135
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
139
* Test CapitalizationFilterFactory's minWordLength option.
141
* This is very weird when combined with ONLY_FIRST_WORD!!!
143
public void testMinWordLength() throws Exception {
144
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
145
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
146
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
147
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
149
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
151
TokenStream ts = factory.create(tokenizer);
152
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
156
* Test CapitalizationFilterFactory's maxWordCount option with only words of 1
157
* in each token (it should do nothing)
159
public void testMaxWordCount() throws Exception {
160
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
161
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
162
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
164
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
165
"one two three four"));
166
TokenStream ts = factory.create(tokenizer);
167
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
171
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
173
public void testMaxWordCount2() throws Exception {
174
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
175
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
176
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
178
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
179
"one two three four"));
180
TokenStream ts = factory.create(tokenizer);
181
assertTokenStreamContents(ts, new String[] {"one two three four"});
185
* Test CapitalizationFilterFactory's maxTokenLength option when exceeded
187
* This is weird, it is not really a max, but inclusive (look at 'is')
189
public void testMaxTokenLength() throws Exception {
190
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
191
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
192
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
194
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
196
TokenStream ts = factory.create(tokenizer);
197
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
201
* Test CapitalizationFilterFactory's forceFirstLetter option
203
public void testForceFirstLetter() throws Exception {
204
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
205
args.put(CapitalizationFilterFactory.KEEP, "kitten");
206
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
207
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
209
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten"));
210
TokenStream ts = factory.create(tokenizer);
211
assertTokenStreamContents(ts, new String[] {"Kitten"});