1
package org.apache.solr.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.ByteArrayInputStream;
21
import java.io.IOException;
22
import java.io.InputStream;
23
import java.io.StringReader;
24
import java.text.Collator;
25
import java.text.RuleBasedCollator;
26
import java.util.HashMap;
27
import java.util.List;
28
import java.util.Locale;
31
import org.apache.lucene.analysis.MockTokenizer;
32
import org.apache.lucene.analysis.TokenStream;
33
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34
import org.apache.solr.common.ResourceLoader;
36
public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
39
* Turkish has some funny casing.
40
* This test shows how you can solve this kind of thing easily with collation.
41
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
42
* Then things will sort and match correctly.
44
public void testBasicUsage() throws IOException {
45
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
46
String turkishLowerCase = "ı will use turkish casıng";
47
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
48
Map<String,String> args = new HashMap<String,String>();
49
args.put("language", "tr");
50
args.put("strength", "primary");
52
factory.inform(new StringMockSolrResourceLoader(""));
53
TokenStream tsUpper = factory.create(
54
new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false));
55
TokenStream tsLower = factory.create(
56
new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false));
57
assertCollatesToSame(tsUpper, tsLower);
61
* Test usage of the decomposition option for unicode normalization.
63
public void testNormalization() throws IOException {
64
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
65
String turkishLowerCase = "ı will use turkish casıng";
66
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
67
Map<String,String> args = new HashMap<String,String>();
68
args.put("language", "tr");
69
args.put("strength", "primary");
70
args.put("decomposition", "canonical");
72
factory.inform(new StringMockSolrResourceLoader(""));
73
TokenStream tsUpper = factory.create(
74
new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false));
75
TokenStream tsLower = factory.create(
76
new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false));
77
assertCollatesToSame(tsUpper, tsLower);
81
* Test usage of the K decomposition option for unicode normalization.
82
* This works even with identical strength.
84
public void testFullDecomposition() throws IOException {
85
String fullWidth = "Testing";
86
String halfWidth = "Testing";
87
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
88
Map<String,String> args = new HashMap<String,String>();
89
args.put("language", "zh");
90
args.put("strength", "identical");
91
args.put("decomposition", "full");
93
factory.inform(new StringMockSolrResourceLoader(""));
94
TokenStream tsFull = factory.create(
95
new MockTokenizer(new StringReader(fullWidth), MockTokenizer.KEYWORD, false));
96
TokenStream tsHalf = factory.create(
97
new MockTokenizer(new StringReader(halfWidth), MockTokenizer.KEYWORD, false));
98
assertCollatesToSame(tsFull, tsHalf);
102
* Test secondary strength, for english case is not significant.
104
public void testSecondaryStrength() throws IOException {
105
String upperCase = "TESTING";
106
String lowerCase = "testing";
107
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
108
Map<String,String> args = new HashMap<String,String>();
109
args.put("language", "en");
110
args.put("strength", "secondary");
111
args.put("decomposition", "no");
113
factory.inform(new StringMockSolrResourceLoader(""));
114
TokenStream tsUpper = factory.create(
115
new MockTokenizer(new StringReader(upperCase), MockTokenizer.KEYWORD, false));
116
TokenStream tsLower = factory.create(
117
new MockTokenizer(new StringReader(lowerCase), MockTokenizer.KEYWORD, false));
118
assertCollatesToSame(tsUpper, tsLower);
122
* For german, you might want oe to sort and match with o umlaut.
123
* This is not the default, but you can make a customized ruleset to do this.
125
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
126
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
128
public void testCustomRules() throws Exception {
129
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE"));
131
String DIN5007_2_tailorings =
132
"& ae , a\u0308 & AE , A\u0308"+
133
"& oe , o\u0308 & OE , O\u0308"+
134
"& ue , u\u0308 & UE , u\u0308";
136
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
137
String tailoredRules = tailoredCollator.getRules();
139
// at this point, you would save these tailoredRules to a file,
140
// and use the custom parameter.
142
String germanUmlaut = "Töne";
143
String germanOE = "Toene";
144
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
145
Map<String,String> args = new HashMap<String,String>();
146
args.put("custom", "rules.txt");
147
args.put("strength", "primary");
149
factory.inform(new StringMockSolrResourceLoader(tailoredRules));
150
TokenStream tsUmlaut = factory.create(
151
new MockTokenizer(new StringReader(germanUmlaut), MockTokenizer.KEYWORD, false));
152
TokenStream tsOE = factory.create(
153
new MockTokenizer(new StringReader(germanOE), MockTokenizer.KEYWORD, false));
155
assertCollatesToSame(tsUmlaut, tsOE);
158
private class StringMockSolrResourceLoader implements ResourceLoader {
161
StringMockSolrResourceLoader(String text) {
165
public List<String> getLines(String resource) throws IOException {
169
public Object newInstance(String cname, String... subpackages) {
173
public InputStream openResource(String resource) throws IOException {
174
return new ByteArrayInputStream(text.getBytes("UTF-8"));
178
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
182
CharTermAttribute term1 = stream1
183
.addAttribute(CharTermAttribute.class);
184
CharTermAttribute term2 = stream2
185
.addAttribute(CharTermAttribute.class);
186
assertTrue(stream1.incrementToken());
187
assertTrue(stream2.incrementToken());
188
assertEquals(term1.toString(), term2.toString());
189
assertFalse(stream1.incrementToken());
190
assertFalse(stream2.incrementToken());