1
package org.apache.solr.analysis;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.Reader;
21
import java.io.StringReader;
22
import java.util.Collections;
23
import java.util.HashMap;
26
import org.apache.lucene.analysis.Tokenizer;
29
* A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
32
public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
34
public void testUAX29URLEmailTokenizer() throws Exception {
35
Reader reader = new StringReader("Wha\u0301t's this thing do?");
36
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
37
factory.init(DEFAULT_VERSION_PARAM);
38
Tokenizer stream = factory.create(reader);
39
assertTokenStreamContents(stream,
40
new String[] {"Wha\u0301t's", "this", "thing", "do" });
43
public void testArabic() throws Exception {
44
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
45
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
46
factory.init(DEFAULT_VERSION_PARAM);
47
Tokenizer stream = factory.create(reader);
48
assertTokenStreamContents(stream,
49
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
50
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
53
public void testChinese() throws Exception {
54
Reader reader = new StringReader("我是中国人。 1234 Tests ");
55
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
56
factory.init(DEFAULT_VERSION_PARAM);
57
Tokenizer stream = factory.create(reader);
58
assertTokenStreamContents(stream,
59
new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
62
public void testKorean() throws Exception {
63
Reader reader = new StringReader("안녕하세요 한글입니다");
64
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
65
factory.init(DEFAULT_VERSION_PARAM);
66
Tokenizer stream = factory.create(reader);
67
assertTokenStreamContents(stream,
68
new String[] {"안녕하세요", "한글입니다"});
71
public void testHyphen() throws Exception {
72
Reader reader = new StringReader("some-dashed-phrase");
73
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
74
factory.init(DEFAULT_VERSION_PARAM);
75
Tokenizer stream = factory.create(reader);
76
assertTokenStreamContents(stream,
77
new String[] {"some", "dashed", "phrase"});
80
// Test with some URLs from TestUAX29URLEmailTokenizer's
81
// urls.from.random.text.with.urls.txt
82
public void testURLs() throws Exception {
84
= "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n"
85
+ " some extra\nWords thrown in here. "
86
+ "http://c5-3486.bisynxu.FR/aI.YnNms/"
87
+ " samba Halta gamba "
88
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
89
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
90
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
92
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
93
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
94
+ " blah Sirrah woof "
95
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
96
Reader reader = new StringReader(textWithURLs);
97
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
98
factory.init(DEFAULT_VERSION_PARAM);
99
Tokenizer stream = factory.create(reader);
100
assertTokenStreamContents(stream,
102
"http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on",
103
"some", "extra", "Words", "thrown", "in", "here",
104
"http://c5-3486.bisynxu.FR/aI.YnNms/",
105
"samba", "Halta", "gamba",
106
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
107
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
108
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
110
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
111
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
112
"blah", "Sirrah", "woof",
113
"http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
118
// Test with some emails from TestUAX29URLEmailTokenizer's
119
// email.addresses.from.random.text.with.email.addresses.txt
120
public void testEmails() throws Exception {
121
String textWithEmails
122
= " some extra\nWords thrown in here. "
123
+ "dJ8ngFi@avz13m.CC\n"
124
+ "kU-l6DS@[082.015.228.189]\n"
125
+ "\"%U\u0012@?\\B\"@Fl2d.md"
126
+ " samba Halta gamba "
128
+ "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
129
+ "~+Kdz@3mousnl.SE\n"
131
+ "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
132
+ "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
133
+ " blah Sirrah woof "
134
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
135
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
136
Reader reader = new StringReader(textWithEmails);
137
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
138
factory.init(DEFAULT_VERSION_PARAM);
139
Tokenizer stream = factory.create(reader);
140
assertTokenStreamContents(stream,
142
"some", "extra", "Words", "thrown", "in", "here",
144
"kU-l6DS@[082.015.228.189]",
145
"\"%U\u0012@?\\B\"@Fl2d.md",
146
"samba", "Halta", "gamba",
148
"SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
151
"C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
152
"}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
153
"blah", "Sirrah", "woof",
154
"lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
155
"lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
160
public void testMaxTokenLength() throws Exception {
161
StringBuilder builder = new StringBuilder();
162
for (int i = 0 ; i < 100 ; ++i) {
163
builder.append("abcdefg"); // 7 * 100 = 700 char "word"
165
String longWord = builder.toString();
166
String content = "one two three " + longWord + " four five six";
167
Reader reader = new StringReader(content);
168
Map<String,String> args = new HashMap<String,String>();
169
args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
170
args.put("maxTokenLength", "1000");
171
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
173
Tokenizer stream = factory.create(reader);
174
assertTokenStreamContents(stream,
175
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
178
/** @deprecated nuke this test in lucene 5.0 */
180
public void testMatchVersion() throws Exception {
181
Reader reader = new StringReader("ざ");
182
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
183
factory.init(DEFAULT_VERSION_PARAM);
184
Tokenizer stream = factory.create(reader);
185
assertTokenStreamContents(stream,
186
new String[] {"ざ"});
188
reader = new StringReader("ざ");
189
factory = new UAX29URLEmailTokenizerFactory();
190
factory.init(Collections.singletonMap("luceneMatchVersion", "3.1"));
191
stream = factory.create(reader);
192
assertTokenStreamContents(stream,
193
new String[] {"さ"}); // old broken behavior