1
package org.apache.lucene.analysis.th;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.StringReader;
22
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23
import org.apache.lucene.analysis.TokenStream;
24
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
25
import org.apache.lucene.util.Version;
28
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
33
public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
36
* testcase for offsets
38
public void testOffsets() throws Exception {
39
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
40
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
41
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
42
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
43
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
46
public void testTokenType() throws Exception {
47
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
48
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
49
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
50
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
51
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
52
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
53
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
58
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
59
* @deprecated testing backwards behavior
62
public void testBuggyTokenType30() throws Exception {
63
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
64
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
65
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
66
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
67
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
68
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
71
/** @deprecated testing backwards behavior */
73
public void testAnalyzer30() throws Exception {
74
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
75
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
77
assertAnalyzesTo(analyzer, "", new String[] {});
81
"การที่ได้ต้องแสดงว่างานดี",
82
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
86
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
87
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
92
"ประโยคว่า The quick brown fox jumped over the lazy dogs",
93
new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
97
* Test that position increments are adjusted correctly for stopwords.
99
public void testPositionIncrements() throws Exception {
100
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
101
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
103
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
104
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
105
new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
106
new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
107
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
109
// case that a stopword is adjacent to thai text, with no whitespace
110
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องthe แสดงว่างานดี",
111
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
112
new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
113
new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
114
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
117
public void testReusableTokenStream() throws Exception {
118
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
119
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
120
assertAnalyzesToReuse(analyzer, "", new String[] {});
122
assertAnalyzesToReuse(
124
"การที่ได้ต้องแสดงว่างานดี",
125
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
127
assertAnalyzesToReuse(
129
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
130
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
133
/** @deprecated, for version back compat */
135
public void testReusableTokenStream30() throws Exception {
136
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
137
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
138
assertAnalyzesToReuse(analyzer, "", new String[] {});
140
assertAnalyzesToReuse(
142
"การที่ได้ต้องแสดงว่างานดี",
143
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
145
assertAnalyzesToReuse(
147
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
148
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
151
/** blast some random strings through the analyzer */
152
public void testRandomStrings() throws Exception {
153
checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
157
public void testAttributeReuse() throws Exception {
158
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
159
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
161
TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
162
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
163
// this consumer adds flagsAtt, which this analyzer does not use.
164
ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
165
ts.addAttribute(FlagsAttribute.class);
166
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });