2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.analysis;
20
import java.io.IOException;
21
import java.io.StringReader;
22
import java.util.ArrayList;
23
import java.util.Arrays;
24
import java.util.Collection;
25
import java.util.List;
27
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
28
import org.apache.lucene.analysis.MockTokenizer;
29
import org.apache.lucene.analysis.Token;
30
import org.apache.lucene.analysis.TokenStream;
31
import org.apache.lucene.analysis.Tokenizer;
32
import org.apache.lucene.analysis.tokenattributes.*;
35
* @deprecated Remove this test in Lucene 5.0
38
public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
40
static List<String> strings(String str) {
41
String[] arr = str.split(" ");
42
return Arrays.asList(arr);
45
static void assertTokenizesTo(SlowSynonymMap dict, String input,
46
String expected[]) throws IOException {
47
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
48
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
49
assertTokenStreamContents(stream, expected);
52
static void assertTokenizesTo(SlowSynonymMap dict, String input,
53
String expected[], int posIncs[]) throws IOException {
54
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
55
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
56
assertTokenStreamContents(stream, expected, posIncs);
59
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
60
String expected[], int posIncs[])
62
TokenStream tokenizer = new IterTokenStream(input);
63
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
64
assertTokenStreamContents(stream, expected, posIncs);
67
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
68
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
70
TokenStream tokenizer = new IterTokenStream(input);
71
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
72
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
76
public void testMatching() throws IOException {
77
SlowSynonymMap map = new SlowSynonymMap();
81
map.add(strings("a b"), tokens("ab"), orig, merge);
82
map.add(strings("a c"), tokens("ac"), orig, merge);
83
map.add(strings("a"), tokens("aa"), orig, merge);
84
map.add(strings("b"), tokens("bb"), orig, merge);
85
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
86
map.add(strings("x c"), tokens("xc"), orig, merge);
88
assertTokenizesTo(map, "$", new String[] { "$" });
89
assertTokenizesTo(map, "a", new String[] { "aa" });
90
assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
91
assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
92
assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
93
assertTokenizesTo(map, "b", new String[] { "bb" });
94
assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
95
assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });
98
map.add(strings("a b"), tokens("ab"), orig, merge);
99
map.add(strings("a b"), tokens("ab"), orig, merge);
101
// FIXME: the below test intended to be { "ab" }
102
assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" });
104
// check for lack of recursion
105
map.add(strings("zoo"), tokens("zoo"), orig, merge);
106
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
107
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
108
// FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
109
// maybe this was just a typo in the old test????
110
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
113
public void testIncludeOrig() throws IOException {
114
SlowSynonymMap map = new SlowSynonymMap();
117
boolean merge = true;
118
map.add(strings("a b"), tokens("ab"), orig, merge);
119
map.add(strings("a c"), tokens("ac"), orig, merge);
120
map.add(strings("a"), tokens("aa"), orig, merge);
121
map.add(strings("b"), tokens("bb"), orig, merge);
122
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
123
map.add(strings("x c"), tokens("xc"), orig, merge);
125
assertTokenizesTo(map, "$",
126
new String[] { "$" },
128
assertTokenizesTo(map, "a",
129
new String[] { "a", "aa" },
131
assertTokenizesTo(map, "a",
132
new String[] { "a", "aa" },
134
assertTokenizesTo(map, "$ a",
135
new String[] { "$", "a", "aa" },
136
new int[] { 1, 1, 0 });
137
assertTokenizesTo(map, "a $",
138
new String[] { "a", "aa", "$" },
139
new int[] { 1, 0, 1 });
140
assertTokenizesTo(map, "$ a !",
141
new String[] { "$", "a", "aa", "!" },
142
new int[] { 1, 1, 0, 1 });
143
assertTokenizesTo(map, "a a",
144
new String[] { "a", "aa", "a", "aa" },
145
new int[] { 1, 0, 1, 0 });
146
assertTokenizesTo(map, "b",
147
new String[] { "b", "bb" },
149
assertTokenizesTo(map, "z x c v",
150
new String[] { "z", "zxcv", "x", "c", "v" },
151
new int[] { 1, 0, 1, 1, 1 });
152
assertTokenizesTo(map, "z x c $",
153
new String[] { "z", "x", "xc", "c", "$" },
154
new int[] { 1, 1, 0, 1, 1 });
156
// check for lack of recursion
157
map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
158
// CHECKME: I think the previous test (with 4 zoo's), was just a typo.
159
assertTokenizesTo(map, "zoo zoo $ zoo",
160
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
161
new int[] { 1, 0, 1, 1, 1 });
163
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
164
assertTokenizesTo(map, "zoo zoo $ zoo",
165
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
166
new int[] { 1, 0, 1, 1, 1, 0, 1 });
170
public void testMapMerge() throws IOException {
171
SlowSynonymMap map = new SlowSynonymMap();
173
boolean orig = false;
174
boolean merge = true;
175
map.add(strings("a"), tokens("a5,5"), orig, merge);
176
map.add(strings("a"), tokens("a3,3"), orig, merge);
178
assertTokenizesTo(map, "a",
179
new String[] { "a3", "a5" },
182
map.add(strings("b"), tokens("b3,3"), orig, merge);
183
map.add(strings("b"), tokens("b5,5"), orig, merge);
185
assertTokenizesTo(map, "b",
186
new String[] { "b3", "b5" },
189
map.add(strings("a"), tokens("A3,3"), orig, merge);
190
map.add(strings("a"), tokens("A5,5"), orig, merge);
192
assertTokenizesTo(map, "a",
193
new String[] { "a3", "A3", "a5", "A5" },
194
new int[] { 1, 0, 2, 0 });
196
map.add(strings("a"), tokens("a1"), orig, merge);
197
assertTokenizesTo(map, "a",
198
new String[] { "a1", "a3", "A3", "a5", "A5" },
199
new int[] { 1, 2, 0, 2, 0 });
201
map.add(strings("a"), tokens("a2,2"), orig, merge);
202
map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
203
assertTokenizesTo(map, "a",
204
new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
205
new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
209
public void testOverlap() throws IOException {
210
SlowSynonymMap map = new SlowSynonymMap();
212
boolean orig = false;
213
boolean merge = true;
214
map.add(strings("qwe"), tokens("qq/ww/ee"), orig, merge);
215
map.add(strings("qwe"), tokens("xx"), orig, merge);
216
map.add(strings("qwe"), tokens("yy"), orig, merge);
217
map.add(strings("qwe"), tokens("zz"), orig, merge);
218
assertTokenizesTo(map, "$", new String[] { "$" });
219
assertTokenizesTo(map, "qwe",
220
new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
221
new int[] { 1, 0, 0, 0, 0, 0 });
223
// test merging within the map
225
map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
226
map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
227
assertTokenizesTo(map, "a",
228
new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
229
new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
232
public void testPositionIncrements() throws IOException {
233
SlowSynonymMap map = new SlowSynonymMap();
235
boolean orig = false;
236
boolean merge = true;
238
// test that generated tokens start at the same posInc as the original
239
map.add(strings("a"), tokens("aa"), orig, merge);
240
assertTokenizesTo(map, tokens("a,5"),
241
new String[] { "aa" },
243
assertTokenizesTo(map, tokens("a,0"),
244
new String[] { "aa" },
247
// test that offset of first replacement is ignored (always takes the orig offset)
248
map.add(strings("b"), tokens("bb,100"), orig, merge);
249
assertTokenizesTo(map, tokens("b,5"),
250
new String[] { "bb" },
252
assertTokenizesTo(map, tokens("b,0"),
253
new String[] { "bb" },
256
// test that subsequent tokens are adjusted accordingly
257
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
258
assertTokenizesTo(map, tokens("c,5"),
259
new String[] { "cc", "c2" },
261
assertTokenizesTo(map, tokens("c,0"),
262
new String[] { "cc", "c2" },
267
public void testPositionIncrementsWithOrig() throws IOException {
268
SlowSynonymMap map = new SlowSynonymMap();
271
boolean merge = true;
273
// test that generated tokens start at the same offset as the original
274
map.add(strings("a"), tokens("aa"), orig, merge);
275
assertTokenizesTo(map, tokens("a,5"),
276
new String[] { "a", "aa" },
278
assertTokenizesTo(map, tokens("a,0"),
279
new String[] { "a", "aa" },
282
// test that offset of first replacement is ignored (always takes the orig offset)
283
map.add(strings("b"), tokens("bb,100"), orig, merge);
284
assertTokenizesTo(map, tokens("b,5"),
285
new String[] { "b", "bb" },
287
assertTokenizesTo(map, tokens("b,0"),
288
new String[] { "b", "bb" },
291
// test that subsequent tokens are adjusted accordingly
292
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
293
assertTokenizesTo(map, tokens("c,5"),
294
new String[] { "c", "cc", "c2" },
295
new int[] { 5, 0, 2 });
296
assertTokenizesTo(map, tokens("c,0"),
297
new String[] { "c", "cc", "c2" },
298
new int[] { 0, 0, 2 });
302
public void testOffsetBug() throws IOException {
303
// With the following rules:
306
// analysing "a x" causes "y" to have a bad offset (end less than start)
308
SlowSynonymMap map = new SlowSynonymMap();
310
boolean orig = false;
311
boolean merge = true;
313
map.add(strings("a a"), tokens("b"), orig, merge);
314
map.add(strings("x"), tokens("y"), orig, merge);
317
assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
318
new String[] { "b", "y" },
326
* Return a list of tokens according to a test string format:
327
* a b c => returns List<Token> [a,b,c]
328
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
329
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
330
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
331
* @deprecated (3.0) does not support attributes api
334
private List<Token> tokens(String str) {
335
String[] arr = str.split(" ");
336
List<Token> result = new ArrayList<Token>();
337
for (int i=0; i<arr.length; i++) {
338
String[] toks = arr[i].split("/");
339
String[] params = toks[0].split(",");
345
if (params.length > 1) {
346
posInc = Integer.parseInt(params[1]);
351
if (params.length > 2) {
352
start = Integer.parseInt(params[2]);
357
if (params.length > 3) {
358
end = Integer.parseInt(params[3]);
360
end = start + params[0].length();
363
Token t = new Token(params[0],start,end,"TEST");
364
t.setPositionIncrement(posInc);
367
for (int j=1; j<toks.length; j++) {
368
t = new Token(toks[j],0,0,"TEST");
369
t.setPositionIncrement(0);
377
* @deprecated (3.0) does not support custom attributes
380
private static class IterTokenStream extends TokenStream {
381
final Token tokens[];
383
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
384
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
385
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
386
FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
387
TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
388
PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
390
public IterTokenStream(Token... tokens) {
392
this.tokens = tokens;
395
public IterTokenStream(Collection<Token> tokens) {
396
this(tokens.toArray(new Token[tokens.size()]));
400
public boolean incrementToken() throws IOException {
401
if (index >= tokens.length)
405
Token token = tokens[index++];
406
termAtt.setEmpty().append(token);
407
offsetAtt.setOffset(token.startOffset(), token.endOffset());
408
posIncAtt.setPositionIncrement(token.getPositionIncrement());
409
flagsAtt.setFlags(token.getFlags());
410
typeAtt.setType(token.type());
411
payloadAtt.setPayload(token.getPayload());