1
package org.apache.lucene.analysis.shingle;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.StringReader;
22
import java.util.Collection;
23
import java.util.Iterator;
24
import java.util.LinkedList;
26
import org.apache.lucene.analysis.*;
27
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
28
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
29
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
30
import org.apache.lucene.analysis.payloads.PayloadHelper;
31
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
32
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
33
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
35
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
36
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
37
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
38
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
41
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
43
public void testIterator() throws IOException {
45
WhitespaceTokenizer wst = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("one two three four five"));
46
ShingleMatrixFilter smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
49
for(i=0; smf.incrementToken(); i++) {}
52
// call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
53
assertFalse(smf.incrementToken());
55
System.currentTimeMillis();
59
public void testBehavingAsShingleFilter() throws IOException {
61
ShingleMatrixFilter.defaultSettingsCodec = null;
65
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
66
assertFalse(ts.incrementToken());
69
LinkedList<Token> tokens;
71
// test a plain old token stream with synonyms translated to rows.
73
tokens = new LinkedList<Token>();
74
tokens.add(createToken("please", 0, 6));
75
tokens.add(createToken("divide", 7, 13));
76
tokens.add(createToken("this", 14, 18));
77
tokens.add(createToken("sentence", 19, 27));
78
tokens.add(createToken("into", 28, 32));
79
tokens.add(createToken("shingles", 33, 39));
81
tls = new TokenListStream(tokens);
85
ts = new ShingleMatrixFilter(tls, 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
87
assertTokenStreamContents(ts,
88
new String[] { "please", "please divide", "divide", "divide this",
89
"this", "this sentence", "sentence", "sentence into", "into",
90
"into shingles", "shingles" },
91
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33 },
92
new int[] { 6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39 });
96
* Extracts a matrix from a token stream.
99
public void testTokenStream() throws IOException {
101
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
105
LinkedList<Token> tokens;
107
// test a plain old token stream with synonyms tranlated to rows.
109
tokens = new LinkedList<Token>();
110
tokens.add(tokenFactory("hello", 1, 0, 4));
111
tokens.add(tokenFactory("greetings", 0, 0, 4));
112
tokens.add(tokenFactory("world", 1, 5, 10));
113
tokens.add(tokenFactory("earth", 0, 5, 10));
114
tokens.add(tokenFactory("tellus", 0, 5, 10));
116
tls = new TokenListStream(tokens);
120
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
122
assertNext(ts, "hello_world");
123
assertNext(ts, "greetings_world");
124
assertNext(ts, "hello_earth");
125
assertNext(ts, "greetings_earth");
126
assertNext(ts, "hello_tellus");
127
assertNext(ts, "greetings_tellus");
128
assertFalse(ts.incrementToken());
130
// bi-grams with no spacer character, start offset, end offset
133
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
134
assertNext(ts, "helloworld", 0, 10);
135
assertNext(ts, "greetingsworld", 0, 10);
136
assertNext(ts, "helloearth", 0, 10);
137
assertNext(ts, "greetingsearth", 0, 10);
138
assertNext(ts, "hellotellus", 0, 10);
139
assertNext(ts, "greetingstellus", 0, 10);
140
assertFalse(ts.incrementToken());
143
// add ^_prefix_and_suffix_$
145
// using 3d codec as it supports weights
147
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
149
tokens = new LinkedList<Token>();
150
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
151
tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
152
tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
153
tokens.add(tokenFactory("earth", 0, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
154
tokens.add(tokenFactory("tellus", 0, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
156
tls = new TokenListStream(tokens);
158
ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
159
tls = new CachingTokenFilter(ts);
161
// bi-grams, position incrememnt, weight, start offset, end offset
163
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false);
165
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
166
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
170
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
171
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
172
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
173
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
174
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
175
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
176
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
177
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
178
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
179
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
180
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
181
assertFalse(ts.incrementToken());
183
// test unlimited size and allow single boundary token as shingle
185
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), false);
188
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
189
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
193
assertNext(ts, "^", 1, 10.0f, 0, 0);
194
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
195
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
196
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
197
assertNext(ts, "hello", 1, 1.0f, 0, 4);
198
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
199
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
200
assertNext(ts, "world", 1, 1.0f, 5, 10);
201
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
202
assertNext(ts, "$", 1, 7.071068f, 10, 10);
203
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
204
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
205
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
206
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
207
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
208
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
209
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
210
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
211
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
212
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
213
assertNext(ts, "earth", 1, 1.0f, 5, 10);
214
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
215
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
216
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
217
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
218
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
219
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
220
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
221
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
222
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
223
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
224
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
225
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
226
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
227
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
228
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
230
assertFalse(ts.incrementToken());
232
// test unlimited size but don't allow single boundary token as shingle
235
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), true);
236
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
237
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
241
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
242
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
243
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
244
assertNext(ts, "hello", 1, 1.0f, 0, 4);
245
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
246
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
247
assertNext(ts, "world", 1, 1.0f, 5, 10);
248
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
249
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
250
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
251
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
252
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
253
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
254
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
255
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
256
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
257
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
258
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
259
assertNext(ts, "earth", 1, 1.0f, 5, 10);
260
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
261
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
262
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
263
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
264
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
265
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
266
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
267
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
268
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
269
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
270
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
271
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
272
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
273
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
274
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
277
assertFalse(ts.incrementToken());
279
System.currentTimeMillis();
281
// multi-token synonyms
284
// {{hello}, {greetings, and, salutations},
285
// {{world}, {earth}, {tellus}}
290
tokens = new LinkedList<Token>();
291
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
292
tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
293
tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
294
tokens.add(tokenFactory("salutations", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
295
tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
296
tokens.add(tokenFactory("earth", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
297
tokens.add(tokenFactory("tellus", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
299
tls = new TokenListStream(tokens);
303
ts = new ShingleMatrixFilter(tls, 2, 3, new Character('_'), false);
305
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
306
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
310
// shingle, position increment, weight, start offset, end offset
312
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
313
assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
314
assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
315
assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
316
assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
317
assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
318
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
319
assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
320
assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
321
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
322
assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
323
assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
325
assertFalse(ts.incrementToken());
327
System.currentTimeMillis();
333
* Tests creat shingles from a pre-assembled matrix
335
* Tests the row token z-axis, multi token synonyms.
337
* @throws IOException
339
public void testMatrix() throws IOException {
340
// some other tests set this to null.
341
// set it here in case tests are run out of the usual order.
342
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
343
Matrix matrix = new Matrix();
345
matrix.new Column(tokenFactory("no", 1));
346
matrix.new Column(tokenFactory("surprise", 1));
347
matrix.new Column(tokenFactory("to", 1));
348
matrix.new Column(tokenFactory("see", 1));
349
matrix.new Column(tokenFactory("england", 1));
350
matrix.new Column(tokenFactory("manager", 1));
352
Column col = matrix.new Column();
354
// sven göran eriksson is a multi token synonym to svennis
355
col.new Row().getTokens().add(tokenFactory("svennis", 1));
357
Column.Row row = col.new Row();
358
row.getTokens().add(tokenFactory("sven", 1));
359
row.getTokens().add(tokenFactory("göran", 1));
360
row.getTokens().add(tokenFactory("eriksson", 1));
362
matrix.new Column(tokenFactory("in", 1));
363
matrix.new Column(tokenFactory("the", 1));
364
matrix.new Column(tokenFactory("croud", 1));
366
TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, new Character('_'), true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
368
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
369
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
373
assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
374
assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
375
assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
376
assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
377
assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
378
assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
379
assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
380
assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
381
assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
382
assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
383
assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
384
assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
385
assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
386
assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
387
assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
388
assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
389
assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
390
assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
391
assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
392
assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
393
assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
394
assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
395
assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
396
assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
397
assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
398
assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
399
assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
400
assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
401
assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
402
assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
403
assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
404
assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
405
assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
406
assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
407
assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
408
assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
409
assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
410
assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
411
assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
413
assertFalse(ts.incrementToken());
417
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
418
Token token = new Token(startOffset, endOffset);
419
token.setEmpty().append(text);
420
token.setPositionIncrement(posIncr);
425
private Token tokenFactory(String text, int posIncr) {
426
return tokenFactory(text, posIncr, 1f, 0, 0);
429
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
430
Token token = new Token(startOffset, endOffset);
431
token.setEmpty().append(text);
432
token.setPositionIncrement(posIncr);
433
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
437
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
438
Token token = new Token(startOffset, endOffset);
439
token.setEmpty().append(text);
440
token.setPositionIncrement(posIncr);
441
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
442
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
446
// assert-methods start here
448
private void assertNext(TokenStream ts, String text) throws IOException {
449
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
451
assertTrue(ts.incrementToken());
452
assertEquals(text, termAtt.toString());
455
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
456
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
457
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
458
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
459
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
461
assertTrue(ts.incrementToken());
462
assertEquals(text, termAtt.toString());
463
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
464
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
465
assertEquals(startOffset, offsetAtt.startOffset());
466
assertEquals(endOffset, offsetAtt.endOffset());
469
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
470
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
471
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
473
assertTrue(ts.incrementToken());
474
assertEquals(text, termAtt.toString());
475
assertEquals(startOffset, offsetAtt.startOffset());
476
assertEquals(endOffset, offsetAtt.endOffset());
479
private static Token createToken(String term, int start, int offset)
481
Token token = new Token(start, offset);
482
token.setEmpty().append(term);
487
public final static class TokenListStream extends TokenStream {
489
private Collection<Token> tokens;
490
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
491
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
492
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
493
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
494
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
495
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
497
public TokenListStream(Collection<Token> tokens) {
498
this.tokens = tokens;
501
private Iterator<Token> iterator;
504
public boolean incrementToken() throws IOException {
505
if (iterator == null) {
506
iterator = tokens.iterator();
508
if (!iterator.hasNext()) {
511
Token prototype = iterator.next();
513
termAtt.copyBuffer(prototype.buffer(), 0, prototype.length());
514
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
515
flagsAtt.setFlags(prototype.getFlags());
516
offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
517
typeAtt.setType(prototype.type());
518
payloadAtt.setPayload(prototype.getPayload());
525
public void reset() throws IOException {