1
package org.apache.lucene.search.vectorhighlight;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.util.Collection;
24
import org.apache.lucene.analysis.Analyzer;
25
import org.apache.lucene.analysis.MockAnalyzer;
26
import org.apache.lucene.analysis.MockTokenizer;
27
import org.apache.lucene.analysis.TokenStream;
28
import org.apache.lucene.analysis.Tokenizer;
29
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
30
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
31
import org.apache.lucene.document.Document;
32
import org.apache.lucene.document.Field;
33
import org.apache.lucene.document.Field.Index;
34
import org.apache.lucene.document.Field.Store;
35
import org.apache.lucene.document.Field.TermVector;
36
import org.apache.lucene.index.IndexReader;
37
import org.apache.lucene.index.IndexWriter;
38
import org.apache.lucene.index.IndexWriterConfig;
39
import org.apache.lucene.index.Term;
40
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
41
import org.apache.lucene.queryParser.QueryParser;
42
import org.apache.lucene.search.DisjunctionMaxQuery;
43
import org.apache.lucene.search.PhraseQuery;
44
import org.apache.lucene.search.Query;
45
import org.apache.lucene.search.TermQuery;
46
import org.apache.lucene.store.Directory;
47
import org.apache.lucene.util.LuceneTestCase;
49
public abstract class AbstractTestCase extends LuceneTestCase {
51
protected final String F = "f";
52
protected final String F1 = "f1";
53
protected final String F2 = "f2";
54
protected Directory dir;
55
protected Analyzer analyzerW;
56
protected Analyzer analyzerB;
57
protected Analyzer analyzerK;
58
protected IndexReader reader;
59
protected QueryParser paW;
60
protected QueryParser paB;
62
protected static final String[] shortMVValues = {
66
"", // empty data in multi valued field
70
protected static final String[] longMVValues = {
71
"Followings are the examples of customizable parameters and actual examples of customization:",
72
"The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
75
// test data for LUCENE-1448 bug
76
protected static final String[] biMVValues = {
77
"\nLucene/Solr does not require such additional hardware.",
78
"\nWhen you talk about processing speed, the"
81
protected static final String[] strMVValues = {
88
public void setUp() throws Exception {
90
analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
91
analyzerB = new BigramAnalyzer();
92
analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
93
paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW );
94
paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB );
99
public void tearDown() throws Exception {
100
if( reader != null ){
108
protected Query tq( String text ){
109
return tq( 1F, text );
112
protected Query tq( float boost, String text ){
113
return tq( boost, F, text );
116
protected Query tq( String field, String text ){
117
return tq( 1F, field, text );
120
protected Query tq( float boost, String field, String text ){
121
Query query = new TermQuery( new Term( field, text ) );
122
query.setBoost( boost );
126
protected Query pqF( String... texts ){
127
return pqF( 1F, texts );
130
protected Query pqF( float boost, String... texts ){
131
return pqF( boost, 0, texts );
134
protected Query pqF( float boost, int slop, String... texts ){
135
return pq( boost, slop, F, texts );
138
protected Query pq( String field, String... texts ){
139
return pq( 1F, 0, field, texts );
142
protected Query pq( float boost, String field, String... texts ){
143
return pq( boost, 0, field, texts );
146
protected Query pq( float boost, int slop, String field, String... texts ){
147
PhraseQuery query = new PhraseQuery();
148
for( String text : texts ){
149
query.add( new Term( field, text ) );
151
query.setBoost( boost );
152
query.setSlop( slop );
156
protected Query dmq( Query... queries ){
157
return dmq( 0.0F, queries );
160
protected Query dmq( float tieBreakerMultiplier, Query... queries ){
161
DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
162
for( Query q : queries ){
168
protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
169
assertEquals( expected.length, actual.size() );
170
for( Query query : expected ){
171
assertTrue( actual.contains( query ) );
175
static final class BigramAnalyzer extends Analyzer {
177
public TokenStream tokenStream(String fieldName, Reader reader) {
178
return new BasicNGramTokenizer( reader );
182
static final class BasicNGramTokenizer extends Tokenizer {
184
public static final int DEFAULT_N_SIZE = 2;
185
public static final String DEFAULT_DELIMITERS = " \t\n.,";
187
private final String delimiters;
188
private int startTerm;
190
private int startOffset;
191
private int nextStartOffset;
193
private String snippet;
194
private StringBuilder snippetBuffer;
195
private static final int BUFFER_SIZE = 4096;
196
private char[] charBuffer;
197
private int charBufferIndex;
198
private int charBufferLen;
200
public BasicNGramTokenizer( Reader in ){
201
this( in, DEFAULT_N_SIZE );
204
public BasicNGramTokenizer( Reader in, int n ){
205
this( in, n, DEFAULT_DELIMITERS );
208
public BasicNGramTokenizer( Reader in, String delimiters ){
209
this( in, DEFAULT_N_SIZE, delimiters );
212
public BasicNGramTokenizer( Reader in, int n, String delimiters ){
215
this.delimiters = delimiters;
219
snippetBuffer = new StringBuilder();
220
charBuffer = new char[BUFFER_SIZE];
221
charBufferIndex = BUFFER_SIZE;
226
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
227
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
229
public boolean incrementToken() throws IOException {
230
if( !getNextPartialSnippet() )
233
termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
234
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
238
private int getFinalOffset() {
239
return nextStartOffset;
243
public final void end(){
244
offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
247
protected boolean getNextPartialSnippet() throws IOException {
248
if( snippet != null && snippet.length() >= startTerm + 1 + n ){
254
return getNextSnippet();
257
protected boolean getNextSnippet() throws IOException {
259
startOffset = nextStartOffset;
260
snippetBuffer.delete( 0, snippetBuffer.length() );
263
ch = readCharFromBuffer();
264
if( ch == -1 ) break;
265
else if( !isDelimiter( ch ) )
266
snippetBuffer.append( (char)ch );
267
else if( snippetBuffer.length() > 0 )
272
if( snippetBuffer.length() == 0 )
274
snippet = snippetBuffer.toString();
275
lenTerm = snippet.length() >= n ? n : snippet.length();
279
protected int readCharFromBuffer() throws IOException {
280
if( charBufferIndex >= charBufferLen ){
281
charBufferLen = input.read( charBuffer );
282
if( charBufferLen == -1 ){
287
int c = charBuffer[charBufferIndex++];
292
protected boolean isDelimiter( int c ){
293
return delimiters.indexOf( c ) >= 0;
297
public void reset( Reader input ) throws IOException {
298
super.reset( input );
303
public void reset() throws IOException {
307
snippetBuffer.setLength( 0 );
308
charBufferIndex = BUFFER_SIZE;
314
protected void make1d1fIndex( String value ) throws Exception {
315
make1dmfIndex( value );
318
protected void make1d1fIndexB( String value ) throws Exception {
319
make1dmfIndexB( value );
322
protected void make1dmfIndex( String... values ) throws Exception {
323
make1dmfIndex( analyzerW, values );
326
protected void make1dmfIndexB( String... values ) throws Exception {
327
make1dmfIndex( analyzerB, values );
330
// make 1 doc with multi valued field
331
protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
332
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
333
TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
334
Document doc = new Document();
335
for( String value: values )
336
doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
337
writer.addDocument( doc );
339
if (reader != null) reader.close();
340
reader = IndexReader.open( dir, true );
343
// make 1 doc with multi valued & not analyzed field
344
protected void make1dmfIndexNA( String... values ) throws Exception {
345
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
346
TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
347
Document doc = new Document();
348
for( String value: values )
349
doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
350
writer.addDocument( doc );
352
if (reader != null) reader.close();
353
reader = IndexReader.open( dir, true );
356
protected void makeIndexShortMV() throws Exception {
374
make1dmfIndex( shortMVValues );
377
protected void makeIndexLongMV() throws Exception {
378
// 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
379
// 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
380
// Followings are the examples of customizable parameters and actual examples of customization:
381
// 0 1 2 3 4 5 6 7 8 9 10 11
384
// 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
385
// 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
386
// The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
387
// 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
389
make1dmfIndex( longMVValues );
392
protected void makeIndexLongMVB() throws Exception {
395
// 1111111111222222222233333333334444444444555555
396
// 01234567890123456789012345678901234567890123456789012345
397
// *Lucene/Solr does not require such additional hardware.
398
// Lu 0 do 10 re 15 su 21 na 31
399
// uc 1 oe 11 eq 16 uc 22 al 32
400
// ce 2 es 12 qu 17 ch 23 ha 33
401
// en 3 no 13 ui 18 ad 24 ar 34
402
// ne 4 ot 14 ir 19 dd 25 rd 35
403
// e/ 5 re 20 di 26 dw 36
409
// 5555666666666677777777778888888888999999999
410
// 6789012345678901234567890123456789012345678
411
// *When you talk about processing speed, the
412
// Wh 40 ab 48 es 56 th 65
413
// he 41 bo 49 ss 57 he 66
422
make1dmfIndexB( biMVValues );
425
protected void makeIndexStrMV() throws Exception {
436
make1dmfIndexNA( strMVValues );