1
package org.apache.lucene.search.highlight;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Reader;
23
import org.apache.lucene.analysis.Analyzer;
24
import org.apache.lucene.analysis.Token;
25
import org.apache.lucene.analysis.TokenStream;
26
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29
import org.apache.lucene.document.Document;
30
import org.apache.lucene.document.Field;
31
import org.apache.lucene.document.Field.TermVector;
32
import org.apache.lucene.index.CorruptIndexException;
33
import org.apache.lucene.index.IndexReader;
34
import org.apache.lucene.index.IndexWriter;
35
import org.apache.lucene.index.Term;
36
import org.apache.lucene.index.TermPositionVector;
37
import org.apache.lucene.search.DisjunctionMaxQuery;
38
import org.apache.lucene.search.IndexSearcher;
39
import org.apache.lucene.search.Query;
40
import org.apache.lucene.search.TopDocs;
41
import org.apache.lucene.search.spans.SpanNearQuery;
42
import org.apache.lucene.search.spans.SpanQuery;
43
import org.apache.lucene.search.spans.SpanTermQuery;
44
import org.apache.lucene.store.Directory;
45
import org.apache.lucene.store.LockObtainFailedException;
46
import org.apache.lucene.util.LuceneTestCase;
49
public class TokenSourcesTest extends LuceneTestCase {
50
private static final String FIELD = "text";
52
private static final class OverlapAnalyzer extends Analyzer {
55
public TokenStream tokenStream(String fieldName, Reader reader) {
56
return new TokenStreamOverlap();
60
private static final class TokenStreamOverlap extends TokenStream {
61
private Token[] tokens;
65
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
66
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
67
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
69
public TokenStreamOverlap() {
74
public boolean incrementToken() throws IOException {
76
if (this.i >= this.tokens.length) {
80
termAttribute.setEmpty().append(this.tokens[i]);
81
offsetAttribute.setOffset(this.tokens[i].startOffset(),
82
this.tokens[i].endOffset());
83
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
84
.getPositionIncrement());
91
this.tokens = new Token[] {
92
new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
93
new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
94
new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
95
new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
96
new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
97
new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
98
this.tokens[1].setPositionIncrement(0);
102
public void testOverlapWithOffset() throws CorruptIndexException,
103
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
104
final String TEXT = "the fox did not jump";
105
final Directory directory = newDirectory();
106
final IndexWriter indexWriter = new IndexWriter(directory,
107
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
109
final Document document = new Document();
110
document.add(new Field(FIELD, new TokenStreamOverlap(),
111
TermVector.WITH_OFFSETS));
112
indexWriter.addDocument(document);
116
final IndexReader indexReader = IndexReader.open(directory, true);
118
assertEquals(1, indexReader.numDocs());
119
final IndexSearcher indexSearcher = newSearcher(indexReader);
121
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
122
query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
123
query.add(new SpanTermQuery(new Term(FIELD, "fox")));
124
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
125
// new SpanTermQuery(new Term(FIELD, "{fox}")),
126
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
128
TopDocs hits = indexSearcher.search(query, 1);
129
assertEquals(1, hits.totalHits);
130
final Highlighter highlighter = new Highlighter(
131
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
132
new QueryScorer(query));
133
final TokenStream tokenStream = TokenSources
135
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
137
assertEquals("<B>the fox</B> did not jump",
138
highlighter.getBestFragment(tokenStream, TEXT));
140
indexSearcher.close();
148
public void testOverlapWithPositionsAndOffset() throws CorruptIndexException,
149
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
150
final String TEXT = "the fox did not jump";
151
final Directory directory = newDirectory();
152
final IndexWriter indexWriter = new IndexWriter(directory,
153
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
155
final Document document = new Document();
156
document.add(new Field(FIELD, new TokenStreamOverlap(),
157
TermVector.WITH_POSITIONS_OFFSETS));
158
indexWriter.addDocument(document);
162
final IndexReader indexReader = IndexReader.open(directory, true);
164
assertEquals(1, indexReader.numDocs());
165
final IndexSearcher indexSearcher = newSearcher(indexReader);
167
final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
168
query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
169
query.add(new SpanTermQuery(new Term(FIELD, "fox")));
170
// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
171
// new SpanTermQuery(new Term(FIELD, "{fox}")),
172
// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
174
TopDocs hits = indexSearcher.search(query, 1);
175
assertEquals(1, hits.totalHits);
176
final Highlighter highlighter = new Highlighter(
177
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
178
new QueryScorer(query));
179
final TokenStream tokenStream = TokenSources
181
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
183
assertEquals("<B>the fox</B> did not jump",
184
highlighter.getBestFragment(tokenStream, TEXT));
186
indexSearcher.close();
194
public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
195
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
196
final String TEXT = "the fox did not jump";
197
final Directory directory = newDirectory();
198
final IndexWriter indexWriter = new IndexWriter(directory,
199
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
201
final Document document = new Document();
202
document.add(new Field(FIELD, new TokenStreamOverlap(),
203
TermVector.WITH_OFFSETS));
204
indexWriter.addDocument(document);
208
final IndexReader indexReader = IndexReader.open(directory, true);
210
assertEquals(1, indexReader.numDocs());
211
final IndexSearcher indexSearcher = newSearcher(indexReader);
213
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
214
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
215
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
216
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
217
new SpanTermQuery(new Term(FIELD, "the")),
218
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
220
TopDocs hits = indexSearcher.search(phraseQuery, 1);
221
assertEquals(1, hits.totalHits);
222
final Highlighter highlighter = new Highlighter(
223
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
224
new QueryScorer(phraseQuery));
225
final TokenStream tokenStream = TokenSources
227
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
229
assertEquals("<B>the fox</B> did not jump",
230
highlighter.getBestFragment(tokenStream, TEXT));
232
indexSearcher.close();
240
public void testOverlapWithPositionsAndOffsetExactPhrase()
241
throws CorruptIndexException, LockObtainFailedException, IOException,
242
InvalidTokenOffsetsException {
243
final String TEXT = "the fox did not jump";
244
final Directory directory = newDirectory();
245
final IndexWriter indexWriter = new IndexWriter(directory,
246
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
248
final Document document = new Document();
249
document.add(new Field(FIELD, new TokenStreamOverlap(),
250
TermVector.WITH_POSITIONS_OFFSETS));
251
indexWriter.addDocument(document);
255
final IndexReader indexReader = IndexReader.open(directory, true);
257
assertEquals(1, indexReader.numDocs());
258
final IndexSearcher indexSearcher = newSearcher(indexReader);
260
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
261
// query.add(new SpanTermQuery(new Term(FIELD, "the")));
262
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
263
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
264
new SpanTermQuery(new Term(FIELD, "the")),
265
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
267
TopDocs hits = indexSearcher.search(phraseQuery, 1);
268
assertEquals(1, hits.totalHits);
269
final Highlighter highlighter = new Highlighter(
270
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
271
new QueryScorer(phraseQuery));
272
final TokenStream tokenStream = TokenSources
274
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
276
assertEquals("<B>the fox</B> did not jump",
277
highlighter.getBestFragment(tokenStream, TEXT));
279
indexSearcher.close();