1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.util.Arrays;
23
import java.util.Iterator;
25
import java.util.SortedSet;
27
import org.apache.lucene.analysis.*;
28
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
29
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31
import org.apache.lucene.document.Document;
32
import org.apache.lucene.document.Field;
33
import org.apache.lucene.store.Directory;
34
import org.apache.lucene.util.LuceneTestCase;
36
public class TestTermVectorsReader extends LuceneTestCase {
37
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
38
private String[] testFields = {"f1", "f2", "f3", "f4"};
39
private boolean[] testFieldsStorePos = {true, false, true, false};
40
private boolean[] testFieldsStoreOff = {true, false, false, true};
41
private String[] testTerms = {"this", "is", "a", "test"};
42
private int[][] positions = new int[testTerms.length][];
43
private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
44
private Directory dir;
46
private FieldInfos fieldInfos = new FieldInfos();
47
private static int TERM_FREQ = 3;
49
private class TestToken implements Comparable<TestToken> {
54
public int compareTo(TestToken other) {
55
return pos - other.pos;
59
TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
62
public void setUp() throws Exception {
65
for (int i = 0; i < testFields.length; i++) {
66
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
70
Arrays.sort(testTerms);
72
for (int i = 0; i < testTerms.length; i++) {
73
positions[i] = new int[TERM_FREQ];
74
offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
75
// first position must be 0
76
for (int j = 0; j < TERM_FREQ; j++) {
77
// positions are always sorted in increasing order
78
positions[i][j] = (int) (j * 10 + Math.random() * 10);
79
// offsets are always sorted in increasing order
80
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
81
TestToken token = tokens[tokenUpto++] = new TestToken();
82
token.text = testTerms[i];
83
token.pos = positions[i][j];
84
token.startOffset = offsets[i][j].getStartOffset();
85
token.endOffset = offsets[i][j].getEndOffset();
91
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MyAnalyzer()).setMaxBufferedDocs(-1).setMergePolicy(newLogMergePolicy(false, 10)));
93
Document doc = new Document();
94
for(int i=0;i<testFields.length;i++) {
95
final Field.TermVector tv;
96
if (testFieldsStorePos[i] && testFieldsStoreOff[i])
97
tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
98
else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
99
tv = Field.TermVector.WITH_POSITIONS;
100
else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
101
tv = Field.TermVector.WITH_OFFSETS;
103
tv = Field.TermVector.YES;
104
doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
107
//Create 5 documents for testing, they all have the same
110
writer.addDocument(doc);
112
seg = writer.newestSegment().name;
115
fieldInfos = new FieldInfos(dir, IndexFileNames.segmentFileName(seg, IndexFileNames.FIELD_INFOS_EXTENSION));
119
public void tearDown() throws Exception {
124
private class MyTokenStream extends TokenStream {
125
private int tokenUpto;
127
private final CharTermAttribute termAtt;
128
private final PositionIncrementAttribute posIncrAtt;
129
private final OffsetAttribute offsetAtt;
131
public MyTokenStream() {
132
termAtt = addAttribute(CharTermAttribute.class);
133
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
134
offsetAtt = addAttribute(OffsetAttribute.class);
138
public boolean incrementToken() {
139
if (tokenUpto >= tokens.length)
142
final TestToken testToken = tokens[tokenUpto++];
144
termAtt.append(testToken.text);
145
offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
147
posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
149
posIncrAtt.setPositionIncrement(testToken.pos+1);
156
public void reset() throws IOException {
162
private class MyAnalyzer extends Analyzer {
164
public TokenStream tokenStream(String fieldName, Reader reader) {
165
return new MyTokenStream();
169
public void test() throws IOException {
170
//Check to see the files were created properly in setup
171
assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)));
172
assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_INDEX_EXTENSION)));
175
public void testReader() throws IOException {
176
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
177
for (int j = 0; j < 5; j++) {
178
TermFreqVector vector = reader.get(j, testFields[0]);
179
assertTrue(vector != null);
180
String[] terms = vector.getTerms();
181
assertTrue(terms != null);
182
assertTrue(terms.length == testTerms.length);
183
for (int i = 0; i < terms.length; i++) {
184
String term = terms[i];
185
//System.out.println("Term: " + term);
186
assertTrue(term.equals(testTerms[i]));
192
public void testPositionReader() throws IOException {
193
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
194
TermPositionVector vector;
196
vector = (TermPositionVector) reader.get(0, testFields[0]);
197
assertTrue(vector != null);
198
terms = vector.getTerms();
199
assertTrue(terms != null);
200
assertTrue(terms.length == testTerms.length);
201
for (int i = 0; i < terms.length; i++) {
202
String term = terms[i];
203
//System.out.println("Term: " + term);
204
assertTrue(term.equals(testTerms[i]));
205
int[] positions = vector.getTermPositions(i);
206
assertTrue(positions != null);
207
assertTrue(positions.length == this.positions[i].length);
208
for (int j = 0; j < positions.length; j++) {
209
int position = positions[j];
210
assertTrue(position == this.positions[i][j]);
212
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
213
assertTrue(offset != null);
214
assertTrue(offset.length == this.offsets[i].length);
215
for (int j = 0; j < offset.length; j++) {
216
TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
217
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
221
TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
222
assertTrue(freqVector != null);
223
assertTrue(freqVector instanceof TermPositionVector == false);
224
terms = freqVector.getTerms();
225
assertTrue(terms != null);
226
assertTrue(terms.length == testTerms.length);
227
for (int i = 0; i < terms.length; i++) {
228
String term = terms[i];
229
//System.out.println("Term: " + term);
230
assertTrue(term.equals(testTerms[i]));
235
public void testOffsetReader() throws IOException {
236
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
237
TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
238
assertTrue(vector != null);
239
String[] terms = vector.getTerms();
240
assertTrue(terms != null);
241
assertTrue(terms.length == testTerms.length);
242
for (int i = 0; i < terms.length; i++) {
243
String term = terms[i];
244
//System.out.println("Term: " + term);
245
assertTrue(term.equals(testTerms[i]));
246
int[] positions = vector.getTermPositions(i);
247
assertTrue(positions != null);
248
assertTrue(positions.length == this.positions[i].length);
249
for (int j = 0; j < positions.length; j++) {
250
int position = positions[j];
251
assertTrue(position == this.positions[i][j]);
253
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
254
assertTrue(offset != null);
255
assertTrue(offset.length == this.offsets[i].length);
256
for (int j = 0; j < offset.length; j++) {
257
TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
258
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
264
public void testMapper() throws IOException {
265
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
266
SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
267
reader.get(0, mapper);
268
SortedSet<TermVectorEntry> set = mapper.getTermVectorEntrySet();
269
assertTrue("set is null and it shouldn't be", set != null);
270
//three fields, 4 terms, all terms are the same
271
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
272
//Check offsets and positions
273
for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
274
TermVectorEntry tve = iterator.next();
275
assertTrue("tve is null and it shouldn't be", tve != null);
276
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
277
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
281
mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
282
reader.get(1, mapper);
283
set = mapper.getTermVectorEntrySet();
284
assertTrue("set is null and it shouldn't be", set != null);
285
//three fields, 4 terms, all terms are the same
286
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
287
//Should have offsets and positions b/c we are munging all the fields together
288
for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
289
TermVectorEntry tve = iterator.next();
290
assertTrue("tve is null and it shouldn't be", tve != null);
291
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
292
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
297
FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
298
reader.get(0, fsMapper);
299
Map<String,SortedSet<TermVectorEntry>> map = fsMapper.getFieldToTerms();
300
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
301
for (Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
302
SortedSet<TermVectorEntry> sortedSet = entry.getValue();
303
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
304
for (final TermVectorEntry tve : sortedSet) {
305
assertTrue("tve is null and it shouldn't be", tve != null);
306
//Check offsets and positions.
307
assertTrue("tve is null and it shouldn't be", tve != null);
308
String field = tve.getField();
309
if (field.equals(testFields[0])) {
310
//should have offsets
312
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
313
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
315
else if (field.equals(testFields[1])) {
316
//should not have offsets
318
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
319
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
323
//Try mapper that ignores offs and positions
324
fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
325
reader.get(0, fsMapper);
326
map = fsMapper.getFieldToTerms();
327
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
328
for (final Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
329
SortedSet<TermVectorEntry> sortedSet = entry.getValue();
330
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
331
for (final TermVectorEntry tve : sortedSet) {
332
assertTrue("tve is null and it shouldn't be", tve != null);
333
//Check offsets and positions.
334
assertTrue("tve is null and it shouldn't be", tve != null);
335
String field = tve.getField();
336
if (field.equals(testFields[0])) {
337
//should have offsets
339
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
340
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
342
else if (field.equals(testFields[1])) {
343
//should not have offsets
345
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
346
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
351
// test setDocumentNumber()
352
IndexReader ir = IndexReader.open(dir, true);
353
DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
354
assertEquals(-1, docNumAwareMapper.getDocumentNumber());
356
ir.getTermFreqVector(0, docNumAwareMapper);
357
assertEquals(0, docNumAwareMapper.getDocumentNumber());
358
docNumAwareMapper.setDocumentNumber(-1);
360
ir.getTermFreqVector(1, docNumAwareMapper);
361
assertEquals(1, docNumAwareMapper.getDocumentNumber());
362
docNumAwareMapper.setDocumentNumber(-1);
364
ir.getTermFreqVector(0, "f1", docNumAwareMapper);
365
assertEquals(0, docNumAwareMapper.getDocumentNumber());
366
docNumAwareMapper.setDocumentNumber(-1);
368
ir.getTermFreqVector(1, "f2", docNumAwareMapper);
369
assertEquals(1, docNumAwareMapper.getDocumentNumber());
370
docNumAwareMapper.setDocumentNumber(-1);
372
ir.getTermFreqVector(0, "f1", docNumAwareMapper);
373
assertEquals(0, docNumAwareMapper.getDocumentNumber());
381
* Make sure exceptions and bad params are handled appropriately
383
public void testBadParams() throws IOException {
384
TermVectorsReader reader = null;
386
reader = new TermVectorsReader(dir, seg, fieldInfos);
387
//Bad document number, good field number
388
reader.get(50, testFields[0]);
390
} catch (IOException e) {
391
// expected exception
396
reader = new TermVectorsReader(dir, seg, fieldInfos);
397
//Bad document number, no field
400
} catch (IOException e) {
401
// expected exception
406
reader = new TermVectorsReader(dir, seg, fieldInfos);
407
//good document number, bad field number
408
TermFreqVector vector = reader.get(0, "f50");
409
assertTrue(vector == null);
411
} catch (IOException e) {
419
public static class DocNumAwareMapper extends TermVectorMapper {
421
public DocNumAwareMapper() {
424
private int documentNumber = -1;
427
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
428
if (documentNumber == -1) {
429
throw new RuntimeException("Documentnumber should be set at this point!");
434
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
435
if (documentNumber == -1) {
436
throw new RuntimeException("Documentnumber should be set at this point!");
440
public int getDocumentNumber() {
441
return documentNumber;
445
public void setDocumentNumber(int documentNumber) {
446
this.documentNumber = documentNumber;