1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
8
#include "CLucene/_SharedHeader.h"
9
#include "../store/MockRAMDirectory.h"
10
#include "CLucene/index/_FieldInfos.h"
11
#include "CLucene/index/_SegmentInfos.h"
12
#include "CLucene/index/_IndexFileNames.h"
13
#include "CLucene/index/_TermVector.h"
14
#include "CLucene/util/_Arrays.h"
23
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
24
const TCHAR* testFields_values[] = {_T("f1"), _T("f2"), _T("f3"), _T("f4")};
25
const bool testFieldsStorePos_values[] = {true, false, true, false};
26
const bool testFieldsStoreOff_values[] = {true, false, false, true};
27
const TCHAR* testTerms_values[] = {_T("this"), _T("is"), _T("a"), _T("test")};
29
CL_NS(store)::MockRAMDirectory dir;
31
FieldInfos *fieldInfos = NULL;
32
const int TERM_FREQ = 3;
41
std::vector<const TCHAR*> testFields(testFields_values, testFields_values + sizeof(testFields_values) / sizeof(TCHAR*));
42
std::vector<const TCHAR*> testTerms(testTerms_values, testTerms_values + sizeof(testTerms_values) / sizeof(TCHAR*));
43
std::vector<bool> testFieldsStorePos(testFieldsStorePos_values, testFieldsStorePos_values + sizeof(testFieldsStorePos_values) / sizeof(bool));
44
std::vector<bool> testFieldsStoreOff(testFieldsStoreOff_values, testFieldsStoreOff_values + sizeof(testFieldsStoreOff_values) / sizeof(bool));
45
std::vector<const TestToken*> tokens;
46
std::vector< std::vector<int> > positions(4);
47
std::vector< std::vector<TermVectorOffsetInfo*> > offsets(4);
49
class MyTokenStream : public TokenStream {
51
std::vector<const TestToken*>::size_type tokenUpto;
57
virtual Token* next(Token *token) {
58
if (tokenUpto >= tokens.size())
62
token = _CLNEW Token();
64
const TestToken* testToken = tokens[tokenUpto++];
65
token->setText(testToken->text);
67
token->setPositionIncrement(testToken->pos - tokens[tokenUpto-2]->pos);
69
token->setPositionIncrement(testToken->pos+1);
70
token->setStartOffset(testToken->startOffset);
71
token->setEndOffset(testToken->endOffset);
75
virtual void close() {
79
class MyAnalyzer : public CL_NS(analysis)::Analyzer {
81
virtual TokenStream* tokenStream(const TCHAR* fieldName, Reader* reader) {
82
return _CLNEW MyTokenStream();
86
class MyIndexWriter : public IndexWriter {
88
MyIndexWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, const bool create) :
89
IndexWriter(d, a, create) {
91
virtual SegmentInfo* newestSegment() {
92
return IndexWriter::newestSegment();
96
struct MyTCharCompare :
97
public std::binary_function<const TCHAR*, const TCHAR*, bool>
99
bool operator () (const TCHAR* v1, const TCHAR* v2) const {
100
return _tcscmp(v1, v2) < 0;
104
struct TestTokenCompare :
105
public std::binary_function<const TestToken*, const TestToken*, bool>
107
bool operator () (const TestToken* t1, const TestToken* t2) const {
108
return t1->pos < t2->pos;
115
std::sort(testTerms.begin(), testTerms.end(), MyTCharCompare());
117
for (std::vector<const TCHAR*>::size_type i = 0; i < testTerms.size(); i++) {
118
positions[i] = std::vector<int>(TERM_FREQ);
119
offsets[i] = std::vector<TermVectorOffsetInfo*>(TERM_FREQ);
120
// first position must be 0
121
for (int j = 0; j < TERM_FREQ; j++) {
122
// positions are always sorted in increasing order
123
positions[i][j] = (int) (j * 10 + (rand() % 10));
124
// offsets are always sorted in increasing order
125
offsets[i][j] = _CLNEW TermVectorOffsetInfo(j * 10, j * 10 + _tcslen(testTerms[i]));
126
TestToken* token = _CLNEW TestToken();
127
tokens.push_back(token);
129
token->text = testTerms[i];
130
token->pos = positions[i][j];
131
token->startOffset = offsets[i][j]->getStartOffset();
132
token->endOffset = offsets[i][j]->getEndOffset();
136
std::sort(tokens.begin(), tokens.end(), TestTokenCompare());
139
MyIndexWriter writer(&dir, &analyzer, true);
140
writer.setUseCompoundFile(false);
141
Document* doc = _CLNEW Document();
142
for(std::vector<const TCHAR*>::size_type i=0;i<testFields.size();i++) {
143
Field::TermVector tv;
144
if (testFieldsStorePos[i] && testFieldsStoreOff[i])
145
tv = Field::TERMVECTOR_WITH_POSITIONS_OFFSETS;
146
else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
147
tv = Field::TERMVECTOR_WITH_POSITIONS;
148
else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
149
tv = Field::TERMVECTOR_WITH_OFFSETS;
151
tv = Field::TERMVECTOR_YES;
152
doc->add(* _CLNEW Field(testFields[i], _T(""), Field::STORE_NO | Field::INDEX_TOKENIZED | tv));
155
//Create 5 documents for testing, they all have the same
158
writer.addDocument(doc);
161
seg = writer.newestSegment()->name;
164
std::string tmp = seg;
166
tmp.append(IndexFileNames::FIELD_INFOS_EXTENSION);
167
fieldInfos = _CLNEW FieldInfos(&dir, tmp.c_str());
171
void test(CuTest* tc) {
172
//Check to see the files were created properly in setup
173
std::string tmp = seg;
175
tmp.append(IndexFileNames::VECTORS_DOCUMENTS_EXTENSION);
176
CuAssertTrue(tc, dir.fileExists(tmp.c_str()), _T("Missing file!"));
180
tmp.append(IndexFileNames::VECTORS_INDEX_EXTENSION);
181
CuAssertTrue(tc, dir.fileExists(tmp.c_str()), _T("Missing file!"));
184
void testTermVectorsReader(CuTest* tc) {
185
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
186
for (int j = 0; j < 5; j++) {
187
TermFreqVector* vector = reader.get(j, testFields[0]);
188
CuAssertTrue(tc, vector != NULL, _T("Expected term frequency vector!"));
189
const ArrayBase<const TCHAR*>* terms = vector->getTerms();
190
CuAssertTrue(tc, terms != NULL, _T("Array of terms expected!"));
191
CuAssertTrue(tc, terms->length == testTerms.size());
192
for (int i = 0; i < terms->length; i++) {
193
const TCHAR* term = (*terms)[i];
194
CuAssertStrEquals(tc, _T(""), testTerms[i], (TCHAR*)term, false);
199
void testPositionReader(CuTest* tc) {
200
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
201
TermPositionVector* vector;
202
const ArrayBase<const TCHAR*>* terms;
203
vector = dynamic_cast<TermPositionVector*>(reader.get(0, testFields[0]));
204
CuAssertTrue(tc, vector != NULL, _T("Term position vector expected!"));
205
terms = vector->getTerms();
206
CuAssertTrue(tc, terms != NULL, _T("Terms expected!"));
207
CuAssertTrue(tc, terms->length == testTerms.size(), _T("Unexpected number of terms!"));
208
for (int i = 0; i < terms->length; i++) {
209
const TCHAR* term = (*terms)[i];
210
CuAssertStrEquals(tc, _T(""), testTerms[i], (TCHAR*)term, false);
211
const ArrayBase<int32_t>* termPositions = vector->getTermPositions(i);
212
CuAssertTrue(tc, termPositions != NULL, _T("Term positions expected!"));
213
CuAssertTrue(tc, termPositions->length == positions[i].size(), _T("Unexpected number of term positions!"));
214
for (int j = 0; j < termPositions->length; j++) {
215
int position = (*termPositions)[j];
216
CuAssertTrue(tc, position == positions[i][j], _T("Postion not equal!"));
218
const ArrayBase<TermVectorOffsetInfo*>* termOffset = vector->getOffsets(i);
219
CuAssertTrue(tc, termOffset != NULL, _T("Term vector offset info expected!"));
220
CuAssertTrue(tc, termOffset->length == offsets[i].size(), _T("Unexpected length of term positions!"));
221
for (int j = 0; j < termOffset->length; j++) {
222
TermVectorOffsetInfo* termVectorOffsetInfo = (*termOffset)[j];
223
CuAssertTrue(tc, termVectorOffsetInfo->equals(offsets[i][j]), _T("Term vector offset info not equal!"));
227
TermFreqVector* freqVector = reader.get(0, testFields[1]); //no pos, no offset
228
CuAssertTrue(tc, freqVector != NULL, _T("Term frequency vector expected!"));
229
CuAssertTrue(tc, dynamic_cast<TermPositionVector*>(freqVector) == NULL, _T("Unepexcted term position vector!"));
230
terms = freqVector->getTerms();
231
CuAssertTrue(tc, terms != NULL, _T("Terms expected!"));
232
CuAssertTrue(tc, terms->length == testTerms.size(), _T("Unexpected length of term positions!"));
233
for (int i = 0; i < terms->length; i++) {
234
const TCHAR* term = (*terms)[i];
235
CuAssertStrEquals(tc, _T(""), testTerms[i], (TCHAR*)term, false);
239
class DocNumAwareMapper : public TermVectorMapper {
246
DocNumAwareMapper() : documentNumber(-1) {
249
virtual ~DocNumAwareMapper() {
252
virtual void setExpectations(const TCHAR* _field, const int32_t numTerms, const bool storeOffsets, const bool storePositions) {
253
if (documentNumber == -1) {
254
_CLTHROWA(CL_ERR_Runtime, "Documentnumber should be set at this point!");
258
virtual void map(const TCHAR* term, const int32_t termLen, const int32_t frequency, CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* offsets, CL_NS(util)::ArrayBase<int32_t>* positions) {
259
if (documentNumber == -1) {
260
_CLTHROWA(CL_ERR_Runtime, "Documentnumber should be set at this point!");
264
int getDocumentNumber() const {
265
return documentNumber;
268
virtual void setDocumentNumber(const int32_t documentNumber) {
269
this->documentNumber = documentNumber;
273
void testOffsetReader(CuTest* tc) {
274
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
275
TermPositionVector* vector = dynamic_cast<TermPositionVector*>(reader.get(0, testFields[0]));
276
CuAssertTrue(tc, vector != NULL, _T("Term position vector expected!"));
277
const CL_NS(util)::ArrayBase<const TCHAR*>* terms = vector->getTerms();
278
CuAssertTrue(tc, terms != NULL, _T("Terms expected"));
279
CuAssertTrue(tc, terms->length == testTerms.size(), _T("Unexpected number of terms!"));
280
for (int i = 0; i < terms->length; i++) {
281
const TCHAR* term = (*terms)[i];
282
CuAssertStrEquals(tc, _T(""), testTerms[i], (TCHAR*)term, false);
283
const ArrayBase<int32_t>* termPositions = vector->getTermPositions(i);
284
CuAssertTrue(tc, termPositions != NULL, _T("Term positions expected!"));
285
CuAssertTrue(tc, termPositions->length == positions[i].size());
286
for (int j = 0; j < termPositions->length; j++) {
287
int position = (*termPositions)[j];
288
CuAssertTrue(tc, position == positions[i][j], _T("Unexpected position!"));
290
const ArrayBase<TermVectorOffsetInfo*>* termOffset = vector->getOffsets(i);
291
CuAssertTrue(tc, termOffset != NULL, _T("Term vector offset info expected!"));
292
CuAssertTrue(tc, termOffset->length == offsets[i].size(), _T("Unexpected number of term positions!"));
293
for (int j = 0; j < termOffset->length; j++) {
294
TermVectorOffsetInfo* termVectorOffsetInfo = (*termOffset)[j];
295
CuAssertTrue(tc, termVectorOffsetInfo->equals(offsets[i][j]), _T("Term vector offset info not equal!"));
300
//void testMapper(CuTest* tc) {
301
// TermVectorsReader reader(&dir, seg, fieldInfos);
302
// SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
303
// reader.get(0, mapper);
304
// SortedSet set = mapper.getTermVectorEntrySet();
305
// CuAssertTrue(tc, set != NULL, "set is null and it shouldn't be");
306
// //three fields, 4 terms, all terms are the same
307
// CuAssertTrue(set.size() == 4, _T("set Size: " + set.size() + " is not: 4");
308
// //Check offsets and positions
309
// for (Iterator iterator = set.iterator(); iterator.hasNext();) {
310
// TermVectorEntry* tve = (TermVectorEntry) iterator.next();
311
// CuAssertTrue(tc, tve != NULL, _T("tve is null and it shouldn't be"));
312
// CuAssertTrue(tc, tve->getOffsets() != NULL, _T("tve.getOffsets() is null and it shouldn't be"));
313
// CuAssertTrue(tc, tve->getPositions() != NULL, _T("tve.getPositions() is null and it shouldn't be"));
316
// mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
317
// reader.get(1, mapper);
318
// set = mapper.getTermVectorEntrySet();
319
// assertTrue("set is null and it shouldn't be", set != null);
320
// //three fields, 4 terms, all terms are the same
321
// assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
322
// //Should have offsets and positions b/c we are munging all the fields together
323
// for (Iterator iterator = set.iterator(); iterator.hasNext();) {
324
// TermVectorEntry tve = (TermVectorEntry) iterator.next();
325
// assertTrue("tve is null and it shouldn't be", tve != null);
326
// assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
327
// assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
332
// FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
333
// reader.get(0, fsMapper);
334
// Map map = fsMapper.getFieldToTerms();
335
// assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
336
// for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
337
// Map.Entry entry = (Map.Entry) iterator.next();
338
// SortedSet sortedSet = (SortedSet) entry.getValue();
339
// assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
340
// for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
341
// TermVectorEntry tve = (TermVectorEntry) inner.next();
342
// assertTrue("tve is null and it shouldn't be", tve != null);
343
// //Check offsets and positions.
344
// assertTrue("tve is null and it shouldn't be", tve != null);
345
// String field = tve.getField();
346
// if (field.equals(testFields[0])) {
347
// //should have offsets
349
// assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
350
// assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
352
// else if (field.equals(testFields[1])) {
353
// //should not have offsets
355
// assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
356
// assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
360
// //Try mapper that ignores offs and positions
361
// fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
362
// reader.get(0, fsMapper);
363
// map = fsMapper.getFieldToTerms();
364
// assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
365
// for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
366
// Map.Entry entry = (Map.Entry) iterator.next();
367
// SortedSet sortedSet = (SortedSet) entry.getValue();
368
// assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
369
// for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
370
// TermVectorEntry tve = (TermVectorEntry) inner.next();
371
// assertTrue("tve is null and it shouldn't be", tve != null);
372
// //Check offsets and positions.
373
// assertTrue("tve is null and it shouldn't be", tve != null);
374
// String field = tve.getField();
375
// if (field.equals(testFields[0])) {
376
// //should have offsets
378
// assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
379
// assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
381
// else if (field.equals(testFields[1])) {
382
// //should not have offsets
384
// assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
385
// assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
390
// // test setDocumentNumber()
391
// IndexReader ir = IndexReader.open(&dir);
392
// DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
393
// assertEquals(-1, docNumAwareMapper.getDocumentNumber());
395
// ir.getTermFreqVector(0, docNumAwareMapper);
396
// assertEquals(0, docNumAwareMapper.getDocumentNumber());
397
// docNumAwareMapper.setDocumentNumber(-1);
399
// ir.getTermFreqVector(1, docNumAwareMapper);
400
// assertEquals(1, docNumAwareMapper.getDocumentNumber());
401
// docNumAwareMapper.setDocumentNumber(-1);
403
// ir.getTermFreqVector(0, "f1", docNumAwareMapper);
404
// assertEquals(0, docNumAwareMapper.getDocumentNumber());
405
// docNumAwareMapper.setDocumentNumber(-1);
407
// ir.getTermFreqVector(1, "f2", docNumAwareMapper);
408
// assertEquals(1, docNumAwareMapper.getDocumentNumber());
409
// docNumAwareMapper.setDocumentNumber(-1);
411
// ir.getTermFreqVector(0, "f1", docNumAwareMapper);
412
// assertEquals(0, docNumAwareMapper.getDocumentNumber());
419
* Make sure exceptions and bad params are handled appropriately
421
void testBadParams(CuTest* tc) {
423
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
424
//Bad document number, good field number
425
reader.get(50, testFields[0]);
426
CuFail(tc, _T("Expected an IO exception!"));
427
} catch (CLuceneError& e) {
428
if (e.number() != CL_ERR_IO) {
429
CuFail(tc, e.twhat());
433
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
434
//Bad document number, no field
437
} catch (CLuceneError& e) {
438
if (e.number() != CL_ERR_IO) {
439
CuFail(tc, e.twhat());
443
TermVectorsReader reader(&dir, seg.c_str(), fieldInfos);
444
//good document number, bad field number
445
TermFreqVector* vector = reader.get(0, _T("f50"));
446
CuAssertTrue(tc, vector == NULL, _T(""));
447
} catch (CLuceneError& e) {
448
CuFail(tc, e.twhat());
453
CuSuite *testTermVectorsReader(void) {
454
CuSuite *suite = CuSuiteNew(_T("CLucene TermVectorsReader Test"));
458
SUITE_ADD_TEST(suite, test);
459
SUITE_ADD_TEST(suite, testTermVectorsReader);
460
SUITE_ADD_TEST(suite, testPositionReader);
461
SUITE_ADD_TEST(suite, testOffsetReader);
462
//SUITE_ADD_TEST(suite, testMapper);
463
SUITE_ADD_TEST(suite, testBadParams);