1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
9
IndexSearcher* tv_searcher = NULL;
10
RAMDirectory* tv_directory = NULL;
12
void testTermPositionVectors(CuTest *tc) {
13
CLUCENE_ASSERT(tv_searcher!=NULL);
15
Term* term = _CLNEW Term(_T("field"), _T("fifty"));
16
TermQuery query(term);
19
Hits* hits = tv_searcher->search(&query);
20
CuAssert (tc,_T("hits.length != 100"), 100 == hits->length());
22
for (size_t i = 0; i < hits->length(); i++)
24
ArrayBase<TermFreqVector*>* vector = tv_searcher->getReader()->getTermFreqVectors(hits->id(i));
25
CLUCENE_ASSERT(vector != NULL);
26
CLUCENE_ASSERT(vector->length== 1);
27
vector->deleteValues();
32
} catch (CLuceneError& e) {
33
if ( e.number() == CL_ERR_IO )
34
CuAssert(tc, _T("IO Error"),false);
39
void testTermVectors(CuTest *tc) {
40
CLUCENE_ASSERT(tv_searcher!=NULL);
42
Term* term = _CLNEW Term(_T("field"), _T("seventy"));
43
TermQuery query(term);
47
Hits* hits = tv_searcher->search(&query);
48
CuAssertIntEquals(tc,_T("hits!=100"), 100, hits->length());
50
for (size_t i = 0; i < hits->length(); i++)
52
ArrayBase<TermFreqVector*>* vector = tv_searcher->getReader()->getTermFreqVectors(hits->id(i));
53
CLUCENE_ASSERT(vector != NULL);
54
CLUCENE_ASSERT(vector->length == 1);
55
vector->deleteValues();
59
//test mem leaks with vectors
60
CL_NS(search)::Explanation expl;
61
tv_searcher->explain(&query, hits->id(50), &expl);
62
TCHAR* tmp = expl.toString();
63
_CLDELETE_CARRAY(tmp);
67
} catch (CLuceneError& e) {
68
if ( e.number() == CL_ERR_IO )
69
CuAssert(tc,_T("IO Exception"),false);
75
void testTVSetup(CuTest* /*tc*/) {
77
tv_directory = _CLNEW RAMDirectory();
78
IndexWriter writer(tv_directory, &a, true);
79
writer.setUseCompoundFile(false);
82
for (int32_t i = 0; i < 1000; i++) {
84
English::IntToEnglish(i,buf,200);
89
if (mod2 == 0 && mod3 == 0)
90
termVector = Field::TERMVECTOR_WITH_POSITIONS_OFFSETS;
92
termVector = Field::TERMVECTOR_WITH_POSITIONS;
94
termVector = Field::TERMVECTOR_WITH_OFFSETS;
96
termVector = Field::TERMVECTOR_YES;
98
doc.add(*new Field(_T("field"), buf, Field::STORE_YES | Field::INDEX_TOKENIZED | termVector ));
99
writer.addDocument(&doc);
102
tv_searcher = _CLNEW IndexSearcher(tv_directory);
104
void testTVCleanup(CuTest* /*tc*/) {
105
_CLDELETE(tv_searcher);
106
tv_directory->close();
107
_CLDELETE(tv_directory);
110
void setupDoc(Document& doc, const TCHAR* text)
112
doc.add(*new Field(_T("field"), text, Field::STORE_YES |
113
Field::INDEX_TOKENIZED | Field::TERMVECTOR_YES));
115
struct __TCharCompare
117
bool operator()(const TCHAR* s1, const TCHAR* s2) const
119
return _tcscmp(s1, s2) < 0;
123
void testKnownSetOfDocuments(CuTest *tc) {
124
const TCHAR* test1 = _T("eating chocolate in a computer lab"); //6 terms
125
const TCHAR* test2 = _T("computer in a computer lab"); //5 terms
126
const TCHAR* test3 = _T("a chocolate lab grows old"); //5 terms
127
const TCHAR* test4 = _T("eating chocolate with a chocolate lab in an old chocolate colored computer lab"); //13 terms
129
typedef std::map<const TCHAR*, int32_t, __TCharCompare> test4MapType;
130
test4MapType test4Map;
131
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("chocolate"), 3) );
132
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("lab"), 2) );
133
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("eating"), 1) );
134
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("computer"), 1) );
135
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("with"), 1) );
136
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("a"), 1) );
137
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("colored"), 1) );
138
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("in"), 1) );
139
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("an"), 1) );
140
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("computer"), 1) );
141
test4Map.insert( std::pair<const TCHAR*,int32_t>(_T("old"), 1) );
144
setupDoc(testDoc1, test1);
146
setupDoc(testDoc2, test2);
148
setupDoc(testDoc3, test3);
150
setupDoc(testDoc4, test4);
156
IndexWriter writer(&dir, &a, true);
158
writer.addDocument(&testDoc1);
159
writer.addDocument(&testDoc2);
160
writer.addDocument(&testDoc3);
161
writer.addDocument(&testDoc4);
164
IndexSearcher knownSearcher(&dir);
165
TermEnum* termEnum = knownSearcher.getReader()->terms();
166
TermDocs* termDocs = knownSearcher.getReader()->termDocs();
168
CL_NS(search)::Similarity* sim = knownSearcher.getSimilarity();
169
while (termEnum->next() == true)
171
Term* term = termEnum->term(true);
172
//System.out.println("Term: " + term);
173
termDocs->seek(term);
175
while (termDocs->next())
177
int32_t docId = termDocs->doc();
178
int32_t freq = termDocs->freq();
179
//System.out.println("Doc Id: " + docId + " freq " + freq);
180
TermFreqVector* vector = knownSearcher.getReader()->getTermFreqVector(docId, _T("field"));
181
float_t tf = sim->tf(freq);
182
float_t idf = sim->idf(term, &knownSearcher);
183
//float_t qNorm = sim.queryNorm()
184
idf += tf; //remove warning
186
const ArrayBase<const TCHAR*>* terms = vector->getTerms();
187
CLUCENE_ASSERT(vector != NULL);
188
int termsCount=terms != NULL ? terms->length : 0;
190
//This is fine since we don't have stop words
191
float_t lNorm = sim->lengthNorm(_T("field"), termsCount);
192
lNorm ++;//remove warning
194
//float_t coord = sim.coord()
195
//System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
196
const ArrayBase<const TCHAR*>* vTerms = vector->getTerms();
197
const ArrayBase<int32_t>* freqs = vector->getTermFrequencies();
199
while ( vTerms && i < vTerms->length )
201
if ( _tcscmp(term->text(), vTerms->values[i]) == 0 )
203
CLUCENE_ASSERT((*freqs)[i] == freq);
211
//System.out.println("--------");
217
Term* tqTerm = _CLNEW Term(_T("field"), _T("chocolate"));
218
TermQuery query(tqTerm);
219
_CLDECDELETE(tqTerm);
221
Hits* hits = knownSearcher.search(&query);
222
//doc 3 should be the first hit b/c it is the shortest match
223
CLUCENE_ASSERT(hits->length() == 3);
224
float_t score = hits->score(0);
227
CLUCENE_ASSERT(2==hits->id(0) );
228
CLUCENE_ASSERT(3==hits->id(1) );
229
CLUCENE_ASSERT(0==hits->id(2) );
231
TermFreqVector* vector = knownSearcher.getReader()->getTermFreqVector(hits->id(1), _T("field"));
232
CLUCENE_ASSERT(vector != NULL);
233
//_tprintf(_T("Vector: %s\n"),vector);
234
const ArrayBase<const TCHAR*>* terms = vector->getTerms();
235
const ArrayBase<int32_t>* freqs = vector->getTermFrequencies();
236
CLUCENE_ASSERT(terms != NULL);
238
int termsLength = terms->length;
239
CLUCENE_ASSERT(termsLength == 10);
241
for (int32_t i = 0; i < termsLength; i++) {
242
const TCHAR* term = terms->values[i];
243
//_tprintf(_T("Term: %s, test4map.size()=%d\n"),term, test4Map.size());
244
int32_t freq = (*freqs)[i];
245
CLUCENE_ASSERT( _tcsstr(test4,term) != NULL );
246
test4MapType::const_iterator itr = test4Map.find(term);
247
CLUCENE_ASSERT( itr != test4Map.end() );
248
int32_t freqInt = itr->second;
249
CLUCENE_ASSERT(freqInt == freq);
253
knownSearcher.close();
255
} catch (CLuceneError& e) {
256
CuAssert(tc, e.twhat(),false);
260
CuSuite *testtermvector(void)
263
CuSuite *suite = CuSuiteNew(_T("CLucene Term Vector Test"));
264
SUITE_ADD_TEST(suite, testTVSetup);
265
SUITE_ADD_TEST(suite, testKnownSetOfDocuments);
266
SUITE_ADD_TEST(suite, testTermVectors);
267
SUITE_ADD_TEST(suite, testTermPositionVectors);
268
SUITE_ADD_TEST(suite, testTVCleanup);