1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
7
#ifndef _lucene_index_internal_termvector_h
8
#define _lucene_index_internal_termvector_h
10
#include "CLucene/util/Array.h"
11
#include "_FieldInfos.h"
12
#include "TermVector.h"
13
//#include "FieldInfos.h"
17
class TermVectorsWriter:LUCENE_BASE {
19
CL_NS(store)::IndexOutput* tvx, *tvd, *tvf;
20
FieldInfos* fieldInfos;
23
TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment,
24
FieldInfos* fieldInfos);
28
* Add a complete document specified by all its term vectors. If document has no
29
* term vectors, add value for tvx.
34
void addAllDocVectors(CL_NS(util)::ArrayBase<TermFreqVector*>* vectors);
36
/** Close all streams.
37
* to suppress exceptions from being thrown, pass an error object to be filled in
39
void close(CLuceneError* err = NULL);
44
class SegmentTermVector: public /*virtual*/ TermFreqVector {
47
CL_NS(util)::ArrayBase<TCHAR*>* terms;
48
CL_NS(util)::ArrayBase<int32_t>* termFreqs;
50
int32_t binarySearch(const CL_NS(util)::ArrayBase<TCHAR*>& array, const TCHAR* key) const;
52
//note: termFreqs must be the same length as terms
53
SegmentTermVector(const TCHAR* field, CL_NS(util)::ArrayBase<TCHAR*>* terms,
54
CL_NS(util)::ArrayBase<int32_t>* termFreqs);
55
virtual ~SegmentTermVector();
59
* @return The number of the field this vector is associated with
61
const TCHAR* getField();
62
TCHAR* toString() const;
64
const CL_NS(util)::ArrayBase<const TCHAR*>* getTerms();
65
const CL_NS(util)::ArrayBase<int32_t>* getTermFrequencies();
66
int32_t indexOf(const TCHAR* termText);
67
CL_NS(util)::ArrayBase<int32_t>* indexesOf(const CL_NS(util)::ArrayBase<TCHAR*>& termNumbers, const int32_t start, const int32_t len);
69
virtual TermPositionVector* __asTermPositionVector();
77
class TermVectorMapper; // Forward declaration
79
class CLUCENE_EXPORT TermVectorsReader:LUCENE_BASE {
81
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 2);
82
LUCENE_STATIC_CONSTANT(uint8_t, STORE_POSITIONS_WITH_TERMVECTOR = 0x1);
83
LUCENE_STATIC_CONSTANT(uint8_t, STORE_OFFSET_WITH_TERMVECTOR = 0x2);
86
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
87
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4);
89
FieldInfos* fieldInfos;
91
CL_NS(store)::IndexInput* tvx;
92
CL_NS(store)::IndexInput* tvd;
93
CL_NS(store)::IndexInput* tvf;
94
int64_t _size; // TODO: size_t ?
96
// The docID offset where our docs begin in the index
97
// file. This will be 0 if we have our own private file.
98
int32_t docStoreOffset;
104
TermVectorsReader(CL_NS(store)::Directory* d, const char* segment, FieldInfos* fieldInfos,
105
int32_t readBufferSize=LUCENE_STREAM_BUFFER_SIZE, int32_t docStoreOffset=-1, int32_t size=0);
106
~TermVectorsReader();
109
int32_t checkValidFormat(CL_NS(store)::IndexInput* in);
116
* @return The number of documents in the reader
118
int64_t size() const;
121
void get(const int32_t docNum, const TCHAR* field, TermVectorMapper* mapper);
124
* Retrieve the term vector for the given document and field
125
* @param docNum The document number to retrieve the vector for
126
* @param field The field within the document to retrieve
127
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
128
* @throws IOException if there is an error reading the term vector files
130
TermFreqVector* get(const int32_t docNum, const TCHAR* field);
133
* Return all term vectors stored for this document or null if the could not be read in.
135
* @param docNum The document number to retrieve the vector for
136
* @return All term frequency vectors
137
* @throws IOException if there is an error reading the term vector files
139
CL_NS(util)::ArrayBase<TermFreqVector*>* get(const int32_t docNum);
140
//bool get(int32_t docNum, CL_NS(util)::ObjectArray<TermFreqVector*>& result);
142
void get(const int32_t docNumber, TermVectorMapper* mapper);
145
CL_NS(util)::ObjectArray<SegmentTermVector>* readTermVectors(const int32_t docNum,
146
const TCHAR** fields, const int64_t* tvfPointers, const int32_t len);
148
void readTermVectors(const TCHAR** fields, const int64_t* tvfPointers,
149
const int32_t len, TermVectorMapper* mapper);
153
* @param field The field to read in
154
* @param tvfPointer The pointer within the tvf file where we should start reading
155
* @param mapper The mapper used to map the TermVector
156
* @return The TermVector located at that position
157
* @throws IOException
159
void readTermVector(const TCHAR* field, const int64_t tvfPointer, TermVectorMapper* mapper);
162
DEFINE_MUTEX(THIS_LOCK)
163
TermVectorsReader(const TermVectorsReader& copy);
166
TermVectorsReader* clone() const;
170
class SegmentTermPositionVector: public SegmentTermVector, public TermPositionVector {
172
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<int32_t>* >* positions;
173
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* >* offsets;
174
static CL_NS(util)::ValueArray<int32_t> EMPTY_TERM_POS;
176
SegmentTermPositionVector(const TCHAR* field,
177
CL_NS(util)::ArrayBase<TCHAR*>* terms,
178
CL_NS(util)::ArrayBase<int32_t>* termFreqs,
179
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<int32_t>* >* _positions,
180
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* >* _offsets);
181
~SegmentTermPositionVector();
184
* Returns an array of TermVectorOffsetInfo in which the term is found.
186
* @param index The position in the array to get the offsets from
187
* @return An array of TermVectorOffsetInfo objects or the empty list
188
* @see org.apache.lucene.analysis.Token
190
const CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* getOffsets(const size_t index);
193
* Returns an array of positions in which the term is found.
194
* Terms are identified by the index at which its number appears in the
195
* term String array obtained from the <code>indexOf</code> method.
197
const CL_NS(util)::ArrayBase<int32_t>* getTermPositions(const size_t index);
200
const TCHAR* getField(){ return SegmentTermVector::getField(); }
201
TCHAR* toString() const{ return SegmentTermVector::toString(); }
202
int32_t size(){ return SegmentTermVector::size(); }
203
const CL_NS(util)::ArrayBase<const TCHAR*>* getTerms(){ return SegmentTermVector::getTerms(); }
204
const CL_NS(util)::ArrayBase<int32_t>* getTermFrequencies(){ return SegmentTermVector::getTermFrequencies(); }
205
int32_t indexOf(const TCHAR* termText){ return SegmentTermVector::indexOf(termText); }
206
CL_NS(util)::ArrayBase<int32_t>* indexesOf(const CL_NS(util)::ArrayBase<TCHAR*>& termNumbers, const int32_t start, const int32_t len);
208
virtual TermPositionVector* __asTermPositionVector();
212
* The TermVectorMapper can be used to map Term Vectors into your own
213
* structure instead of the parallel array structure used by
214
* {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
216
* It is up to the implementation to make sure it is thread-safe.
220
class CLUCENE_EXPORT TermVectorMapper : LUCENE_BASE{
222
bool ignoringPositions;
223
bool ignoringOffsets;
227
virtual ~TermVectorMapper(){};
231
* @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored
232
* @param ignoringOffsets similar to ignoringPositions
234
TermVectorMapper(const bool _ignoringPositions, const bool _ignoringOffsets);
238
* Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
239
* This method will be called once before retrieving the vector for a field.
241
* This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}.
242
* @param field The field the vector is for
243
* @param numTerms The number of terms that need to be mapped
244
* @param storeOffsets true if the mapper should expect offset information
245
* @param storePositions true if the mapper should expect positions info
247
virtual void setExpectations(const TCHAR* _field, const int32_t numTerms, const bool storeOffsets,
248
const bool storePositions) = 0;
251
* Map the Term Vector information into your own structure
252
* @param term The term to add to the vector
253
* @param frequency The frequency of the term in the document
254
* @param offsets null if the offset is not specified, otherwise the offset into the field of the term
255
* @param positions null if the position is not specified, otherwise the position in the field of the term
256
* @memory offset and position objects must be cleaned up by implementing class
258
virtual void map(const TCHAR* term, const int32_t termLen, const int32_t frequency,
259
CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* _offsets,
260
CL_NS(util)::ArrayBase<int32_t>* _positions) = 0;
263
* Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
264
* can be skipped over. Derived classes should set this to true if they want to ignore positions. The default
265
* is false, meaning positions will be loaded if they are stored.
268
bool isIgnoringPositions() const;
272
* @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default.
275
bool isIgnoringOffsets() const;
278
* Passes down the index of the document whose term vector is currently being mapped,
279
* once for each top level call to a term vector reader.
281
* Default implementation IGNORES the document number. Override if your implementation needs the document number.
283
* NOTE: Document numbers are internal to Lucene and subject to change depending on indexing operations.
285
* @param documentNumber index of document currently being mapped
287
virtual void setDocumentNumber(const int32_t documentNumber);
291
* Models the existing parallel array structure
293
class ParallelArrayTermVectorMapper : public TermVectorMapper
296
CL_NS(util)::ArrayBase<TCHAR*>* terms;
297
CL_NS(util)::ArrayBase<int32_t>* termFreqs;
298
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<int32_t>* >* positions;
299
CL_NS(util)::ArrayBase< CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* >* offsets;
300
int32_t currentPosition;
302
bool storingPositions;
306
ParallelArrayTermVectorMapper();
307
virtual ~ParallelArrayTermVectorMapper();
309
void setExpectations(const TCHAR* _field, const int32_t numTerms,
310
const bool storeOffsets, const bool storePositions);
312
void map(const TCHAR* term, const int32_t termLen, const int32_t frequency,
313
CL_NS(util)::ArrayBase<TermVectorOffsetInfo*>* _offsets,
314
CL_NS(util)::ArrayBase<int32_t>* _positions);
317
* Construct the vector
318
* @return The {@link TermFreqVector} based on the mappings.
319
* @memory Caller is responsible for freeing up the returned object
321
TermFreqVector* materializeVector();