1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
7
#ifndef _lucene_index_SegmentHeader_
8
#define _lucene_index_SegmentHeader_
10
#include "_SegmentInfos.h"
11
#include "CLucene/util/BitSet.h"
12
//#include "CLucene/util/VoidMap.h"
13
#include "CLucene/store/IndexInput.h"
14
#include "CLucene/store/IndexOutput.h"
15
#include "CLucene/index/IndexReader.h"
18
#include "_TermInfo.h"
19
//#include "FieldInfos.h"
20
#include "_FieldsReader.h"
21
#include "_TermVector.h"
22
//#include "IndexReader.h"
23
#include "_TermInfosReader.h"
24
#include "_CompoundFile.h"
25
#include "DirectoryIndexReader.h"
26
#include "_SkipListReader.h"
27
#include "CLucene/util/_ThreadLocal.h"
32
class SegmentTermDocs:public virtual TermDocs {
34
const SegmentReader* parent;
35
CL_NS(store)::IndexInput* freqStream;
38
CL_NS(util)::BitSet* deletedDocs;
44
int32_t maxSkipLevels;
45
DefaultSkipListReader* skipListReader;
47
int64_t freqBasePointer;
48
int64_t proxBasePointer;
54
bool currentFieldStoresPayloads;
57
///\param Parent must be a segment reader
58
SegmentTermDocs( const SegmentReader* Parent);
59
virtual ~SegmentTermDocs();
61
virtual void seek(Term* term);
62
virtual void seek(TermEnum* termEnum);
63
virtual void seek(const TermInfo* ti,Term* term);
66
virtual int32_t doc()const;
67
virtual int32_t freq()const;
71
/** Optimized implementation. */
72
virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
74
/** Optimized implementation. */
75
virtual bool skipTo(const int32_t target);
77
virtual TermPositions* __asTermPositions();
80
virtual void skippingDoc(){}
81
virtual void skipProx(const int64_t /*proxPointer*/, const int32_t /*payloadLength*/){}
85
class SegmentTermPositions: public SegmentTermDocs, public TermPositions {
87
CL_NS(store)::IndexInput* proxStream;
91
// the current payload length
92
int32_t payloadLength;
93
// indicates whether the payload of the currend position has
94
// been read from the proxStream yet
95
bool needToLoadPayload;
97
// these variables are being used to remember information
99
int64_t lazySkipPointer;
100
int32_t lazySkipProxCount;
103
///\param Parent must be a segment reader
104
SegmentTermPositions(const SegmentReader* Parent);
105
virtual ~SegmentTermPositions();
108
void seek(const TermInfo* ti, Term* term);
113
int32_t nextPosition();
115
int32_t readDeltaPosition();
122
int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
125
/** Called by super.skipTo(). */
126
void skipProx(const int64_t proxPointer, const int32_t _payloadLength);
129
void skipPositions( int32_t n );
132
// It is not always neccessary to move the prox pointer
133
// to a new document after the freq pointer has been moved.
134
// Consider for example a phrase query with two terms:
135
// the freq pointer for term 1 has to move to document x
136
// to answer the question if the term occurs in that document. But
137
// only if term 2 also matches document x, the positions have to be
138
// read to figure out if term 1 and term 2 appear next
139
// to each other in document x and thus satisfy the query.
140
// So we move the prox pointer lazily to the document
141
// as soon as positions are requested.
145
int32_t getPayloadLength() const;
147
uint8_t* getPayload(uint8_t* data);
149
bool isPayloadAvailable() const;
152
virtual TermDocs* __asTermDocs();
153
virtual TermPositions* __asTermPositions();
155
//resolve SegmentTermDocs/TermPositions ambiguity
156
void seek(Term* term){ SegmentTermDocs::seek(term); }
157
void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
158
int32_t doc() const{ return SegmentTermDocs::doc(); }
159
int32_t freq() const{ return SegmentTermDocs::freq(); }
160
bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
167
* An IndexReader responsible for reading 1 segment of an index
169
class SegmentReader: public DirectoryIndexReader {
171
* The class Norm represents the normalizations for a field.
172
* These normalizations are read from an IndexInput in into an array of bytes called bytes
174
class Norm :LUCENE_BASE{
177
SegmentReader* _this;
178
const char* segment; ///< pointer to segment name
179
volatile int32_t refCount;
180
bool useSingleNormStream;
184
/** Closes the underlying IndexInput for this norm.
185
* It is still valid to access all other norm properties after close is called.
186
* @throws IOException
190
DEFINE_MUTEX(THIS_LOCK)
192
CL_NS(store)::IndexInput* in;
196
Norm(CL_NS(store)::IndexInput* instrm, bool useSingleNormStream, int32_t number, int64_t normSeek, SegmentReader* reader, const char* segment);
200
void reWrite(SegmentInfo* si);
204
friend class SegmentReader;
206
static void doDelete(Norm* norm);
208
friend class SegmentReader::Norm;
210
//Holds the name of the segment that is being read
213
int32_t readBufferSize;
215
//Indicates if there are documents marked as deleted
216
bool deletedDocsDirty;
220
bool rollbackDeletedDocsDirty;
221
bool rollbackNormsDirty;
222
bool rollbackUndeleteAll;
225
//Holds all norms for all fields in the segment
226
typedef CL_NS(util)::CLHashtable<const TCHAR*,Norm*,
227
CL_NS(util)::Compare::TChar, CL_NS(util)::Equals::TChar,
228
CL_NS(util)::Deletor::Dummy,
233
uint8_t* fakeNorms();
235
// optionally used for the .nrm file shared by multiple norms
236
CL_NS(store)::IndexInput* singleNormStream;
238
// Compound File Reader when based on a compound file segment
239
CompoundFileReader* cfsReader;
240
CompoundFileReader* storeCFSReader;
242
///Reads the Field Info file
243
FieldsReader* fieldsReader;
244
TermVectorsReader* termVectorsReaderOrig;
245
CL_NS(util)::ThreadLocal<TermVectorsReader*,
246
CL_NS(util)::Deletor::Object<TermVectorsReader> >termVectorsLocal;
248
void initialize(SegmentInfo* si, int32_t readBufferSize, bool doOpenStores, bool doingReopen);
251
* Create a clone from the initial TermVectorsReader and store it in the ThreadLocal.
252
* @return TermVectorsReader
254
TermVectorsReader* getTermVectorsReader();
256
FieldsReader* getFieldsReader();
257
FieldInfos* getFieldInfos();
260
///Marks document docNum as deleted
261
void doDelete(const int32_t docNum);
262
void doUndeleteAll();
263
void commitChanges();
264
void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value);
266
// can return null if norms aren't stored
267
uint8_t* getNorms(const TCHAR* field);
270
* Decrements the RC of the norms this reader is using
275
DirectoryIndexReader* doReopen(SegmentInfos* infos);
279
* @throws CorruptIndexException if the index is corrupt
280
* @throws IOException if there is a low-level IO error
282
static SegmentReader* get(SegmentInfo* si, bool doOpenStores=true);
285
* @throws CorruptIndexException if the index is corrupt
286
* @throws IOException if there is a low-level IO error
288
static SegmentReader* get(SegmentInfo* si, int32_t readBufferSize, bool doOpenStores=true);
291
* @throws CorruptIndexException if the index is corrupt
292
* @throws IOException if there is a low-level IO error
294
static SegmentReader* get(SegmentInfos* sis, SegmentInfo* si, bool closeDir);
297
* @throws CorruptIndexException if the index is corrupt
298
* @throws IOException if there is a low-level IO error
299
* @param readBufferSize defaults to BufferedIndexInput::BUFFER_SIZE
301
static SegmentReader* get(CL_NS(store)::Directory* dir, SegmentInfo* si,
303
bool closeDir, bool ownDir,
304
int32_t readBufferSize=-1,
305
bool doOpenStores=true);
311
virtual ~SegmentReader();
313
///Closes all streams to the files of a single segment
316
///Checks if a segment managed by SegmentInfo si has deletions
317
static bool hasDeletions(const SegmentInfo* si);
318
bool hasDeletions() const;
319
bool hasNorms(const TCHAR* field);
321
///Returns all file names managed by this SegmentReader
322
void files(std::vector<std::string>& retarray);
323
///Returns an enumeration of all the Terms and TermInfos in the set.
325
///Returns an enumeration of terms starting at or after the named term t
326
TermEnum* terms(const Term* t);
328
///Gets the document identified by n
329
bool document(int32_t n, CL_NS(document)::Document& doc, const CL_NS(document)::FieldSelector* fieldSelector);
331
///Checks if the n-th document has been marked deleted
332
bool isDeleted(const int32_t n);
334
///Returns an unpositioned TermDocs enumerator.
335
TermDocs* termDocs();
336
///Returns an unpositioned TermPositions enumerator.
337
TermPositions* termPositions();
339
///Returns the number of documents which contain the term t
340
int32_t docFreq(const Term* t);
342
///Returns the actual number of documents in the segment
344
///Returns the number of all the documents in the segment including the ones that have
345
///been marked deleted
346
int32_t maxDoc() const;
349
void setTermInfosIndexDivisor(int32_t indexDivisor);
351
int32_t getTermInfosIndexDivisor();
353
///Returns the bytes array that holds the norms of a named field.
354
///Returns fake norms if norms aren't available
355
uint8_t* norms(const TCHAR* field);
357
///Reads the Norms for field from disk
358
void norms(const TCHAR* field, uint8_t* bytes);
360
///concatenating segment with ext and x
361
std::string SegmentName(const char* ext, const int32_t x=-1);
362
///Creates a filename in buffer by concatenating segment with ext and x
363
void SegmentName(char* buffer,int32_t bufferLen,const char* ext, const int32_t x=-1 );
366
* @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
368
void getFieldNames(FieldOption fldOption, StringArrayWithDeletor& retarray);
370
static bool usesCompoundFile(SegmentInfo* si);
372
/** Return a term frequency vector for the specified document and field. The
373
* vector returned contains term numbers and frequencies for all terms in
374
* the specified field of this document, if the field had storeTermVector
375
* flag set. If the flag was not set, the method returns null.
376
* @throws IOException
378
TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field=NULL);
380
void getTermFreqVector(int32_t docNumber, const TCHAR* field, TermVectorMapper* mapper);
381
void getTermFreqVector(int32_t docNumber, TermVectorMapper* mapper);
383
/** Return an array of term frequency vectors for the specified document.
384
* The array contains a vector for each vectorized field in the document.
385
* Each vector vector contains term numbers and frequencies for all terms
386
* in a given vectorized field.
387
* If no such fields existed, the method returns null.
388
* @throws IOException
390
CL_NS(util)::ArrayBase<TermFreqVector*>* getTermFreqVectors(int32_t docNumber);
392
static const char* getClassName();
393
const char* getObjectName() const;
399
//Open all norms files for all fields
400
void openNorms(CL_NS(store)::Directory* cfsDir, int32_t readBufferSize);
402
///a bitVector that manages which documents have been deleted
403
CL_NS(util)::BitSet* deletedDocs;
404
///an IndexInput to the frequency file
405
CL_NS(store)::IndexInput* freqStream;
406
///For reading the fieldInfos file
407
FieldInfos* _fieldInfos;
408
///For reading the Term Dictionary .tis file
409
TermInfosReader* tis;
410
///an IndexInput to the prox file
411
CL_NS(store)::IndexInput* proxStream;
413
static bool hasSeparateNorms(SegmentInfo* si);
414
static uint8_t* createFakeNorms(int32_t size);
416
void loadDeletedDocs();
417
SegmentReader* reopenSegment(SegmentInfo* si);
419
/** Returns the field infos of this segment */
420
FieldInfos* fieldInfos();
423
* Return the name of the segment this reader is reading.
425
const char* getSegmentName();
428
* Return the SegmentInfo of the segment this reader is reading.
430
SegmentInfo* getSegmentInfo();
431
void setSegmentInfo(SegmentInfo* info);
433
void rollbackCommit();
435
//allow various classes to access the internals of this. this allows us to have
436
//a more tight idea of the package
437
friend class IndexReader;
438
friend class IndexWriter;
439
friend class SegmentTermDocs;
440
friend class SegmentTermPositions;
441
friend class MultiReader;
442
friend class MultiSegmentReader;
443
friend class SegmentMerger;