1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
7
#ifndef _lucene_index_IndexWriter_
8
#define _lucene_index_IndexWriter_
10
#include "CLucene/util/VoidList.h"
11
#include "CLucene/util/Array.h"
12
CL_CLASS_DEF(search,Similarity)
13
CL_CLASS_DEF(store,Lock)
14
CL_CLASS_DEF(analysis,Analyzer)
15
CL_CLASS_DEF(store,Directory)
16
CL_CLASS_DEF(store,LuceneLock)
17
CL_CLASS_DEF(document,Document)
19
#include "MergePolicy.h"
20
#include "CLucene/LuceneThreads.h"
29
class DocumentsWriter;
30
class IndexFileDeleter;
32
class IndexDeletionPolicy;
36
An <code>IndexWriter</code> creates and maintains an index.
38
<p>The <code>create</code> argument to the
39
<a href="#IndexWriter(org.apache.lucene.store.Directory, org.apache.lucene.analysis.Analyzer, boolean)"><b>constructor</b></a>
40
determines whether a new index is created, or whether an existing index is
42
can open an index with <code>create=true</code> even while readers are
43
using the index. The old readers will continue to search
44
the "point in time" snapshot they had opened, and won't
45
see the newly created index until they re-open. There are
46
also <a href="#IndexWriter(org.apache.lucene.store.Directory, org.apache.lucene.analysis.Analyzer)"><b>constructors</b></a>
47
with no <code>create</code> argument which
48
will create a new index if there is not already an index at the
49
provided path and otherwise open the existing index.</p>
51
<p>In either case, documents are added with <a
52
href="#addDocument(org.apache.lucene.document.Document)"><b>addDocument</b></a>
54
href="#deleteDocuments(org.apache.lucene.index.Term)"><b>deleteDocuments</b></a>.
55
A document can be updated with <a href="#updateDocument(org.apache.lucene.index.Term, org.apache.lucene.document.Document)"><b>updateDocument</b></a>
56
(which just deletes and then adds the entire document).
57
When finished adding, deleting and updating documents, <a href="#close()"><b>close</b></a> should be called.</p>
59
<p>These changes are buffered in memory and periodically
60
flushed to the {@link Directory} (during the above method
61
calls). A flush is triggered when there are enough
62
buffered deletes (see {@link #setMaxBufferedDeleteTerms})
63
or enough added documents since the last flush, whichever
64
is sooner. For the added documents, flushing is triggered
65
either by RAM usage of the documents (see {@link
66
#setRAMBufferSizeMB}) or the number of added documents.
67
The default is to flush when RAM usage hits 16 MB. For
68
best indexing speed you should flush by RAM usage with a
69
large RAM buffer. You can also force a flush by calling
70
{@link #flush}. When a flush occurs, both pending deletes
71
and added documents are flushed to the index. A flush may
72
also trigger one or more segment merges which by default
73
run with a background thread so as not to block the
74
addDocument calls (see <a href="#mergePolicy">below</a>
75
for changing the {@link MergeScheduler}).</p>
77
<a name="autoCommit"></a>
78
<p>The optional <code>autoCommit</code> argument to the
79
<a href="#IndexWriter(org.apache.lucene.store.Directory, boolean, org.apache.lucene.analysis.Analyzer)"><b>constructors</b></a>
80
controls visibility of the changes to {@link IndexReader} instances reading the same index.
81
When this is <code>false</code>, changes are not
82
visible until {@link #close()} is called.
83
Note that changes will still be flushed to the
84
{@link org.apache.lucene.store.Directory} as new files,
85
but are not committed (no new <code>segments_N</code> file
86
is written referencing the new files) until {@link #close} is
87
called. If something goes terribly wrong (for example the
88
JVM crashes) before {@link #close()}, then
89
the index will reflect none of the changes made (it will
90
remain in its starting state).
91
You can also call {@link #abort()}, which closes the writer without committing any
92
changes, and removes any index
93
files that had been flushed but are now unreferenced.
94
This mode is useful for preventing readers from refreshing
95
at a bad time (for example after you've done all your
96
deletes but before you've done your adds).
97
It can also be used to implement simple single-writer
98
transactional semantics ("all or none").</p>
100
<p>When <code>autoCommit</code> is <code>true</code> then
101
every flush is also a commit ({@link IndexReader}
102
instances will see each flush as changes to the index).
103
This is the default, to match the behavior before 2.2.
104
When running in this mode, be careful not to refresh your
105
readers while optimize or segment merges are taking place
106
as this can tie up substantial disk space.</p>
108
<p>Regardless of <code>autoCommit</code>, an {@link
109
IndexReader} or {@link org.apache.lucene.search.IndexSearcher} will only see the
110
index as of the "point in time" that it was opened. Any
111
changes committed to the index after the reader was opened
112
are not visible until the reader is re-opened.</p>
114
<p>If an index will not have more documents added for a while and optimal search
115
performance is desired, then the <a href="#optimize()"><b>optimize</b></a>
116
method should be called before the index is closed.</p>
118
<p>Opening an <code>IndexWriter</code> creates a lock file for the directory in use. Trying to open
119
another <code>IndexWriter</code> on the same directory will lead to a
120
{@link LockObtainFailedException}. The {@link LockObtainFailedException}
121
is also thrown if an IndexReader on the same directory is used to delete documents
124
<a name="deletionPolicy"></a>
125
<p>Expert: <code>IndexWriter</code> allows an optional
126
{@link IndexDeletionPolicy} implementation to be
127
specified. You can use this to control when prior commits
128
are deleted from the index. The default policy is {@link
129
KeepOnlyLastCommitDeletionPolicy} which removes all prior
130
commits as soon as a new commit is done (this matches
131
behavior before 2.2). Creating your own policy can allow
132
you to explicitly keep previous "point in time" commits
133
alive in the index for some time, to allow readers to
134
refresh to the new commit without having the old commit
135
deleted out from under them. This is necessary on
136
filesystems like NFS that do not support "delete on last
137
close" semantics, which Lucene's "point in time" search
138
normally relies on. </p>
140
<a name="mergePolicy"></a> <p>Expert:
141
<code>IndexWriter</code> allows you to separately change
142
the {@link MergePolicy} and the {@link MergeScheduler}.
143
The {@link MergePolicy} is invoked whenever there are
144
changes to the segments in the index. Its role is to
145
select which merges to do, if any, and return a {@link
146
MergePolicy.MergeSpecification} describing the merges. It
147
also selects merges to do for optimize(). (The default is
148
{@link LogByteSizeMergePolicy}. Then, the {@link
149
MergeScheduler} is invoked with the requested merges and
150
it decides when and how to run the merges. The default is
151
{@link ConcurrentMergeScheduler}. </p>
154
* Clarification: Check Points (and commits)
155
* Being able to set autoCommit=false allows IndexWriter to flush and
156
* write new index files to the directory without writing a new segments_N
157
* file which references these new files. It also means that the state of
158
* the in memory SegmentInfos object is different than the most recent
159
* segments_N file written to the directory.
161
* Each time the SegmentInfos is changed, and matches the (possibly
162
* modified) directory files, we have a new "check point".
163
* If the modified/new SegmentInfos is written to disk - as a new
164
* (generation of) segments_N file - this check point is also an
167
* With autoCommit=true, every checkPoint is also a CommitPoint.
168
* With autoCommit=false, some checkPoints may not be commits.
170
* A new checkpoint always replaces the previous checkpoint and
171
* becomes the new "front" of the index. This allows the IndexFileDeleter
172
* to delete files that are referenced only by stale checkpoints.
173
* (files that were created since the last commit, but are no longer
174
* referenced by the "front" of the index). For this, IndexFileDeleter
175
* keeps track of the last non commit checkpoint.
177
class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
178
bool isOpen; //indicates if the writers is open - this way close can be called multiple times
180
// how to analyze text
181
CL_NS(analysis)::Analyzer* analyzer;
183
CL_NS(search)::Similarity* similarity; // how to normalize
189
// Holds all SegmentInfo instances currently involved in
191
typedef CL_NS(util)::CLHashSet<SegmentInfo*, CL_NS(util)::Compare::Void<SegmentInfo> > MergingSegmentsType;
192
MergingSegmentsType* mergingSegments;
193
MergePolicy* mergePolicy;
194
MergeScheduler* mergeScheduler;
196
typedef CL_NS(util)::CLLinkedList<MergePolicy::OneMerge*,
197
CL_NS(util)::Deletor::Object<MergePolicy::OneMerge> > PendingMergesType;
198
PendingMergesType* pendingMerges;
200
typedef CL_NS(util)::CLHashSet<MergePolicy::OneMerge*,
201
CL_NS(util)::Compare::Void<MergePolicy::OneMerge>,
202
CL_NS(util)::Deletor::Object<MergePolicy::OneMerge> > RunningMergesType;
203
RunningMergesType* runningMerges;
205
typedef CL_NS(util)::CLArrayList<MergePolicy::OneMerge*> MergeExceptionsType;
206
MergeExceptionsType* mergeExceptions;
211
/** If non-null, information about merges will be printed to this.
213
std::ostream* infoStream;
214
static std::ostream* defaultInfoStream;
218
bool commitPending; // true if segmentInfos has changes not yet committed
219
SegmentInfos* rollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails
221
SegmentInfos* localRollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails
222
bool localAutoCommit; // saved autoCommit during local transaction
223
bool autoCommit; // false if we should commit only on close
225
DocumentsWriter* docWriter;
226
IndexFileDeleter* deleter;
228
typedef std::vector<SegmentInfo*> SegmentsToOptimizeType;
229
SegmentsToOptimizeType* segmentsToOptimize; // used by optimize to note those needing optimization
232
CL_NS(store)::LuceneLock* writeLock;
234
void init(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, bool closeDir, IndexDeletionPolicy* deletionPolicy, bool autoCommit);
235
void init(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, bool create, bool closeDir, IndexDeletionPolicy* deletionPolicy, bool autoCommit);
236
void deinit(bool releaseWriteLock = true) throw();
238
// where this index resides
239
CL_NS(store)::Directory* directory;
243
int32_t getSegmentsCounter();
244
int32_t maxFieldLength;
246
int32_t minMergeDocs;
247
int32_t maxMergeDocs;
248
int32_t termIndexInterval;
250
int64_t writeLockTimeout;
251
int64_t commitLockTimeout;
253
// The normal read buffer size defaults to 1024, but
254
// increasing this during merging seems to yield
255
// performance gains. However we don't want to increase
256
// it too much because there are quite a few
257
// BufferedIndexInputs created during merging. See
258
// LUCENE-888 for details.
259
static const int32_t MERGE_READ_BUFFER_SIZE;
261
// Used for printing messages
262
STATIC_DEFINE_MUTEX(MESSAGE_ID_LOCK)
263
static int32_t MESSAGE_ID;
268
DEFINE_MUTEX(THIS_LOCK)
269
DEFINE_CONDITION(THIS_WAIT_CONDITION)
271
// Release the write lock, if needed.
272
SegmentInfos* segmentInfos;
274
// Release the write lock, if needed.
275
virtual ~IndexWriter();
278
* The Java implementation of Lucene silently truncates any tokenized
279
* field if the number of tokens exceeds a certain threshold. Although
280
* that threshold is adjustable, it is easy for the client programmer
281
* to be unaware that such a threshold exists, and to become its
283
* CLucene implements a less insidious truncation policy. Up to
284
* DEFAULT_MAX_FIELD_LENGTH tokens, CLucene behaves just as JLucene
285
* does. If the number of tokens exceeds that threshold without any
286
* indication of a truncation preference by the client programmer,
287
* CLucene raises an exception, prompting the client programmer to
288
* explicitly set a truncation policy by adjusting maxFieldLength.
290
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_FIELD_LENGTH = 10000);
291
LUCENE_STATIC_CONSTANT(int32_t, FIELD_TRUNC_POLICY__WARN = -1);
294
* Returns the maximum number of terms that will be
295
* indexed for a single field in a document.
296
* @see #setMaxFieldLength
298
int32_t getMaxFieldLength();
300
* The maximum number of terms that will be indexed for a single field in a
301
* document. This limits the amount of memory required for indexing, so that
302
* collections with very large files will not crash the indexing process by
303
* running out of memory. This setting refers to the number of running terms,
304
* not to the number of different terms.<p/>
305
* <strong>Note:</strong> see {@link DEFAULT_MAX_FIELD_LENGTH} for an important
306
* note regarding field lengths.
307
* @see #DEFAULT_MAX_FIELD_LENGTH
309
void setMaxFieldLength(int32_t val);
311
/** Determines the minimal number of documents required before the buffered
312
* in-memory documents are merging and a new Segment is created.
313
* Since Documents are merged in a {@link RAMDirectory},
314
* large value gives faster indexing. At the same time, mergeFactor limits
315
* the number of files open in a FSDirectory.
317
* <p> The default value is DEFAULT_MAX_BUFFERED_DOCS.*/
318
void setMaxBufferedDocs(int32_t val);
320
* @see #setMaxBufferedDocs
322
int32_t getMaxBufferedDocs();
325
* Default value for the write lock timeout (1,000).
326
* @see #setDefaultWriteLockTimeout
328
static int64_t WRITE_LOCK_TIMEOUT;
330
* Sets the maximum time to wait for a write lock (in milliseconds).
332
void setWriteLockTimeout(int64_t writeLockTimeout);
334
* @see #setWriteLockTimeout
336
int64_t getWriteLockTimeout();
339
* Sets the maximum time to wait for a commit lock (in milliseconds).
341
void setCommitLockTimeout(int64_t commitLockTimeout);
343
* @see #setCommitLockTimeout
345
int64_t getCommitLockTimeout();
348
* Name of the write lock in the index.
350
static const char* WRITE_LOCK_NAME; //"write.lock";
354
* @see LogMergePolicy#DEFAULT_MERGE_FACTOR
356
static const int32_t DEFAULT_MERGE_FACTOR ;
359
* Value to denote a flush trigger is disabled
361
static const int32_t DISABLE_AUTO_FLUSH;
364
* Disabled by default (because IndexWriter flushes by RAM usage
365
* by default). Change using {@link #setMaxBufferedDocs(int)}.
367
static const int32_t DEFAULT_MAX_BUFFERED_DOCS;
370
* Default value is 16 MB (which means flush when buffered
371
* docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}.
373
static const float_t DEFAULT_RAM_BUFFER_SIZE_MB;
376
* Disabled by default (because IndexWriter flushes by RAM usage
377
* by default). Change using {@link #setMaxBufferedDeleteTerms(int)}.
379
static const int32_t DEFAULT_MAX_BUFFERED_DELETE_TERMS;
383
* @see LogDocMergePolicy#DEFAULT_MAX_MERGE_DOCS
385
static const int32_t DEFAULT_MAX_MERGE_DOCS;
388
* Absolute hard maximum length for a term. If a term
389
* arrives from the analyzer longer than this length, it
390
* is skipped and a message is printed to infoStream, if
391
* set (see {@link #setInfoStream}).
393
static const int32_t MAX_TERM_LENGTH;
396
/* Determines how often segment indices are merged by addDocument(). With
397
* smaller values, less RAM is used while indexing, and searches on
398
* unoptimized indices are faster, but indexing speed is slower. With larger
399
* values more RAM is used while indexing and searches on unoptimized indices
400
* are slower, but indexing is faster. Thus larger values (> 10) are best
401
* for batched index creation, and smaller values (< 10) for indices that are
402
* interactively maintained.
404
* <p>This must never be less than 2. The default value is 10.
406
int32_t getMergeFactor() const;
407
void setMergeFactor(int32_t val);
410
/** Expert: The fraction of terms in the "dictionary" which should be stored
411
* in RAM. Smaller values use more memory, but make searching slightly
412
* faster, while larger values use less memory and make searching slightly
413
* slower. Searching is typically not dominated by dictionary lookup, so
414
* tweaking this is rarely useful.
416
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERM_INDEX_INTERVAL = 128);
417
/** Expert: Set the interval between indexed terms. Large values cause less
418
* memory to be used by IndexReader, but slow random-access to terms. Small
419
* values cause more memory to be used by an IndexReader, and speed
420
* random-access to terms.
422
* This parameter determines the amount of computation required per query
423
* term, regardless of the number of documents that contain that term. In
424
* particular, it is the maximum number of other terms that must be
425
* scanned before a term is located and its frequency and position information
426
* may be processed. In a large index with user-entered query terms, query
427
* processing time is likely to be dominated not by term lookup but rather
428
* by the processing of frequency and positional data. In a small index
429
* or when many uncommon query terms are generated (e.g., by wildcard
430
* queries) term lookup may become a dominant cost.
432
* In particular, <code>numUniqueTerms/interval</code> terms are read into
433
* memory by an IndexReader, and, on average, <code>interval/2</code> terms
434
* must be scanned for each random term access.
436
* @see #DEFAULT_TERM_INDEX_INTERVAL
438
void setTermIndexInterval(int32_t interval);
439
/** Expert: Return the interval between indexed terms.
441
* @see #setTermIndexInterval(int)
443
int32_t getTermIndexInterval();
445
/**Determines the largest number of documents ever merged by addDocument().
446
* Small values (e.g., less than 10,000) are best for interactive indexing,
447
* as this limits the length of pauses while indexing to a few seconds.
448
* Larger values are best for batched indexing and speedier searches.
450
* <p>The default value is {@link Integer#MAX_VALUE}.
452
int32_t getMaxMergeDocs() const;
453
void setMaxMergeDocs(int32_t val);
456
* Constructs an IndexWriter for the index in <code>path</code>.
457
* Text will be analyzed with <code>a</code>. If <code>create</code>
458
* is true, then a new, empty index will be created in
459
* <code>path</code>, replacing the index already there, if any.
461
* @param path the path to the index directory
462
* @param a the analyzer to use
463
* @param create <code>true</code> to create the index or overwrite
464
* the existing one; <code>false</code> to append to the existing
466
* @throws CorruptIndexException if the index is corrupt
467
* @throws LockObtainFailedException if another writer
468
* has this index open (<code>write.lock</code> could not
470
* @throws IOException if the directory cannot be read/written to, or
471
* if it does not exist and <code>create</code> is
472
* <code>false</code> or if there is any other low-level
475
explicit IndexWriter(const char* path, CL_NS(analysis)::Analyzer* a, const bool create);
478
* Constructs an IndexWriter for the index in <code>d</code>.
479
* Text will be analyzed with <code>a</code>. If <code>create</code>
480
* is true, then a new, empty index will be created in
481
* <code>d</code>, replacing the index already there, if any.
483
* @param d the index directory
484
* @param a the analyzer to use
485
* @param create <code>true</code> to create the index or overwrite
486
* the existing one; <code>false</code> to append to the existing
488
* @throws CorruptIndexException if the index is corrupt
489
* @throws LockObtainFailedException if another writer
490
* has this index open (<code>write.lock</code> could not
492
* @throws IOException if the directory cannot be read/written to, or
493
* if it does not exist and <code>create</code> is
494
* <code>false</code> or if there is any other low-level
497
explicit IndexWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, const bool create, const bool closeDirOnShutdown=false);
500
* Expert: constructs an IndexWriter with a custom {@link
501
* IndexDeletionPolicy}, for the index in <code>d</code>,
502
* first creating it if it does not already exist. Text
503
* will be analyzed with <code>a</code>.
505
* @param d the index directory
506
* @param autoCommit see <a href="#autoCommit">above</a>
507
* @param a the analyzer to use
508
* @param deletionPolicy see <a href="#deletionPolicy">above</a>
509
* @throws CorruptIndexException if the index is corrupt
510
* @throws LockObtainFailedException if another writer
511
* has this index open (<code>write.lock</code> could not
513
* @throws IOException if the directory cannot be
514
* read/written to or if there is any other low-level
517
explicit IndexWriter(CL_NS(store)::Directory* d, const bool autoCommit, CL_NS(analysis)::Analyzer* a, IndexDeletionPolicy* deletionPolicy = NULL, const bool closeDirOnShutdown=false);
520
* Expert: constructs an IndexWriter with a custom {@link
521
* IndexDeletionPolicy}, for the index in <code>d</code>.
522
* Text will be analyzed with <code>a</code>. If
523
* <code>create</code> is true, then a new, empty index
524
* will be created in <code>d</code>, replacing the index
525
* already there, if any.
527
* @param d the index directory
528
* @param autoCommit see <a href="#autoCommit">above</a>
529
* @param a the analyzer to use
530
* @param create <code>true</code> to create the index or overwrite
531
* the existing one; <code>false</code> to append to the existing
533
* @param deletionPolicy see <a href="#deletionPolicy">above</a>
534
* @throws CorruptIndexException if the index is corrupt
535
* @throws LockObtainFailedException if another writer
536
* has this index open (<code>write.lock</code> could not
538
* @throws IOException if the directory cannot be read/written to, or
539
* if it does not exist and <code>create</code> is
540
* <code>false</code> or if there is any other low-level
543
explicit IndexWriter(CL_NS(store)::Directory* d, const bool autoCommit, CL_NS(analysis)::Analyzer* a, const bool create, IndexDeletionPolicy* deletionPolicy = NULL, const bool closeDirOnShutdown=false);
545
/**Returns the number of documents currently in this index.
551
/** Returns the directory this index resides in. */
552
CL_NS(store)::Directory* getDirectory();
554
/** Get the current setting of whether to use the compound file format.
555
* Note that this just returns the value you set with setUseCompoundFile(boolean)
556
* or the default. You cannot use this to query the status of an existing index.
557
* @see #setUseCompoundFile(boolean)
559
bool getUseCompoundFile();
561
/** Setting to turn on usage of a compound file. When on, multiple files
562
* for each segment are merged into a single file once the segment creation
563
* is finished. This is done regardless of what directory is in use.
565
void setUseCompoundFile(bool value);
568
/** Expert: Set the Similarity implementation used by this IndexWriter.
570
* @see Similarity#setDefault(Similarity)
572
void setSimilarity(CL_NS(search)::Similarity* similarity);
574
/** Expert: Return the Similarity implementation used by this IndexWriter.
576
* <p>This defaults to the current value of {@link Similarity#getDefault()}.
578
CL_NS(search)::Similarity* getSimilarity();
580
/** Returns the analyzer used by this index. */
581
CL_NS(analysis)::Analyzer* getAnalyzer();
584
std::string newSegmentName();
587
* Prints a message to the infoStream (if non-null),
588
* prefixed with the identifying information for this
589
* writer and the thread that's calling it.
591
void message(std::string message);
594
* Returns the current default infoStream for newly
595
* instantiated IndexWriters.
596
* @see #setDefaultInfoStream
598
static std::ostream* getDefaultInfoStream();
601
* Returns the current infoStream in use by this writer.
602
* @see #setInfoStream
604
std::ostream* getInfoStream();
607
* Returns the number of buffered deleted terms that will
608
* trigger a flush if enabled.
609
* @see #setMaxBufferedDeleteTerms
611
int32_t getMaxBufferedDeleteTerms();
614
* Expert: returns the current MergePolicy in use by this writer.
615
* @see #setMergePolicy
617
MergePolicy* getMergePolicy();
620
* Expert: returns the current MergePolicy in use by this
622
* @see #setMergePolicy
624
MergeScheduler* getMergeScheduler();
627
* Returns the value set by {@link #setRAMBufferSizeMB} if enabled.
629
float_t getRAMBufferSizeMB();
631
/** If non-null, this will be the default infoStream used
632
* by a newly instantiated IndexWriter.
633
* @see #setInfoStream
635
static void setDefaultInfoStream(std::ostream* infoStream);\
637
/** If non-null, information about merges, deletes and a
638
* message when maxFieldLength is reached will be printed
641
void setInfoStream(std::ostream* infoStream);
644
* <p>Determines the minimal number of delete terms required before the buffered
645
* in-memory delete terms are applied and flushed. If there are documents
646
* buffered in memory at the time, they are merged and a new segment is
649
* <p>Disabled by default (writer flushes by RAM usage).</p>
651
* @throws IllegalArgumentException if maxBufferedDeleteTerms
652
* is enabled but smaller than 1
653
* @see #setRAMBufferSizeMB
655
void setMaxBufferedDeleteTerms(int32_t maxBufferedDeleteTerms);
658
* Expert: set the merge policy used by this writer.
660
void setMergePolicy(MergePolicy* mp);
663
* Expert: set the merge scheduler used by this writer.
665
void setMergeScheduler(MergeScheduler* mergeScheduler);
667
/** Determines the amount of RAM that may be used for
668
* buffering added documents before they are flushed as a
669
* new Segment. Generally for faster indexing performance
670
* it's best to flush by RAM usage instead of document
671
* count and use as large a RAM buffer as you can.
673
* <p>When this is set, the writer will flush whenever
674
* buffered documents use this much RAM. Pass in {@link
675
* #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
676
* to RAM usage. Note that if flushing by document count
677
* is also enabled, then the flush will be triggered by
678
* whichever comes first.</p>
680
* <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
682
* @throws IllegalArgumentException if ramBufferSize is
683
* enabled but non-positive, or it disables ramBufferSize
684
* when maxBufferedDocs is already disabled
686
void setRAMBufferSizeMB(float_t mb);
689
/** Expert: the {@link MergeScheduler} calls this method
690
* to retrieve the next merge requested by the
692
MergePolicy::OneMerge* getNextMerge();
695
* Merges the indicated segments, replacing them in the stack with a
698
void merge(MergePolicy::OneMerge* merge);
701
* Deletes the document(s) containing <code>term</code>.
702
* @param term the term to identify the documents to be deleted
703
* @throws CorruptIndexException if the index is corrupt
704
* @throws IOException if there is a low-level IO error
706
void deleteDocuments(Term* term);
709
* Deletes the document(s) containing any of the
710
* terms. All deletes are flushed at the same time.
711
* @param terms array of terms to identify the documents
713
* @throws CorruptIndexException if the index is corrupt
714
* @throws IOException if there is a low-level IO error
716
void deleteDocuments(const CL_NS(util)::ArrayBase<Term*>* terms);
719
* Updates a document by first deleting the document(s)
720
* containing <code>term</code> and then adding the new
721
* document. The delete and then add are atomic as seen
722
* by a reader on the same index (flush may happen only after
724
* @param term the term to identify the document(s) to be
726
* @param doc the document to be added
727
* @throws CorruptIndexException if the index is corrupt
728
* @throws IOException if there is a low-level IO error
730
void updateDocument(Term* term, CL_NS(document)::Document* doc);
733
* Updates a document by first deleting the document(s)
734
* containing <code>term</code> and then adding the new
735
* document. The delete and then add are atomic as seen
736
* by a reader on the same index (flush may happen only after
738
* @param term the term to identify the document(s) to be
740
* @param doc the document to be added
741
* @param analyzer the analyzer to use when analyzing the document
742
* @throws CorruptIndexException if the index is corrupt
743
* @throws IOException if there is a low-level IO error
745
void updateDocument(Term* term, CL_NS(document)::Document* doc, CL_NS(analysis)::Analyzer* analyzer);
748
* Returns default write lock timeout for newly
749
* instantiated IndexWriters.
750
* @see #setDefaultWriteLockTimeout
752
int64_t getDefaultWriteLockTimeout();
755
* Sets the default (for any instance of IndexWriter) maximum time to wait for a write lock (in
758
void setDefaultWriteLockTimeout(int64_t writeLockTimeout);
760
std::string segString();
763
* Closes the index with or without waiting for currently
764
* running merges to finish. This is only meaningful when
765
* using a MergeScheduler that runs merges in background
767
* @param waitForMerges if true, this call will block
768
* until all merges complete; else, it will ask all
769
* running merges to abort, wait until those merges have
770
* finished (which should be at most a few seconds), and
773
* <p> If an Exception is hit during close, eg due to disk
774
* full or some other reason, then both the on-disk index
775
* and the internal state of the IndexWriter instance will
776
* be consistent. However, the close will not be complete
777
* even though part of it (flushing buffered documents)
778
* may have succeeded, so the write lock will still be
781
* <p> If you can correct the underlying cause (eg free up
782
* some disk space) then you can call close() again.
783
* Failing that, if you want to force the write lock to be
784
* released (dangerous, because you may then lose buffered
785
* docs in the IndexWriter instance) then you can do
786
* something like this:</p>
792
* if (IndexReader.isLocked(directory)) {
793
* IndexReader.unlock(directory);
798
* after which, you must be certain not to use the writer
799
* instance anymore.</p>
800
* @throws CorruptIndexException if the index is corrupt
801
* @throws IOException if there is a low-level IO error
803
void close(bool waitForMerges=true);
806
* Requests an "optimize" operation on an index, priming the index
807
* for the fastest available search. Traditionally this has meant
808
* merging all segments into a single segment as is done in the
809
* default merge policy, but individaul merge policies may implement
810
* optimize in different ways.
812
* @see LogMergePolicy#findMergesForOptimize
814
* <p>It is recommended that this method be called upon completion of indexing. In
815
* environments with frequent updates, optimize is best done during low volume times, if at all.
818
* <p>See http://www.gossamer-threads.com/lists/lucene/java-dev/47895 for more discussion. </p>
820
* <p>Note that this can require substantial temporary free
821
* space in the Directory (see <a target="_top"
822
* href="http://issues.apache.org/jira/browse/LUCENE-764">LUCENE-764</a>
828
* <p>If no readers/searchers are open against the index,
829
* then free space required is up to 1X the total size of
830
* the starting index. For example, if the starting
831
* index is 10 GB, then you must have up to 10 GB of free
832
* space before calling optimize.</p>
836
* <p>If readers/searchers are using the index, then free
837
* space required is up to 2X the size of the starting
838
* index. This is because in addition to the 1X used by
839
* optimize, the original 1X of the starting index is
840
* still consuming space in the Directory as the readers
841
* are holding the segments files open. Even on Unix,
842
* where it will appear as if the files are gone ("ls"
843
* won't list them), they still consume storage due to
844
* "delete on last close" semantics.</p>
846
* <p>Furthermore, if some but not all readers re-open
847
* while the optimize is underway, this will cause > 2X
848
* temporary space to be consumed as those new readers
849
* will then hold open the partially optimized segments at
850
* that time. It is best not to re-open readers while
851
* optimize is running.</p>
855
* <p>The actual temporary usage could be much less than
856
* these figures (it depends on many factors).</p>
858
* <p>In general, once the optimize completes, the total size of the
859
* index will be less than the size of the starting index.
860
* It could be quite a bit smaller (if there were many
861
* pending deletes) or just slightly smaller.</p>
863
* <p>If an Exception is hit during optimize(), for example
864
* due to disk full, the index will not be corrupt and no
865
* documents will have been lost. However, it may have
866
* been partially optimized (some segments were merged but
867
* not all), and it's possible that one of the segments in
868
* the index will be in non-compound format even when
869
* using compound file format. This will occur when the
870
* Exception is hit during conversion of the segment into
871
* compound format.</p>
873
* <p>This call will optimize those segments present in
874
* the index when the call started. If other threads are
875
* still adding documents and flushing segments, those
876
* newly created segments will not be optimized unless you
877
* call optimize again.</p>
879
* @param doWait Specifies whether the call should block
880
* until the optimize completes. This is only meaningful
881
* with a {@link MergeScheduler} that is able to run merges
882
* in background threads.
883
* @throws CorruptIndexException if the index is corrupt
884
* @throws IOException if there is a low-level IO error
886
void optimize(bool doWait=true);
889
* Optimize the index down to <= maxNumSegments. If
890
* maxNumSegments==1 then this is the same as {@link
892
* @param maxNumSegments maximum number of segments left
893
* in the index after optimization finishes
894
* @param doWait Specifies whether the call should block
895
* until the optimize completes. This is only meaningful
896
* with a {@link MergeScheduler} that is able to run merges
897
* in background threads.
899
void optimize(int32_t maxNumSegments, bool doWait=true);
902
* Flush all in-memory buffered updates (adds and deletes)
904
* <p>Note: if <code>autoCommit=false</code>, flushed data would still
905
* not be visible to readers, until {@link #close} is called.
906
* @throws CorruptIndexException if the index is corrupt
907
* @throws IOException if there is a low-level IO error
912
* Adds a document to this index. If the document contains more than
913
* {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
914
* discarded (depending on the policy, see #FIELD_TRUNC_POLICY__WARN)
916
* <p> Note that if an Exception is hit (for example disk full)
917
* then the index will be consistent, but this document
918
* may not have been added. Furthermore, it's possible
919
* the index will have one segment in non-compound format
920
* even when using compound files (when a merge has
921
* partially succeeded).</p>
923
* <p> This method periodically flushes pending documents
924
* to the Directory (every {@link #setMaxBufferedDocs}),
925
* and also periodically merges segments in the index
926
* (every {@link #setMergeFactor} flushes). When this
927
* occurs, the method will take more time to run (possibly
928
* a long time if the index is large), and will require
929
* free temporary space in the Directory to do the
932
* <p>The amount of free space required when a merge is triggered is
933
* up to 1X the size of all segments being merged, when no
934
* readers/searchers are open against the index, and up to 2X the
935
* size of all segments being merged when readers/searchers are open
936
* against the index (see {@link #optimize()} for details). The
937
* sequence of primitive merge operations performed is governed by
940
* <p>Note that each term in the document can be no longer
941
* than 16383 characters, otherwise an
942
* IllegalArgumentException will be thrown.</p>
944
* @throws CorruptIndexException if the index is corrupt
945
* @throws IOException if there is a low-level IO error
946
* @param analyzer use the provided analyzer instead of the
947
* value of {@link #getAnalyzer()}
949
void addDocument(CL_NS(document)::Document* doc, CL_NS(analysis)::Analyzer* analyzer=NULL);
953
* Expert: asks the mergePolicy whether any merges are
954
* necessary now and if so, runs the requested merges and
955
* then iterate (test again if merges are needed) until no
956
* more merges are returned by the mergePolicy.
958
* Explicit calls to maybeMerge() are usually not
959
* necessary. The most common case is when merge policy
960
* parameters have changed.
965
* Close the <code>IndexWriter</code> without committing
966
* any of the changes that have occurred since it was
967
* opened. This removes any temporary files that had been
968
* created, after which the state of the index will be the
969
* same as it was when this writer was first opened. This
970
* can only be called when this IndexWriter was opened
971
* with <code>autoCommit=false</code>.
972
* @throws IllegalStateException if this is called when
973
* the writer was opened with <code>autoCommit=true</code>.
974
* @throws IOException if there is a low-level IO error
980
* Merges all segments from an array of indexes into this index.
982
* This is similar to addIndexes(Directory[]). However, no optimize()
983
* is called either at the beginning or at the end. Instead, merges
984
* are carried out as necessary.
986
* <p><b>NOTE:</b> the index in each Directory must not be
987
* changed (opened by a writer) while this method is
988
* running. This method does not acquire a write lock in
989
* each input Directory, so it is up to the caller to
992
* <p><b>NOTE:</b> while this is running, any attempts to
993
* add or delete documents (with another thread) will be
994
* paused until this method completes.
997
* This requires this index not be among those to be added, and the
998
* upper bound* of those segment doc counts not exceed maxMergeDocs.
1000
* <p>See {@link #addIndexes(Directory[])} for
1001
* details on transactional semantics, temporary free
1002
* space required in the Directory, and non-CFS segments
1003
* on an Exception.</p>
1004
* @throws CorruptIndexException if the index is corrupt
1005
* @throws IOException if there is a low-level IO error
1007
void addIndexesNoOptimize(CL_NS(util)::ArrayBase<CL_NS(store)::Directory*>& dirs);
1009
/** Merges the provided indexes into this index.
1010
* <p>After this completes, the index is optimized. </p>
1011
* <p>The provided IndexReaders are not closed.</p>
1013
* <p><b>NOTE:</b> the index in each Directory must not be
1014
* changed (opened by a writer) while this method is
1015
* running. This method does not acquire a write lock in
1016
* each input Directory, so it is up to the caller to
1019
* <p><b>NOTE:</b> while this is running, any attempts to
1020
* add or delete documents (with another thread) will be
1021
* paused until this method completes.
1023
* <p>See {@link #addIndexes(Directory[])} for
1024
* details on transactional semantics, temporary free
1025
* space required in the Directory, and non-CFS segments
1026
* on an Exception.</p>
1027
* @throws CorruptIndexException if the index is corrupt
1028
* @throws IOException if there is a low-level IO error
1030
//NOT IMPLEMENTED YET: void addIndexes(CL_NS(util)::ArrayBase<IndexReader*>& readers);
1032
/** Merges all segments from an array of indexes into this index.
1034
* <p>This may be used to parallelize batch indexing. A large document
1035
* collection can be broken into sub-collections. Each sub-collection can be
1036
* indexed in parallel, on a different thread, process or machine. The
1037
* complete index can then be created by merging sub-collection indexes
1040
* <p><b>NOTE:</b> the index in each Directory must not be
1041
* changed (opened by a writer) while this method is
1042
* running. This method does not acquire a write lock in
1043
* each input Directory, so it is up to the caller to
1046
* <p><b>NOTE:</b> while this is running, any attempts to
1047
* add or delete documents (with another thread) will be
1048
* paused until this method completes.
1050
* <p>After this completes, the index is optimized.
1052
* <p>This method is transactional in how Exceptions are
1053
* handled: it does not commit a new segments_N file until
1054
* all indexes are added. This means if an Exception
1055
* occurs (for example disk full), then either no indexes
1056
* will have been added or they all will have been.</p>
1058
* <p>If an Exception is hit, it's still possible that all
1059
* indexes were successfully added. This happens when the
1060
* Exception is hit when trying to build a CFS file. In
1061
* this case, one segment in the index will be in non-CFS
1062
* format, even when using compound file format.</p>
1064
* <p>Also note that on an Exception, the index may still
1065
* have been partially or fully optimized even though none
1066
* of the input indexes were added. </p>
1068
* <p>Note that this requires temporary free space in the
1069
* Directory up to 2X the sum of all input indexes
1070
* (including the starting index). If readers/searchers
1071
* are open against the starting index, then temporary
1072
* free space required will be higher by the size of the
1073
* starting index (see {@link #optimize()} for details).
1076
* <p>Once this completes, the final size of the index
1077
* will be less than the sum of all input index sizes
1078
* (including the starting index). It could be quite a
1079
* bit smaller (if there were many pending deletes) or
1080
* just slightly smaller.</p>
1082
* <p>See <a target="_top"
1083
* href="http://issues.apache.org/jira/browse/LUCENE-702">LUCENE-702</a>
1085
* @throws CorruptIndexException if the index is corrupt
1086
* @throws IOException if there is a low-level IO error
1088
void addIndexes(CL_NS(util)::ArrayBase<CL_NS(store)::Directory*>& dirs);
1090
/** Expert: Return the total size of all index files currently cached in memory.
1091
* Useful for size management with flushRamDocs()
1093
int64_t ramSizeInBytes();
1095
/** Expert: Return the number of documents whose segments are currently cached in memory.
1096
* Useful when calling flush()
1098
int32_t numRamDocs();
1100
/** for testing only */
1101
virtual bool testPoint(const char* name);
1105
friend class LockWith2;
1106
friend class LockWithCFS;
1107
friend class DocumentsWriter;
1109
/** Merges all RAM-resident segments. */
1110
void flushRamSegments();
1112
/** Incremental segment merger. */
1113
void maybeMergeSegments();
1115
/** Pops segments off of segmentInfos stack down to minSegment, merges them,
1116
* and pushes the merged index onto the top of the segmentInfos stack.
1118
void mergeSegments(const uint32_t minSegment);
1120
/** Merges the named range of segments, replacing them in the stack with a
1121
* single segment. */
1122
void mergeSegments(const uint32_t minSegment, const uint32_t end);
1124
void deleteFiles(std::vector<std::string>& files);
1125
void readDeleteableFiles(std::vector<std::string>& files);
1126
void writeDeleteableFiles(std::vector<std::string>& files);
1129
* Some operating systems (e.g. Windows) don't permit a file to be deleted
1130
* while it is opened for read (e.g. by another process or thread). So we
1131
* assume that when a delete fails it is because the file is open in another
1132
* process, and queue the file for subsequent deletion.
1134
void deleteSegments(CL_NS(util)::CLVector<SegmentReader*>* segments);
1135
void deleteFiles(std::vector<std::string>& files, CL_NS(store)::Directory* directory);
1136
void deleteFiles(std::vector<std::string>& files, std::vector<std::string>& deletable);
1139
* Casts current mergePolicy to LogMergePolicy, and throws
1140
* an exception if the mergePolicy is not a LogMergePolicy.
1142
LogMergePolicy* getLogMergePolicy() const;
1144
void setMessageID();
1146
void closeInternal(bool waitForMerges);
1147
void messageState();
1150
* If we are flushing by doc count (not by RAM usage), and
1151
* using LogDocMergePolicy then push maxBufferedDocs down
1152
* as its minMergeDocs, to keep backwards compatibility.
1154
void pushMaxBufferedDocs();
1156
void finishMerges(bool waitForMerges);
1158
/** Tells the docWriter to close its currently open shared
1159
* doc stores (stored fields & vectors files).
1160
* Return value specifices whether new doc store files are compound or not.
1162
bool flushDocStores();
1166
int32_t getDocCount(int32_t i);
1167
int32_t getNumBufferedDocuments();
1168
int32_t getSegmentCount();
1169
int32_t getBufferedDeleteTermsSize();
1170
int32_t getNumBufferedDeleteTerms();
1171
virtual SegmentInfo* newestSegment();
1174
void waitForClose();
1175
void deletePartialSegmentsFile();
1177
/** Returns true if any merges in pendingMerges or
1178
* runningMerges are optimization merges. */
1179
bool optimizeMergesPending();
1181
void resetMergeExceptions();
1183
void updatePendingMerges(int32_t maxNumSegmentsOptimize, bool optimize);
1186
* Begin a transaction. During a transaction, any segment
1187
* merges that happen (or ram segments flushed) will not
1188
* write a new segments file and will not remove any files
1189
* that were present at the start of the transaction. You
1190
* must make a matched (try/finally) call to
1191
* commitTransaction() or rollbackTransaction() to finish
1194
* Note that buffered documents and delete terms are not handled
1195
* within the transactions, so they must be flushed before the
1196
* transaction is started.
1198
void startTransaction();
1201
* Rolls back the transaction and restores state to where
1202
* we were at the start.
1204
void rollbackTransaction();
1207
* Commits the transaction. This will write the new
1208
* segments file and remove and pending deletions we have
1209
* accumulated during the transaction
1211
void commitTransaction();
1214
void maybeMerge(bool optimize);
1215
void maybeMerge(int32_t maxNumSegmentsOptimize, bool optimize);
1216
/** Does initial setup for a merge, which is fast but holds
1217
* the synchronized lock on IndexWriter instance. */
1218
void mergeInit(MergePolicy::OneMerge* _merge);
1219
void _mergeInit(MergePolicy::OneMerge* _merge);
1221
/* If any of our segments are using a directory != ours
1222
* then copy them over. Currently this is only used by
1223
* addIndexesNoOptimize(). */
1224
void copyExternalSegments();
1228
* Called whenever the SegmentInfos has been updated and
1229
* the index files referenced exist (correctly) in the
1230
* index directory-> If we are in autoCommit mode, we
1231
* commit the change immediately. Else, we mark
1236
bool doFlush(bool flushDocStores);
1238
/* FIXME if we want to support non-contiguous segment merges */
1239
bool commitMerge(MergePolicy::OneMerge* merge);
1241
int32_t ensureContiguousMerge(MergePolicy::OneMerge* merge);
1243
void decrefMergeSegments(MergePolicy::OneMerge* _merge);
1245
/** Does fininishing for a merge, which is fast but holds
1246
* the synchronized lock on IndexWriter instance. */
1247
void mergeFinish(MergePolicy::OneMerge* _merge);
1249
/** Does the actual (time-consuming) work of the merge,
1250
* but without holding synchronized lock on IndexWriter
1252
int32_t mergeMiddle(MergePolicy::OneMerge* _merge);
1254
void addMergeException(MergePolicy::OneMerge* _merge);
1256
/** Checks whether this merge involves any segments
1257
* already participating in a merge. If not, this merge
1258
* is "registered", meaning we record that its segments
1259
* are now participating in a merge, and true is
1260
* returned. Else (the merge conflicts) false is
1262
bool registerMerge(MergePolicy::OneMerge* _merge);
1264
// Called during flush to apply any buffered deletes. If
1265
// flushedNewSegment is true then a new segment was just
1266
// created and flushed from the ram segments, so we will
1267
// selectively apply the deletes to that new segment.
1268
void applyDeletes(bool flushedNewSegment);
1271
Internal* _internal;
1273
// This is called after pending added and deleted
1274
// documents have been flushed to the Directory but before
1275
// the change is committed (_CLNEW segments_N file written).
1276
virtual void doAfterFlush();
1279
* Used internally to throw an {@link
1280
* AlreadyClosedException} if this IndexWriter has been
1282
* @throws AlreadyClosedException if this IndexWriter is
1287
* Flush all in-memory buffered udpates (adds and deletes)
1289
* @param triggerMerge if true, we may merge segments (if
1290
* deletes or docs were flushed) if necessary
1291
* @param flushDocStores if false we are allowed to keep
1292
* doc stores open to share with the next segment
1294
void flush(bool triggerMerge, bool flushDocStores);