1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.Collection;
23
import java.util.List;
26
/** <p>This class implements a {@link MergePolicy} that tries
27
* to merge segments into levels of exponentially
28
* increasing size, where each level has fewer segments than
29
* the value of the merge factor. Whenever extra segments
30
* (beyond the merge factor upper bound) are encountered,
31
* all segments within the level are merged. You can get or
32
* set the merge factor using {@link #getMergeFactor()} and
33
* {@link #setMergeFactor(int)} respectively.</p>
35
* <p>This class is abstract and requires a subclass to
36
* define the {@link #size} method which specifies how a
37
* segment's size is determined. {@link LogDocMergePolicy}
38
* is one subclass that measures size by document count in
39
* the segment. {@link LogByteSizeMergePolicy} is another
40
* subclass that measures size as the total byte size of the
41
* file(s) for the segment.</p>
44
public abstract class LogMergePolicy extends MergePolicy {
46
/** Defines the allowed range of log(size) for each
47
* level. A level is computed by taking the max segment
48
* log size, minus LEVEL_LOG_SPAN, and finding all
49
* segments falling within that range. */
50
public static final double LEVEL_LOG_SPAN = 0.75;
52
/** Default merge factor, which is how many segments are
54
public static final int DEFAULT_MERGE_FACTOR = 10;
56
/** Default maximum segment size. A segment of this size
57
* or larger will never be merged. @see setMaxMergeDocs */
58
public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
60
/** Default noCFSRatio. If a merge's size is >= 10% of
61
* the index, then we disable compound file for it.
62
* @see #setNoCFSRatio */
63
public static final double DEFAULT_NO_CFS_RATIO = 0.1;
65
protected int mergeFactor = DEFAULT_MERGE_FACTOR;
67
protected long minMergeSize;
68
protected long maxMergeSize;
69
// Although the core MPs set it explicitly, we must default in case someone
70
// out there wrote his own LMP ...
71
protected long maxMergeSizeForForcedMerge = Long.MAX_VALUE;
72
protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
74
protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
76
protected boolean calibrateSizeByDeletes = true;
78
protected boolean useCompoundFile = true;
80
public LogMergePolicy() {
84
protected boolean verbose() {
85
IndexWriter w = writer.get();
86
return w != null && w.verbose();
89
/** @see #setNoCFSRatio */
90
public double getNoCFSRatio() {
94
/** If a merged segment will be more than this percentage
95
* of the total size of the index, leave the segment as
96
* non-compound file even if compound file is enabled.
97
* Set to 1.0 to always use CFS regardless of merge
99
public void setNoCFSRatio(double noCFSRatio) {
100
if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
101
throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
103
this.noCFSRatio = noCFSRatio;
106
protected void message(String message) {
108
writer.get().message("LMP: " + message);
111
/** <p>Returns the number of segments that are merged at
112
* once and also controls the total number of segments
113
* allowed to accumulate in the index.</p> */
114
public int getMergeFactor() {
118
/** Determines how often segment indices are merged by
119
* addDocument(). With smaller values, less RAM is used
120
* while indexing, and searches are
121
* faster, but indexing speed is slower. With larger
122
* values, more RAM is used during indexing, and while
123
* searches is slower, indexing is
124
* faster. Thus larger values (> 10) are best for batch
125
* index creation, and smaller values (< 10) for indices
126
* that are interactively maintained. */
127
public void setMergeFactor(int mergeFactor) {
129
throw new IllegalArgumentException("mergeFactor cannot be less than 2");
130
this.mergeFactor = mergeFactor;
135
public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
138
if (!useCompoundFile) {
140
} else if (noCFSRatio == 1.0) {
144
for (SegmentInfo info : infos)
145
totalSize += size(info);
147
doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
152
/** Sets whether compound file format should be used for
153
* newly flushed and newly merged segments. */
154
public void setUseCompoundFile(boolean useCompoundFile) {
155
this.useCompoundFile = useCompoundFile;
158
/** Returns true if newly flushed and newly merge segments
159
* are written in compound file format. @see
160
* #setUseCompoundFile */
161
public boolean getUseCompoundFile() {
162
return useCompoundFile;
165
/** Sets whether the segment size should be calibrated by
166
* the number of deletes when choosing segments for merge. */
167
public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) {
168
this.calibrateSizeByDeletes = calibrateSizeByDeletes;
171
/** Returns true if the segment size should be calibrated
172
* by the number of deletes when choosing segments for merge. */
173
public boolean getCalibrateSizeByDeletes() {
174
return calibrateSizeByDeletes;
178
public void close() {}
180
abstract protected long size(SegmentInfo info) throws IOException;
182
protected long sizeDocs(SegmentInfo info) throws IOException {
183
if (calibrateSizeByDeletes) {
184
int delCount = writer.get().numDeletedDocs(info);
185
assert delCount <= info.docCount;
186
return (info.docCount - (long)delCount);
188
return info.docCount;
192
protected long sizeBytes(SegmentInfo info) throws IOException {
193
long byteSize = info.sizeInBytes(true);
194
if (calibrateSizeByDeletes) {
195
int delCount = writer.get().numDeletedDocs(info);
196
double delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount));
197
assert delRatio <= 1.0;
198
return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio)));
204
protected boolean isMerged(SegmentInfos infos, int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToMerge) throws IOException {
205
final int numSegments = infos.size();
207
SegmentInfo mergeInfo = null;
208
boolean segmentIsOriginal = false;
209
for(int i=0;i<numSegments && numToMerge <= maxNumSegments;i++) {
210
final SegmentInfo info = infos.info(i);
211
final Boolean isOriginal = segmentsToMerge.get(info);
212
if (isOriginal != null) {
213
segmentIsOriginal = isOriginal;
219
return numToMerge <= maxNumSegments &&
220
(numToMerge != 1 || !segmentIsOriginal || isMerged(mergeInfo));
223
/** Returns true if this single info is already fully merged (has no
224
* pending norms or deletes, is in the same dir as the
225
* writer, and matches the current compound file setting */
226
protected boolean isMerged(SegmentInfo info)
228
IndexWriter w = writer.get();
230
boolean hasDeletions = w.numDeletedDocs(info) > 0;
231
return !hasDeletions &&
232
!info.hasSeparateNorms() &&
233
info.dir == w.getDirectory() &&
234
(info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
238
* Returns the merges necessary to merge the index, taking the max merge
239
* size or max merge docs into consideration. This method attempts to respect
240
* the {@code maxNumSegments} parameter, however it might be, due to size
241
* constraints, that more than that number of segments will remain in the
242
* index. Also, this method does not guarantee that exactly {@code
243
* maxNumSegments} will remain, but <= that number.
245
private MergeSpecification findForcedMergesSizeLimit(
246
SegmentInfos infos, int maxNumSegments, int last) throws IOException {
247
MergeSpecification spec = new MergeSpecification();
248
final List<SegmentInfo> segments = infos.asList();
250
int start = last - 1;
252
SegmentInfo info = infos.info(start);
253
if (size(info) > maxMergeSizeForForcedMerge || sizeDocs(info) > maxMergeDocs) {
255
message("findForcedMergesSizeLimit: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForForcedMerge + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
257
// need to skip that segment + add a merge for the 'right' segments,
258
// unless there is only 1 which is merged.
259
if (last - start - 1 > 1 || (start != last - 1 && !isMerged(infos.info(start + 1)))) {
260
// there is more than 1 segment to the right of
261
// this one, or a mergeable single segment.
262
spec.add(new OneMerge(segments.subList(start + 1, last)));
265
} else if (last - start == mergeFactor) {
266
// mergeFactor eligible segments were found, add them as a merge.
267
spec.add(new OneMerge(segments.subList(start, last)));
273
// Add any left-over segments, unless there is just 1
274
// already fully merged
275
if (last > 0 && (++start + 1 < last || !isMerged(infos.info(start)))) {
276
spec.add(new OneMerge(segments.subList(start, last)));
279
return spec.merges.size() == 0 ? null : spec;
283
* Returns the merges necessary to forceMerge the index. This method constraints
284
* the returned merges only by the {@code maxNumSegments} parameter, and
285
* guaranteed that exactly that number of segments will remain in the index.
287
private MergeSpecification findForcedMergesMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException {
288
MergeSpecification spec = new MergeSpecification();
289
final List<SegmentInfo> segments = infos.asList();
291
// First, enroll all "full" merges (size
292
// mergeFactor) to potentially be run concurrently:
293
while (last - maxNumSegments + 1 >= mergeFactor) {
294
spec.add(new OneMerge(segments.subList(last - mergeFactor, last)));
298
// Only if there are no full merges pending do we
299
// add a final partial (< mergeFactor segments) merge:
300
if (0 == spec.merges.size()) {
301
if (maxNumSegments == 1) {
303
// Since we must merge down to 1 segment, the
305
if (last > 1 || !isMerged(infos.info(0))) {
306
spec.add(new OneMerge(segments.subList(0, last)));
308
} else if (last > maxNumSegments) {
310
// Take care to pick a partial merge that is
311
// least cost, but does not make the index too
312
// lopsided. If we always just picked the
313
// partial tail then we could produce a highly
314
// lopsided index over time:
316
// We must merge this many segments to leave
317
// maxNumSegments in the index (from when
318
// forceMerge was first kicked off):
319
final int finalMergeSize = last - maxNumSegments + 1;
321
// Consider all possible starting points:
325
for(int i=0;i<last-finalMergeSize+1;i++) {
327
for(int j=0;j<finalMergeSize;j++)
328
sumSize += size(infos.info(j+i));
329
if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
335
spec.add(new OneMerge(segments.subList(bestStart, bestStart + finalMergeSize)));
338
return spec.merges.size() == 0 ? null : spec;
341
/** Returns the merges necessary to merge the index down
342
* to a specified number of segments.
343
* This respects the {@link #maxMergeSizeForForcedMerge} setting.
344
* By default, and assuming {@code maxNumSegments=1}, only
345
* one segment will be left in the index, where that segment
346
* has no deletions pending nor separate norms, and it is in
347
* compound file format if the current useCompoundFile
348
* setting is true. This method returns multiple merges
349
* (mergeFactor at a time) so the {@link MergeScheduler}
350
* in use may make use of concurrency. */
352
public MergeSpecification findForcedMerges(SegmentInfos infos,
353
int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToMerge) throws IOException {
355
assert maxNumSegments > 0;
357
message("findForcedMerges: maxNumSegs=" + maxNumSegments + " segsToMerge="+ segmentsToMerge);
360
// If the segments are already merged (e.g. there's only 1 segment), or
361
// there are <maxNumSegements:.
362
if (isMerged(infos, maxNumSegments, segmentsToMerge)) {
364
message("already merged; skip");
369
// Find the newest (rightmost) segment that needs to
370
// be merged (other segments may have been flushed
371
// since merging started):
372
int last = infos.size();
374
final SegmentInfo info = infos.info(--last);
375
if (segmentsToMerge.get(info) != null) {
381
if (last == 0) return null;
383
// There is only one segment already, and it is merged
384
if (maxNumSegments == 1 && last == 1 && isMerged(infos.info(0))) {
386
message("already 1 seg; skip");
391
// Check if there are any segments above the threshold
392
boolean anyTooLarge = false;
393
for (int i = 0; i < last; i++) {
394
SegmentInfo info = infos.info(i);
395
if (size(info) > maxMergeSizeForForcedMerge || sizeDocs(info) > maxMergeDocs) {
402
return findForcedMergesSizeLimit(infos, maxNumSegments, last);
404
return findForcedMergesMaxNumSegments(infos, maxNumSegments, last);
409
* Finds merges necessary to force-merge all deletes from the
410
* index. We simply merge adjacent segments that have
411
* deletes, up to mergeFactor at a time.
414
public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos)
415
throws CorruptIndexException, IOException {
416
final List<SegmentInfo> segments = segmentInfos.asList();
417
final int numSegments = segments.size();
420
message("findForcedDeleteMerges: " + numSegments + " segments");
422
MergeSpecification spec = new MergeSpecification();
423
int firstSegmentWithDeletions = -1;
424
IndexWriter w = writer.get();
426
for(int i=0;i<numSegments;i++) {
427
final SegmentInfo info = segmentInfos.info(i);
428
int delCount = w.numDeletedDocs(info);
431
message(" segment " + info.name + " has deletions");
432
if (firstSegmentWithDeletions == -1)
433
firstSegmentWithDeletions = i;
434
else if (i - firstSegmentWithDeletions == mergeFactor) {
435
// We've seen mergeFactor segments in a row with
436
// deletions, so force a merge now:
438
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
439
spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
440
firstSegmentWithDeletions = i;
442
} else if (firstSegmentWithDeletions != -1) {
443
// End of a sequence of segments with deletions, so,
444
// merge those past segments even if it's fewer than
445
// mergeFactor segments
447
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
448
spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
449
firstSegmentWithDeletions = -1;
453
if (firstSegmentWithDeletions != -1) {
455
message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
456
spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, numSegments)));
462
private static class SegmentInfoAndLevel implements Comparable<SegmentInfoAndLevel> {
467
public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
473
// Sorts largest to smallest
474
public int compareTo(SegmentInfoAndLevel other) {
475
if (level < other.level)
477
else if (level > other.level)
484
/** Checks if any merges are now necessary and returns a
485
* {@link MergePolicy.MergeSpecification} if so. A merge
486
* is necessary when there are more than {@link
487
* #setMergeFactor} segments at a given level. When
488
* multiple levels have too many segments, this method
489
* will return multiple merges, allowing the {@link
490
* MergeScheduler} to use concurrency. */
492
public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
494
final int numSegments = infos.size();
496
message("findMerges: " + numSegments + " segments");
498
// Compute levels, which is just log (base mergeFactor)
499
// of the size of each segment
500
final List<SegmentInfoAndLevel> levels = new ArrayList<SegmentInfoAndLevel>();
501
final float norm = (float) Math.log(mergeFactor);
503
final Collection<SegmentInfo> mergingSegments = writer.get().getMergingSegments();
505
for(int i=0;i<numSegments;i++) {
506
final SegmentInfo info = infos.info(i);
507
long size = size(info);
509
// Floor tiny segments
514
final SegmentInfoAndLevel infoLevel = new SegmentInfoAndLevel(info, (float) Math.log(size)/norm, i);
515
levels.add(infoLevel);
518
final long segBytes = sizeBytes(info);
519
String extra = mergingSegments.contains(info) ? " [merging]" : "";
520
if (size >= maxMergeSize) {
521
extra += " [skip: too large]";
523
message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format("%.3f MB", segBytes/1024/1024.) + extra);
527
final float levelFloor;
528
if (minMergeSize <= 0)
529
levelFloor = (float) 0.0;
531
levelFloor = (float) (Math.log(minMergeSize)/norm);
533
// Now, we quantize the log values into levels. The
534
// first level is any segment whose log size is within
535
// LEVEL_LOG_SPAN of the max size, or, who has such as
536
// segment "to the right". Then, we find the max of all
537
// other segments and use that to define the next level
540
MergeSpecification spec = null;
542
final int numMergeableSegments = levels.size();
545
while(start < numMergeableSegments) {
547
// Find max level of all segments not already
549
float maxLevel = levels.get(start).level;
550
for(int i=1+start;i<numMergeableSegments;i++) {
551
final float level = levels.get(i).level;
552
if (level > maxLevel)
556
// Now search backwards for the rightmost segment that
557
// falls into this level:
559
if (maxLevel <= levelFloor)
560
// All remaining segments fall into the min level
563
levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
565
// Force a boundary at the level floor
566
if (levelBottom < levelFloor && maxLevel >= levelFloor)
567
levelBottom = levelFloor;
570
int upto = numMergeableSegments-1;
571
while(upto >= start) {
572
if (levels.get(upto).level >= levelBottom) {
578
message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments");
580
// Finally, record all merges that are viable at this level:
581
int end = start + mergeFactor;
582
while(end <= 1+upto) {
583
boolean anyTooLarge = false;
584
boolean anyMerging = false;
585
for(int i=start;i<end;i++) {
586
final SegmentInfo info = levels.get(i).info;
587
anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs);
588
if (mergingSegments.contains(info)) {
596
} else if (!anyTooLarge) {
598
spec = new MergeSpecification();
599
final List<SegmentInfo> mergeInfos = new ArrayList<SegmentInfo>();
600
for(int i=start;i<end;i++) {
601
mergeInfos.add(levels.get(i).info);
602
assert infos.contains(levels.get(i).info);
605
message(" add merge=" + writer.get().segString(mergeInfos) + " start=" + start + " end=" + end);
607
spec.add(new OneMerge(mergeInfos));
608
} else if (verbose()) {
609
message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
613
end = start + mergeFactor;
622
/** <p>Determines the largest segment (measured by
623
* document count) that may be merged with other segments.
624
* Small values (e.g., less than 10,000) are best for
625
* interactive indexing, as this limits the length of
626
* pauses while indexing to a few seconds. Larger values
627
* are best for batched indexing and speedier
630
* <p>The default value is {@link Integer#MAX_VALUE}.</p>
632
* <p>The default merge policy ({@link
633
* LogByteSizeMergePolicy}) also allows you to set this
634
* limit by net size (in MB) of the segment, using {@link
635
* LogByteSizeMergePolicy#setMaxMergeMB}.</p>
637
public void setMaxMergeDocs(int maxMergeDocs) {
638
this.maxMergeDocs = maxMergeDocs;
641
/** Returns the largest segment (measured by document
642
* count) that may be merged with other segments.
643
* @see #setMaxMergeDocs */
644
public int getMaxMergeDocs() {
649
public String toString() {
650
StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
651
sb.append("minMergeSize=").append(minMergeSize).append(", ");
652
sb.append("mergeFactor=").append(mergeFactor).append(", ");
653
sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
654
sb.append("maxMergeSizeForForcedMerge=").append(maxMergeSizeForForcedMerge).append(", ");
655
sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
656
sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
657
sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
658
sb.append("noCFSRatio=").append(noCFSRatio);
660
return sb.toString();