1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.util.Arrays;
23
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24
import org.apache.lucene.document.Fieldable;
25
import org.apache.lucene.util.UnicodeUtil;
26
import org.apache.lucene.util.RamUsageEstimator;
27
import org.apache.lucene.util.SorterTemplate;
29
final class TermsHashPerField extends InvertedDocConsumerPerField {
31
final TermsHashConsumerPerField consumer;
33
final TermsHashPerField nextPerField;
34
final TermsHashPerThread perThread;
35
final DocumentsWriter.DocState docState;
36
final FieldInvertState fieldState;
37
CharTermAttribute termAtt;
39
// Copied from our perThread
40
final CharBlockPool charPool;
41
final IntBlockPool intPool;
42
final ByteBlockPool bytePool;
44
final int streamCount;
45
final int numPostingInt;
47
final FieldInfo fieldInfo;
49
boolean postingsCompacted;
51
private int postingsHashSize = 4;
52
private int postingsHashHalfSize = postingsHashSize/2;
53
private int postingsHashMask = postingsHashSize-1;
54
private int[] postingsHash;
56
ParallelPostingsArray postingsArray;
58
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
59
this.perThread = perThread;
60
intPool = perThread.intPool;
61
charPool = perThread.charPool;
62
bytePool = perThread.bytePool;
63
docState = perThread.docState;
65
postingsHash = new int[postingsHashSize];
66
Arrays.fill(postingsHash, -1);
67
bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
69
fieldState = docInverterPerField.fieldState;
70
this.consumer = perThread.consumer.addField(this, fieldInfo);
73
streamCount = consumer.getStreamCount();
74
numPostingInt = 2*streamCount;
75
this.fieldInfo = fieldInfo;
76
if (nextPerThread != null)
77
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
82
private void initPostingsArray() {
83
postingsArray = consumer.createPostingsArray(2);
84
bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
87
// sugar: just forwards to DW
88
private void bytesUsed(long size) {
89
if (perThread.termsHash.trackAllocations) {
90
perThread.termsHash.docWriter.bytesUsed(size);
94
void shrinkHash(int targetSize) {
95
assert postingsCompacted || numPostings == 0;
97
final int newSize = 4;
98
if (newSize != postingsHash.length) {
99
final long previousSize = postingsHash.length;
100
postingsHash = new int[newSize];
101
bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
102
Arrays.fill(postingsHash, -1);
103
postingsHashSize = newSize;
104
postingsHashHalfSize = newSize/2;
105
postingsHashMask = newSize-1;
108
// Fully free the postings array on each flush:
109
if (postingsArray != null) {
110
bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
111
postingsArray = null;
115
public void reset() {
116
if (!postingsCompacted)
118
assert numPostings <= postingsHash.length;
119
if (numPostings > 0) {
120
Arrays.fill(postingsHash, 0, numPostings, -1);
123
postingsCompacted = false;
124
if (nextPerField != null)
125
nextPerField.reset();
129
synchronized public void abort() {
131
if (nextPerField != null)
132
nextPerField.abort();
135
private final void growParallelPostingsArray() {
136
int oldSize = postingsArray.size;
137
this.postingsArray = this.postingsArray.grow();
138
bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
141
public void initReader(ByteSliceReader reader, int termID, int stream) {
142
assert stream < streamCount;
143
int intStart = postingsArray.intStarts[termID];
144
final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
145
final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
146
reader.init(bytePool,
147
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
151
private void compactPostings() {
153
for(int i=0;i<postingsHashSize;i++) {
154
if (postingsHash[i] != -1) {
156
postingsHash[upto] = postingsHash[i];
157
postingsHash[i] = -1;
163
assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
164
postingsCompacted = true;
167
/** Collapse the hash table & sort in-place. */
168
public int[] sortPostings() {
170
final int[] postingsHash = this.postingsHash;
171
new SorterTemplate() {
173
protected void swap(int i, int j) {
174
final int o = postingsHash[i];
175
postingsHash[i] = postingsHash[j];
180
protected int compare(int i, int j) {
181
final int term1 = postingsHash[i], term2 = postingsHash[j];
184
final int textStart1 = postingsArray.textStarts[term1],
185
textStart2 = postingsArray.textStarts[term2];
186
final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
187
final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
188
final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
189
final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
190
return comparePostings(text1, pos1, text2, pos2);
194
protected void setPivot(int i) {
195
pivotTerm = postingsHash[i];
196
final int textStart = postingsArray.textStarts[pivotTerm];
197
pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
198
pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
202
protected int comparePivot(int j) {
203
final int term = postingsHash[j];
204
if (pivotTerm == term)
206
final int textStart = postingsArray.textStarts[term];
207
final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
208
final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
209
return comparePostings(pivotBuf, pivotBufPos, text, pos);
212
private int pivotTerm, pivotBufPos;
213
private char[] pivotBuf;
215
/** Compares term text for two Posting instance and
216
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
217
private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
218
assert text1 != text2 || pos1 != pos2;
221
final char c1 = text1[pos1++];
222
final char c2 = text2[pos2++];
226
else if (0xffff == c1)
231
// This method should never compare equal postings
236
}.quickSort(0, numPostings-1);
240
/** Test whether the text for current RawPostingList p equals
241
* current tokenText. */
242
private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
243
final int textStart = postingsArray.textStarts[termID];
245
final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
247
int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
250
for(;tokenPos<tokenTextLen;pos++,tokenPos++)
251
if (tokenText[tokenPos] != text[pos])
253
return 0xffff == text[pos];
256
private boolean doCall;
257
private boolean doNextCall;
260
void start(Fieldable f) {
261
termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
263
if (nextPerField != null) {
264
nextPerField.start(f);
269
boolean start(Fieldable[] fields, int count) throws IOException {
270
doCall = consumer.start(fields, count);
271
if (postingsArray == null) {
275
if (nextPerField != null)
276
doNextCall = nextPerField.start(fields, count);
277
return doCall || doNextCall;
280
// Secondary entry point (for 2nd & subsequent TermsHash),
281
// because token text has already been "interned" into
282
// textStart, so we hash by textStart
283
public void add(int textStart) throws IOException {
284
int code = textStart;
286
int hashPos = code & postingsHashMask;
288
assert !postingsCompacted;
290
// Locate RawPostingList in hash
291
int termID = postingsHash[hashPos];
293
if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
294
// Conflict: keep searching different locations in
296
final int inc = ((code>>8)+code)|1;
299
hashPos = code & postingsHashMask;
300
termID = postingsHash[hashPos];
301
} while (termID != -1 && postingsArray.textStarts[termID] != textStart);
306
// First time we are seeing this token since we last
310
termID = numPostings++;
311
if (termID >= postingsArray.size) {
312
growParallelPostingsArray();
317
postingsArray.textStarts[termID] = textStart;
319
assert postingsHash[hashPos] == -1;
320
postingsHash[hashPos] = termID;
322
if (numPostings == postingsHashHalfSize)
323
rehashPostings(2*postingsHashSize);
325
// Init stream slices
326
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
327
intPool.nextBuffer();
329
if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
330
bytePool.nextBuffer();
332
intUptos = intPool.buffer;
333
intUptoStart = intPool.intUpto;
334
intPool.intUpto += streamCount;
336
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
338
for(int i=0;i<streamCount;i++) {
339
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
340
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
342
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
344
consumer.newTerm(termID);
347
int intStart = postingsArray.intStarts[termID];
348
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
349
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
350
consumer.addTerm(termID);
354
// Primary entry point (for first TermsHash)
356
void add() throws IOException {
358
assert !postingsCompacted;
360
// We are first in the chain so we must "intern" the
361
// term text into textStart address
363
// Get the text of this term.
364
final char[] tokenText = termAtt.buffer();
365
final int tokenTextLen = termAtt.length();
367
// Compute hashcode & replace any invalid UTF16 sequences
368
int downto = tokenTextLen;
371
char ch = tokenText[--downto];
373
if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
376
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
378
final char ch2 = tokenText[downto-1];
379
if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
380
// OK: high followed by low. This is a valid
382
code = ((code*31) + ch)*31+ch2;
387
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
390
} else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
392
// Unpaired or 0xffff
393
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
396
code = (code*31) + ch;
399
int hashPos = code & postingsHashMask;
401
// Locate RawPostingList in hash
402
int termID = postingsHash[hashPos];
404
if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
405
// Conflict: keep searching different locations in
407
final int inc = ((code>>8)+code)|1;
410
hashPos = code & postingsHashMask;
411
termID = postingsHash[hashPos];
412
} while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
417
// First time we are seeing this token since we last
419
final int textLen1 = 1+tokenTextLen;
420
if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
421
if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
422
// Just skip this term, to remain as robust as
423
// possible during indexing. A TokenFilter
424
// can be inserted into the analyzer chain if
425
// other behavior is wanted (pruning the term
426
// to a prefix, throwing an exception, etc).
428
if (docState.maxTermPrefix == null)
429
docState.maxTermPrefix = new String(tokenText, 0, 30);
431
consumer.skippingLongTerm();
434
charPool.nextBuffer();
438
termID = numPostings++;
439
if (termID >= postingsArray.size) {
440
growParallelPostingsArray();
445
final char[] text = charPool.buffer;
446
final int textUpto = charPool.charUpto;
447
postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
448
charPool.charUpto += textLen1;
449
System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
450
text[textUpto+tokenTextLen] = 0xffff;
452
assert postingsHash[hashPos] == -1;
453
postingsHash[hashPos] = termID;
455
if (numPostings == postingsHashHalfSize) {
456
rehashPostings(2*postingsHashSize);
457
bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
460
// Init stream slices
461
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
462
intPool.nextBuffer();
464
if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
465
bytePool.nextBuffer();
467
intUptos = intPool.buffer;
468
intUptoStart = intPool.intUpto;
469
intPool.intUpto += streamCount;
471
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
473
for(int i=0;i<streamCount;i++) {
474
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
475
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
477
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
479
consumer.newTerm(termID);
482
final int intStart = postingsArray.intStarts[termID];
483
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
484
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
485
consumer.addTerm(termID);
489
nextPerField.add(postingsArray.textStarts[termID]);
495
void writeByte(int stream, byte b) {
496
int upto = intUptos[intUptoStart+stream];
497
byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
498
assert bytes != null;
499
int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
500
if (bytes[offset] != 0) {
501
// End of slice; allocate a new one
502
offset = bytePool.allocSlice(bytes, offset);
503
bytes = bytePool.buffer;
504
intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
507
(intUptos[intUptoStart+stream])++;
510
public void writeBytes(int stream, byte[] b, int offset, int len) {
512
final int end = offset + len;
513
for(int i=offset;i<end;i++)
514
writeByte(stream, b[i]);
517
void writeVInt(int stream, int i) {
518
assert stream < streamCount;
519
while ((i & ~0x7F) != 0) {
520
writeByte(stream, (byte)((i & 0x7f) | 0x80));
523
writeByte(stream, (byte) i);
527
void finish() throws IOException {
531
if (nextPerField != null) {
532
nextPerField.finish();
537
/** Called when postings hash is too small (> 50%
538
* occupied) or too large (< 20% occupied). */
539
void rehashPostings(final int newSize) {
541
final int newMask = newSize-1;
543
int[] newHash = new int[newSize];
544
Arrays.fill(newHash, -1);
545
for(int i=0;i<postingsHashSize;i++) {
546
int termID = postingsHash[i];
549
if (perThread.primary) {
550
final int textStart = postingsArray.textStarts[termID];
551
final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
552
final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
554
while(text[pos] != 0xffff)
558
code = (code*31) + text[--pos];
560
code = postingsArray.textStarts[termID];
562
int hashPos = code & newMask;
564
if (newHash[hashPos] != -1) {
565
final int inc = ((code>>8)+code)|1;
568
hashPos = code & newMask;
569
} while (newHash[hashPos] != -1);
571
newHash[hashPos] = termID;
575
postingsHashMask = newMask;
576
postingsHash = newHash;
578
postingsHashSize = newSize;
579
postingsHashHalfSize = newSize >> 1;