1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.store.Directory;
21
import org.apache.lucene.store.IndexOutput;
22
import org.apache.lucene.util.IOUtils;
23
import org.apache.lucene.util.StringHelper;
24
import org.apache.lucene.util.UnicodeUtil;
26
import java.io.IOException;
28
final class TermVectorsWriter {
30
private IndexOutput tvx = null, tvd = null, tvf = null;
31
private FieldInfos fieldInfos;
32
final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
33
new UnicodeUtil.UTF8Result()};
35
public TermVectorsWriter(Directory directory, String segment,
36
FieldInfos fieldInfos) throws IOException {
37
boolean success = false;
39
// Open files for TermVector storage
40
tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION));
41
tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
42
tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
43
tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
44
tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION));
45
tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
49
IOUtils.closeWhileHandlingException(tvx, tvd, tvf);
53
this.fieldInfos = fieldInfos;
57
* Add a complete document specified by all its term vectors. If document has no
58
* term vectors, add value for tvx.
63
public final void addAllDocVectors(TermFreqVector[] vectors) throws IOException {
65
tvx.writeLong(tvd.getFilePointer());
66
tvx.writeLong(tvf.getFilePointer());
68
if (vectors != null) {
69
final int numFields = vectors.length;
70
tvd.writeVInt(numFields);
72
long[] fieldPointers = new long[numFields];
74
for (int i=0; i<numFields; i++) {
75
fieldPointers[i] = tvf.getFilePointer();
77
final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField());
79
// 1st pass: write field numbers to tvd
80
tvd.writeVInt(fieldNumber);
82
final int numTerms = vectors[i].size();
83
tvf.writeVInt(numTerms);
85
final TermPositionVector tpVector;
88
final boolean storePositions;
89
final boolean storeOffsets;
91
if (vectors[i] instanceof TermPositionVector) {
92
// May have positions & offsets
93
tpVector = (TermPositionVector) vectors[i];
94
storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null;
95
storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null;
96
bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) +
97
(storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0));
101
storePositions = false;
102
storeOffsets = false;
107
final String[] terms = vectors[i].getTerms();
108
final int[] freqs = vectors[i].getTermFrequencies();
111
utf8Results[1].length = 0;
113
for (int j=0; j<numTerms; j++) {
115
UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
117
int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
118
utf8Results[1-utf8Upto].length,
119
utf8Results[utf8Upto].result,
120
utf8Results[utf8Upto].length);
121
int length = utf8Results[utf8Upto].length - start;
122
tvf.writeVInt(start); // write shared prefix length
123
tvf.writeVInt(length); // write delta length
124
tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
125
utf8Upto = 1-utf8Upto;
127
final int termFreq = freqs[j];
129
tvf.writeVInt(termFreq);
131
if (storePositions) {
132
final int[] positions = tpVector.getTermPositions(j);
133
if (positions == null)
134
throw new IllegalStateException("Trying to write positions that are null!");
135
assert positions.length == termFreq;
137
// use delta encoding for positions
138
int lastPosition = 0;
139
for(int k=0;k<positions.length;k++) {
140
final int position = positions[k];
141
tvf.writeVInt(position-lastPosition);
142
lastPosition = position;
147
final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j);
149
throw new IllegalStateException("Trying to write offsets that are null!");
150
assert offsets.length == termFreq;
152
// use delta encoding for offsets
153
int lastEndOffset = 0;
154
for(int k=0;k<offsets.length;k++) {
155
final int startOffset = offsets[k].getStartOffset();
156
final int endOffset = offsets[k].getEndOffset();
157
tvf.writeVInt(startOffset-lastEndOffset);
158
tvf.writeVInt(endOffset-startOffset);
159
lastEndOffset = endOffset;
165
// 2nd pass: write field pointers to tvd
167
long lastFieldPointer = fieldPointers[0];
168
for (int i=1; i<numFields; i++) {
169
final long fieldPointer = fieldPointers[i];
170
tvd.writeVLong(fieldPointer-lastFieldPointer);
171
lastFieldPointer = fieldPointer;
179
* Do a bulk copy of numDocs documents from reader to our
180
* streams. This is used to expedite merging, if the
181
* field numbers are congruent.
183
final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
184
long tvdPosition = tvd.getFilePointer();
185
long tvfPosition = tvf.getFilePointer();
186
long tvdStart = tvdPosition;
187
long tvfStart = tvfPosition;
188
for(int i=0;i<numDocs;i++) {
189
tvx.writeLong(tvdPosition);
190
tvdPosition += tvdLengths[i];
191
tvx.writeLong(tvfPosition);
192
tvfPosition += tvfLengths[i];
194
tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
195
tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
196
assert tvd.getFilePointer() == tvdPosition;
197
assert tvf.getFilePointer() == tvfPosition;
200
/** Close all streams. */
201
final void close() throws IOException {
202
// make an effort to close all streams we can but remember and re-throw
203
// the first exception encountered in this process
204
IOUtils.close(tvx, tvd, tvf);