1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
21
import java.io.IOException;
22
import java.io.Reader;
23
import java.io.UnsupportedEncodingException;
24
import java.util.ArrayList;
25
import java.util.HashMap;
26
import java.util.List;
29
import org.apache.lucene.analysis.Analyzer;
30
import org.apache.lucene.analysis.MockAnalyzer;
31
import org.apache.lucene.analysis.MockTokenizer;
32
import org.apache.lucene.analysis.TokenFilter;
33
import org.apache.lucene.analysis.TokenStream;
34
import org.apache.lucene.analysis.WhitespaceTokenizer;
35
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
36
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
37
import org.apache.lucene.document.Document;
38
import org.apache.lucene.document.Field;
39
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
40
import org.apache.lucene.store.Directory;
41
import org.apache.lucene.util.LuceneTestCase;
42
import org.apache.lucene.util._TestUtil;
45
public class TestPayloads extends LuceneTestCase {
47
// Simple tests to test the Payload class
48
public void testPayload() throws Exception {
49
byte[] testData = "This is a test!".getBytes();
50
Payload payload = new Payload(testData);
51
assertEquals("Wrong payload length.", testData.length, payload.length());
54
byte[] target = new byte[testData.length - 1];
56
payload.copyTo(target, 0);
57
fail("Expected exception not thrown");
58
} catch (Exception expected) {
62
target = new byte[testData.length + 3];
63
payload.copyTo(target, 3);
65
for (int i = 0; i < testData.length; i++) {
66
assertEquals(testData[i], target[i + 3]);
71
target = payload.toByteArray();
72
assertByteArrayEquals(testData, target);
75
for (int i = 0; i < testData.length; i++) {
76
assertEquals(payload.byteAt(i), testData[i]);
80
payload.byteAt(testData.length + 1);
81
fail("Expected exception not thrown");
82
} catch (Exception expected) {
86
Payload clone = (Payload) payload.clone();
87
assertEquals(payload.length(), clone.length());
88
for (int i = 0; i < payload.length(); i++) {
89
assertEquals(payload.byteAt(i), clone.byteAt(i));
94
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
95
// payload bit in the FieldInfo
96
public void testPayloadFieldBit() throws Exception {
97
Directory ram = newDirectory();
98
PayloadAnalyzer analyzer = new PayloadAnalyzer();
99
IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
100
Document d = new Document();
101
// this field won't have any payloads
102
d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
103
// this field will have payloads in all docs, however not for all term positions,
104
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
105
// even if only some term positions have payloads
106
d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
107
d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
108
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
109
// enabled in only some documents
110
d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
111
// only add payload data for field f2
112
analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
113
writer.addDocument(d);
117
SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
118
FieldInfos fi = reader.fieldInfos();
119
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
120
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
121
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
124
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
125
// enabled payloads for that field
126
writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
127
analyzer).setOpenMode(OpenMode.CREATE));
129
d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
130
d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
131
d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
132
d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
133
// add payload data for field f2 and f3
134
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
135
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
136
writer.addDocument(d);
138
writer.forceMerge(1);
142
reader = SegmentReader.getOnlySegmentReader(ram);
143
fi = reader.fieldInfos();
144
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
145
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
146
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
151
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
152
public void testPayloadsEncoding() throws Exception {
153
// first perform the test using a RAMDirectory
154
Directory dir = newDirectory();
157
// now use a FSDirectory and repeat same test
158
File dirName = _TestUtil.getTempDir("test_payloads");
159
dir = newFSDirectory(dirName);
161
_TestUtil.rmDir(dirName);
165
// builds an index with payloads in the given Directory and performs
166
// different tests to verify the payload encoding
167
private void performTest(Directory dir) throws Exception {
168
PayloadAnalyzer analyzer = new PayloadAnalyzer();
169
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
170
TEST_VERSION_CURRENT, analyzer)
171
.setOpenMode(OpenMode.CREATE)
172
.setMergePolicy(newLogMergePolicy()));
174
// should be in sync with value in TermInfosWriter
175
final int skipInterval = 16;
177
final int numTerms = 5;
178
final String fieldName = "f1";
180
int numDocs = skipInterval + 1;
181
// create content for the test documents with just a few terms
182
Term[] terms = generateTerms(fieldName, numTerms);
183
StringBuilder sb = new StringBuilder();
184
for (int i = 0; i < terms.length; i++) {
185
sb.append(terms[i].text);
188
String content = sb.toString();
191
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
192
byte[] payloadData = generateRandomData(payloadDataLength);
194
Document d = new Document();
195
d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
196
// add the same document multiple times to have the same payload lengths for all
197
// occurrences within two consecutive skip intervals
199
for (int i = 0; i < 2 * numDocs; i++) {
200
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
202
writer.addDocument(d);
205
// make sure we create more than one segment to test merging
208
// now we make sure to have different payload lengths next at the next skip point
209
for (int i = 0; i < numDocs; i++) {
210
analyzer.setPayloadData(fieldName, payloadData, offset, i);
211
offset += i * numTerms;
212
writer.addDocument(d);
215
writer.forceMerge(1);
222
* first we test if all payloads are stored correctly
224
IndexReader reader = IndexReader.open(dir, true);
226
byte[] verifyPayloadData = new byte[payloadDataLength];
228
TermPositions[] tps = new TermPositions[numTerms];
229
for (int i = 0; i < numTerms; i++) {
230
tps[i] = reader.termPositions(terms[i]);
233
while (tps[0].next()) {
234
for (int i = 1; i < numTerms; i++) {
237
int freq = tps[0].freq();
239
for (int i = 0; i < freq; i++) {
240
for (int j = 0; j < numTerms; j++) {
241
tps[j].nextPosition();
242
if (tps[j].isPayloadAvailable()) {
243
tps[j].getPayload(verifyPayloadData, offset);
244
offset += tps[j].getPayloadLength();
250
for (int i = 0; i < numTerms; i++) {
254
assertByteArrayEquals(payloadData, verifyPayloadData);
259
TermPositions tp = reader.termPositions(terms[0]);
262
// now we don't read this payload
264
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
265
byte[] payload = tp.getPayload(null, 0);
266
assertEquals(payload[0], payloadData[numTerms]);
269
// we don't read this payload and skip to a different document
272
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
273
payload = tp.getPayload(null, 0);
274
assertEquals(payload[0], payloadData[5 * numTerms]);
278
* Test different lengths at skip points
283
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
284
tp.skipTo(skipInterval - 1);
286
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
287
tp.skipTo(2 * skipInterval - 1);
289
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
290
tp.skipTo(3 * skipInterval - 1);
292
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
295
* Test multiple call of getPayload()
297
tp.getPayload(null, 0);
299
// it is forbidden to call getPayload() more than once
300
// without calling nextPosition()
301
tp.getPayload(null, 0);
302
fail("Expected exception not thrown");
303
} catch (Exception expected) {
304
// expected exception
310
analyzer = new PayloadAnalyzer();
311
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
312
analyzer).setOpenMode(OpenMode.CREATE));
313
String singleTerm = "lucene";
316
d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
317
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
318
payloadData = generateRandomData(2000);
319
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
320
writer.addDocument(d);
323
writer.forceMerge(1);
327
reader = IndexReader.open(dir, true);
328
tp = reader.termPositions(new Term(fieldName, singleTerm));
332
verifyPayloadData = new byte[tp.getPayloadLength()];
333
tp.getPayload(verifyPayloadData, 0);
334
byte[] portion = new byte[1500];
335
System.arraycopy(payloadData, 100, portion, 0, 1500);
337
assertByteArrayEquals(portion, verifyPayloadData);
342
private void generateRandomData(byte[] data) {
343
// this test needs the random data to be valid unicode
344
String s = _TestUtil.randomFixedByteLengthUnicodeString(random, data.length);
347
b = s.getBytes("UTF-8");
348
} catch (UnsupportedEncodingException e) {
349
throw new RuntimeException(e);
351
assert b.length == data.length;
352
System.arraycopy(b, 0, data, 0, b.length);
355
private byte[] generateRandomData(int n) {
356
byte[] data = new byte[n];
357
generateRandomData(data);
361
private Term[] generateTerms(String fieldName, int n) {
362
int maxDigits = (int) (Math.log(n) / Math.log(10));
363
Term[] terms = new Term[n];
364
StringBuilder sb = new StringBuilder();
365
for (int i = 0; i < n; i++) {
368
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
369
for (int j = 0; j < zeros; j++) {
373
terms[i] = new Term(fieldName, sb.toString());
379
void assertByteArrayEquals(byte[] b1, byte[] b2) {
380
if (b1.length != b2.length) {
381
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
384
for (int i = 0; i < b1.length; i++) {
385
if (b1[i] != b2[i]) {
386
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
393
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
395
private static class PayloadAnalyzer extends Analyzer {
396
Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
398
void setPayloadData(String field, byte[] data, int offset, int length) {
399
fieldToData.put(field, new PayloadData(0, data, offset, length));
402
void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
403
fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
407
public TokenStream tokenStream(String fieldName, Reader reader) {
408
PayloadData payload = fieldToData.get(fieldName);
409
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
410
if (payload != null) {
411
if (payload.numFieldInstancesToSkip == 0) {
412
ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
414
payload.numFieldInstancesToSkip--;
420
private static class PayloadData {
424
int numFieldInstancesToSkip;
426
PayloadData(int skip, byte[] data, int offset, int length) {
427
numFieldInstancesToSkip = skip;
429
this.offset = offset;
430
this.length = length;
437
* This Filter adds payloads to the tokens.
439
private static class PayloadFilter extends TokenFilter {
443
private int startOffset;
444
PayloadAttribute payloadAtt;
446
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
449
this.length = length;
450
this.offset = offset;
451
this.startOffset = offset;
452
payloadAtt = addAttribute(PayloadAttribute.class);
456
public boolean incrementToken() throws IOException {
457
boolean hasNext = input.incrementToken();
459
if (offset + length <= data.length) {
460
Payload p = new Payload();
461
payloadAtt.setPayload(p);
462
p.setData(data, offset, length);
465
payloadAtt.setPayload(null);
473
public void reset() throws IOException {
475
this.offset = startOffset;
479
public void testThreadSafety() throws Exception {
480
final int numThreads = 5;
481
final int numDocs = atLeast(50);
482
final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
484
Directory dir = newDirectory();
485
final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
486
TEST_VERSION_CURRENT, new MockAnalyzer(random)));
487
final String field = "test";
489
Thread[] ingesters = new Thread[numThreads];
490
for (int i = 0; i < numThreads; i++) {
491
ingesters[i] = new Thread() {
495
for (int j = 0; j < numDocs; j++) {
496
Document d = new Document();
497
d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
498
writer.addDocument(d);
500
} catch (Exception e) {
506
ingesters[i].start();
509
for (int i = 0; i < numThreads; i++) {
513
IndexReader reader = IndexReader.open(dir, true);
514
TermEnum terms = reader.terms();
515
while (terms.next()) {
516
TermPositions tp = reader.termPositions(terms.term());
518
int freq = tp.freq();
519
for (int i = 0; i < freq; i++) {
521
byte payload[] = new byte[5];
522
tp.getPayload(payload, 0);
523
assertEquals(terms.term().text, new String(payload, 0, payload.length, "UTF-8"));
531
assertEquals(pool.size(), numThreads);
534
private class PoolingPayloadTokenStream extends TokenStream {
535
private byte[] payload;
536
private boolean first;
537
private ByteArrayPool pool;
540
CharTermAttribute termAtt;
541
PayloadAttribute payloadAtt;
543
PoolingPayloadTokenStream(ByteArrayPool pool) {
545
payload = pool.get();
546
generateRandomData(payload);
548
term = new String(payload, 0, payload.length, "UTF-8");
549
} catch (UnsupportedEncodingException e) {
550
throw new RuntimeException(e);
553
payloadAtt = addAttribute(PayloadAttribute.class);
554
termAtt = addAttribute(CharTermAttribute.class);
558
public boolean incrementToken() throws IOException {
559
if (!first) return false;
562
termAtt.append(term);
563
payloadAtt.setPayload(new Payload(payload));
568
public void close() throws IOException {
569
pool.release(payload);
574
private static class ByteArrayPool {
575
private List<byte[]> pool;
577
ByteArrayPool(int capacity, int size) {
578
pool = new ArrayList<byte[]>();
579
for (int i = 0; i < capacity; i++) {
580
pool.add(new byte[size]);
584
synchronized byte[] get() {
585
return pool.remove(0);
588
synchronized void release(byte[] b) {
592
synchronized int size() {
597
public void testAcrossFields() throws Exception {
598
Directory dir = newDirectory();
599
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
600
new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
601
Document doc = new Document();
602
doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
603
writer.addDocument(doc);
606
writer = new RandomIndexWriter(random, dir,
607
new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
608
doc = new Document();
609
doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
610
writer.addDocument(doc);
611
writer.addDocument(doc);
612
writer.forceMerge(1);