1
package org.apache.lucene.index;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.Arrays;
23
import java.util.Collections;
24
import java.util.HashSet;
25
import java.util.List;
28
import org.apache.lucene.analysis.MockAnalyzer;
29
import org.apache.lucene.document.Document;
30
import org.apache.lucene.document.Field;
31
import org.apache.lucene.store.Directory;
32
import org.apache.lucene.util.BytesRef;
33
import org.apache.lucene.util.LineFileDocs;
34
import org.apache.lucene.util.LuceneTestCase;
35
import org.apache.lucene.util._TestUtil;
37
public class TestTermsEnum extends LuceneTestCase {
39
public void test() throws Exception {
40
final LineFileDocs docs = new LineFileDocs(random);
41
final Directory d = newDirectory();
42
final RandomIndexWriter w = new RandomIndexWriter(random, d);
43
final int numDocs = atLeast(10);
44
for(int docCount=0;docCount<numDocs;docCount++) {
45
w.addDocument(docs.nextDoc());
47
final IndexReader r = w.getReader();
50
final List<Term> terms = new ArrayList<Term>();
51
TermEnum termEnum = r.terms(new Term("body"));
53
Term term = termEnum.term();
54
if (term == null || !"body".equals(term.field())) {
58
} while (termEnum.next());
61
System.out.println("TEST: " + terms.size() + " terms");
65
final int iters = atLeast(200);
66
for(int iter=0;iter<iters;iter++) {
68
if (upto != -1 && random.nextBoolean()) {
71
System.out.println("TEST: iter next");
74
isEnd = termEnum.term() == null || !"body".equals(termEnum.term().field());
78
System.out.println(" end");
80
assertEquals(upto, terms.size());
84
System.out.println(" got term=" + termEnum.term() + " expected=" + terms.get(upto));
86
assertTrue(upto < terms.size());
87
assertEquals(terms.get(upto), termEnum.term());
93
if (random.nextBoolean()) {
95
if (random.nextBoolean()) {
96
target = new Term("body",
97
_TestUtil.randomSimpleString(random));
99
target = new Term("body",
100
_TestUtil.randomRealisticUnicodeString(random));
102
exists = "likely not";
105
target = terms.get(random.nextInt(terms.size()));
109
upto = Collections.binarySearch(terms, target);
112
System.out.println("TEST: iter seekCeil target=" + target + " exists=" + exists);
114
termEnum = r.terms(target);
115
final Term actualTerm = termEnum.term();
118
System.out.println(" got term=" + actualTerm);
123
if (upto >= terms.size()) {
124
assertTrue(actualTerm == null || !"body".equals(actualTerm.field()));
127
assertTrue(actualTerm != null && "body".equals(actualTerm.field()));
128
assertEquals(terms.get(upto), actualTerm);
131
assertEquals(terms.get(upto), actualTerm);
141
private IndexReader r;
143
private final String FIELD = "field";
145
private IndexReader makeIndex(String... terms) throws Exception {
147
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
150
CoreCodecProvider cp = new CoreCodecProvider();
151
cp.unregister(cp.lookup("Standard"));
152
cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock));
153
cp.setDefaultFieldCodec("Standard");
154
iwc.setCodecProvider(cp);
157
final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc);
158
w.w.setInfoStream(VERBOSE ? System.out : null);
159
for(String term : terms) {
160
Document doc = new Document();
161
Field f = newField(FIELD, term, Field.Store.NO, Field.Index.NOT_ANALYZED);
173
private void close() throws Exception {
174
final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory();
179
private int docFreq(IndexReader r, String term) throws Exception {
180
return r.docFreq(new Term(FIELD, term));
183
public void testEasy() throws Exception {
185
r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
187
// First term in block:
188
assertEquals(1, docFreq(r, "aa0"));
190
// Scan forward to another term in same block
191
assertEquals(1, docFreq(r, "aa2"));
193
assertEquals(1, docFreq(r, "aa"));
195
// Reset same block then scan forwards
196
assertEquals(1, docFreq(r, "aa1"));
198
// Not found, in same block
199
assertEquals(0, docFreq(r, "aa5"));
201
// Found, in same block
202
assertEquals(1, docFreq(r, "aa2"));
204
// Not found in index:
205
assertEquals(0, docFreq(r, "b0"));
208
assertEquals(1, docFreq(r, "aa2"));
211
assertEquals(1, docFreq(r, "aa0"));
214
// First term in block:
215
assertEquals(1, docFreq(r, "bb0"));
217
// Scan forward to another term in same block
218
assertEquals(1, docFreq(r, "bb2"));
220
// Reset same block then scan forwards
221
assertEquals(1, docFreq(r, "bb1"));
223
// Not found, in same block
224
assertEquals(0, docFreq(r, "bb5"));
226
// Found, in same block
227
assertEquals(1, docFreq(r, "bb2"));
229
// Not found in index:
230
assertEquals(0, docFreq(r, "b0"));
233
assertEquals(1, docFreq(r, "bb2"));
236
assertEquals(1, docFreq(r, "bb0"));
242
// - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
243
// - term that's entirely in the index
245
public void testFloorBlocks() throws Exception {
246
final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"};
247
r = makeIndex(terms);
248
//r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9");
250
// First term in first block:
251
assertEquals(1, docFreq(r, "aa0"));
252
assertEquals(1, docFreq(r, "aa4"));
255
assertEquals(0, docFreq(r, "bb0"));
258
assertEquals(1, docFreq(r, "aa4"));
260
// Backwards to prior floor block:
261
assertEquals(1, docFreq(r, "aa0"));
263
// Forwards to last floor block:
264
assertEquals(1, docFreq(r, "aa9"));
266
assertEquals(0, docFreq(r, "a"));
267
assertEquals(1, docFreq(r, "aa"));
268
assertEquals(0, docFreq(r, "a"));
269
assertEquals(1, docFreq(r, "aa"));
271
// Forwards to last floor block:
272
assertEquals(1, docFreq(r, "xx"));
273
assertEquals(1, docFreq(r, "aa1"));
274
assertEquals(0, docFreq(r, "yy"));
276
assertEquals(1, docFreq(r, "xx"));
277
assertEquals(1, docFreq(r, "aa9"));
279
assertEquals(1, docFreq(r, "xx"));
280
assertEquals(1, docFreq(r, "aa4"));
282
final TermEnum te = r.terms(new Term(FIELD));
284
//System.out.println("TEST: next term=" + te.term().utf8ToString());
287
testRandomSeeks(r, terms);
291
public void testZeroTerms() throws Exception {
293
final RandomIndexWriter w = new RandomIndexWriter(random, d);
294
w.w.setInfoStream(VERBOSE ? System.out : null);
295
Document doc = new Document();
296
doc.add(newField("field", "one two three", Field.Store.NO, Field.Index.ANALYZED));
297
doc = new Document();
298
doc.add(newField("field2", "one two three", Field.Store.NO, Field.Index.ANALYZED));
301
w.deleteDocuments(new Term("field", "one"));
303
IndexReader r = w.getReader();
305
assertEquals(1, r.numDocs());
306
assertEquals(1, r.maxDoc());
307
TermEnum terms = r.terms(new Term("field"));
309
assertTrue(!terms.next() || !"field".equals(terms.term().field()));
315
private String getRandomString() {
316
//return _TestUtil.randomSimpleString(random);
317
return _TestUtil.randomRealisticUnicodeString(random);
320
public void testRandomTerms() throws Exception {
321
final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))];
322
final Set<String> seen = new HashSet<String>();
324
final boolean allowEmptyString = random.nextBoolean();
326
if (random.nextInt(10) == 7 && terms.length > 2) {
327
// Sometimes add a bunch of terms sharing a longish common prefix:
328
final int numTermsSamePrefix = random.nextInt(terms.length/2);
329
if (numTermsSamePrefix > 0) {
332
prefix = getRandomString();
333
if (prefix.length() < 5) {
339
while(seen.size() < numTermsSamePrefix) {
340
final String t = prefix + getRandomString();
341
if (!seen.contains(t)) {
342
terms[seen.size()] = t;
349
while(seen.size() < terms.length) {
350
final String t = getRandomString();
351
if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) {
352
terms[seen.size()] = t;
356
r = makeIndex(terms);
357
testRandomSeeks(r, terms);
361
private BytesRef getNonExistTerm(BytesRef[] terms) {
364
final String ts = getRandomString();
365
t = new BytesRef(ts);
366
if (Arrays.binarySearch(terms, t) < 0) {
372
private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException {
373
final BytesRef[] validTerms = new BytesRef[validTermStrings.length];
374
for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) {
375
validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
377
Arrays.sort(validTerms, BytesRef.getUTF8SortedAsUTF16Comparator());
379
System.out.println("TEST: " + validTerms.length + " terms:");
380
for(int idx=0;idx<validTerms.length;idx++) {
381
System.out.println(" " + idx + ": " + validTerms[idx]);
385
final int END_LOC = -validTerms.length-1;
387
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
391
if (random.nextInt(6) == 4) {
392
// pick term that doens't exist:
393
t = getNonExistTerm(validTerms);
395
System.out.println("\nTEST: invalid term=" + t.utf8ToString());
397
loc = Arrays.binarySearch(validTerms, t, BytesRef.getUTF8SortedAsUTF16Comparator());
400
loc = random.nextInt(validTerms.length);
401
t = new BytesRef(validTerms[loc]);
403
System.out.println("\nTEST: valid term=" + t.utf8ToString());
406
final Term targetTerm = new Term(FIELD, t.utf8ToString());
409
System.out.println(" seek term=" + targetTerm);
412
final TermEnum te = r.terms(targetTerm);
413
Term actualTerm = te.term();
415
System.out.println(" got " + actualTerm);
419
// assertEquals(TermsEnum.SeekStatus.FOUND, result);
420
} else if (loc == END_LOC) {
421
assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
423
assert loc >= -validTerms.length;
424
assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
425
//assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result);
429
assertEquals(targetTerm, actualTerm);
430
} else if (loc == END_LOC) {
434
assertEquals(new Term(FIELD, validTerms[loc].utf8ToString()), actualTerm);
437
// Do a bunch of next's after the seek
438
final int numNext = random.nextInt(validTerms.length);
441
System.out.println("\nTEST: numNext=" + numNext);
444
for(int nextCount=0;nextCount<numNext;nextCount++) {
446
System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length);
448
boolean result = te.next();
449
actualTerm = te.term();
452
if (loc == validTerms.length) {
454
System.out.println(" actual=null");
457
assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
461
System.out.println(" actual=" + new BytesRef(actualTerm.text()));
464
assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
465
assertEquals(validTerms[loc], new BytesRef(actualTerm.text()));