2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.lucene.search.spell;
20
import java.io.IOException;
21
import java.util.Iterator;
23
import org.apache.lucene.index.IndexReader;
24
import org.apache.lucene.index.Term;
25
import org.apache.lucene.index.TermEnum;
26
import org.apache.lucene.search.spell.Dictionary;
27
import org.apache.lucene.util.StringHelper;
30
* HighFrequencyDictionary: terms taken from the given field
31
* of a Lucene index, which appear in a number of documents
32
* above a given threshold.
34
* When using IndexReader.terms(Term) the code must not call next() on TermEnum
35
* as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
37
* Threshold is a value in [0..1] representing the minimum
38
* number of documents (of the total) where a term should appear.
40
* Based on LuceneDictionary.
42
public class HighFrequencyDictionary implements Dictionary {
43
private IndexReader reader;
47
public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
49
this.field = StringHelper.intern(field);
53
public final Iterator<String> getWordsIterator() {
54
return new HighFrequencyIterator();
57
final class HighFrequencyIterator implements TermFreqIterator {
58
private TermEnum termEnum;
59
private Term actualTerm;
60
private int actualFreq;
61
private boolean hasNextCalled;
62
private int minNumDocs;
64
HighFrequencyIterator() {
66
termEnum = reader.terms(new Term(field, ""));
67
minNumDocs = (int)(thresh * (float)reader.numDocs());
68
} catch (IOException e) {
69
throw new RuntimeException(e);
73
private boolean isFrequent(Term term) {
75
return reader.docFreq(term) >= minNumDocs;
76
} catch (IOException e) {
77
throw new RuntimeException(e);
81
public String next() {
85
hasNextCalled = false;
89
} catch (IOException e) {
90
throw new RuntimeException(e);
93
return (actualTerm != null) ? actualTerm.text() : null;
101
public boolean hasNext() {
103
return actualTerm != null;
105
hasNextCalled = true;
108
actualTerm = termEnum.term();
109
actualFreq = termEnum.docFreq();
111
// if there are no words return false
112
if (actualTerm == null) {
116
String currentField = actualTerm.field();
118
// if the next word doesn't have the same field return false
119
if (currentField != field) { // intern'd comparison
124
// got a valid term, does it pass the threshold?
125
if (isFrequent(actualTerm)) {
129
// term not up to threshold
132
} catch (IOException e) {
133
throw new RuntimeException(e);
139
public void remove() {
140
throw new UnsupportedOperationException();