1
package org.apache.lucene.misc;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.index.IndexReader;
21
import org.apache.lucene.index.Term;
22
import org.apache.lucene.index.TermDocs;
23
import org.apache.lucene.index.TermEnum;
24
import org.apache.lucene.store.FSDirectory;
25
import org.apache.lucene.util.PriorityQueue;
26
import java.util.Arrays;
27
import java.util.Comparator;
32
* <code>HighFreqTerms</code> class extracts the top n most frequent terms
33
* (by document frequency ) from an existing Lucene index and reports their
34
* document frequency. If used with the -t flag it also reports their
35
* total tf (total number of occurences) in order of highest total tf
37
public class HighFreqTerms {
39
// The top numTerms will be displayed
40
public static final int DEFAULTnumTerms = 100;
41
public static int numTerms = DEFAULTnumTerms;
43
public static void main(String[] args) throws Exception {
44
IndexReader reader = null;
45
FSDirectory dir = null;
47
boolean IncludeTermFreqs = false;
49
if (args.length == 0 || args.length > 4) {
54
if (args.length > 0) {
55
dir = FSDirectory.open(new File(args[0]));
58
for (int i = 1; i < args.length; i++) {
59
if (args[i].equals("-t")) {
60
IncludeTermFreqs = true;
64
numTerms = Integer.parseInt(args[i]);
65
} catch (NumberFormatException e) {
72
reader = IndexReader.open(dir, true);
73
TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
75
* Insert logic so it will only lookup totaltf if right arg
76
* also change names as in flex
78
if (!IncludeTermFreqs) {
79
//default HighFreqTerms behavior
80
for (int i = 0; i < terms.length; i++) {
81
System.out.printf("%s %,d \n",
82
terms[i].term, terms[i].docFreq);
86
TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
87
for (int i = 0; i < termsWithTF.length; i++) {
88
System.out.printf("%s \t total_tf = %,d \t doc freq = %,d \n",
89
termsWithTF[i].term, termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
96
private static void usage() {
99
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: include totalTermFreq\n\n");
107
* @return TermStats[] ordered by terms with highest docFreq first.
110
public static TermStats[] getHighFreqTerms(IndexReader reader,
111
int numTerms, String field) throws Exception {
113
TermInfoWiTFQueue tiq = new TermInfoWiTFQueue(numTerms);
115
TermEnum terms = reader.terms(new Term(field));
116
if (terms != null && terms.term() != null) {
118
if (!terms.term().field().equals(field)) {
121
tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
122
} while (terms.next());
124
System.out.println("No terms for field \"" + field + "\"");
127
TermEnum terms = reader.terms();
128
while (terms.next()) {
129
tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
133
TermStats[] result = new TermStats[tiq.size()];
135
// we want highest first so we read the queue and populate the array
136
// starting at the end and work backwards
137
int count = tiq.size() - 1;
138
while (tiq.size() != 0) {
139
result[count] = tiq.pop();
146
* Takes array of TermStats. For each term looks up the tf for each doc
147
* containing the term and stores the total in the output array of TermStats.
148
* Output array is sorted by highest total tf.
153
* @return TermStats[]
157
public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
158
TermStats[] ts = new TermStats[terms.length]; // array for sorting
160
for (int i = 0; i < terms.length; i++) {
161
totalTF = getTotalTermFreq(reader, terms[i].term);
162
ts[i] = new TermStats( terms[i].term, terms[i].docFreq, totalTF);
165
Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
171
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
173
TermDocs td = reader.termDocs(term);
175
totalTF += td.freq();
182
final class TermStats {
185
public long totalTermFreq;
187
public TermStats(Term t, int df) {
192
public TermStats(Term t, int df, long tf) {
195
this.totalTermFreq = tf;
201
* Priority queue for TermStats objects ordered by TermStats.docFreq
203
final class TermInfoWiTFQueue extends PriorityQueue<TermStats> {
204
TermInfoWiTFQueue(int size) {
209
protected boolean lessThan(TermStats termInfoA,
210
TermStats termInfoB) {
211
return termInfoA.docFreq < termInfoB.docFreq;
218
* Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
219
* b.totalTermFreq So we can sort in descending order of totalTermFreq
221
final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
223
public int compare(TermStats a, TermStats b) {
224
if (a.totalTermFreq < b.totalTermFreq) {
226
} else if (a.totalTermFreq > b.totalTermFreq) {