1
package org.apache.lucene.search.highlight;
3
* Licensed to the Apache Software Foundation (ASF) under one or more
4
* contributor license agreements. See the NOTICE file distributed with
5
* this work for additional information regarding copyright ownership.
6
* The ASF licenses this file to You under the Apache License, Version 2.0
7
* (the "License"); you may not use this file except in compliance with
8
* the License. You may obtain a copy of the License at
10
* http://www.apache.org/licenses/LICENSE-2.0
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
19
import java.io.IOException;
20
import java.io.StringReader;
21
import java.util.ArrayList;
22
import java.util.Iterator;
24
import org.apache.lucene.analysis.Analyzer;
25
import org.apache.lucene.analysis.TokenStream;
26
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29
import org.apache.lucene.util.PriorityQueue;
32
* Class used to markup highlighted terms found in the best sections of a
33
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
34
* {@link Encoder} and tokenizers.
36
public class Highlighter
38
public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
40
private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
41
private Formatter formatter;
42
private Encoder encoder;
43
private Fragmenter textFragmenter=new SimpleFragmenter();
44
private Scorer fragmentScorer=null;
46
public Highlighter(Scorer fragmentScorer)
48
this(new SimpleHTMLFormatter(),fragmentScorer);
52
public Highlighter(Formatter formatter, Scorer fragmentScorer)
54
this(formatter,new DefaultEncoder(),fragmentScorer);
58
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
60
this.formatter = formatter;
61
this.encoder = encoder;
62
this.fragmentScorer = fragmentScorer;
66
* Highlights chosen terms in a text, extracting the most relevant section.
67
* This is a convenience method that calls
68
* {@link #getBestFragment(TokenStream, String)}
70
* @param analyzer the analyzer that will be used to split <code>text</code>
72
* @param text text to highlight terms in
73
* @param fieldName Name of field used to influence analyzer's tokenization policy
75
* @return highlighted text fragment or null if no terms found
76
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
78
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
79
throws IOException, InvalidTokenOffsetsException
81
TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
82
return getBestFragment(tokenStream, text);
86
* Highlights chosen terms in a text, extracting the most relevant section.
87
* The document text is analysed in chunks to record hit statistics
88
* across the document. After accumulating stats, the fragment with the highest score
91
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
92
* This is typically produced by an analyzer re-parsing a document's
93
* text. Some work may be done on retrieving TokenStreams more efficiently
94
* by adding support for storing original text position data in the Lucene
95
* index but this support is not currently available (as of Lucene 1.4 rc2).
96
* @param text text to highlight terms in
98
* @return highlighted text fragment or null if no terms found
99
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
101
public final String getBestFragment(TokenStream tokenStream, String text)
102
throws IOException, InvalidTokenOffsetsException
104
String[] results = getBestFragments(tokenStream,text, 1);
105
if (results.length > 0)
113
* Highlights chosen terms in a text, extracting the most relevant sections.
114
* This is a convenience method that calls
115
* {@link #getBestFragments(TokenStream, String, int)}
117
* @param analyzer the analyzer that will be used to split <code>text</code>
119
* @param fieldName the name of the field being highlighted (used by analyzer)
120
* @param text text to highlight terms in
121
* @param maxNumFragments the maximum number of fragments.
123
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
124
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
126
public final String[] getBestFragments(
131
throws IOException, InvalidTokenOffsetsException
133
TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
134
return getBestFragments(tokenStream, text, maxNumFragments);
138
* Highlights chosen terms in a text, extracting the most relevant sections.
139
* The document text is analysed in chunks to record hit statistics
140
* across the document. After accumulating stats, the fragments with the highest scores
141
* are returned as an array of strings in order of score (contiguous fragments are merged into
142
* one in their original order to improve readability)
144
* @param text text to highlight terms in
145
* @param maxNumFragments the maximum number of fragments.
147
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
148
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
150
public final String[] getBestFragments(
151
TokenStream tokenStream,
154
throws IOException, InvalidTokenOffsetsException
156
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
158
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
161
ArrayList<String> fragTexts = new ArrayList<String>();
162
for (int i = 0; i < frag.length; i++)
164
if ((frag[i] != null) && (frag[i].getScore() > 0))
166
fragTexts.add(frag[i].toString());
169
return fragTexts.toArray(new String[0]);
174
* Low level api to get the most relevant (formatted) sections of the document.
175
* This method has been made public to allow visibility of score information held in TextFragment objects.
176
* Thanks to Jason Calabrese for help in redefining the interface.
179
* @param maxNumFragments
180
* @param mergeContiguousFragments
181
* @throws IOException
182
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
184
public final TextFragment[] getBestTextFragments(
185
TokenStream tokenStream,
187
boolean mergeContiguousFragments,
189
throws IOException, InvalidTokenOffsetsException
191
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
192
StringBuilder newText=new StringBuilder();
194
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
195
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
196
tokenStream.addAttribute(PositionIncrementAttribute.class);
199
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
201
if (fragmentScorer instanceof QueryScorer) {
202
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
205
TokenStream newStream = fragmentScorer.init(tokenStream);
206
if(newStream != null) {
207
tokenStream = newStream;
209
fragmentScorer.startFragment(currentFrag);
210
docFrags.add(currentFrag);
212
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
220
int lastEndOffset = 0;
221
textFragmenter.start(text, tokenStream);
223
TokenGroup tokenGroup=new TokenGroup(tokenStream);
225
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
226
next = tokenStream.incrementToken())
228
if( (offsetAtt.endOffset()>text.length())
230
(offsetAtt.startOffset()>text.length())
233
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
234
+" exceeds length of provided text sized "+text.length());
236
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
238
//the current token is distinct from previous tokens -
239
// markup the cached token group info
240
startOffset = tokenGroup.matchStartOffset;
241
endOffset = tokenGroup.matchEndOffset;
242
tokenText = text.substring(startOffset, endOffset);
243
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
244
//store any whitespace etc from between this and last group
245
if (startOffset > lastEndOffset)
246
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
247
newText.append(markedUpText);
248
lastEndOffset=Math.max(endOffset, lastEndOffset);
251
//check if current token marks the start of a new fragment
252
if(textFragmenter.isNewFragment())
254
currentFrag.setScore(fragmentScorer.getFragmentScore());
255
//record stats for a new fragment
256
currentFrag.textEndPos = newText.length();
257
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
258
fragmentScorer.startFragment(currentFrag);
259
docFrags.add(currentFrag);
263
tokenGroup.addToken(fragmentScorer.getTokenScore());
265
// if(lastEndOffset>maxDocBytesToAnalyze)
270
currentFrag.setScore(fragmentScorer.getFragmentScore());
272
if(tokenGroup.numTokens>0)
274
//flush the accumulated text (same code as in above loop)
275
startOffset = tokenGroup.matchStartOffset;
276
endOffset = tokenGroup.matchEndOffset;
277
tokenText = text.substring(startOffset, endOffset);
278
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
279
//store any whitespace etc from between this and last group
280
if (startOffset > lastEndOffset)
281
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
282
newText.append(markedUpText);
283
lastEndOffset=Math.max(lastEndOffset,endOffset);
286
//Test what remains of the original text beyond the point where we stopped analyzing
288
// if there is text beyond the last token considered..
289
(lastEndOffset < text.length())
291
// and that text is not too large...
292
(text.length()<= maxDocCharsToAnalyze)
295
//append it to the last fragment
296
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
299
currentFrag.textEndPos = newText.length();
301
//sort the most relevant sections of the text
302
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
304
currentFrag = i.next();
306
//If you are running with a version of Lucene before 11th Sept 03
307
// you do not have PriorityQueue.insert() - so uncomment the code below
309
if (currentFrag.getScore() >= minScore)
311
fragQueue.put(currentFrag);
312
if (fragQueue.size() > maxNumFragments)
313
{ // if hit queue overfull
314
fragQueue.pop(); // remove lowest in hit queue
315
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
321
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
322
//fix to PriorityQueue. The correct method to use here is the new "insert" method
323
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
324
fragQueue.insertWithOverflow(currentFrag);
327
//return the most relevant fragments
328
TextFragment frag[] = new TextFragment[fragQueue.size()];
329
for (int i = frag.length - 1; i >= 0; i--)
331
frag[i] = fragQueue.pop();
334
//merge any contiguous fragments to improve readability
335
if(mergeContiguousFragments)
337
mergeContiguousFragments(frag);
338
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
339
for (int i = 0; i < frag.length; i++)
341
if ((frag[i] != null) && (frag[i].getScore() > 0))
343
fragTexts.add(frag[i]);
346
frag= fragTexts.toArray(new TextFragment[0]);
354
if (tokenStream != null)
369
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
370
* that were contiguous in the original text into one larger fragment with the correct order.
371
* This will leave a "null" in the array entry for the lesser scored fragment.
373
* @param frag An array of document fragments in descending score
375
private void mergeContiguousFragments(TextFragment[] frag)
377
boolean mergingStillBeingDone;
381
mergingStillBeingDone = false; //initialise loop control flag
382
//for each fragment, scan other frags looking for contiguous blocks
383
for (int i = 0; i < frag.length; i++)
389
//merge any contiguous blocks
390
for (int x = 0; x < frag.length; x++)
400
TextFragment frag1 = null;
401
TextFragment frag2 = null;
404
int bestScoringFragNum;
405
int worstScoringFragNum;
406
//if blocks are contiguous....
407
if (frag[i].follows(frag[x]))
415
if (frag[x].follows(frag[i]))
425
if (frag1.getScore() > frag2.getScore())
427
bestScoringFragNum = frag1Num;
428
worstScoringFragNum = frag2Num;
432
bestScoringFragNum = frag2Num;
433
worstScoringFragNum = frag1Num;
436
frag[worstScoringFragNum] = null;
437
mergingStillBeingDone = true;
438
frag[bestScoringFragNum] = frag1;
443
while (mergingStillBeingDone);
448
* Highlights terms in the text , extracting the most relevant sections
449
* and concatenating the chosen fragments with a separator (typically "...").
450
* The document text is analysed in chunks to record hit statistics
451
* across the document. After accumulating stats, the fragments with the highest scores
452
* are returned in order as "separator" delimited strings.
454
* @param text text to highlight terms in
455
* @param maxNumFragments the maximum number of fragments.
456
* @param separator the separator used to intersperse the document fragments (typically "...")
458
* @return highlighted text
459
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
461
public final String getBestFragments(
462
TokenStream tokenStream,
466
throws IOException, InvalidTokenOffsetsException
468
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
469
StringBuilder result = new StringBuilder();
470
for (int i = 0; i < sections.length; i++)
474
result.append(separator);
476
result.append(sections[i]);
478
return result.toString();
481
public int getMaxDocCharsToAnalyze() {
482
return maxDocCharsToAnalyze;
485
public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
486
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
490
public Fragmenter getTextFragmenter()
492
return textFragmenter;
498
public void setTextFragmenter(Fragmenter fragmenter)
500
textFragmenter = fragmenter;
504
* @return Object used to score each text fragment
506
public Scorer getFragmentScorer()
508
return fragmentScorer;
515
public void setFragmentScorer(Scorer scorer)
517
fragmentScorer = scorer;
520
public Encoder getEncoder()
524
public void setEncoder(Encoder encoder)
526
this.encoder = encoder;
529
class FragmentQueue extends PriorityQueue<TextFragment>
531
public FragmentQueue(int size)
537
public final boolean lessThan(TextFragment fragA, TextFragment fragB)
539
if (fragA.getScore() == fragB.getScore())
540
return fragA.fragNum > fragB.fragNum;
542
return fragA.getScore() < fragB.getScore();