~slub.team/goobi-indexserver/3.x

« back to all changes in this revision

Viewing changes to lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

  • Committer: Sebastian Meyer
  • Date: 2012-08-03 09:12:40 UTC
  • Revision ID: sebastian.meyer@slub-dresden.de-20120803091240-x6861b0vabq1xror
Remove Lucene and Solr source code and add patches instead
Fix Bug #985487: Auto-suggestion for the search interface

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
package org.apache.lucene.search.highlight;
2
 
/**
3
 
 * Licensed to the Apache Software Foundation (ASF) under one or more
4
 
 * contributor license agreements.  See the NOTICE file distributed with
5
 
 * this work for additional information regarding copyright ownership.
6
 
 * The ASF licenses this file to You under the Apache License, Version 2.0
7
 
 * (the "License"); you may not use this file except in compliance with
8
 
 * the License.  You may obtain a copy of the License at
9
 
 *
10
 
 *     http://www.apache.org/licenses/LICENSE-2.0
11
 
 *
12
 
 * Unless required by applicable law or agreed to in writing, software
13
 
 * distributed under the License is distributed on an "AS IS" BASIS,
14
 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 
 * See the License for the specific language governing permissions and
16
 
 * limitations under the License.
17
 
 */
18
 
 
19
 
import java.io.IOException;
20
 
import java.io.StringReader;
21
 
import java.util.ArrayList;
22
 
import java.util.Iterator;
23
 
 
24
 
import org.apache.lucene.analysis.Analyzer;
25
 
import org.apache.lucene.analysis.TokenStream;
26
 
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27
 
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28
 
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29
 
import org.apache.lucene.util.PriorityQueue;
30
 
 
31
 
/**
32
 
 * Class used to markup highlighted terms found in the best sections of a
33
 
 * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
34
 
 * {@link Encoder} and tokenizers.
35
 
 */
36
 
public class Highlighter
37
 
{
38
 
  public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
39
 
 
40
 
  private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
41
 
        private Formatter formatter;
42
 
        private Encoder encoder;
43
 
        private Fragmenter textFragmenter=new SimpleFragmenter();
44
 
        private Scorer fragmentScorer=null;
45
 
 
46
 
        public Highlighter(Scorer fragmentScorer)
47
 
        {
48
 
                this(new SimpleHTMLFormatter(),fragmentScorer);
49
 
        }
50
 
 
51
 
 
52
 
        public Highlighter(Formatter formatter, Scorer fragmentScorer)
53
 
        {
54
 
                this(formatter,new DefaultEncoder(),fragmentScorer);
55
 
        }
56
 
 
57
 
 
58
 
        public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
59
 
        {
60
 
                this.formatter = formatter;
61
 
                this.encoder = encoder;
62
 
                this.fragmentScorer = fragmentScorer;
63
 
        }
64
 
 
65
 
        /**
66
 
         * Highlights chosen terms in a text, extracting the most relevant section.
67
 
         * This is a convenience method that calls
68
 
         * {@link #getBestFragment(TokenStream, String)}
69
 
         *
70
 
         * @param analyzer   the analyzer that will be used to split <code>text</code>
71
 
         * into chunks
72
 
         * @param text text to highlight terms in
73
 
         * @param fieldName Name of field used to influence analyzer's tokenization policy
74
 
         *
75
 
         * @return highlighted text fragment or null if no terms found
76
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
77
 
         */
78
 
        public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
79
 
                throws IOException, InvalidTokenOffsetsException
80
 
        {
81
 
                TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
82
 
                return getBestFragment(tokenStream, text);
83
 
        }
84
 
 
85
 
        /**
86
 
         * Highlights chosen terms in a text, extracting the most relevant section.
87
 
         * The document text is analysed in chunks to record hit statistics
88
 
         * across the document. After accumulating stats, the fragment with the highest score
89
 
         * is returned
90
 
         *
91
 
         * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
92
 
         * This is typically produced by an analyzer re-parsing a document's
93
 
         * text. Some work may be done on retrieving TokenStreams more efficiently
94
 
         * by adding support for storing original text position data in the Lucene
95
 
         * index but this support is not currently available (as of Lucene 1.4 rc2).
96
 
         * @param text text to highlight terms in
97
 
         *
98
 
         * @return highlighted text fragment or null if no terms found
99
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
100
 
         */
101
 
        public final String getBestFragment(TokenStream tokenStream, String text)
102
 
                throws IOException, InvalidTokenOffsetsException
103
 
        {
104
 
                String[] results = getBestFragments(tokenStream,text, 1);
105
 
                if (results.length > 0)
106
 
                {
107
 
                        return results[0];
108
 
                }
109
 
                return null;
110
 
        }
111
 
 
112
 
        /**
113
 
         * Highlights chosen terms in a text, extracting the most relevant sections.
114
 
         * This is a convenience method that calls
115
 
         * {@link #getBestFragments(TokenStream, String, int)}
116
 
         *
117
 
         * @param analyzer   the analyzer that will be used to split <code>text</code>
118
 
         * into chunks
119
 
         * @param fieldName     the name of the field being highlighted (used by analyzer)
120
 
         * @param text          text to highlight terms in
121
 
         * @param maxNumFragments  the maximum number of fragments.
122
 
         *
123
 
         * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
124
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
125
 
         */
126
 
        public final String[] getBestFragments(
127
 
                Analyzer analyzer,
128
 
                String fieldName,
129
 
                String text,
130
 
                int maxNumFragments)
131
 
                throws IOException, InvalidTokenOffsetsException
132
 
        {
133
 
                TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
134
 
                return getBestFragments(tokenStream, text, maxNumFragments);
135
 
        }
136
 
 
137
 
        /**
138
 
         * Highlights chosen terms in a text, extracting the most relevant sections.
139
 
         * The document text is analysed in chunks to record hit statistics
140
 
         * across the document. After accumulating stats, the fragments with the highest scores
141
 
         * are returned as an array of strings in order of score (contiguous fragments are merged into
142
 
         * one in their original order to improve readability)
143
 
         *
144
 
         * @param text          text to highlight terms in
145
 
         * @param maxNumFragments  the maximum number of fragments.
146
 
         *
147
 
         * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
148
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
149
 
         */
150
 
        public final String[] getBestFragments(
151
 
                TokenStream tokenStream,
152
 
                String text,
153
 
                int maxNumFragments)
154
 
                throws IOException, InvalidTokenOffsetsException
155
 
        {
156
 
                maxNumFragments = Math.max(1, maxNumFragments); //sanity check
157
 
 
158
 
                TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
159
 
 
160
 
                //Get text
161
 
                ArrayList<String> fragTexts = new ArrayList<String>();
162
 
                for (int i = 0; i < frag.length; i++)
163
 
                {
164
 
                        if ((frag[i] != null) && (frag[i].getScore() > 0))
165
 
                        {
166
 
                                fragTexts.add(frag[i].toString());
167
 
                        }
168
 
                }
169
 
                return fragTexts.toArray(new String[0]);
170
 
        }
171
 
 
172
 
 
173
 
        /**
174
 
         * Low level api to get the most relevant (formatted) sections of the document.
175
 
         * This method has been made public to allow visibility of score information held in TextFragment objects.
176
 
         * Thanks to Jason Calabrese for help in redefining the interface.
177
 
         * @param tokenStream
178
 
         * @param text
179
 
         * @param maxNumFragments
180
 
         * @param mergeContiguousFragments
181
 
         * @throws IOException
182
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
183
 
         */
184
 
        public final TextFragment[] getBestTextFragments(
185
 
                TokenStream tokenStream,
186
 
                String text,
187
 
                boolean mergeContiguousFragments,
188
 
                int maxNumFragments)
189
 
                throws IOException, InvalidTokenOffsetsException
190
 
        {
191
 
                ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
192
 
                StringBuilder newText=new StringBuilder();
193
 
                
194
 
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
195
 
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
196
 
            tokenStream.addAttribute(PositionIncrementAttribute.class);
197
 
            tokenStream.reset();
198
 
            
199
 
                TextFragment currentFrag =      new TextFragment(newText,newText.length(), docFrags.size());
200
 
                
201
 
    if (fragmentScorer instanceof QueryScorer) {
202
 
      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
203
 
    }
204
 
    
205
 
                TokenStream newStream = fragmentScorer.init(tokenStream);
206
 
                if(newStream != null) {
207
 
                  tokenStream = newStream;
208
 
                }
209
 
                fragmentScorer.startFragment(currentFrag);
210
 
                docFrags.add(currentFrag);
211
 
 
212
 
                FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
213
 
 
214
 
                try
215
 
                {
216
 
 
217
 
                        String tokenText;
218
 
                        int startOffset;
219
 
                        int endOffset;
220
 
                        int lastEndOffset = 0;
221
 
                        textFragmenter.start(text, tokenStream);
222
 
 
223
 
                        TokenGroup tokenGroup=new TokenGroup(tokenStream);
224
 
 
225
 
                        for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
226
 
                              next = tokenStream.incrementToken())
227
 
                        {
228
 
                                if(     (offsetAtt.endOffset()>text.length())
229
 
                                        ||
230
 
                                        (offsetAtt.startOffset()>text.length())
231
 
                                        )                                               
232
 
                                {
233
 
                                        throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
234
 
                                                        +" exceeds length of provided text sized "+text.length());
235
 
                                }
236
 
                                if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
237
 
                                {
238
 
                                        //the current token is distinct from previous tokens -
239
 
                                        // markup the cached token group info
240
 
                                        startOffset = tokenGroup.matchStartOffset;
241
 
                                        endOffset = tokenGroup.matchEndOffset;
242
 
                                        tokenText = text.substring(startOffset, endOffset);
243
 
                                        String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
244
 
                                        //store any whitespace etc from between this and last group
245
 
                                        if (startOffset > lastEndOffset)
246
 
                                                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
247
 
                                        newText.append(markedUpText);
248
 
                                        lastEndOffset=Math.max(endOffset, lastEndOffset);
249
 
                                        tokenGroup.clear();
250
 
 
251
 
                                        //check if current token marks the start of a new fragment
252
 
                                        if(textFragmenter.isNewFragment())
253
 
                                        {
254
 
                                                currentFrag.setScore(fragmentScorer.getFragmentScore());
255
 
                                                //record stats for a new fragment
256
 
                                                currentFrag.textEndPos = newText.length();
257
 
                                                currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
258
 
                                                fragmentScorer.startFragment(currentFrag);
259
 
                                                docFrags.add(currentFrag);
260
 
                                        }
261
 
                                }
262
 
 
263
 
                                tokenGroup.addToken(fragmentScorer.getTokenScore());
264
 
 
265
 
//                              if(lastEndOffset>maxDocBytesToAnalyze)
266
 
//                              {
267
 
//                                      break;
268
 
//                              }
269
 
                        }
270
 
                        currentFrag.setScore(fragmentScorer.getFragmentScore());
271
 
 
272
 
                        if(tokenGroup.numTokens>0)
273
 
                        {
274
 
                                //flush the accumulated text (same code as in above loop)
275
 
                                startOffset = tokenGroup.matchStartOffset;
276
 
                                endOffset = tokenGroup.matchEndOffset;
277
 
                                tokenText = text.substring(startOffset, endOffset);
278
 
                                String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
279
 
                                //store any whitespace etc from between this and last group
280
 
                                if (startOffset > lastEndOffset)
281
 
                                        newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
282
 
                                newText.append(markedUpText);
283
 
                                lastEndOffset=Math.max(lastEndOffset,endOffset);
284
 
                        }
285
 
 
286
 
                        //Test what remains of the original text beyond the point where we stopped analyzing 
287
 
                        if (
288
 
//                                      if there is text beyond the last token considered..
289
 
                                        (lastEndOffset < text.length()) 
290
 
                                        &&
291
 
//                                      and that text is not too large...
292
 
                                        (text.length()<= maxDocCharsToAnalyze)
293
 
                                )                               
294
 
                        {
295
 
                                //append it to the last fragment
296
 
                                newText.append(encoder.encodeText(text.substring(lastEndOffset)));
297
 
                        }
298
 
 
299
 
                        currentFrag.textEndPos = newText.length();
300
 
 
301
 
                        //sort the most relevant sections of the text
302
 
                        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
303
 
                        {
304
 
                                currentFrag = i.next();
305
 
 
306
 
                                //If you are running with a version of Lucene before 11th Sept 03
307
 
                                // you do not have PriorityQueue.insert() - so uncomment the code below
308
 
                                /*
309
 
                                                                        if (currentFrag.getScore() >= minScore)
310
 
                                                                        {
311
 
                                                                                fragQueue.put(currentFrag);
312
 
                                                                                if (fragQueue.size() > maxNumFragments)
313
 
                                                                                { // if hit queue overfull
314
 
                                                                                        fragQueue.pop(); // remove lowest in hit queue
315
 
                                                                                        minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
316
 
                                                                                }
317
 
 
318
 
 
319
 
                                                                        }
320
 
                                */
321
 
                                //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
322
 
                                //fix to PriorityQueue. The correct method to use here is the new "insert" method
323
 
                                // USE ABOVE CODE IF THIS DOES NOT COMPILE!
324
 
                                fragQueue.insertWithOverflow(currentFrag);
325
 
                        }
326
 
 
327
 
                        //return the most relevant fragments
328
 
                        TextFragment frag[] = new TextFragment[fragQueue.size()];
329
 
                        for (int i = frag.length - 1; i >= 0; i--)
330
 
                        {
331
 
                                frag[i] = fragQueue.pop();
332
 
                        }
333
 
 
334
 
                        //merge any contiguous fragments to improve readability
335
 
                        if(mergeContiguousFragments)
336
 
                        {
337
 
                                mergeContiguousFragments(frag);
338
 
                                ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
339
 
                                for (int i = 0; i < frag.length; i++)
340
 
                                {
341
 
                                        if ((frag[i] != null) && (frag[i].getScore() > 0))
342
 
                                        {
343
 
                                                fragTexts.add(frag[i]);
344
 
                                        }
345
 
                                }
346
 
                                frag= fragTexts.toArray(new TextFragment[0]);
347
 
                        }
348
 
 
349
 
                        return frag;
350
 
 
351
 
                }
352
 
                finally
353
 
                {
354
 
                        if (tokenStream != null)
355
 
                        {
356
 
                                try
357
 
                                {
358
 
                                  tokenStream.end();
359
 
                                        tokenStream.close();
360
 
                                }
361
 
                                catch (Exception e)
362
 
                                {
363
 
                                }
364
 
                        }
365
 
                }
366
 
        }
367
 
 
368
 
 
369
 
        /** Improves readability of a score-sorted list of TextFragments by merging any fragments
370
 
         * that were contiguous in the original text into one larger fragment with the correct order.
371
 
         * This will leave a "null" in the array entry for the lesser scored fragment. 
372
 
         * 
373
 
         * @param frag An array of document fragments in descending score
374
 
         */
375
 
        private void mergeContiguousFragments(TextFragment[] frag)
376
 
        {
377
 
                boolean mergingStillBeingDone;
378
 
                if (frag.length > 1)
379
 
                        do
380
 
                        {
381
 
                                mergingStillBeingDone = false; //initialise loop control flag
382
 
                                //for each fragment, scan other frags looking for contiguous blocks
383
 
                                for (int i = 0; i < frag.length; i++)
384
 
                                {
385
 
                                        if (frag[i] == null)
386
 
                                        {
387
 
                                                continue;
388
 
                                        }
389
 
                                        //merge any contiguous blocks 
390
 
                                        for (int x = 0; x < frag.length; x++)
391
 
                                        {
392
 
                                                if (frag[x] == null)
393
 
                                                {
394
 
                                                        continue;
395
 
                                                }
396
 
                                                if (frag[i] == null)
397
 
                                                {
398
 
                                                        break;
399
 
                                                }
400
 
                                                TextFragment frag1 = null;
401
 
                                                TextFragment frag2 = null;
402
 
                                                int frag1Num = 0;
403
 
                                                int frag2Num = 0;
404
 
                                                int bestScoringFragNum;
405
 
                                                int worstScoringFragNum;
406
 
                                                //if blocks are contiguous....
407
 
                                                if (frag[i].follows(frag[x]))
408
 
                                                {
409
 
                                                        frag1 = frag[x];
410
 
                                                        frag1Num = x;
411
 
                                                        frag2 = frag[i];
412
 
                                                        frag2Num = i;
413
 
                                                }
414
 
                                                else
415
 
                                                        if (frag[x].follows(frag[i]))
416
 
                                                        {
417
 
                                                                frag1 = frag[i];
418
 
                                                                frag1Num = i;
419
 
                                                                frag2 = frag[x];
420
 
                                                                frag2Num = x;
421
 
                                                        }
422
 
                                                //merging required..
423
 
                                                if (frag1 != null)
424
 
                                                {
425
 
                                                        if (frag1.getScore() > frag2.getScore())
426
 
                                                        {
427
 
                                                                bestScoringFragNum = frag1Num;
428
 
                                                                worstScoringFragNum = frag2Num;
429
 
                                                        }
430
 
                                                        else
431
 
                                                        {
432
 
                                                                bestScoringFragNum = frag2Num;
433
 
                                                                worstScoringFragNum = frag1Num;
434
 
                                                        }
435
 
                                                        frag1.merge(frag2);
436
 
                                                        frag[worstScoringFragNum] = null;
437
 
                                                        mergingStillBeingDone = true;
438
 
                                                        frag[bestScoringFragNum] = frag1;
439
 
                                                }
440
 
                                        }
441
 
                                }
442
 
                        }
443
 
                        while (mergingStillBeingDone);
444
 
        }
445
 
        
446
 
        
447
 
        /**
448
 
         * Highlights terms in the  text , extracting the most relevant sections
449
 
         * and concatenating the chosen fragments with a separator (typically "...").
450
 
         * The document text is analysed in chunks to record hit statistics
451
 
         * across the document. After accumulating stats, the fragments with the highest scores
452
 
         * are returned in order as "separator" delimited strings.
453
 
         *
454
 
         * @param text        text to highlight terms in
455
 
         * @param maxNumFragments  the maximum number of fragments.
456
 
         * @param separator  the separator used to intersperse the document fragments (typically "...")
457
 
         *
458
 
         * @return highlighted text
459
 
         * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
460
 
         */
461
 
        public final String getBestFragments(
462
 
                TokenStream tokenStream,        
463
 
                String text,
464
 
                int maxNumFragments,
465
 
                String separator)
466
 
                throws IOException, InvalidTokenOffsetsException
467
 
        {
468
 
                String sections[] =     getBestFragments(tokenStream,text, maxNumFragments);
469
 
                StringBuilder result = new StringBuilder();
470
 
                for (int i = 0; i < sections.length; i++)
471
 
                {
472
 
                        if (i > 0)
473
 
                        {
474
 
                                result.append(separator);
475
 
                        }
476
 
                        result.append(sections[i]);
477
 
                }
478
 
                return result.toString();
479
 
        }
480
 
 
481
 
  public int getMaxDocCharsToAnalyze() {
482
 
    return maxDocCharsToAnalyze;
483
 
  }
484
 
 
485
 
  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
486
 
    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
487
 
  }
488
 
 
489
 
  
490
 
        public Fragmenter getTextFragmenter()
491
 
        {
492
 
                return textFragmenter;
493
 
        }
494
 
 
495
 
        /**
496
 
         * @param fragmenter
497
 
         */
498
 
        public void setTextFragmenter(Fragmenter fragmenter)
499
 
        {
500
 
                textFragmenter = fragmenter;
501
 
        }
502
 
 
503
 
        /**
504
 
         * @return Object used to score each text fragment 
505
 
         */
506
 
        public Scorer getFragmentScorer()
507
 
        {
508
 
                return fragmentScorer;
509
 
        }
510
 
 
511
 
 
512
 
        /**
513
 
         * @param scorer
514
 
         */
515
 
        public void setFragmentScorer(Scorer scorer)
516
 
        {
517
 
                fragmentScorer = scorer;
518
 
        }
519
 
 
520
 
    public Encoder getEncoder()
521
 
    {
522
 
        return encoder;
523
 
    }
524
 
    public void setEncoder(Encoder encoder)
525
 
    {
526
 
        this.encoder = encoder;
527
 
    }
528
 
}
529
 
class FragmentQueue extends PriorityQueue<TextFragment>
530
 
{
531
 
        public FragmentQueue(int size)
532
 
        {
533
 
                initialize(size);
534
 
        }
535
 
 
536
 
        @Override
537
 
        public final boolean lessThan(TextFragment fragA, TextFragment fragB)
538
 
        {
539
 
                if (fragA.getScore() == fragB.getScore())
540
 
                        return fragA.fragNum > fragB.fragNum;
541
 
                else
542
 
                        return fragA.getScore() < fragB.getScore();
543
 
        }
544
 
}