1
package org.apache.lucene.search.vectorhighlight;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.List;
24
import org.apache.lucene.document.Document;
25
import org.apache.lucene.document.Field;
26
import org.apache.lucene.document.MapFieldSelector;
27
import org.apache.lucene.index.IndexReader;
28
import org.apache.lucene.search.highlight.DefaultEncoder;
29
import org.apache.lucene.search.highlight.Encoder;
30
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
31
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
32
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
34
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
36
protected String[] preTags, postTags;
37
public static final String[] COLORED_PRE_TAGS = {
38
"<b style=\"background:yellow\">", "<b style=\"background:lawngreen\">", "<b style=\"background:aquamarine\">",
39
"<b style=\"background:magenta\">", "<b style=\"background:palegreen\">", "<b style=\"background:coral\">",
40
"<b style=\"background:wheat\">", "<b style=\"background:khaki\">", "<b style=\"background:lime\">",
41
"<b style=\"background:deepskyblue\">", "<b style=\"background:deeppink\">", "<b style=\"background:salmon\">",
42
"<b style=\"background:peachpuff\">", "<b style=\"background:violet\">", "<b style=\"background:mediumpurple\">",
43
"<b style=\"background:palegoldenrod\">", "<b style=\"background:darkkhaki\">", "<b style=\"background:springgreen\">",
44
"<b style=\"background:turquoise\">", "<b style=\"background:powderblue\">"
46
public static final String[] COLORED_POST_TAGS = { "</b>" };
47
private char multiValuedSeparator = ' ';
48
private final BoundaryScanner boundaryScanner;
50
protected BaseFragmentsBuilder(){
51
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
54
protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
55
this(preTags, postTags, new SimpleBoundaryScanner());
58
protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){
59
this( new String[]{ "<b>" }, new String[]{ "</b>" }, boundaryScanner );
62
protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){
63
this.preTags = preTags;
64
this.postTags = postTags;
65
this.boundaryScanner = boundaryScanner;
68
static Object checkTagsArgument( Object tags ){
69
if( tags instanceof String ) return tags;
70
else if( tags instanceof String[] ) return tags;
71
throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
74
public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
76
private static final Encoder NULL_ENCODER = new DefaultEncoder();
78
public String createFragment( IndexReader reader, int docId,
79
String fieldName, FieldFragList fieldFragList ) throws IOException {
80
return createFragment( reader, docId, fieldName, fieldFragList,
81
preTags, postTags, NULL_ENCODER );
84
public String[] createFragments( IndexReader reader, int docId,
85
String fieldName, FieldFragList fieldFragList, int maxNumFragments )
87
return createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
88
preTags, postTags, NULL_ENCODER );
91
public String createFragment( IndexReader reader, int docId,
92
String fieldName, FieldFragList fieldFragList, String[] preTags, String[] postTags,
93
Encoder encoder ) throws IOException {
94
String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1,
95
preTags, postTags, encoder );
96
if( fragments == null || fragments.length == 0 ) return null;
100
public String[] createFragments( IndexReader reader, int docId,
101
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
102
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
103
if( maxNumFragments < 0 )
104
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
106
List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
108
List<String> fragments = new ArrayList<String>( maxNumFragments );
109
Field[] values = getFields( reader, docId, fieldName );
110
if( values.length == 0 ) return null;
111
StringBuilder buffer = new StringBuilder();
112
int[] nextValueIndex = { 0 };
113
for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
114
WeightedFragInfo fragInfo = fragInfos.get( n );
115
fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) );
117
return fragments.toArray( new String[fragments.size()] );
121
protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException {
122
Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
123
return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null
126
protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException {
127
// according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
128
Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
129
return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null
133
protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){
134
final int s = fragInfo.startOffset;
135
return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s,
136
preTags, postTags, NULL_ENCODER );
139
private String makeFragment( WeightedFragInfo fragInfo, String src, int s,
140
String[] preTags, String[] postTags, Encoder encoder ){
141
StringBuilder fragment = new StringBuilder();
143
for( SubInfo subInfo : fragInfo.subInfos ){
144
for( Toffs to : subInfo.termsOffsets ){
146
.append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) )
147
.append( getPreTag( preTags, subInfo.seqnum ) )
148
.append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) )
149
.append( getPostTag( postTags, subInfo.seqnum ) );
150
srcIndex = to.endOffset - s;
153
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
154
return fragment.toString();
157
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
158
String[] preTags, String[] postTags, Encoder encoder ){
159
StringBuilder fragment = new StringBuilder();
160
final int s = fragInfo.getStartOffset();
161
int[] modifiedStartOffset = { s };
162
String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset );
164
for( SubInfo subInfo : fragInfo.getSubInfos() ){
165
for( Toffs to : subInfo.getTermsOffsets() ){
167
.append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) )
168
.append( getPreTag( preTags, subInfo.getSeqnum() ) )
169
.append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) )
170
.append( getPostTag( postTags, subInfo.getSeqnum() ) );
171
srcIndex = to.getEndOffset() - modifiedStartOffset[0];
174
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
175
return fragment.toString();
178
protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
179
int startOffset, int endOffset, int[] modifiedStartOffset ){
180
while( buffer.length() < endOffset && index[0] < values.length ){
181
buffer.append( values[index[0]].stringValue() );
182
if( values[index[0]].isTokenized() )
183
buffer.append( getMultiValuedSeparator() );
186
int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset );
187
modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
188
return buffer.substring( modifiedStartOffset[0], eo );
192
protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values,
193
int startOffset, int endOffset ){
194
while( buffer.length() < endOffset && index[0] < values.length ){
195
buffer.append( values[index[0]] );
196
buffer.append( multiValuedSeparator );
199
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
200
return buffer.substring( startOffset, eo );
203
protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
204
int startOffset, int endOffset ){
205
while( buffer.length() < endOffset && index[0] < values.length ){
206
buffer.append( values[index[0]].stringValue() );
207
if( values[index[0]].isTokenized() )
208
buffer.append( multiValuedSeparator );
211
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
212
return buffer.substring( startOffset, eo );
215
public void setMultiValuedSeparator( char separator ){
216
multiValuedSeparator = separator;
219
public char getMultiValuedSeparator(){
220
return multiValuedSeparator;
223
protected String getPreTag( int num ){
224
return getPreTag( preTags, num );
227
protected String getPostTag( int num ){
228
return getPostTag( postTags, num );
231
protected String getPreTag( String[] preTags, int num ){
232
int n = num % preTags.length;
236
protected String getPostTag( String[] postTags, int num ){
237
int n = num % postTags.length;