1
package org.apache.lucene.search.vectorhighlight;
3
* Licensed to the Apache Software Foundation (ASF) under one or more
4
* contributor license agreements. See the NOTICE file distributed with
5
* this work for additional information regarding copyright ownership.
6
* The ASF licenses this file to You under the Apache License, Version 2.0
7
* (the "License"); you may not use this file except in compliance with
8
* the License. You may obtain a copy of the License at
10
* http://www.apache.org/licenses/LICENSE-2.0
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
19
import java.io.IOException;
20
import java.util.ArrayList;
21
import java.util.Collection;
22
import java.util.HashMap;
23
import java.util.HashSet;
24
import java.util.Iterator;
25
import java.util.List;
29
import org.apache.lucene.index.IndexReader;
30
import org.apache.lucene.index.Term;
31
import org.apache.lucene.search.BooleanClause;
32
import org.apache.lucene.search.BooleanQuery;
33
import org.apache.lucene.search.DisjunctionMaxQuery;
34
import org.apache.lucene.search.FuzzyQuery;
35
import org.apache.lucene.search.MultiTermQuery;
36
import org.apache.lucene.search.PhraseQuery;
37
import org.apache.lucene.search.PrefixQuery;
38
import org.apache.lucene.search.Query;
39
import org.apache.lucene.search.TermQuery;
40
import org.apache.lucene.search.TermRangeQuery;
41
import org.apache.lucene.search.WildcardQuery;
42
import org.apache.lucene.search.regex.RegexQuery;
43
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
46
* FieldQuery breaks down query object into terms/phrases and keep
47
* them in QueryPhraseMap structure.
49
public class FieldQuery {
51
final boolean fieldMatch;
53
// fieldMatch==true, Map<fieldName,QueryPhraseMap>
54
// fieldMatch==false, Map<null,QueryPhraseMap>
55
Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>();
57
// fieldMatch==true, Map<fieldName,setOfTermsInQueries>
58
// fieldMatch==false, Map<null,setOfTermsInQueries>
59
Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
61
int termOrPhraseNumber; // used for colored tag support
63
// The maximum number of different matching terms accumulated from any one MultiTermQuery
64
private static final int MAX_MTQ_TERMS = 1024;
66
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
67
this.fieldMatch = fieldMatch;
68
List<Query> flatQueries = new ArrayList<Query>();
69
flatten( query, reader, flatQueries );
70
saveTerms( flatQueries, reader );
71
Collection<Query> expandQueries = expand( flatQueries );
73
for( Query flatQuery : expandQueries ){
74
QueryPhraseMap rootMap = getRootMap( flatQuery );
75
rootMap.add( flatQuery, reader );
76
if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
77
PhraseQuery pq = (PhraseQuery)flatQuery;
78
if( pq.getTerms().length > 1 ){
79
for( Term term : pq.getTerms() )
80
rootMap.addTerm( term, flatQuery.getBoost() );
86
/** For backwards compatibility you can initialize FieldQuery without
87
* an IndexReader, which is only required to support MultiTermQuery
89
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
90
this (query, null, phraseHighlight, fieldMatch);
93
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
94
if( sourceQuery instanceof BooleanQuery ){
95
BooleanQuery bq = (BooleanQuery)sourceQuery;
96
for( BooleanClause clause : bq.getClauses() ){
97
if( !clause.isProhibited() )
98
flatten( clause.getQuery(), reader, flatQueries );
101
else if( sourceQuery instanceof DisjunctionMaxQuery ){
102
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
103
for( Query query : dmq ){
104
flatten( query, reader, flatQueries );
107
else if( sourceQuery instanceof TermQuery ){
108
if( !flatQueries.contains( sourceQuery ) )
109
flatQueries.add( sourceQuery );
111
else if (sourceQuery instanceof MultiTermQuery && reader != null) {
112
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
113
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
114
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
115
flatten(mtqTerms, reader, flatQueries);
117
else if( sourceQuery instanceof PhraseQuery ){
118
if( !flatQueries.contains( sourceQuery ) ){
119
PhraseQuery pq = (PhraseQuery)sourceQuery;
120
if( pq.getTerms().length > 1 )
121
flatQueries.add( pq );
122
else if( pq.getTerms().length == 1 ){
123
flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
127
// else discard queries
131
* Create expandQueries from flatQueries.
133
* expandQueries := flatQueries + overlapped phrase queries
135
* ex1) flatQueries={a,b,c}
136
* => expandQueries={a,b,c}
137
* ex2) flatQueries={a,"b c","c d"}
138
* => expandQueries={a,"b c","c d","b c d"}
140
Collection<Query> expand( Collection<Query> flatQueries ){
141
List<Query> expandQueries = new ArrayList<Query>();
142
for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){
143
Query query = i.next();
145
expandQueries.add( query );
146
if( !( query instanceof PhraseQuery ) ) continue;
147
for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){
149
if( !( qj instanceof PhraseQuery ) ) continue;
150
checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
153
return expandQueries;
157
* Check if PhraseQuery A and B have overlapped part.
159
* ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
160
* ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
161
* ex3) A="a b", B="c d" => no overlap; expandQueries={}
163
private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){
164
if( a.getSlop() != b.getSlop() ) return;
165
Term[] ats = a.getTerms();
166
Term[] bts = b.getTerms();
167
if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
168
checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
169
checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
173
* Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
175
* ex1) src="a b", dest="c d" => no overlap
176
* ex2) src="a b", dest="a b c" => no overlap
177
* ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
178
* ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
179
* ex5) src="a b c", dest="b c" => no overlap
180
* ex6) src="a b c", dest="b" => no overlap
181
* ex7) src="a a a a", dest="a a a" => overlap;
182
* expandQueries={"a a a a a","a a a a a a"}
183
* ex8) src="a b c d", dest="b c" => no overlap
185
private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){
186
// beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
187
// is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
188
// converts PhraseQuery to TermQuery)
189
for( int i = 1; i < src.length; i++ ){
190
boolean overlap = true;
191
for( int j = i; j < src.length; j++ ){
192
if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
197
if( overlap && src.length - i < dest.length ){
198
PhraseQuery pq = new PhraseQuery();
199
for( Term srcTerm : src )
201
for( int k = src.length - i; k < dest.length; k++ ){
202
pq.add( new Term( src[0].field(), dest[k].text() ) );
205
pq.setBoost( boost );
206
if(!expandQueries.contains( pq ) )
207
expandQueries.add( pq );
212
QueryPhraseMap getRootMap( Query query ){
213
String key = getKey( query );
214
QueryPhraseMap map = rootMaps.get( key );
216
map = new QueryPhraseMap( this );
217
rootMaps.put( key, map );
223
* Return 'key' string. 'key' is the field name of the Query.
224
* If not fieldMatch, 'key' will be null.
226
private String getKey( Query query ){
227
if( !fieldMatch ) return null;
228
if( query instanceof TermQuery )
229
return ((TermQuery)query).getTerm().field();
230
else if ( query instanceof PhraseQuery ){
231
PhraseQuery pq = (PhraseQuery)query;
232
Term[] terms = pq.getTerms();
233
return terms[0].field();
235
else if (query instanceof FuzzyQuery) {
236
return ((FuzzyQuery)query).getTerm().field();
238
else if (query instanceof PrefixQuery) {
239
return ((PrefixQuery)query).getPrefix().field();
241
else if (query instanceof RegexQuery) {
242
return ((RegexQuery)query).getTerm().field();
244
else if (query instanceof TermRangeQuery) {
245
return ((TermRangeQuery)query).getField();
247
else if (query instanceof WildcardQuery) {
248
return ((WildcardQuery)query).getTerm().field();
251
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
255
* Save the set of terms in the queries to termSetMap.
259
* termSetMap=Map<"name",Set<"john">>
260
* - fieldMatch==false
261
* termSetMap=Map<null,Set<"john">>
263
* ex2) q=name:john title:manager
265
* termSetMap=Map<"name",Set<"john">,
266
* "title",Set<"manager">>
267
* - fieldMatch==false
268
* termSetMap=Map<null,Set<"john","manager">>
270
* ex3) q=name:"john lennon"
272
* termSetMap=Map<"name",Set<"john","lennon">>
273
* - fieldMatch==false
274
* termSetMap=Map<null,Set<"john","lennon">>
276
void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
277
for( Query query : flatQueries ){
278
Set<String> termSet = getTermSet( query );
279
if( query instanceof TermQuery )
280
termSet.add( ((TermQuery)query).getTerm().text() );
281
else if( query instanceof PhraseQuery ){
282
for( Term term : ((PhraseQuery)query).getTerms() )
283
termSet.add( term.text() );
285
else if (query instanceof MultiTermQuery && reader != null) {
286
BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
287
for (BooleanClause clause : mtqTerms.getClauses()) {
288
termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
292
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
296
private Set<String> getTermSet( Query query ){
297
String key = getKey( query );
298
Set<String> set = termSetMap.get( key );
300
set = new HashSet<String>();
301
termSetMap.put( key, set );
306
Set<String> getTermSet( String field ){
307
return termSetMap.get( fieldMatch ? field : null );
314
* @return QueryPhraseMap
316
public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
317
QueryPhraseMap rootMap = getRootMap( fieldName );
318
return rootMap == null ? null : rootMap.subMap.get( term );
324
* @param phraseCandidate
325
* @return QueryPhraseMap
327
public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){
328
QueryPhraseMap root = getRootMap( fieldName );
329
if( root == null ) return null;
330
return root.searchPhrase( phraseCandidate );
333
private QueryPhraseMap getRootMap( String fieldName ){
334
return rootMaps.get( fieldMatch ? fieldName : null );
337
int nextTermOrPhraseNumber(){
338
return termOrPhraseNumber++;
341
public static class QueryPhraseMap {
344
int slop; // valid if terminal == true and phraseHighlight == true
345
float boost; // valid if terminal == true
346
int termOrPhraseNumber; // valid if terminal == true
347
FieldQuery fieldQuery;
348
Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
350
public QueryPhraseMap( FieldQuery fieldQuery ){
351
this.fieldQuery = fieldQuery;
354
void addTerm( Term term, float boost ){
355
QueryPhraseMap map = getOrNewMap( subMap, term.text() );
356
map.markTerminal( boost );
359
private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){
360
QueryPhraseMap map = subMap.get( term );
362
map = new QueryPhraseMap( fieldQuery );
363
subMap.put( term, map );
368
void add( Query query, IndexReader reader ) throws IOException {
369
if( query instanceof TermQuery ){
370
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
372
else if( query instanceof PhraseQuery ){
373
PhraseQuery pq = (PhraseQuery)query;
374
Term[] terms = pq.getTerms();
375
Map<String, QueryPhraseMap> map = subMap;
376
QueryPhraseMap qpm = null;
377
for( Term term : terms ){
378
qpm = getOrNewMap( map, term.text() );
381
qpm.markTerminal( pq.getSlop(), pq.getBoost() );
384
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
387
public QueryPhraseMap getTermMap( String term ){
388
return subMap.get( term );
391
private void markTerminal( float boost ){
392
markTerminal( 0, boost );
395
private void markTerminal( int slop, float boost ){
396
this.terminal = true;
399
this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
402
public boolean isTerminal(){
406
public int getSlop(){
410
public float getBoost(){
414
public int getTermOrPhraseNumber(){
415
return termOrPhraseNumber;
418
public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){
419
QueryPhraseMap currMap = this;
420
for( TermInfo ti : phraseCandidate ){
421
currMap = currMap.subMap.get( ti.getText() );
422
if( currMap == null ) return null;
424
return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
427
public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){
429
if( !terminal ) return false;
431
// if the candidate is a term, it is valid
432
if( phraseCandidate.size() == 1 ) return true;
434
// else check whether the candidate is valid phrase
435
// compare position-gaps between terms to slop
436
int pos = phraseCandidate.get( 0 ).getPosition();
437
for( int i = 1; i < phraseCandidate.size(); i++ ){
438
int nextPos = phraseCandidate.get( i ).getPosition();
439
if( Math.abs( nextPos - pos - 1 ) > slop ) return false;