2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.handler;
20
import java.io.IOException;
21
import java.io.Reader;
22
import java.net.MalformedURLException;
24
import java.util.ArrayList;
25
import java.util.Comparator;
27
import java.util.Iterator;
28
import java.util.List;
30
import java.util.regex.Pattern;
32
import org.apache.lucene.document.Document;
33
import org.apache.lucene.index.IndexReader;
34
import org.apache.lucene.index.Term;
35
import org.apache.lucene.search.BooleanClause;
36
import org.apache.lucene.search.BooleanQuery;
37
import org.apache.lucene.search.Query;
38
import org.apache.lucene.search.TermQuery;
39
import org.apache.lucene.search.similar.MoreLikeThis;
40
import org.apache.solr.common.SolrException;
41
import org.apache.solr.common.params.CommonParams;
42
import org.apache.solr.common.params.FacetParams;
43
import org.apache.solr.common.params.MoreLikeThisParams;
44
import org.apache.solr.common.params.SolrParams;
45
import org.apache.solr.common.params.MoreLikeThisParams.TermStyle;
46
import org.apache.solr.common.util.ContentStream;
47
import org.apache.solr.common.util.NamedList;
48
import org.apache.solr.common.util.SimpleOrderedMap;
49
import org.apache.solr.core.SolrCore;
50
import org.apache.solr.request.SimpleFacets;
51
import org.apache.solr.request.SolrQueryRequest;
52
import org.apache.solr.response.SolrQueryResponse;
53
import org.apache.solr.schema.IndexSchema;
54
import org.apache.solr.schema.SchemaField;
55
import org.apache.solr.search.DocIterator;
56
import org.apache.solr.search.DocList;
57
import org.apache.solr.search.DocListAndSet;
58
import org.apache.solr.search.QueryParsing;
59
import org.apache.solr.search.SolrIndexSearcher;
60
import org.apache.solr.util.SolrPluginUtils;
63
* Solr MoreLikeThis --
65
* Return similar documents either based on a single document or based on posted text.
69
public class MoreLikeThisHandler extends RequestHandlerBase
71
// Pattern is thread safe -- TODO? share this with general 'fl' param
72
private static final Pattern splitList = Pattern.compile(",| ");
75
public void init(NamedList args) {
80
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
82
SolrParams params = req.getParams();
83
SolrIndexSearcher searcher = req.getSearcher();
86
MoreLikeThisHelper mlt = new MoreLikeThisHelper( params, searcher );
87
List<Query> filters = SolrPluginUtils.parseFilterQueries(req);
89
// Hold on to the interesting terms if relevant
90
TermStyle termStyle = TermStyle.get( params.get( MoreLikeThisParams.INTERESTING_TERMS ) );
91
List<InterestingTerm> interesting = (termStyle == TermStyle.NONE )
92
? null : new ArrayList<InterestingTerm>( mlt.mlt.getMaxQueryTerms() );
94
DocListAndSet mltDocs = null;
95
String q = params.get( CommonParams.Q );
97
// Parse Required Params
98
// This will either have a single Reader or valid query
101
if (q == null || q.trim().length() < 1) {
102
Iterable<ContentStream> streams = req.getContentStreams();
103
if (streams != null) {
104
Iterator<ContentStream> iter = streams.iterator();
105
if (iter.hasNext()) {
106
reader = iter.next().getReader();
108
if (iter.hasNext()) {
109
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
110
"MoreLikeThis does not support multiple ContentStreams");
115
// What fields do we need to return
116
String fl = params.get(CommonParams.FL);
119
flags |= SolrPluginUtils.setReturnFields(fl, rsp);
122
int start = params.getInt(CommonParams.START, 0);
123
int rows = params.getInt(CommonParams.ROWS, 10);
125
// Find documents MoreLikeThis - either with a reader or a query
126
// --------------------------------------------------------------------------------
127
if (reader != null) {
128
mltDocs = mlt.getMoreLikeThis(reader, start, rows, filters,
130
} else if (q != null) {
132
boolean includeMatch = params.getBool(MoreLikeThisParams.MATCH_INCLUDE,
134
int matchOffset = params.getInt(MoreLikeThisParams.MATCH_OFFSET, 0);
135
// Find the base match
136
Query query = QueryParsing.parseQuery(q, params.get(CommonParams.DF),
137
params, req.getSchema());
138
DocList match = searcher.getDocList(query, null, null, matchOffset, 1,
139
flags); // only get the first one...
141
rsp.add("match", match);
144
// This is an iterator, but we only handle the first match
145
DocIterator iterator = match.iterator();
146
if (iterator.hasNext()) {
147
// do a MoreLikeThis query for each document in results
148
int id = iterator.nextDoc();
149
mltDocs = mlt.getMoreLikeThis(id, start, rows, filters, interesting,
153
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
154
"MoreLikeThis requires either a query (?q=) or text to find similar documents.");
158
if (reader != null) {
163
if( mltDocs == null ) {
164
mltDocs = new DocListAndSet(); // avoid NPE
166
rsp.add( "response", mltDocs.docList );
169
if( interesting != null ) {
170
if( termStyle == TermStyle.DETAILS ) {
171
NamedList<Float> it = new NamedList<Float>();
172
for( InterestingTerm t : interesting ) {
173
it.add( t.term.toString(), t.boost );
175
rsp.add( "interestingTerms", it );
178
List<String> it = new ArrayList<String>( interesting.size() );
179
for( InterestingTerm t : interesting ) {
180
it.add( t.term.text());
182
rsp.add( "interestingTerms", it );
186
// maybe facet the results
187
if (params.getBool(FacetParams.FACET,false)) {
188
if( mltDocs.docSet == null ) {
189
rsp.add( "facet_counts", null );
192
SimpleFacets f = new SimpleFacets(req, mltDocs.docSet, params );
193
rsp.add( "facet_counts", f.getFacetCounts() );
197
boolean dbg = req.getParams().getBool(CommonParams.DEBUG_QUERY, false);
198
// Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug?
201
NamedList<Object> dbgInfo = SolrPluginUtils.doStandardDebug(req, q, mlt.getRawMLTQuery(), mltDocs.docList);
202
if (null != dbgInfo) {
203
if (null != filters) {
204
dbgInfo.add("filter_queries",req.getParams().getParams(CommonParams.FQ));
205
List<String> fqs = new ArrayList<String>(filters.size());
206
for (Query fq : filters) {
207
fqs.add(QueryParsing.toString(fq, req.getSchema()));
209
dbgInfo.add("parsed_filter_queries",fqs);
211
rsp.add("debug", dbgInfo);
213
} catch (Exception e) {
214
SolrException.logOnce(SolrCore.log, "Exception during debug", e);
215
rsp.add("exception_during_debug", SolrException.toStr(e));
220
public static class InterestingTerm
225
public static Comparator<InterestingTerm> BOOST_ORDER = new Comparator<InterestingTerm>() {
226
public int compare(InterestingTerm t1, InterestingTerm t2) {
227
float d = t1.boost - t2.boost;
237
* Helper class for MoreLikeThis that can be called from other request handlers
239
public static class MoreLikeThisHelper
241
final SolrIndexSearcher searcher;
242
final MoreLikeThis mlt;
243
final IndexReader reader;
244
final SchemaField uniqueKeyField;
245
final boolean needDocSet;
246
Map<String,Float> boostFields;
248
public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher )
250
this.searcher = searcher;
251
this.reader = searcher.getReader();
252
this.uniqueKeyField = searcher.getSchema().getUniqueKeyField();
253
this.needDocSet = params.getBool(FacetParams.FACET,false);
255
SolrParams required = params.required();
256
String[] fields = splitList.split( required.get(MoreLikeThisParams.SIMILARITY_FIELDS) );
257
if( fields.length < 1 ) {
258
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
259
"MoreLikeThis requires at least one similarity field: "+MoreLikeThisParams.SIMILARITY_FIELDS );
262
this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() );
263
mlt.setFieldNames(fields);
264
mlt.setAnalyzer( searcher.getSchema().getAnalyzer() );
266
// configurable params
267
mlt.setMinTermFreq( params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
268
mlt.setMinDocFreq( params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
269
mlt.setMinWordLen( params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
270
mlt.setMaxWordLen( params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
271
mlt.setMaxQueryTerms( params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
272
mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
273
mlt.setBoost( params.getBool(MoreLikeThisParams.BOOST, false ) );
274
boostFields = SolrPluginUtils.parseFieldBoosts(params.getParams(MoreLikeThisParams.QF));
277
private Query rawMLTQuery;
278
private Query boostedMLTQuery;
279
private BooleanQuery realMLTQuery;
281
public Query getRawMLTQuery(){
285
public Query getBoostedMLTQuery(){
286
return boostedMLTQuery;
289
public Query getRealMLTQuery(){
293
private Query getBoostedQuery(Query mltquery) {
294
BooleanQuery boostedQuery = (BooleanQuery)mltquery.clone();
295
if (boostFields.size() > 0) {
296
List clauses = boostedQuery.clauses();
297
for( Object o : clauses ) {
298
TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
299
Float b = this.boostFields.get(q.getTerm().field());
301
q.setBoost(b*q.getBoost());
308
public DocListAndSet getMoreLikeThis( int id, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
310
Document doc = reader.document(id);
311
rawMLTQuery = mlt.like(id);
312
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
313
if( terms != null ) {
314
fillInterestingTermsFromMLTQuery( rawMLTQuery, terms );
317
// exclude current document from results
318
realMLTQuery = new BooleanQuery();
319
realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
321
new TermQuery(new Term(uniqueKeyField.getName(), uniqueKeyField.getType().storedToIndexed(doc.getFieldable(uniqueKeyField.getName())))),
322
BooleanClause.Occur.MUST_NOT);
324
DocListAndSet results = new DocListAndSet();
325
if (this.needDocSet) {
326
results = searcher.getDocListAndSet(realMLTQuery, filters, null, start, rows, flags);
328
results.docList = searcher.getDocList(realMLTQuery, filters, null, start, rows, flags);
333
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
335
// analyzing with the first field: previous (stupid) behavior
336
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
337
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
338
if( terms != null ) {
339
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
341
DocListAndSet results = new DocListAndSet();
342
if (this.needDocSet) {
343
results = searcher.getDocListAndSet( boostedMLTQuery, filters, null, start, rows, flags);
345
results.docList = searcher.getDocList( boostedMLTQuery, filters, null, start, rows, flags);
351
public NamedList<DocList> getMoreLikeThese( DocList docs, int rows, int flags ) throws IOException
353
IndexSchema schema = searcher.getSchema();
354
NamedList<DocList> mlt = new SimpleOrderedMap<DocList>();
355
DocIterator iterator = docs.iterator();
356
while( iterator.hasNext() ) {
357
int id = iterator.nextDoc();
359
DocListAndSet sim = getMoreLikeThis( id, 0, rows, null, null, flags );
360
String name = schema.printableUniqueKey( reader.document( id ) );
362
mlt.add(name, sim.docList);
367
private void fillInterestingTermsFromMLTQuery( Query query, List<InterestingTerm> terms )
369
List clauses = ((BooleanQuery)query).clauses();
370
for( Object o : clauses ) {
371
TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
372
InterestingTerm it = new InterestingTerm();
373
it.boost = q.getBoost();
374
it.term = q.getTerm();
377
// alternatively we could use
378
// mltquery.extractTerms( terms );
381
public MoreLikeThis getMoreLikeThis()
388
//////////////////////// SolrInfoMBeans methods //////////////////////
391
public String getVersion() {
392
return "$Revision: 1164331 $";
396
public String getDescription() {
397
return "Solr MoreLikeThis";
401
public String getSourceId() {
402
return "$Id: MoreLikeThisHandler.java 1164331 2011-09-02 02:14:28Z koji $";
406
public String getSource() {
407
return "$URL: http://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_5_0/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java $";
411
public URL[] getDocs() {
413
return new URL[] { new URL("http://wiki.apache.org/solr/MoreLikeThis") };
415
catch( MalformedURLException ex ) { return null; }