1
package org.apache.lucene.search;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.index.Term;
21
import org.apache.lucene.util.SmallFloat;
23
import java.io.IOException;
24
import java.io.Serializable;
25
import java.util.Collection;
26
import java.util.Iterator;
28
/** Expert: Scoring API.
29
* <p>Subclasses implement search scoring.
31
* <p>The score of query <code>q</code> for document <code>d</code> correlates to the
32
* cosine-distance or dot-product between document and query vectors in a
33
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
34
* Vector Space Model (VSM) of Information Retrieval</a>.
35
* A document whose vector is closer to the query vector in that model is scored higher.
37
* The score is computed as follows:
40
* <table cellpadding="1" cellspacing="0" border="1" align="center">
42
* <table cellpadding="1" cellspacing="0" border="0" align="center">
44
* <td valign="middle" align="right" rowspan="1">
45
* score(q,d) =
46
* <A HREF="#formula_coord">coord(q,d)</A> ·
47
* <A HREF="#formula_queryNorm">queryNorm(q)</A> ·
49
* <td valign="bottom" align="center" rowspan="1">
50
* <big><big><big>∑</big></big></big>
52
* <td valign="middle" align="right" rowspan="1">
53
* <big><big>(</big></big>
54
* <A HREF="#formula_tf">tf(t in d)</A> ·
55
* <A HREF="#formula_idf">idf(t)</A><sup>2</sup> ·
56
* <A HREF="#formula_termBoost">t.getBoost()</A> ·
57
* <A HREF="#formula_norm">norm(t,d)</A>
58
* <big><big>)</big></big>
63
* <td align="center"><small>t in q</small></td>
73
* <A NAME="formula_tf"></A>
75
* correlates to the term's <i>frequency</i>,
76
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
77
* Documents that have more occurrences of a given term receive a higher score.
78
* The default computation for <i>tf(t in d)</i> in
79
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
82
* <table cellpadding="2" cellspacing="2" border="0" align="center">
84
* <td valign="middle" align="right" rowspan="1">
85
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} =
87
* <td valign="top" align="center" rowspan="1">
88
* frequency<sup><big>½</big></sup>
96
* <A NAME="formula_idf"></A>
97
* <b>idf(t)</b> stands for Inverse Document Frequency. This value
98
* correlates to the inverse of <i>docFreq</i>
99
* (the number of documents in which the term <i>t</i> appears).
100
* This means rarer terms give higher contribution to the total score.
101
* The default computation for <i>idf(t)</i> in
102
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
105
* <table cellpadding="2" cellspacing="2" border="0" align="center">
107
* <td valign="middle" align="right">
108
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} =
110
* <td valign="middle" align="center">
111
* 1 + log <big>(</big>
113
* <td valign="middle" align="center">
115
* <tr><td align="center"><small>numDocs</small></td></tr>
116
* <tr><td align="center">–––––––––</td></tr>
117
* <tr><td align="center"><small>docFreq+1</small></td></tr>
120
* <td valign="middle" align="center">
129
* <A NAME="formula_coord"></A>
131
* is a score factor based on how many of the query terms are found in the specified document.
132
* Typically, a document that contains more of the query's terms will receive a higher score
133
* than another document with fewer query terms.
134
* This is a search time factor computed in
135
* {@link #coord(int, int) coord(q,d)}
136
* by the Similarity in effect at search time.
141
* <A NAME="formula_queryNorm"></A>
144
* is a normalizing factor used to make scores between queries comparable.
145
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
146
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
147
* This is a search time factor computed by the Similarity in effect at search time.
149
* The default computation in
150
* {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity}
153
* <table cellpadding="1" cellspacing="0" border="0" align="center">
155
* <td valign="middle" align="right" rowspan="1">
156
* queryNorm(q) =
157
* {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)}
160
* <td valign="middle" align="center" rowspan="1">
162
* <tr><td align="center"><big>1</big></td></tr>
163
* <tr><td align="center"><big>
164
* ––––––––––––––
166
* <tr><td align="center">sumOfSquaredWeights<sup><big>½</big></sup></td></tr>
173
* The sum of squared weights (of the query terms) is
174
* computed by the query {@link org.apache.lucene.search.Weight} object.
175
* For example, a {@link org.apache.lucene.search.BooleanQuery boolean query}
176
* computes this value as:
179
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
181
* <td valign="middle" align="right" rowspan="1">
182
* {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} =
183
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
184
* ·
186
* <td valign="bottom" align="center" rowspan="1">
187
* <big><big><big>∑</big></big></big>
189
* <td valign="middle" align="right" rowspan="1">
190
* <big><big>(</big></big>
191
* <A HREF="#formula_idf">idf(t)</A> ·
192
* <A HREF="#formula_termBoost">t.getBoost()</A>
193
* <big><big>) <sup>2</sup> </big></big>
198
* <td align="center"><small>t in q</small></td>
207
* <A NAME="formula_termBoost"></A>
208
* <b>t.getBoost()</b>
209
* is a search time boost of term <i>t</i> in the query <i>q</i> as
210
* specified in the query text
211
* (see <A HREF="../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
212
* or as set by application calls to
213
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
214
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
215
* but rather multi terms are represented in a query as multi
216
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
217
* and so the boost of a term in the query is accessible by calling the sub-query
218
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
223
* <A NAME="formula_norm"></A>
224
* <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
227
* <li><b>Document boost</b> - set by calling
228
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
229
* before adding the document to the index.
231
* <li><b>Field boost</b> - set by calling
232
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
233
* before adding the field to a document.
235
* <li>{@link #lengthNorm(String, int) <b>lengthNorm</b>(field)} - computed
236
* when the document is added to the index in accordance with the number of tokens
237
* of this field in the document, so that shorter fields contribute more to the score.
238
* LengthNorm is computed by the Similarity class in effect at indexing.
243
* When a document is added to the index, all the above factors are multiplied.
244
* If the document has multiple fields with the same name, all their boosts are multiplied together:
247
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
249
* <td valign="middle" align="right" rowspan="1">
250
* norm(t,d) =
251
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
252
* ·
253
* {@link #lengthNorm(String, int) lengthNorm(field)}
254
* ·
256
* <td valign="bottom" align="center" rowspan="1">
257
* <big><big><big>∏</big></big></big>
259
* <td valign="middle" align="right" rowspan="1">
260
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
265
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
270
* However the resulted <i>norm</i> value is {@link #encodeNorm(float) encoded} as a single byte
271
* before being stored.
272
* At search time, the norm byte value is read from the index
273
* {@link org.apache.lucene.store.Directory directory} and
274
* {@link #decodeNorm(byte) decoded} back to a float <i>norm</i> value.
275
* This encoding/decoding, while reducing index size, comes with the price of
276
* precision loss - it is not guaranteed that decode(encode(x)) = x.
277
* For instance, decode(encode(0.89)) = 0.75.
278
* Also notice that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
279
* using a different {@link Similarity} for search.
284
* @see #setDefault(Similarity)
285
* @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
286
* @see Searcher#setSimilarity(Similarity)
288
public abstract class Similarity implements Serializable {
289
/** The Similarity implementation used by default. */
290
private static Similarity defaultImpl = new DefaultSimilarity();
292
/** Set the default Similarity implementation used by indexing and search
295
* @see Searcher#setSimilarity(Similarity)
296
* @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
298
public static void setDefault(Similarity similarity) {
299
Similarity.defaultImpl = similarity;
302
/** Return the default Similarity implementation used by indexing and search
305
* <p>This is initially an instance of {@link DefaultSimilarity}.
307
* @see Searcher#setSimilarity(Similarity)
308
* @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
310
public static Similarity getDefault() {
311
return Similarity.defaultImpl;
314
/** Cache of decoded bytes. */
315
private static final float[] NORM_TABLE = new float[256];
318
for (int i = 0; i < 256; i++)
319
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
322
/** Decodes a normalization factor stored in an index.
323
* @see #encodeNorm(float)
325
public static float decodeNorm(byte b) {
326
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
329
/** Returns a table for decoding normalization bytes.
330
* @see #encodeNorm(float)
332
public static float[] getNormDecoder() {
336
/** Computes the normalization value for a field given the total number of
337
* terms contained in a field. These values, together with field boosts, are
338
* stored in an index and multipled into scores for hits on each field by the
341
* <p>Matches in longer fields are less precise, so implementations of this
342
* method usually return smaller values when <code>numTokens</code> is large,
343
* and larger values when <code>numTokens</code> is small.
345
* <p>That these values are computed under
346
* {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
347
* and stored then using
348
* {@link #encodeNorm(float)}.
349
* Thus they have limited precision, and documents
350
* must be re-indexed if this method is altered.
352
* @param fieldName the name of the field
353
* @param numTokens the total number of tokens contained in fields named
354
* <i>fieldName</i> of <i>doc</i>.
355
* @return a normalization factor for hits on this field of this document
357
* @see org.apache.lucene.document.Field#setBoost(float)
359
public abstract float lengthNorm(String fieldName, int numTokens);
361
/** Computes the normalization value for a query given the sum of the squared
362
* weights of each of the query terms. This value is then multipled into the
363
* weight of each query term.
365
* <p>This does not affect ranking, but rather just attempts to make scores
366
* from different queries comparable.
368
* @param sumOfSquaredWeights the sum of the squares of query term weights
369
* @return a normalization factor for query weights
371
public abstract float queryNorm(float sumOfSquaredWeights);
373
/** Encodes a normalization factor for storage in an index.
375
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
376
* the zero-exponent point at 15, thus
377
* representing values from around 7x10^9 to 2x10^-9 with about one
378
* significant decimal digit of accuracy. Zero is also represented.
379
* Negative numbers are rounded up to zero. Values too large to represent
380
* are rounded down to the largest representable value. Positive values too
381
* small to represent are rounded up to the smallest positive representable
384
* @see org.apache.lucene.document.Field#setBoost(float)
385
* @see org.apache.lucene.util.SmallFloat
387
public static byte encodeNorm(float f) {
388
return SmallFloat.floatToByte315(f);
392
/** Computes a score factor based on a term or phrase's frequency in a
393
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
394
* factor for each term in the query and these products are then summed to
395
* form the initial score for a document.
397
* <p>Terms and phrases repeated in a document indicate the topic of the
398
* document, so implementations of this method usually return larger values
399
* when <code>freq</code> is large, and smaller values when <code>freq</code>
402
* <p>The default implementation calls {@link #tf(float)}.
404
* @param freq the frequency of a term within a document
405
* @return a score factor based on a term's within-document frequency
407
public float tf(int freq) {
408
return tf((float)freq);
411
/** Computes the amount of a sloppy phrase match, based on an edit distance.
412
* This value is summed for each sloppy phrase match in a document to form
413
* the frequency that is passed to {@link #tf(float)}.
415
* <p>A phrase match with a small edit distance to a document passage more
416
* closely matches the document, so implementations of this method usually
417
* return larger values when the edit distance is small and smaller values
420
* @see PhraseQuery#setSlop(int)
421
* @param distance the edit distance of this sloppy phrase match
422
* @return the frequency increment for this match
424
public abstract float sloppyFreq(int distance);
426
/** Computes a score factor based on a term or phrase's frequency in a
427
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
428
* factor for each term in the query and these products are then summed to
429
* form the initial score for a document.
431
* <p>Terms and phrases repeated in a document indicate the topic of the
432
* document, so implementations of this method usually return larger values
433
* when <code>freq</code> is large, and smaller values when <code>freq</code>
436
* @param freq the frequency of a term within a document
437
* @return a score factor based on a term's within-document frequency
439
public abstract float tf(float freq);
441
/** Computes a score factor for a simple term.
443
* <p>The default implementation is:<pre>
444
* return idf(searcher.docFreq(term), searcher.maxDoc());
447
* Note that {@link Searcher#maxDoc()} is used instead of
448
* {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
449
* {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
450
* so is the other, and in the same direction.
452
* @param term the term in question
453
* @param searcher the document collection being searched
454
* @return a score factor for the term
456
public float idf(Term term, Searcher searcher) throws IOException {
457
return idf(searcher.docFreq(term), searcher.maxDoc());
460
/** Computes a score factor for a phrase.
462
* <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
463
* for each term in the phrase.
465
* @param terms the terms in the phrase
466
* @param searcher the document collection being searched
467
* @return a score factor for the phrase
469
public float idf(Collection terms, Searcher searcher) throws IOException {
471
Iterator i = terms.iterator();
472
while (i.hasNext()) {
473
idf += idf((Term)i.next(), searcher);
478
/** Computes a score factor based on a term's document frequency (the number
479
* of documents which contain the term). This value is multiplied by the
480
* {@link #tf(int)} factor for each term in the query and these products are
481
* then summed to form the initial score for a document.
483
* <p>Terms that occur in fewer documents are better indicators of topic, so
484
* implementations of this method usually return larger values for rare terms,
485
* and smaller values for common terms.
487
* @param docFreq the number of documents which contain the term
488
* @param numDocs the total number of documents in the collection
489
* @return a score factor based on the term's document frequency
491
public abstract float idf(int docFreq, int numDocs);
493
/** Computes a score factor based on the fraction of all query terms that a
494
* document contains. This value is multiplied into scores.
496
* <p>The presence of a large portion of the query terms indicates a better
497
* match with the query, so implementations of this method usually return
498
* larger values when the ratio between these parameters is large and smaller
499
* values when the ratio between them is small.
501
* @param overlap the number of query terms matched in the document
502
* @param maxOverlap the total number of terms in the query
503
* @return a score factor based on term overlap with the query
505
public abstract float coord(int overlap, int maxOverlap);
509
* Calculate a scoring factor based on the data in the payload. Overriding implementations
510
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
511
* what is in the byte array.
513
* The default implementation returns 1.
515
* @param fieldName The fieldName of the term this payload belongs to
516
* @param payload The payload byte array to be scored
517
* @param offset The offset into the payload array
518
* @param length The length in the array
519
* @return An implementation dependent float to be used as a scoring factor
521
public float scorePayload(String fieldName, byte [] payload, int offset, int length)