7
* This source file is subject to the new BSD license that is bundled
8
* with this package in the file LICENSE.txt.
9
* It is also available through the world-wide-web at this URL:
10
* http://framework.zend.com/license/new-bsd
11
* If you did not receive a copy of the license and are unable to
12
* obtain it through the world-wide-web, please send an email
13
* to license@zend.com so we can send you a copy immediately.
16
* @package Zend_Search_Lucene
18
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
19
* @license http://framework.zend.com/license/new-bsd New BSD License
23
/** Zend_Search_Lucene_Search_Similarity_Default */
24
require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
29
* @package Zend_Search_Lucene
31
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
32
* @license http://framework.zend.com/license/new-bsd New BSD License
34
abstract class Zend_Search_Lucene_Search_Similarity
37
* The Similarity implementation used by default.
39
* @var Zend_Search_Lucene_Search_Similarity
41
private static $_defaultImpl;
44
* Cache of decoded bytes.
49
private static $_normTable = array( 0 => 0.0,
304
255 => 7.5161928E9 );
308
* Set the default Similarity implementation used by indexing and search
311
* @param Zend_Search_Lucene_Search_Similarity $similarity
313
public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
315
self::$_defaultImpl = $similarity;
320
* Return the default Similarity implementation used by indexing and search
323
* @return Zend_Search_Lucene_Search_Similarity
325
public static function getDefault()
327
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
328
self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
331
return self::$_defaultImpl;
336
* Computes the normalization value for a field given the total number of
337
* terms contained in a field. These values, together with field boosts, are
338
* stored in an index and multipled into scores for hits on each field by the
341
* Matches in longer fields are less precise, so implemenations of this
342
* method usually return smaller values when 'numTokens' is large,
343
* and larger values when 'numTokens' is small.
345
* That these values are computed under
346
* IndexWriter::addDocument(Document) and stored then using
347
* encodeNorm(float). Thus they have limited precision, and documents
348
* must be re-indexed if this method is altered.
350
* fieldName - name of field
351
* numTokens - the total number of tokens contained in fields named
352
* 'fieldName' of 'doc'.
353
* Returns a normalization factor for hits on this field of this document
355
* @param string $fieldName
356
* @param integer $numTokens
359
abstract public function lengthNorm($fieldName, $numTokens);
362
* Computes the normalization value for a query given the sum of the squared
363
* weights of each of the query terms. This value is then multipled into the
364
* weight of each query term.
366
* This does not affect ranking, but rather just attempts to make scores
367
* from different queries comparable.
369
* sumOfSquaredWeights - the sum of the squares of query term weights
370
* Returns a normalization factor for query weights
372
* @param float $sumOfSquaredWeights
375
abstract public function queryNorm($sumOfSquaredWeights);
379
* Decodes a normalization factor stored in an index.
381
* @param integer $byte
384
public static function decodeNorm($byte)
386
return self::$_normTable[$byte & 0xFF];
391
* Encodes a normalization factor for storage in an index.
393
* The encoding uses a five-bit exponent and three-bit mantissa, thus
394
* representing values from around 7x10^9 to 2x10^-9 with about one
395
* significant decimal digit of accuracy. Zero is also represented.
396
* Negative numbers are rounded up to zero. Values too large to represent
397
* are rounded down to the largest representable value. Positive values too
398
* small to represent are rounded up to the smallest positive representable
404
static function encodeNorm($f)
406
return self::_floatToByte($f);
410
* Float to byte conversion
415
private static function _floatToByte($f)
417
// round negatives up to zero
422
// search for appropriate value
425
while ($highIndex >= $lowIndex) {
426
// $mid = ($highIndex - $lowIndex)/2;
427
$mid = ($highIndex + $lowIndex) >> 1;
428
$delta = $f - self::$_normTable[$mid];
432
} elseif ($delta > 0) {
435
return $mid; // We got it!
439
// round to closest value
440
if ($highIndex != 255 &&
441
$f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
442
return $highIndex + 1;
450
* Computes a score factor based on a term or phrase's frequency in a
451
* document. This value is multiplied by the idf(Term, Searcher)
452
* factor for each term in the query and these products are then summed to
453
* form the initial score for a document.
455
* Terms and phrases repeated in a document indicate the topic of the
456
* document, so implementations of this method usually return larger values
457
* when 'freq' is large, and smaller values when 'freq'
460
* freq - the frequency of a term within a document
461
* Returns a score factor based on a term's within-document frequency
466
abstract public function tf($freq);
469
* Computes the amount of a sloppy phrase match, based on an edit distance.
470
* This value is summed for each sloppy phrase match in a document to form
471
* the frequency that is passed to tf(float).
473
* A phrase match with a small edit distance to a document passage more
474
* closely matches the document, so implementations of this method usually
475
* return larger values when the edit distance is small and smaller values
478
* distance - the edit distance of this sloppy phrase match
479
* Returns the frequency increment for this match
481
* @param integer $distance
484
abstract public function sloppyFreq($distance);
488
* Computes a score factor for a simple term or a phrase.
490
* The default implementation is:
491
* return idfFreq(searcher.docFreq(term), searcher.maxDoc());
493
* input - the term in question or array of terms
494
* reader - reader the document collection being searched
495
* Returns a score factor for the term
497
* @param mixed $input
498
* @param Zend_Search_Lucene_Interface $reader
499
* @return a score factor for the term
501
public function idf($input, Zend_Search_Lucene_Interface $reader)
503
if (!is_array($input)) {
504
return $this->idfFreq($reader->docFreq($input), $reader->count());
507
foreach ($input as $term) {
508
$idf += $this->idfFreq($reader->docFreq($term), $reader->count());
515
* Computes a score factor based on a term's document frequency (the number
516
* of documents which contain the term). This value is multiplied by the
517
* tf(int) factor for each term in the query and these products are
518
* then summed to form the initial score for a document.
520
* Terms that occur in fewer documents are better indicators of topic, so
521
* implemenations of this method usually return larger values for rare terms,
522
* and smaller values for common terms.
524
* docFreq - the number of documents which contain the term
525
* numDocs - the total number of documents in the collection
526
* Returns a score factor based on the term's document frequency
528
* @param integer $docFreq
529
* @param integer $numDocs
532
abstract public function idfFreq($docFreq, $numDocs);
535
* Computes a score factor based on the fraction of all query terms that a
536
* document contains. This value is multiplied into scores.
538
* The presence of a large portion of the query terms indicates a better
539
* match with the query, so implemenations of this method usually return
540
* larger values when the ratio between these parameters is large and smaller
541
* values when the ratio between them is small.
543
* overlap - the number of query terms matched in the document
544
* maxOverlap - the total number of terms in the query
545
* Returns a score factor based on term overlap with the query
547
* @param integer $overlap
548
* @param integer $maxOverlap
551
abstract public function coord($overlap, $maxOverlap);