~registry/clubdistro/anoochit-clubdistro

« back to all changes in this revision

Viewing changes to concrete5-5.2ubuntu1/concrete5.2.0RC2/concrete/libraries/3rdparty/Zend/Search/Lucene/Search/Similarity.php

  • Committer: Anuchit Chalothorn
  • Date: 2009-01-19 08:26:13 UTC
  • Revision ID: anoochit@gmail.com-20090119082613-jyxv9tam9ktfa73t
add concrete5 package

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<?php 
 
2
/**
 
3
 * Zend Framework
 
4
 *
 
5
 * LICENSE
 
6
 *
 
7
 * This source file is subject to the new BSD license that is bundled
 
8
 * with this package in the file LICENSE.txt.
 
9
 * It is also available through the world-wide-web at this URL:
 
10
 * http://framework.zend.com/license/new-bsd
 
11
 * If you did not receive a copy of the license and are unable to
 
12
 * obtain it through the world-wide-web, please send an email
 
13
 * to license@zend.com so we can send you a copy immediately.
 
14
 *
 
15
 * @category   Zend
 
16
 * @package    Zend_Search_Lucene
 
17
 * @subpackage Search
 
18
 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 
19
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 
20
 */
 
21
 
 
22
 
 
23
/** Zend_Search_Lucene_Search_Similarity_Default */
 
24
require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
 
25
 
 
26
 
 
27
/**
 
28
 * @category   Zend
 
29
 * @package    Zend_Search_Lucene
 
30
 * @subpackage Search
 
31
 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 
32
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 
33
 */
 
34
abstract class Zend_Search_Lucene_Search_Similarity
 
35
{
 
36
    /**
 
37
     * The Similarity implementation used by default.
 
38
     *
 
39
     * @var Zend_Search_Lucene_Search_Similarity
 
40
     */
 
41
    private static $_defaultImpl;
 
42
 
 
43
    /**
 
44
     * Cache of decoded bytes.
 
45
     * Array of floats
 
46
     *
 
47
     * @var array
 
48
     */
 
49
    private static $_normTable = array( 0   => 0.0,
 
50
                                        1   => 5.820766E-10,
 
51
                                        2   => 6.9849193E-10,
 
52
                                        3   => 8.1490725E-10,
 
53
                                        4   => 9.313226E-10,
 
54
                                        5   => 1.1641532E-9,
 
55
                                        6   => 1.3969839E-9,
 
56
                                        7   => 1.6298145E-9,
 
57
                                        8   => 1.8626451E-9,
 
58
                                        9   => 2.3283064E-9,
 
59
                                        10  => 2.7939677E-9,
 
60
                                        11  => 3.259629E-9,
 
61
                                        12  => 3.7252903E-9,
 
62
                                        13  => 4.656613E-9,
 
63
                                        14  => 5.5879354E-9,
 
64
                                        15  => 6.519258E-9,
 
65
                                        16  => 7.4505806E-9,
 
66
                                        17  => 9.313226E-9,
 
67
                                        18  => 1.1175871E-8,
 
68
                                        19  => 1.3038516E-8,
 
69
                                        20  => 1.4901161E-8,
 
70
                                        21  => 1.8626451E-8,
 
71
                                        22  => 2.2351742E-8,
 
72
                                        23  => 2.6077032E-8,
 
73
                                        24  => 2.9802322E-8,
 
74
                                        25  => 3.7252903E-8,
 
75
                                        26  => 4.4703484E-8,
 
76
                                        27  => 5.2154064E-8,
 
77
                                        28  => 5.9604645E-8,
 
78
                                        29  => 7.4505806E-8,
 
79
                                        30  => 8.940697E-8,
 
80
                                        31  => 1.0430813E-7,
 
81
                                        32  => 1.1920929E-7,
 
82
                                        33  => 1.4901161E-7,
 
83
                                        34  => 1.7881393E-7,
 
84
                                        35  => 2.0861626E-7,
 
85
                                        36  => 2.3841858E-7,
 
86
                                        37  => 2.9802322E-7,
 
87
                                        38  => 3.5762787E-7,
 
88
                                        39  => 4.172325E-7,
 
89
                                        40  => 4.7683716E-7,
 
90
                                        41  => 5.9604645E-7,
 
91
                                        42  => 7.1525574E-7,
 
92
                                        43  => 8.34465E-7,
 
93
                                        44  => 9.536743E-7,
 
94
                                        45  => 1.1920929E-6,
 
95
                                        46  => 1.4305115E-6,
 
96
                                        47  => 1.66893E-6,
 
97
                                        48  => 1.9073486E-6,
 
98
                                        49  => 2.3841858E-6,
 
99
                                        50  => 2.861023E-6,
 
100
                                        51  => 3.33786E-6,
 
101
                                        52  => 3.8146973E-6,
 
102
                                        53  => 4.7683716E-6,
 
103
                                        54  => 5.722046E-6,
 
104
                                        55  => 6.67572E-6,
 
105
                                        56  => 7.6293945E-6,
 
106
                                        57  => 9.536743E-6,
 
107
                                        58  => 1.1444092E-5,
 
108
                                        59  => 1.335144E-5,
 
109
                                        60  => 1.5258789E-5,
 
110
                                        61  => 1.9073486E-5,
 
111
                                        62  => 2.2888184E-5,
 
112
                                        63  => 2.670288E-5,
 
113
                                        64  => 3.0517578E-5,
 
114
                                        65  => 3.8146973E-5,
 
115
                                        66  => 4.5776367E-5,
 
116
                                        67  => 5.340576E-5,
 
117
                                        68  => 6.1035156E-5,
 
118
                                        69  => 7.6293945E-5,
 
119
                                        70  => 9.1552734E-5,
 
120
                                        71  => 1.0681152E-4,
 
121
                                        72  => 1.2207031E-4,
 
122
                                        73  => 1.5258789E-4,
 
123
                                        74  => 1.8310547E-4,
 
124
                                        75  => 2.1362305E-4,
 
125
                                        76  => 2.4414062E-4,
 
126
                                        77  => 3.0517578E-4,
 
127
                                        78  => 3.6621094E-4,
 
128
                                        79  => 4.272461E-4,
 
129
                                        80  => 4.8828125E-4,
 
130
                                        81  => 6.1035156E-4,
 
131
                                        82  => 7.324219E-4,
 
132
                                        83  => 8.544922E-4,
 
133
                                        84  => 9.765625E-4,
 
134
                                        85  => 0.0012207031,
 
135
                                        86  => 0.0014648438,
 
136
                                        87  => 0.0017089844,
 
137
                                        88  => 0.001953125,
 
138
                                        89  => 0.0024414062,
 
139
                                        90  => 0.0029296875,
 
140
                                        91  => 0.0034179688,
 
141
                                        92  => 0.00390625,
 
142
                                        93  => 0.0048828125,
 
143
                                        94  => 0.005859375,
 
144
                                        95  => 0.0068359375,
 
145
                                        96  => 0.0078125,
 
146
                                        97  => 0.009765625,
 
147
                                        98  => 0.01171875,
 
148
                                        99  => 0.013671875,
 
149
                                        100 => 0.015625,
 
150
                                        101 => 0.01953125,
 
151
                                        102 => 0.0234375,
 
152
                                        103 => 0.02734375,
 
153
                                        104 => 0.03125,
 
154
                                        105 => 0.0390625,
 
155
                                        106 => 0.046875,
 
156
                                        107 => 0.0546875,
 
157
                                        108 => 0.0625,
 
158
                                        109 => 0.078125,
 
159
                                        110 => 0.09375,
 
160
                                        111 => 0.109375,
 
161
                                        112 => 0.125,
 
162
                                        113 => 0.15625,
 
163
                                        114 => 0.1875,
 
164
                                        115 => 0.21875,
 
165
                                        116 => 0.25,
 
166
                                        117 => 0.3125,
 
167
                                        118 => 0.375,
 
168
                                        119 => 0.4375,
 
169
                                        120 => 0.5,
 
170
                                        121 => 0.625,
 
171
                                        122 => 0.75,
 
172
                                        123 => 0.875,
 
173
                                        124 => 1.0,
 
174
                                        125 => 1.25,
 
175
                                        126 => 1.5,
 
176
                                        127 => 1.75,
 
177
                                        128 => 2.0,
 
178
                                        129 => 2.5,
 
179
                                        130 => 3.0,
 
180
                                        131 => 3.5,
 
181
                                        132 => 4.0,
 
182
                                        133 => 5.0,
 
183
                                        134 => 6.0,
 
184
                                        135 => 7.0,
 
185
                                        136 => 8.0,
 
186
                                        137 => 10.0,
 
187
                                        138 => 12.0,
 
188
                                        139 => 14.0,
 
189
                                        140 => 16.0,
 
190
                                        141 => 20.0,
 
191
                                        142 => 24.0,
 
192
                                        143 => 28.0,
 
193
                                        144 => 32.0,
 
194
                                        145 => 40.0,
 
195
                                        146 => 48.0,
 
196
                                        147 => 56.0,
 
197
                                        148 => 64.0,
 
198
                                        149 => 80.0,
 
199
                                        150 => 96.0,
 
200
                                        151 => 112.0,
 
201
                                        152 => 128.0,
 
202
                                        153 => 160.0,
 
203
                                        154 => 192.0,
 
204
                                        155 => 224.0,
 
205
                                        156 => 256.0,
 
206
                                        157 => 320.0,
 
207
                                        158 => 384.0,
 
208
                                        159 => 448.0,
 
209
                                        160 => 512.0,
 
210
                                        161 => 640.0,
 
211
                                        162 => 768.0,
 
212
                                        163 => 896.0,
 
213
                                        164 => 1024.0,
 
214
                                        165 => 1280.0,
 
215
                                        166 => 1536.0,
 
216
                                        167 => 1792.0,
 
217
                                        168 => 2048.0,
 
218
                                        169 => 2560.0,
 
219
                                        170 => 3072.0,
 
220
                                        171 => 3584.0,
 
221
                                        172 => 4096.0,
 
222
                                        173 => 5120.0,
 
223
                                        174 => 6144.0,
 
224
                                        175 => 7168.0,
 
225
                                        176 => 8192.0,
 
226
                                        177 => 10240.0,
 
227
                                        178 => 12288.0,
 
228
                                        179 => 14336.0,
 
229
                                        180 => 16384.0,
 
230
                                        181 => 20480.0,
 
231
                                        182 => 24576.0,
 
232
                                        183 => 28672.0,
 
233
                                        184 => 32768.0,
 
234
                                        185 => 40960.0,
 
235
                                        186 => 49152.0,
 
236
                                        187 => 57344.0,
 
237
                                        188 => 65536.0,
 
238
                                        189 => 81920.0,
 
239
                                        190 => 98304.0,
 
240
                                        191 => 114688.0,
 
241
                                        192 => 131072.0,
 
242
                                        193 => 163840.0,
 
243
                                        194 => 196608.0,
 
244
                                        195 => 229376.0,
 
245
                                        196 => 262144.0,
 
246
                                        197 => 327680.0,
 
247
                                        198 => 393216.0,
 
248
                                        199 => 458752.0,
 
249
                                        200 => 524288.0,
 
250
                                        201 => 655360.0,
 
251
                                        202 => 786432.0,
 
252
                                        203 => 917504.0,
 
253
                                        204 => 1048576.0,
 
254
                                        205 => 1310720.0,
 
255
                                        206 => 1572864.0,
 
256
                                        207 => 1835008.0,
 
257
                                        208 => 2097152.0,
 
258
                                        209 => 2621440.0,
 
259
                                        210 => 3145728.0,
 
260
                                        211 => 3670016.0,
 
261
                                        212 => 4194304.0,
 
262
                                        213 => 5242880.0,
 
263
                                        214 => 6291456.0,
 
264
                                        215 => 7340032.0,
 
265
                                        216 => 8388608.0,
 
266
                                        217 => 1.048576E7,
 
267
                                        218 => 1.2582912E7,
 
268
                                        219 => 1.4680064E7,
 
269
                                        220 => 1.6777216E7,
 
270
                                        221 => 2.097152E7,
 
271
                                        222 => 2.5165824E7,
 
272
                                        223 => 2.9360128E7,
 
273
                                        224 => 3.3554432E7,
 
274
                                        225 => 4.194304E7,
 
275
                                        226 => 5.0331648E7,
 
276
                                        227 => 5.8720256E7,
 
277
                                        228 => 6.7108864E7,
 
278
                                        229 => 8.388608E7,
 
279
                                        230 => 1.00663296E8,
 
280
                                        231 => 1.17440512E8,
 
281
                                        232 => 1.34217728E8,
 
282
                                        233 => 1.6777216E8,
 
283
                                        234 => 2.01326592E8,
 
284
                                        235 => 2.34881024E8,
 
285
                                        236 => 2.68435456E8,
 
286
                                        237 => 3.3554432E8,
 
287
                                        238 => 4.02653184E8,
 
288
                                        239 => 4.69762048E8,
 
289
                                        240 => 5.3687091E8,
 
290
                                        241 => 6.7108864E8,
 
291
                                        242 => 8.0530637E8,
 
292
                                        243 => 9.395241E8,
 
293
                                        244 => 1.07374182E9,
 
294
                                        245 => 1.34217728E9,
 
295
                                        246 => 1.61061274E9,
 
296
                                        247 => 1.87904819E9,
 
297
                                        248 => 2.14748365E9,
 
298
                                        249 => 2.68435456E9,
 
299
                                        250 => 3.22122547E9,
 
300
                                        251 => 3.75809638E9,
 
301
                                        252 => 4.2949673E9,
 
302
                                        253 => 5.3687091E9,
 
303
                                        254 => 6.4424509E9,
 
304
                                        255 => 7.5161928E9 );
 
305
 
 
306
 
 
307
    /**
 
308
     * Set the default Similarity implementation used by indexing and search
 
309
     * code.
 
310
     *
 
311
     * @param Zend_Search_Lucene_Search_Similarity $similarity
 
312
     */
 
313
    public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
 
314
    {
 
315
        self::$_defaultImpl = $similarity;
 
316
    }
 
317
 
 
318
 
 
319
    /**
 
320
     * Return the default Similarity implementation used by indexing and search
 
321
     * code.
 
322
     *
 
323
     * @return Zend_Search_Lucene_Search_Similarity
 
324
     */
 
325
    public static function getDefault()
 
326
    {
 
327
        if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
 
328
            self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
 
329
        }
 
330
 
 
331
        return self::$_defaultImpl;
 
332
    }
 
333
 
 
334
 
 
335
    /**
 
336
     * Computes the normalization value for a field given the total number of
 
337
     * terms contained in a field.  These values, together with field boosts, are
 
338
     * stored in an index and multipled into scores for hits on each field by the
 
339
     * search code.
 
340
     *
 
341
     * Matches in longer fields are less precise, so implemenations of this
 
342
     * method usually return smaller values when 'numTokens' is large,
 
343
     * and larger values when 'numTokens' is small.
 
344
     *
 
345
     * That these values are computed under
 
346
     * IndexWriter::addDocument(Document) and stored then using
 
347
     * encodeNorm(float).  Thus they have limited precision, and documents
 
348
     * must be re-indexed if this method is altered.
 
349
     *
 
350
     * fieldName - name of field
 
351
     * numTokens - the total number of tokens contained in fields named
 
352
     *             'fieldName' of 'doc'.
 
353
     * Returns a normalization factor for hits on this field of this document
 
354
     *
 
355
     * @param string $fieldName
 
356
     * @param integer $numTokens
 
357
     * @return float
 
358
     */
 
359
    abstract public function lengthNorm($fieldName, $numTokens);
 
360
 
 
361
    /**
 
362
     * Computes the normalization value for a query given the sum of the squared
 
363
     * weights of each of the query terms.  This value is then multipled into the
 
364
     * weight of each query term.
 
365
     *
 
366
     * This does not affect ranking, but rather just attempts to make scores
 
367
     * from different queries comparable.
 
368
     *
 
369
     * sumOfSquaredWeights - the sum of the squares of query term weights
 
370
     * Returns a normalization factor for query weights
 
371
     *
 
372
     * @param float $sumOfSquaredWeights
 
373
     * @return float
 
374
     */
 
375
    abstract public function queryNorm($sumOfSquaredWeights);
 
376
 
 
377
 
 
378
    /**
 
379
     *  Decodes a normalization factor stored in an index.
 
380
     *
 
381
     * @param integer $byte
 
382
     * @return float
 
383
     */
 
384
    public static function decodeNorm($byte)
 
385
    {
 
386
        return self::$_normTable[$byte & 0xFF];
 
387
    }
 
388
 
 
389
 
 
390
    /**
 
391
     * Encodes a normalization factor for storage in an index.
 
392
     *
 
393
     * The encoding uses a five-bit exponent and three-bit mantissa, thus
 
394
     * representing values from around 7x10^9 to 2x10^-9 with about one
 
395
     * significant decimal digit of accuracy.  Zero is also represented.
 
396
     * Negative numbers are rounded up to zero.  Values too large to represent
 
397
     * are rounded down to the largest representable value.  Positive values too
 
398
     * small to represent are rounded up to the smallest positive representable
 
399
     * value.
 
400
     *
 
401
     * @param float $f
 
402
     * @return integer
 
403
     */
 
404
    static function encodeNorm($f)
 
405
    {
 
406
      return self::_floatToByte($f);
 
407
    }
 
408
 
 
409
    /**
 
410
     * Float to byte conversion
 
411
     *
 
412
     * @param integer $b
 
413
     * @return float
 
414
     */
 
415
    private static function _floatToByte($f)
 
416
    {
 
417
        // round negatives up to zero
 
418
        if ($f <= 0.0) {
 
419
            return 0;
 
420
        }
 
421
 
 
422
        // search for appropriate value
 
423
        $lowIndex = 0;
 
424
        $highIndex = 255;
 
425
        while ($highIndex >= $lowIndex) {
 
426
            // $mid = ($highIndex - $lowIndex)/2;
 
427
            $mid = ($highIndex + $lowIndex) >> 1;
 
428
            $delta = $f - self::$_normTable[$mid];
 
429
 
 
430
            if ($delta < 0) {
 
431
                $highIndex = $mid-1;
 
432
            } elseif ($delta > 0) {
 
433
                $lowIndex  = $mid+1;
 
434
            } else {
 
435
                return $mid; // We got it!
 
436
            }
 
437
        }
 
438
 
 
439
        // round to closest value
 
440
        if ($highIndex != 255 &&
 
441
            $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
 
442
            return $highIndex + 1;
 
443
        } else {
 
444
            return $highIndex;
 
445
        }
 
446
    }
 
447
 
 
448
 
 
449
    /**
 
450
     * Computes a score factor based on a term or phrase's frequency in a
 
451
     * document.  This value is multiplied by the idf(Term, Searcher)
 
452
     * factor for each term in the query and these products are then summed to
 
453
     * form the initial score for a document.
 
454
     *
 
455
     * Terms and phrases repeated in a document indicate the topic of the
 
456
     * document, so implementations of this method usually return larger values
 
457
     * when 'freq' is large, and smaller values when 'freq'
 
458
     * is small.
 
459
     *
 
460
     * freq - the frequency of a term within a document
 
461
     * Returns a score factor based on a term's within-document frequency
 
462
     *
 
463
     * @param float $freq
 
464
     * @return float
 
465
     */
 
466
    abstract public function tf($freq);
 
467
 
 
468
    /**
 
469
     * Computes the amount of a sloppy phrase match, based on an edit distance.
 
470
     * This value is summed for each sloppy phrase match in a document to form
 
471
     * the frequency that is passed to tf(float).
 
472
     *
 
473
     * A phrase match with a small edit distance to a document passage more
 
474
     * closely matches the document, so implementations of this method usually
 
475
     * return larger values when the edit distance is small and smaller values
 
476
     * when it is large.
 
477
     *
 
478
     * distance - the edit distance of this sloppy phrase match
 
479
     * Returns the frequency increment for this match
 
480
     *
 
481
     * @param integer $distance
 
482
     * @return float
 
483
     */
 
484
    abstract public function sloppyFreq($distance);
 
485
 
 
486
 
 
487
    /**
 
488
     * Computes a score factor for a simple term or a phrase.
 
489
     *
 
490
     * The default implementation is:
 
491
     *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
 
492
     *
 
493
     * input - the term in question or array of terms
 
494
     * reader - reader the document collection being searched
 
495
     * Returns a score factor for the term
 
496
     *
 
497
     * @param mixed $input
 
498
     * @param Zend_Search_Lucene_Interface $reader
 
499
     * @return a score factor for the term
 
500
     */
 
501
    public function idf($input, Zend_Search_Lucene_Interface $reader)
 
502
    {
 
503
        if (!is_array($input)) {
 
504
            return $this->idfFreq($reader->docFreq($input), $reader->count());
 
505
        } else {
 
506
            $idf = 0.0;
 
507
            foreach ($input as $term) {
 
508
                $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
 
509
            }
 
510
            return $idf;
 
511
        }
 
512
    }
 
513
 
 
514
    /**
 
515
     * Computes a score factor based on a term's document frequency (the number
 
516
     * of documents which contain the term).  This value is multiplied by the
 
517
     * tf(int) factor for each term in the query and these products are
 
518
     * then summed to form the initial score for a document.
 
519
     *
 
520
     * Terms that occur in fewer documents are better indicators of topic, so
 
521
     * implemenations of this method usually return larger values for rare terms,
 
522
     * and smaller values for common terms.
 
523
     *
 
524
     * docFreq - the number of documents which contain the term
 
525
     * numDocs - the total number of documents in the collection
 
526
     * Returns a score factor based on the term's document frequency
 
527
     *
 
528
     * @param integer $docFreq
 
529
     * @param integer $numDocs
 
530
     * @return float
 
531
     */
 
532
    abstract public function idfFreq($docFreq, $numDocs);
 
533
 
 
534
    /**
 
535
     * Computes a score factor based on the fraction of all query terms that a
 
536
     * document contains.  This value is multiplied into scores.
 
537
     *
 
538
     * The presence of a large portion of the query terms indicates a better
 
539
     * match with the query, so implemenations of this method usually return
 
540
     * larger values when the ratio between these parameters is large and smaller
 
541
     * values when the ratio between them is small.
 
542
     *
 
543
     * overlap - the number of query terms matched in the document
 
544
     * maxOverlap - the total number of terms in the query
 
545
     * Returns a score factor based on term overlap with the query
 
546
     *
 
547
     * @param integer $overlap
 
548
     * @param integer $maxOverlap
 
549
     * @return float
 
550
     */
 
551
    abstract public function coord($overlap, $maxOverlap);
 
552
}
 
553