7
* This source file is subject to the new BSD license that is bundled
8
* with this package in the file LICENSE.txt.
9
* It is also available through the world-wide-web at this URL:
10
* http://framework.zend.com/license/new-bsd
11
* If you did not receive a copy of the license and are unable to
12
* obtain it through the world-wide-web, please send an email
13
* to license@zend.com so we can send you a copy immediately.
16
* @package Zend_Search_Lucene
18
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
19
* @license http://framework.zend.com/license/new-bsd New BSD License
23
/** Zend_Search_Lucene_Exception */
24
require_once 'Zend/Search/Lucene/Exception.php';
30
* It's a dummy class which is created to encapsulate non-good structured code.
31
* Manual "method inlining" is performed to increase dictionary index loading operation
32
* which is major bottelneck for search performance.
36
* @package Zend_Search_Lucene
38
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
39
* @license http://framework.zend.com/license/new-bsd New BSD License
41
class Zend_Search_Lucene_Index_DictionaryLoader
44
* Dictionary index loader.
46
* It takes a string which is actually <segment_name>.tii index file data and
47
* returns two arrays - term and tremInfo lists.
49
* See Zend_Search_Lucene_Index_SegmintInfo class for details
53
* @throws Zend_Search_Lucene_Exception
55
public static function load($data)
57
$termDictionary = array();
61
// $tiVersion = $tiiFile->readInt();
62
$tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
64
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
65
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
66
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
69
// $indexTermCount = $tiiFile->readLong();
70
if (PHP_INT_SIZE > 4) {
71
$indexTermCount = ord($data[$pos]) << 56 |
72
ord($data[$pos+1]) << 48 |
73
ord($data[$pos+2]) << 40 |
74
ord($data[$pos+3]) << 32 |
75
ord($data[$pos+4]) << 24 |
76
ord($data[$pos+5]) << 16 |
77
ord($data[$pos+6]) << 8 |
80
if ((ord($data[$pos]) != 0) ||
81
(ord($data[$pos+1]) != 0) ||
82
(ord($data[$pos+2]) != 0) ||
83
(ord($data[$pos+3]) != 0) ||
84
((ord($data[$pos+4]) & 0x80) != 0)) {
85
throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
88
$indexTermCount = ord($data[$pos+4]) << 24 |
89
ord($data[$pos+5]) << 16 |
90
ord($data[$pos+6]) << 8 |
95
// $tiiFile->readInt(); // IndexInterval
98
// $skipInterval = $tiiFile->readInt();
99
$skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
101
if ($indexTermCount < 1) {
102
throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
105
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
106
/* Skip MaxSkipLevels value */
114
for ($count = 0; $count < $indexTermCount; $count++) {
115
//$termPrefixLength = $tiiFile->readVInt();
116
$nbyte = ord($data[$pos++]);
117
$termPrefixLength = $nbyte & 0x7F;
118
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
119
$nbyte = ord($data[$pos++]);
120
$termPrefixLength |= ($nbyte & 0x7F) << $shift;
123
// $termSuffix = $tiiFile->readString();
124
$nbyte = ord($data[$pos++]);
125
$len = $nbyte & 0x7F;
126
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
127
$nbyte = ord($data[$pos++]);
128
$len |= ($nbyte & 0x7F) << $shift;
133
$termSuffix = substr($data, $pos, $len);
135
for ($count1 = 0; $count1 < $len; $count1++ ) {
136
if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
138
if (ord($termSuffix[$count1]) & 0x20 ) {
141
// Never used for Java Lucene created index.
142
// Java2 doesn't encode strings in four bytes
143
if (ord($termSuffix[$count1]) & 0x10 ) {
147
$termSuffix .= substr($data, $pos, $addBytes);
151
// Check for null character. Java2 encodes null character
153
if (ord($termSuffix[$count1]) == 0xC0 &&
154
ord($termSuffix[$count1+1]) == 0x80 ) {
155
$termSuffix[$count1] = 0;
156
$termSuffix = substr($termSuffix,0,$count1+1)
157
. substr($termSuffix,$count1+2);
159
$count1 += $addBytes;
164
// $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
166
while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
168
if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
170
if (ord($prevTerm[$pb]) & 0x20 ) {
172
if (ord($prevTerm[$pb]) & 0x10 ) {
178
if ($pb + $charBytes > strlen($data)) {
186
$termValue = substr($prevTerm, 0, $pb) . $termSuffix;
188
// $termFieldNum = $tiiFile->readVInt();
189
$nbyte = ord($data[$pos++]);
190
$termFieldNum = $nbyte & 0x7F;
191
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
192
$nbyte = ord($data[$pos++]);
193
$termFieldNum |= ($nbyte & 0x7F) << $shift;
196
// $docFreq = $tiiFile->readVInt();
197
$nbyte = ord($data[$pos++]);
198
$docFreq = $nbyte & 0x7F;
199
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
200
$nbyte = ord($data[$pos++]);
201
$docFreq |= ($nbyte & 0x7F) << $shift;
204
// $freqPointer += $tiiFile->readVInt();
205
$nbyte = ord($data[$pos++]);
206
$vint = $nbyte & 0x7F;
207
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
208
$nbyte = ord($data[$pos++]);
209
$vint |= ($nbyte & 0x7F) << $shift;
211
$freqPointer += $vint;
213
// $proxPointer += $tiiFile->readVInt();
214
$nbyte = ord($data[$pos++]);
215
$vint = $nbyte & 0x7F;
216
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
217
$nbyte = ord($data[$pos++]);
218
$vint |= ($nbyte & 0x7F) << $shift;
220
$proxPointer += $vint;
222
if( $docFreq >= $skipInterval ) {
223
// $skipDelta = $tiiFile->readVInt();
224
$nbyte = ord($data[$pos++]);
225
$vint = $nbyte & 0x7F;
226
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
227
$nbyte = ord($data[$pos++]);
228
$vint |= ($nbyte & 0x7F) << $shift;
235
// $indexPointer += $tiiFile->readVInt();
236
$nbyte = ord($data[$pos++]);
237
$vint = $nbyte & 0x7F;
238
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
239
$nbyte = ord($data[$pos++]);
240
$vint |= ($nbyte & 0x7F) << $shift;
242
$indexPointer += $vint;
245
// $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
246
$termDictionary[] = array($termFieldNum, $termValue);
249
// new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
250
array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
252
$prevTerm = $termValue;
255
// Check special index entry mark
256
if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
257
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
258
} else if (PHP_INT_SIZE > 4){
259
// Treat 64-bit 0xFFFFFFFF as -1
260
$termDictionary[0][0] = -1;
263
return array(&$termDictionary, &$termInfos);