~chroot64bit/zivios/gentoo-experimental

« back to all changes in this revision

Viewing changes to application/library/Zend/Search/Lucene/Search/QueryLexer.php

  • Committer: Mustafa A. Hashmi
  • Date: 2008-12-04 13:32:21 UTC
  • Revision ID: mhashmi@zivios.org-20081204133221-0nd1trunwevijj38
Inclusion of new installation framework with ties to zend layout and dojo layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
<?php
 
2
/**
 
3
 * Zend Framework
 
4
 *
 
5
 * LICENSE
 
6
 *
 
7
 * This source file is subject to the new BSD license that is bundled
 
8
 * with this package in the file LICENSE.txt.
 
9
 * It is also available through the world-wide-web at this URL:
 
10
 * http://framework.zend.com/license/new-bsd
 
11
 * If you did not receive a copy of the license and are unable to
 
12
 * obtain it through the world-wide-web, please send an email
 
13
 * to license@zend.com so we can send you a copy immediately.
 
14
 *
 
15
 * @category   Zend
 
16
 * @package    Zend_Search_Lucene
 
17
 * @subpackage Search
 
18
 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 
19
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 
20
 */
 
21
 
 
22
 
 
23
/** Zend_Search_Lucene_FSM */
 
24
require_once 'Zend/Search/Lucene/FSM.php';
 
25
 
 
26
/** Zend_Search_Lucene_Search_QueryParser */
 
27
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
 
28
 
 
29
/** Zend_Search_Lucene_Exception */
 
30
require_once 'Zend/Search/Lucene/Exception.php';
 
31
 
 
32
/** Zend_Search_Lucene_Search_QueryParserException */
 
33
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
 
34
 
 
35
 
 
36
/**
 
37
 * @category   Zend
 
38
 * @package    Zend_Search_Lucene
 
39
 * @subpackage Search
 
40
 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 
41
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 
42
 */
 
43
class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM
 
44
{
 
45
    /** State Machine states */
 
46
    const ST_WHITE_SPACE     = 0;
 
47
    const ST_SYNT_LEXEME     = 1;
 
48
    const ST_LEXEME          = 2;
 
49
    const ST_QUOTED_LEXEME   = 3;
 
50
    const ST_ESCAPED_CHAR    = 4;
 
51
    const ST_ESCAPED_QCHAR   = 5;
 
52
    const ST_LEXEME_MODIFIER = 6;
 
53
    const ST_NUMBER          = 7;
 
54
    const ST_MANTISSA        = 8;
 
55
    const ST_ERROR           = 9;
 
56
 
 
57
    /** Input symbols */
 
58
    const IN_WHITE_SPACE     = 0;
 
59
    const IN_SYNT_CHAR       = 1;
 
60
    const IN_LEXEME_MODIFIER = 2;
 
61
    const IN_ESCAPE_CHAR     = 3;
 
62
    const IN_QUOTE           = 4;
 
63
    const IN_DECIMAL_POINT   = 5;
 
64
    const IN_ASCII_DIGIT     = 6;
 
65
    const IN_CHAR            = 7;
 
66
    const IN_MUTABLE_CHAR    = 8;
 
67
 
 
68
    const QUERY_WHITE_SPACE_CHARS      = " \n\r\t";
 
69
    const QUERY_SYNT_CHARS             = ':()[]{}!|&';
 
70
    const QUERY_MUTABLE_CHARS          = '+-';
 
71
    const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
 
72
    const QUERY_LEXEMEMODIFIER_CHARS   = '~^';
 
73
    const QUERY_ASCIIDIGITS_CHARS      = '0123456789';
 
74
 
 
75
    /**
 
76
     * List of recognized lexemes
 
77
     *
 
78
     * @var array
 
79
     */
 
80
    private $_lexemes;
 
81
 
 
82
    /**
 
83
     * Query string (array of single- or non single-byte characters)
 
84
     *
 
85
     * @var array
 
86
     */
 
87
    private $_queryString;
 
88
 
 
89
    /**
 
90
     * Current position within a query string
 
91
     * Used to create appropriate error messages
 
92
     *
 
93
     * @var integer
 
94
     */
 
95
    private $_queryStringPosition;
 
96
 
 
97
    /**
 
98
     * Recognized part of current lexeme
 
99
     *
 
100
     * @var string
 
101
     */
 
102
    private $_currentLexeme;
 
103
 
 
104
    public function __construct()
 
105
    {
 
106
        parent::__construct( array(self::ST_WHITE_SPACE,
 
107
                                   self::ST_SYNT_LEXEME,
 
108
                                   self::ST_LEXEME,
 
109
                                   self::ST_QUOTED_LEXEME,
 
110
                                   self::ST_ESCAPED_CHAR,
 
111
                                   self::ST_ESCAPED_QCHAR,
 
112
                                   self::ST_LEXEME_MODIFIER,
 
113
                                   self::ST_NUMBER,
 
114
                                   self::ST_MANTISSA,
 
115
                                   self::ST_ERROR),
 
116
                             array(self::IN_WHITE_SPACE,
 
117
                                   self::IN_SYNT_CHAR,
 
118
                                   self::IN_MUTABLE_CHAR,
 
119
                                   self::IN_LEXEME_MODIFIER,
 
120
                                   self::IN_ESCAPE_CHAR,
 
121
                                   self::IN_QUOTE,
 
122
                                   self::IN_DECIMAL_POINT,
 
123
                                   self::IN_ASCII_DIGIT,
 
124
                                   self::IN_CHAR));
 
125
 
 
126
 
 
127
        $lexemeModifierErrorAction    = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException');
 
128
        $quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException');
 
129
        $wrongNumberErrorAction       = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException');
 
130
 
 
131
 
 
132
 
 
133
        $this->addRules(array( array(self::ST_WHITE_SPACE,   self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
134
                               array(self::ST_WHITE_SPACE,   self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
135
                               array(self::ST_WHITE_SPACE,   self::IN_MUTABLE_CHAR,    self::ST_SYNT_LEXEME),
 
136
                               array(self::ST_WHITE_SPACE,   self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
137
                               array(self::ST_WHITE_SPACE,   self::IN_ESCAPE_CHAR,     self::ST_ESCAPED_CHAR),
 
138
                               array(self::ST_WHITE_SPACE,   self::IN_QUOTE,           self::ST_QUOTED_LEXEME),
 
139
                               array(self::ST_WHITE_SPACE,   self::IN_DECIMAL_POINT,   self::ST_LEXEME),
 
140
                               array(self::ST_WHITE_SPACE,   self::IN_ASCII_DIGIT,     self::ST_LEXEME),
 
141
                               array(self::ST_WHITE_SPACE,   self::IN_CHAR,            self::ST_LEXEME)
 
142
                             ));
 
143
        $this->addRules(array( array(self::ST_SYNT_LEXEME,   self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
144
                               array(self::ST_SYNT_LEXEME,   self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
145
                               array(self::ST_SYNT_LEXEME,   self::IN_MUTABLE_CHAR,    self::ST_SYNT_LEXEME),
 
146
                               array(self::ST_SYNT_LEXEME,   self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
147
                               array(self::ST_SYNT_LEXEME,   self::IN_ESCAPE_CHAR,     self::ST_ESCAPED_CHAR),
 
148
                               array(self::ST_SYNT_LEXEME,   self::IN_QUOTE,           self::ST_QUOTED_LEXEME),
 
149
                               array(self::ST_SYNT_LEXEME,   self::IN_DECIMAL_POINT,   self::ST_LEXEME),
 
150
                               array(self::ST_SYNT_LEXEME,   self::IN_ASCII_DIGIT,     self::ST_LEXEME),
 
151
                               array(self::ST_SYNT_LEXEME,   self::IN_CHAR,            self::ST_LEXEME)
 
152
                             ));
 
153
        $this->addRules(array( array(self::ST_LEXEME,        self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
154
                               array(self::ST_LEXEME,        self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
155
                               array(self::ST_LEXEME,        self::IN_MUTABLE_CHAR,    self::ST_LEXEME),
 
156
                               array(self::ST_LEXEME,        self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
157
                               array(self::ST_LEXEME,        self::IN_ESCAPE_CHAR,     self::ST_ESCAPED_CHAR),
 
158
 
 
159
                               // IN_QUOTE     not allowed
 
160
                               array(self::ST_LEXEME,        self::IN_QUOTE,           self::ST_ERROR, $quoteWithinLexemeErrorAction),
 
161
 
 
162
                               array(self::ST_LEXEME,        self::IN_DECIMAL_POINT,   self::ST_LEXEME),
 
163
                               array(self::ST_LEXEME,        self::IN_ASCII_DIGIT,     self::ST_LEXEME),
 
164
                               array(self::ST_LEXEME,        self::IN_CHAR,            self::ST_LEXEME)
 
165
                             ));
 
166
        $this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE,     self::ST_QUOTED_LEXEME),
 
167
                               array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR,       self::ST_QUOTED_LEXEME),
 
168
                               array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR,    self::ST_QUOTED_LEXEME),
 
169
                               array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
 
170
                               array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR,     self::ST_ESCAPED_QCHAR),
 
171
                               array(self::ST_QUOTED_LEXEME, self::IN_QUOTE,           self::ST_WHITE_SPACE),
 
172
                               array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT,   self::ST_QUOTED_LEXEME),
 
173
                               array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT,     self::ST_QUOTED_LEXEME),
 
174
                               array(self::ST_QUOTED_LEXEME, self::IN_CHAR,            self::ST_QUOTED_LEXEME)
 
175
                             ));
 
176
        $this->addRules(array( array(self::ST_ESCAPED_CHAR,  self::IN_WHITE_SPACE,     self::ST_LEXEME),
 
177
                               array(self::ST_ESCAPED_CHAR,  self::IN_SYNT_CHAR,       self::ST_LEXEME),
 
178
                               array(self::ST_ESCAPED_CHAR,  self::IN_MUTABLE_CHAR,    self::ST_LEXEME),
 
179
                               array(self::ST_ESCAPED_CHAR,  self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
 
180
                               array(self::ST_ESCAPED_CHAR,  self::IN_ESCAPE_CHAR,     self::ST_LEXEME),
 
181
                               array(self::ST_ESCAPED_CHAR,  self::IN_QUOTE,           self::ST_LEXEME),
 
182
                               array(self::ST_ESCAPED_CHAR,  self::IN_DECIMAL_POINT,   self::ST_LEXEME),
 
183
                               array(self::ST_ESCAPED_CHAR,  self::IN_ASCII_DIGIT,     self::ST_LEXEME),
 
184
                               array(self::ST_ESCAPED_CHAR,  self::IN_CHAR,            self::ST_LEXEME)
 
185
                             ));
 
186
        $this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE,     self::ST_QUOTED_LEXEME),
 
187
                               array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR,       self::ST_QUOTED_LEXEME),
 
188
                               array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR,    self::ST_QUOTED_LEXEME),
 
189
                               array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
 
190
                               array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR,     self::ST_QUOTED_LEXEME),
 
191
                               array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE,           self::ST_QUOTED_LEXEME),
 
192
                               array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT,   self::ST_QUOTED_LEXEME),
 
193
                               array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT,     self::ST_QUOTED_LEXEME),
 
194
                               array(self::ST_ESCAPED_QCHAR, self::IN_CHAR,            self::ST_QUOTED_LEXEME)
 
195
                             ));
 
196
        $this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
197
                               array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
198
                               array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR,    self::ST_SYNT_LEXEME),
 
199
                               array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
200
 
 
201
                               // IN_ESCAPE_CHAR       not allowed
 
202
                               array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR,     self::ST_ERROR, $lexemeModifierErrorAction),
 
203
 
 
204
                               // IN_QUOTE             not allowed
 
205
                               array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE,           self::ST_ERROR, $lexemeModifierErrorAction),
 
206
 
 
207
 
 
208
                               array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT,   self::ST_MANTISSA),
 
209
                               array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT,     self::ST_NUMBER),
 
210
 
 
211
                               // IN_CHAR              not allowed
 
212
                               array(self::ST_LEXEME_MODIFIER, self::IN_CHAR,            self::ST_ERROR, $lexemeModifierErrorAction),
 
213
                             ));
 
214
        $this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
215
                               array(self::ST_NUMBER, self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
216
                               array(self::ST_NUMBER, self::IN_MUTABLE_CHAR,    self::ST_SYNT_LEXEME),
 
217
                               array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
218
 
 
219
                               // IN_ESCAPE_CHAR       not allowed
 
220
                               array(self::ST_NUMBER, self::IN_ESCAPE_CHAR,     self::ST_ERROR, $wrongNumberErrorAction),
 
221
 
 
222
                               // IN_QUOTE             not allowed
 
223
                               array(self::ST_NUMBER, self::IN_QUOTE,           self::ST_ERROR, $wrongNumberErrorAction),
 
224
 
 
225
                               array(self::ST_NUMBER, self::IN_DECIMAL_POINT,   self::ST_MANTISSA),
 
226
                               array(self::ST_NUMBER, self::IN_ASCII_DIGIT,     self::ST_NUMBER),
 
227
 
 
228
                               // IN_CHAR              not allowed
 
229
                               array(self::ST_NUMBER, self::IN_CHAR,            self::ST_ERROR, $wrongNumberErrorAction),
 
230
                             ));
 
231
        $this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE,     self::ST_WHITE_SPACE),
 
232
                               array(self::ST_MANTISSA, self::IN_SYNT_CHAR,       self::ST_SYNT_LEXEME),
 
233
                               array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR,    self::ST_SYNT_LEXEME),
 
234
                               array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
 
235
 
 
236
                               // IN_ESCAPE_CHAR       not allowed
 
237
                               array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR,     self::ST_ERROR, $wrongNumberErrorAction),
 
238
 
 
239
                               // IN_QUOTE             not allowed
 
240
                               array(self::ST_MANTISSA, self::IN_QUOTE,           self::ST_ERROR, $wrongNumberErrorAction),
 
241
 
 
242
                               // IN_DECIMAL_POINT     not allowed
 
243
                               array(self::ST_MANTISSA, self::IN_DECIMAL_POINT,   self::ST_ERROR, $wrongNumberErrorAction),
 
244
 
 
245
                               array(self::ST_MANTISSA, self::IN_ASCII_DIGIT,     self::ST_MANTISSA),
 
246
 
 
247
                               // IN_CHAR              not allowed
 
248
                               array(self::ST_MANTISSA, self::IN_CHAR,            self::ST_ERROR, $wrongNumberErrorAction),
 
249
                             ));
 
250
 
 
251
 
 
252
        /** Actions */
 
253
        $syntaxLexemeAction    = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme');
 
254
        $lexemeModifierAction  = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier');
 
255
        $addLexemeAction       = new Zend_Search_Lucene_FSMAction($this, 'addLexeme');
 
256
        $addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme');
 
257
        $addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme');
 
258
        $addLexemeCharAction   = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar');
 
259
 
 
260
 
 
261
        /** Syntax lexeme */
 
262
        $this->addEntryAction(self::ST_SYNT_LEXEME,  $syntaxLexemeAction);
 
263
        // Two lexemes in succession
 
264
        $this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
 
265
 
 
266
 
 
267
        /** Lexeme */
 
268
        $this->addEntryAction(self::ST_LEXEME,                       $addLexemeCharAction);
 
269
        $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
 
270
        // ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
 
271
 
 
272
        $this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE,     $addLexemeAction);
 
273
        $this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME,     $addLexemeAction);
 
274
        $this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME,   $addLexemeAction);
 
275
        $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
 
276
        $this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER,          $addLexemeAction);
 
277
        $this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA,        $addLexemeAction);
 
278
 
 
279
 
 
280
        /** Quoted lexeme */
 
281
        // We don't need entry action (skeep quote)
 
282
        $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
 
283
        $this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
 
284
        // Closing quote changes state to the ST_WHITE_SPACE   other states are not used
 
285
        $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE,   $addQuotedLexemeAction);
 
286
 
 
287
 
 
288
        /** Lexeme modifier */
 
289
        $this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
 
290
 
 
291
 
 
292
        /** Number */
 
293
        $this->addEntryAction(self::ST_NUMBER,                           $addLexemeCharAction);
 
294
        $this->addEntryAction(self::ST_MANTISSA,                         $addLexemeCharAction);
 
295
        $this->addTransitionAction(self::ST_NUMBER,   self::ST_NUMBER,   $addLexemeCharAction);
 
296
        // ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
 
297
        $this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
 
298
 
 
299
        $this->addTransitionAction(self::ST_NUMBER,   self::ST_WHITE_SPACE,     $addNumberLexemeAction);
 
300
        $this->addTransitionAction(self::ST_NUMBER,   self::ST_SYNT_LEXEME,     $addNumberLexemeAction);
 
301
        $this->addTransitionAction(self::ST_NUMBER,   self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
 
302
        $this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE,     $addNumberLexemeAction);
 
303
        $this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME,     $addNumberLexemeAction);
 
304
        $this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
 
305
    }
 
306
 
 
307
 
 
308
 
 
309
 
 
310
    /**
 
311
     * Translate input char to an input symbol of state machine
 
312
     *
 
313
     * @param string $char
 
314
     * @return integer
 
315
     */
 
316
    private function _translateInput($char)
 
317
    {
 
318
        if        (strpos(self::QUERY_WHITE_SPACE_CHARS,    $char) !== false) { return self::IN_WHITE_SPACE;
 
319
        } else if (strpos(self::QUERY_SYNT_CHARS,           $char) !== false) { return self::IN_SYNT_CHAR;
 
320
        } else if (strpos(self::QUERY_MUTABLE_CHARS,        $char) !== false) { return self::IN_MUTABLE_CHAR;
 
321
        } else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
 
322
        } else if (strpos(self::QUERY_ASCIIDIGITS_CHARS,    $char) !== false) { return self::IN_ASCII_DIGIT;
 
323
        } else if ($char === '"' )                                            { return self::IN_QUOTE;
 
324
        } else if ($char === '.' )                                            { return self::IN_DECIMAL_POINT;
 
325
        } else if ($char === '\\')                                            { return self::IN_ESCAPE_CHAR;
 
326
        } else                                                                { return self::IN_CHAR;
 
327
        }
 
328
    }
 
329
 
 
330
 
 
331
    /**
 
332
     * This method is used to tokenize query string into lexemes
 
333
     *
 
334
     * @param string $inputString
 
335
     * @param string $encoding
 
336
     * @return array
 
337
     * @throws Zend_Search_Lucene_Search_QueryParserException
 
338
     */
 
339
    public function tokenize($inputString, $encoding)
 
340
    {
 
341
        $this->reset();
 
342
 
 
343
        $this->_lexemes     = array();
 
344
        $this->_queryString = array();
 
345
 
 
346
        if (PHP_OS == 'AIX' && $encoding == '') {
 
347
            $encoding = 'ISO8859-1';
 
348
        }
 
349
        $strLength = iconv_strlen($inputString, $encoding);
 
350
 
 
351
        // Workaround for iconv_substr bug
 
352
        $inputString .= ' ';
 
353
 
 
354
        for ($count = 0; $count < $strLength; $count++) {
 
355
            $this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
 
356
        }
 
357
 
 
358
        for ($this->_queryStringPosition = 0;
 
359
             $this->_queryStringPosition < count($this->_queryString);
 
360
             $this->_queryStringPosition++) {
 
361
            $this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
 
362
        }
 
363
 
 
364
        $this->process(self::IN_WHITE_SPACE);
 
365
 
 
366
        if ($this->getState() != self::ST_WHITE_SPACE) {
 
367
            throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query');
 
368
        }
 
369
 
 
370
        $this->_queryString = null;
 
371
 
 
372
        return $this->_lexemes;
 
373
    }
 
374
 
 
375
 
 
376
 
 
377
    /*********************************************************************
 
378
     * Actions implementation
 
379
     *
 
380
     * Actions affect on recognized lexemes list
 
381
     *********************************************************************/
 
382
 
 
383
    /**
 
384
     * Add query syntax lexeme
 
385
     *
 
386
     * @throws Zend_Search_Lucene_Search_QueryParserException
 
387
     */
 
388
    public function addQuerySyntaxLexeme()
 
389
    {
 
390
        $lexeme = $this->_queryString[$this->_queryStringPosition];
 
391
 
 
392
        // Process two char lexemes
 
393
        if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
 
394
            // increase current position in a query string
 
395
            $this->_queryStringPosition++;
 
396
 
 
397
            // check,
 
398
            if ($this->_queryStringPosition == count($this->_queryString)  ||
 
399
                $this->_queryString[$this->_queryStringPosition] != $lexeme) {
 
400
                    throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
 
401
                }
 
402
 
 
403
            // duplicate character
 
404
            $lexeme .= $lexeme;
 
405
        }
 
406
 
 
407
        $token = new Zend_Search_Lucene_Search_QueryToken(
 
408
                                Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
 
409
                                $lexeme,
 
410
                                $this->_queryStringPosition);
 
411
 
 
412
        // Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
 
413
        if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) {
 
414
            $token = array_pop($this->_lexemes);
 
415
            if ($token === null  ||  $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
 
416
                throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
 
417
            }
 
418
 
 
419
            $token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD;
 
420
        }
 
421
 
 
422
        $this->_lexemes[] = $token;
 
423
    }
 
424
 
 
425
    /**
 
426
     * Add lexeme modifier
 
427
     */
 
428
    public function addLexemeModifier()
 
429
    {
 
430
        $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
 
431
                                    Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
 
432
                                    $this->_queryString[$this->_queryStringPosition],
 
433
                                    $this->_queryStringPosition);
 
434
    }
 
435
 
 
436
 
 
437
    /**
 
438
     * Add lexeme
 
439
     */
 
440
    public function addLexeme()
 
441
    {
 
442
        $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
 
443
                                    Zend_Search_Lucene_Search_QueryToken::TC_WORD,
 
444
                                    $this->_currentLexeme,
 
445
                                    $this->_queryStringPosition - 1);
 
446
 
 
447
        $this->_currentLexeme = '';
 
448
    }
 
449
 
 
450
    /**
 
451
     * Add quoted lexeme
 
452
     */
 
453
    public function addQuotedLexeme()
 
454
    {
 
455
        $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
 
456
                                    Zend_Search_Lucene_Search_QueryToken::TC_PHRASE,
 
457
                                    $this->_currentLexeme,
 
458
                                    $this->_queryStringPosition);
 
459
 
 
460
        $this->_currentLexeme = '';
 
461
    }
 
462
 
 
463
    /**
 
464
     * Add number lexeme
 
465
     */
 
466
    public function addNumberLexeme()
 
467
    {
 
468
        $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
 
469
                                    Zend_Search_Lucene_Search_QueryToken::TC_NUMBER,
 
470
                                    $this->_currentLexeme,
 
471
                                    $this->_queryStringPosition - 1);
 
472
        $this->_currentLexeme = '';
 
473
    }
 
474
 
 
475
    /**
 
476
     * Extend lexeme by one char
 
477
     */
 
478
    public function addLexemeChar()
 
479
    {
 
480
        $this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
 
481
    }
 
482
 
 
483
 
 
484
    /**
 
485
     * Position message
 
486
     *
 
487
     * @return string
 
488
     */
 
489
    private function _positionMsg()
 
490
    {
 
491
        return 'Position is ' . $this->_queryStringPosition . '.';
 
492
    }
 
493
 
 
494
 
 
495
    /*********************************************************************
 
496
     * Syntax errors actions
 
497
     *********************************************************************/
 
498
    public function lexModifierErrException()
 
499
    {
 
500
        throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
 
501
    }
 
502
    public function quoteWithinLexemeErrException()
 
503
    {
 
504
        throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
 
505
    }
 
506
    public function wrongNumberErrException()
 
507
    {
 
508
        throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg());
 
509
    }
 
510
}
 
511