7
* This source file is subject to the new BSD license that is bundled
8
* with this package in the file LICENSE.txt.
9
* It is also available through the world-wide-web at this URL:
10
* http://framework.zend.com/license/new-bsd
11
* If you did not receive a copy of the license and are unable to
12
* obtain it through the world-wide-web, please send an email
13
* to license@zend.com so we can send you a copy immediately.
16
* @package Zend_Search_Lucene
18
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
19
* @license http://framework.zend.com/license/new-bsd New BSD License
23
/** Zend_Search_Lucene_FSM */
24
require_once 'Zend/Search/Lucene/FSM.php';
26
/** Zend_Search_Lucene_Search_QueryParser */
27
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
29
/** Zend_Search_Lucene_Exception */
30
require_once 'Zend/Search/Lucene/Exception.php';
32
/** Zend_Search_Lucene_Search_QueryParserException */
33
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
38
* @package Zend_Search_Lucene
40
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
41
* @license http://framework.zend.com/license/new-bsd New BSD License
43
class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM
45
/** State Machine states */
46
const ST_WHITE_SPACE = 0;
47
const ST_SYNT_LEXEME = 1;
49
const ST_QUOTED_LEXEME = 3;
50
const ST_ESCAPED_CHAR = 4;
51
const ST_ESCAPED_QCHAR = 5;
52
const ST_LEXEME_MODIFIER = 6;
54
const ST_MANTISSA = 8;
58
const IN_WHITE_SPACE = 0;
59
const IN_SYNT_CHAR = 1;
60
const IN_LEXEME_MODIFIER = 2;
61
const IN_ESCAPE_CHAR = 3;
63
const IN_DECIMAL_POINT = 5;
64
const IN_ASCII_DIGIT = 6;
66
const IN_MUTABLE_CHAR = 8;
68
const QUERY_WHITE_SPACE_CHARS = " \n\r\t";
69
const QUERY_SYNT_CHARS = ':()[]{}!|&';
70
const QUERY_MUTABLE_CHARS = '+-';
71
const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
72
const QUERY_LEXEMEMODIFIER_CHARS = '~^';
73
const QUERY_ASCIIDIGITS_CHARS = '0123456789';
76
* List of recognized lexemes
83
* Query string (array of single- or non single-byte characters)
87
private $_queryString;
90
* Current position within a query string
91
* Used to create appropriate error messages
95
private $_queryStringPosition;
98
* Recognized part of current lexeme
102
private $_currentLexeme;
104
public function __construct()
106
parent::__construct( array(self::ST_WHITE_SPACE,
107
self::ST_SYNT_LEXEME,
109
self::ST_QUOTED_LEXEME,
110
self::ST_ESCAPED_CHAR,
111
self::ST_ESCAPED_QCHAR,
112
self::ST_LEXEME_MODIFIER,
116
array(self::IN_WHITE_SPACE,
118
self::IN_MUTABLE_CHAR,
119
self::IN_LEXEME_MODIFIER,
120
self::IN_ESCAPE_CHAR,
122
self::IN_DECIMAL_POINT,
123
self::IN_ASCII_DIGIT,
127
$lexemeModifierErrorAction = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException');
128
$quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException');
129
$wrongNumberErrorAction = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException');
133
$this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
134
array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
135
array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
136
array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
137
array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
138
array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
139
array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME),
140
array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME),
141
array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME)
143
$this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
144
array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
145
array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
146
array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
147
array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
148
array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
149
array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
150
array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
151
array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME)
153
$this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
154
array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
155
array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
156
array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
157
array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
159
// IN_QUOTE not allowed
160
array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction),
162
array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
163
array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
164
array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME)
166
$this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
167
array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
168
array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
169
array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
170
array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR),
171
array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE),
172
array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
173
array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
174
array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME)
176
$this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME),
177
array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME),
178
array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
179
array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
180
array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME),
181
array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME),
182
array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME),
183
array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME),
184
array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME)
186
$this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
187
array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
188
array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
189
array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
190
array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME),
191
array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
192
array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
193
array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
194
array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME)
196
$this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
197
array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
198
array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
199
array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
201
// IN_ESCAPE_CHAR not allowed
202
array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
204
// IN_QUOTE not allowed
205
array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction),
208
array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
209
array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
211
// IN_CHAR not allowed
212
array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
214
$this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
215
array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
216
array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
217
array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
219
// IN_ESCAPE_CHAR not allowed
220
array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
222
// IN_QUOTE not allowed
223
array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
225
array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
226
array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
228
// IN_CHAR not allowed
229
array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
231
$this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
232
array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
233
array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
234
array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
236
// IN_ESCAPE_CHAR not allowed
237
array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
239
// IN_QUOTE not allowed
240
array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
242
// IN_DECIMAL_POINT not allowed
243
array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction),
245
array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA),
247
// IN_CHAR not allowed
248
array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
253
$syntaxLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme');
254
$lexemeModifierAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier');
255
$addLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addLexeme');
256
$addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme');
257
$addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme');
258
$addLexemeCharAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar');
262
$this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction);
263
// Two lexemes in succession
264
$this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
268
$this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction);
269
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
270
// ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
272
$this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction);
273
$this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction);
274
$this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction);
275
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
276
$this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction);
277
$this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction);
281
// We don't need entry action (skeep quote)
282
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
283
$this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
284
// Closing quote changes state to the ST_WHITE_SPACE other states are not used
285
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction);
288
/** Lexeme modifier */
289
$this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
293
$this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction);
294
$this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction);
295
$this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction);
296
// ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
297
$this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
299
$this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction);
300
$this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
301
$this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
302
$this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction);
303
$this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
304
$this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
311
* Translate input char to an input symbol of state machine
313
* @param string $char
316
private function _translateInput($char)
318
if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE;
319
} else if (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR;
320
} else if (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR;
321
} else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
322
} else if (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT;
323
} else if ($char === '"' ) { return self::IN_QUOTE;
324
} else if ($char === '.' ) { return self::IN_DECIMAL_POINT;
325
} else if ($char === '\\') { return self::IN_ESCAPE_CHAR;
326
} else { return self::IN_CHAR;
332
* This method is used to tokenize query string into lexemes
334
* @param string $inputString
335
* @param string $encoding
337
* @throws Zend_Search_Lucene_Search_QueryParserException
339
public function tokenize($inputString, $encoding)
343
$this->_lexemes = array();
344
$this->_queryString = array();
346
if (PHP_OS == 'AIX' && $encoding == '') {
347
$encoding = 'ISO8859-1';
349
$strLength = iconv_strlen($inputString, $encoding);
351
// Workaround for iconv_substr bug
354
for ($count = 0; $count < $strLength; $count++) {
355
$this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
358
for ($this->_queryStringPosition = 0;
359
$this->_queryStringPosition < count($this->_queryString);
360
$this->_queryStringPosition++) {
361
$this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
364
$this->process(self::IN_WHITE_SPACE);
366
if ($this->getState() != self::ST_WHITE_SPACE) {
367
throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query');
370
$this->_queryString = null;
372
return $this->_lexemes;
377
/*********************************************************************
378
* Actions implementation
380
* Actions affect on recognized lexemes list
381
*********************************************************************/
384
* Add query syntax lexeme
386
* @throws Zend_Search_Lucene_Search_QueryParserException
388
public function addQuerySyntaxLexeme()
390
$lexeme = $this->_queryString[$this->_queryStringPosition];
392
// Process two char lexemes
393
if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
394
// increase current position in a query string
395
$this->_queryStringPosition++;
398
if ($this->_queryStringPosition == count($this->_queryString) ||
399
$this->_queryString[$this->_queryStringPosition] != $lexeme) {
400
throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
403
// duplicate character
407
$token = new Zend_Search_Lucene_Search_QueryToken(
408
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
410
$this->_queryStringPosition);
412
// Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
413
if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) {
414
$token = array_pop($this->_lexemes);
415
if ($token === null || $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
416
throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
419
$token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD;
422
$this->_lexemes[] = $token;
426
* Add lexeme modifier
428
public function addLexemeModifier()
430
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
431
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
432
$this->_queryString[$this->_queryStringPosition],
433
$this->_queryStringPosition);
440
public function addLexeme()
442
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
443
Zend_Search_Lucene_Search_QueryToken::TC_WORD,
444
$this->_currentLexeme,
445
$this->_queryStringPosition - 1);
447
$this->_currentLexeme = '';
453
public function addQuotedLexeme()
455
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
456
Zend_Search_Lucene_Search_QueryToken::TC_PHRASE,
457
$this->_currentLexeme,
458
$this->_queryStringPosition);
460
$this->_currentLexeme = '';
466
public function addNumberLexeme()
468
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
469
Zend_Search_Lucene_Search_QueryToken::TC_NUMBER,
470
$this->_currentLexeme,
471
$this->_queryStringPosition - 1);
472
$this->_currentLexeme = '';
476
* Extend lexeme by one char
478
public function addLexemeChar()
480
$this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
489
private function _positionMsg()
491
return 'Position is ' . $this->_queryStringPosition . '.';
495
/*********************************************************************
496
* Syntax errors actions
497
*********************************************************************/
498
public function lexModifierErrException()
500
throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
502
public function quoteWithinLexemeErrException()
504
throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
506
public function wrongNumberErrException()
508
throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg());