1
# -*- test-case-name: twisted.words.test -*-
2
# Copyright (c) 2001-2005 Twisted Matrix Laboratories.
3
# See LICENSE for details.
7
# NOTE! Actual grammar is at the end of the file
11
# 1.) Grab a copy of yapps2: http://theory.stanford.edu/~amitp/Yapps/
12
# 2.) Hack it to not add a "import yappsrt" in the output file
13
# 3.) Generate the grammar as usual
15
"""Run time libraries needed to run parsers generated by Yapps.
17
This module defines parse-time exception classes, a scanner class, a
18
base class for parsers produced by Yapps, and a context class that
19
keeps track of the parse stack.
23
# TODO: it should be possible to embed yappsrt into the generated
24
# grammar to make a standalone module.
28
class SyntaxError(Exception):
29
"""When we run into an unexpected token, this is the exception to use"""
30
def __init__(self, charpos=-1, msg="Bad Token", context=None):
31
Exception.__init__(self)
32
self.charpos = charpos
34
self.context = context
37
if self.charpos < 0: return 'SyntaxError'
38
else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg)
40
class NoMoreTokens(Exception):
41
"""Another exception object, for when we run out of tokens"""
47
The Yapps scanner can work in context sensitive or context
48
insensitive modes. The token(i) method is used to retrieve the
49
i-th token. It takes a restrict set that limits the set of tokens
50
it is allowed to return. In context sensitive mode, this restrict
51
set guides the scanner. In context insensitive mode, there is no
52
restriction (the set is always the full set of tokens).
56
def __init__(self, patterns, ignore, input):
57
""" Initialize the scanner.
59
@param patterns: [(terminal, uncompiled regex), ...] or C{None}
60
@param ignore: [terminal,...]
63
If patterns is C{None}, we assume that the subclass has defined
64
C{self.patterns} : [(terminal, compiled regex), ...]. Note that the
65
patterns parameter expects uncompiled regexes, whereas the
66
C{self.patterns} field expects compiled regexes.
68
self.tokens = [] # [(begin char pos, end char pos, token name, matched text), ...]
69
self.restrictions = []
73
self.first_line_number = 1
75
if patterns is not None:
76
# Compile the regex strings into regex objects
78
for terminal, regex in patterns:
79
self.patterns.append( (terminal, re.compile(regex)) )
81
def get_token_pos(self):
82
"""Get the current token position in the input text."""
83
return len(self.tokens)
85
def get_char_pos(self):
86
"""Get the current char position in the input text."""
89
def get_prev_char_pos(self, i=None):
90
"""Get the previous position (one token back) in the input text."""
91
if self.pos == 0: return 0
93
return self.tokens[i][0]
95
def get_line_number(self):
96
"""Get the line number of the current position in the input text."""
97
# TODO: make this work at any token/char position
98
return self.first_line_number + self.get_input_scanned().count('\n')
100
def get_column_number(self):
101
"""Get the column number of the current position in the input text."""
102
s = self.get_input_scanned()
103
i = s.rfind('\n') # may be -1, but that's okay in this case
104
return len(s) - (i+1)
106
def get_input_scanned(self):
107
"""Get the portion of the input that has been tokenized."""
108
return self.input[:self.pos]
110
def get_input_unscanned(self):
111
"""Get the portion of the input that has not yet been tokenized."""
112
return self.input[self.pos:]
114
def token(self, i, restrict=None):
115
"""Get the i'th token in the input.
118
If L{i} is one past the end, then scan for another token.
120
@param i: token index
121
@param restrict: [token, ...] or C{None}; if restrict is C{None},
122
then any token is allowed. You may call token(i) more
123
than once. However, the restrict set may never be
124
larger than what was passed in on the first call to
127
if i == len(self.tokens):
129
if i < len(self.tokens):
130
# Make sure the restriction is more restricted. This
131
# invariant is needed to avoid ruining tokenization at
132
# position i+1 and higher.
133
if restrict and self.restrictions[i]:
135
if r not in self.restrictions[i]:
136
raise NotImplementedError("Unimplemented: restriction set changed")
137
return self.tokens[i]
141
"""Print the last 10 tokens that have been scanned in"""
143
for t in self.tokens[-10:]:
144
output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3]))
147
def scan(self, restrict):
148
"""Should scan another token and add it to the list, self.tokens,
149
and add the restriction to self.restrictions"""
150
# Keep looking for a token, ignoring any in self.ignore
152
# Search the patterns for the longest match, with earlier
153
# tokens in the list having preference
156
for p, regexp in self.patterns:
157
# First check to see if we're ignoring this token
158
if restrict and p not in restrict and p not in self.ignore:
160
m = regexp.match(self.input, self.pos)
161
if m and len(m.group(0)) > best_match:
162
# We got a match that's better than the previous one
164
best_match = len(m.group(0))
166
# If we didn't find anything, raise an error
167
if best_pat == '(error)' and best_match < 0:
170
msg = 'Trying to find one of '+', '.join(restrict)
171
raise SyntaxError(self.pos, msg)
173
# If we found something that isn't to be ignored, return it
174
if best_pat not in self.ignore:
175
# Create a token with this data
176
token = (self.pos, self.pos+best_match, best_pat,
177
self.input[self.pos:self.pos+best_match])
178
self.pos = self.pos + best_match
179
# Only add this token if it's not in the list
180
# (to prevent looping)
181
if not self.tokens or token != self.tokens[-1]:
182
self.tokens.append(token)
183
self.restrictions.append(restrict)
186
# This token should be ignored ..
187
self.pos = self.pos + best_match
190
"""Base class for Yapps-generated parsers.
194
def __init__(self, scanner):
195
self._scanner = scanner
198
def _peek(self, *types):
199
"""Returns the token type for lookahead; if there are any args
200
then the list of args is the set of token types to allow"""
201
tok = self._scanner.token(self._pos, types)
204
def _scan(self, type):
205
"""Returns the matched text, and moves to the next token"""
206
tok = self._scanner.token(self._pos, [type])
208
raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(self._scanner.restrictions[self._pos]))
209
self._pos = 1 + self._pos
213
"""Class to represent the parser's call stack.
215
Every rule creates a Context that links to its parent rule. The
216
contexts can be used for debugging.
220
def __init__(self, parent, scanner, tokenpos, rule, args=()):
221
"""Create a new context.
223
@param parent: Context object or C{None}
224
@param scanner: Scanner object
225
@param tokenpos: scanner token position
226
@type tokenpos: L{int}
227
@param rule: name of the rule
229
@param args: tuple listing parameters to the rule
233
self.scanner = scanner
234
self.tokenpos = tokenpos
240
if self.parent: output = str(self.parent) + ' > '
244
def print_line_with_pointer(text, p):
245
"""Print the line of 'text' that includes position 'p',
246
along with a second line with a single caret (^) at position p"""
248
# TODO: separate out the logic for determining the line/character
249
# location from the logic for determining how to display an
250
# 80-column line to stderr.
252
# Now try printing part of the line
253
text = text[max(p-80, 0):p+80]
257
i = text[:p].rfind('\n')
258
j = text[:p].rfind('\r')
259
if i < 0 or (0 <= j < i): i = j
265
i = text.find('\n', p)
266
j = text.find('\r', p)
267
if i < 0 or (0 <= j < i): i = j
271
# Now shorten the text
272
while len(text) > 70 and p > 60:
274
text = "..." + text[10:]
277
# Now print the string, along with an indicator
278
print >>sys.stderr, '> ',text
279
print >>sys.stderr, '> ',' '*p + '^'
281
def print_error(input, err, scanner):
282
"""Print error messages, the parser stack, and the input text -- for human-readable error messages."""
283
# NOTE: this function assumes 80 columns :-(
284
# Figure out the line number
285
line_number = scanner.get_line_number()
286
column_number = scanner.get_column_number()
287
print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg)
289
context = err.context
291
print_line_with_pointer(input, err.charpos)
294
# TODO: add line number
295
print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args))
296
print_line_with_pointer(input, context.scanner.get_prev_char_pos(context.tokenpos))
297
context = context.parent
299
def wrap_error_reporter(parser, rule):
301
return getattr(parser, rule)()
302
except SyntaxError, e:
303
input = parser._scanner.input
304
print_error(input, e, parser._scanner)
306
print >>sys.stderr, 'Could not complete parsing; stopped around here:'
307
print >>sys.stderr, parser._scanner
311
from twisted.words.xish.xpath import _Location, _AnyLocation, IndexValue, CompareValue, AttribValue, LiteralValue, Function
314
# Begin -- grammar generated by Yapps
317
class XPathParserScanner(Scanner):
319
('"\\)"', re.compile('\\)')),
320
('","', re.compile(',')),
321
('"\\("', re.compile('\\(')),
322
('"@"', re.compile('@')),
323
('"\\]"', re.compile('\\]')),
324
('"\\["', re.compile('\\[')),
325
('"//"', re.compile('//')),
326
('"/"', re.compile('/')),
327
('\\s+', re.compile('\\s+')),
328
('INDEX', re.compile('[0-9]+')),
329
('WILDCARD', re.compile('\\*')),
330
('IDENTIFIER', re.compile('[a-zA-Z][a-zA-Z0-9_\\-]*')),
331
('ATTRIBUTE', re.compile('\\@[a-zA-Z][a-zA-Z0-9_\\-]*')),
332
('FUNCNAME', re.compile('[a-zA-Z][a-zA-Z0-9_]*')),
333
('CMP_EQ', re.compile('\\=')),
334
('CMP_NE', re.compile('\\!\\=')),
335
('STR_DQ', re.compile('"([^"]|(\\"))*?"')),
336
('STR_SQ', re.compile("'([^']|(\\'))*?'")),
337
('END', re.compile('$')),
339
def __init__(self, str):
340
Scanner.__init__(self,None,['\\s+'],str)
342
class XPathParser(Parser):
344
def XPATH(self, _parent=None):
345
_context = self.Context(_parent, self._scanner, self._pos, 'XPATH', [])
346
PATH = self.PATH(_context)
347
result = PATH; current = result
348
while self._peek('END', '"/"', '"//"') != 'END':
349
PATH = self.PATH(_context)
350
current.childLocation = PATH; current = current.childLocation
351
if self._peek() not in ['END', '"/"', '"//"']:
352
raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['END', '"/"', '"//"']))
353
END = self._scan('END')
356
def PATH(self, _parent=None):
357
_context = self.Context(_parent, self._scanner, self._pos, 'PATH', [])
358
_token = self._peek('"/"', '"//"')
364
result = _AnyLocation()
365
_token = self._peek('IDENTIFIER', 'WILDCARD')
366
if _token == 'IDENTIFIER':
367
IDENTIFIER = self._scan('IDENTIFIER')
368
result.elementName = IDENTIFIER
369
else: # == 'WILDCARD'
370
WILDCARD = self._scan('WILDCARD')
371
result.elementName = None
372
while self._peek('"\\["', 'END', '"/"', '"//"') == '"\\["':
374
PREDICATE = self.PREDICATE(_context)
375
result.predicates.append(PREDICATE)
377
if self._peek() not in ['"\\["', 'END', '"/"', '"//"']:
378
raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"\\["', 'END', '"/"', '"//"']))
381
def PREDICATE(self, _parent=None):
382
_context = self.Context(_parent, self._scanner, self._pos, 'PREDICATE', [])
383
_token = self._peek('INDEX', '"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ')
384
if _token != 'INDEX':
385
EXPR = self.EXPR(_context)
388
INDEX = self._scan('INDEX')
389
return IndexValue(INDEX)
391
def EXPR(self, _parent=None):
392
_context = self.Context(_parent, self._scanner, self._pos, 'EXPR', [])
393
VALUE = self.VALUE(_context)
395
if self._peek('CMP_EQ', 'CMP_NE', '"\\]"') != '"\\]"':
396
CMP = self.CMP(_context)
397
VALUE = self.VALUE(_context)
398
e = CompareValue(e, CMP, VALUE)
401
def VALUE(self, _parent=None):
402
_context = self.Context(_parent, self._scanner, self._pos, 'VALUE', [])
403
_token = self._peek('"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ')
406
IDENTIFIER = self._scan('IDENTIFIER')
407
return AttribValue(IDENTIFIER)
408
elif _token == 'FUNCNAME':
409
FUNCNAME = self._scan('FUNCNAME')
410
f = Function(FUNCNAME); args = []
412
if self._peek('"\\)"', '"@"', 'FUNCNAME', '","', 'STR_DQ', 'STR_SQ') not in ['"\\)"', '","']:
413
VALUE = self.VALUE(_context)
415
while self._peek('","', '"\\)"') == '","':
417
VALUE = self.VALUE(_context)
419
if self._peek() not in ['","', '"\\)"']:
420
raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['","', '"\\)"']))
422
f.setParams(*args); return f
423
else: # in ['STR_DQ', 'STR_SQ']
424
STR = self.STR(_context)
425
return LiteralValue(STR[1:len(STR)-1])
427
def CMP(self, _parent=None):
428
_context = self.Context(_parent, self._scanner, self._pos, 'CMP', [])
429
_token = self._peek('CMP_EQ', 'CMP_NE')
430
if _token == 'CMP_EQ':
431
CMP_EQ = self._scan('CMP_EQ')
434
CMP_NE = self._scan('CMP_NE')
437
def STR(self, _parent=None):
438
_context = self.Context(_parent, self._scanner, self._pos, 'STR', [])
439
_token = self._peek('STR_DQ', 'STR_SQ')
440
if _token == 'STR_DQ':
441
STR_DQ = self._scan('STR_DQ')
444
STR_SQ = self._scan('STR_SQ')
448
def parse(rule, text):
449
P = XPathParser(XPathParserScanner(text))
450
return wrap_error_reporter(P, rule)
452
if __name__ == '__main__':
453
from sys import argv, stdin
456
f = open(argv[2],'r')
459
print parse(argv[1], f.read())
460
else: print >>sys.stderr, 'Args: <rule> [<filename>]'
461
# End -- grammar generated by Yapps