1
# Copyright (c) 2001-2007 Twisted Matrix Laboratories.
2
# See LICENSE for details.
4
# DO NOT EDIT xpathparser.py!
6
# It is generated from xpathparser.g using Yapps. Make needed changes there.
7
# This also means that the generated Python may not conform to Twisted's coding
12
# 1.) Grab a copy of yapps2, version 2.1.1:
13
# http://theory.stanford.edu/~amitp/Yapps/
15
# Note: Do NOT use the package in debian/ubuntu as it has incompatible
18
# 2.) Generate the grammar:
20
# yapps2 xpathparser.g xpathparser.py.proto
22
# 3.) Edit the output to depend on the embedded runtime, not yappsrt.
24
# sed -e '/^import yapps/d' -e '/^[^#]/s/yappsrt\.//g' \
25
# xpathparser.py.proto > xpathparser.py
30
Besides the parser code produced by Yapps, this module also defines the
31
parse-time exception classes, a scanner class, a base class for parsers
32
produced by Yapps, and a context class that keeps track of the parse stack.
33
These have been copied from the Yapps runtime.
38
class SyntaxError(Exception):
39
"""When we run into an unexpected token, this is the exception to use"""
40
def __init__(self, charpos=-1, msg="Bad Token", context=None):
41
Exception.__init__(self)
42
self.charpos = charpos
44
self.context = context
47
if self.charpos < 0: return 'SyntaxError'
48
else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg)
50
class NoMoreTokens(Exception):
51
"""Another exception object, for when we run out of tokens"""
57
The Yapps scanner can work in context sensitive or context
58
insensitive modes. The token(i) method is used to retrieve the
59
i-th token. It takes a restrict set that limits the set of tokens
60
it is allowed to return. In context sensitive mode, this restrict
61
set guides the scanner. In context insensitive mode, there is no
62
restriction (the set is always the full set of tokens).
66
def __init__(self, patterns, ignore, input):
67
"""Initialize the scanner.
69
@param patterns: [(terminal, uncompiled regex), ...] or C{None}
70
@param ignore: [terminal,...]
73
If patterns is C{None}, we assume that the subclass has defined
74
C{self.patterns} : [(terminal, compiled regex), ...]. Note that the
75
patterns parameter expects uncompiled regexes, whereas the
76
C{self.patterns} field expects compiled regexes.
78
self.tokens = [] # [(begin char pos, end char pos, token name, matched text), ...]
79
self.restrictions = []
83
self.first_line_number = 1
85
if patterns is not None:
86
# Compile the regex strings into regex objects
88
for terminal, regex in patterns:
89
self.patterns.append( (terminal, re.compile(regex)) )
91
def get_token_pos(self):
92
"""Get the current token position in the input text."""
93
return len(self.tokens)
95
def get_char_pos(self):
96
"""Get the current char position in the input text."""
99
def get_prev_char_pos(self, i=None):
100
"""Get the previous position (one token back) in the input text."""
101
if self.pos == 0: return 0
103
return self.tokens[i][0]
105
def get_line_number(self):
106
"""Get the line number of the current position in the input text."""
107
# TODO: make this work at any token/char position
108
return self.first_line_number + self.get_input_scanned().count('\n')
110
def get_column_number(self):
111
"""Get the column number of the current position in the input text."""
112
s = self.get_input_scanned()
113
i = s.rfind('\n') # may be -1, but that's okay in this case
114
return len(s) - (i+1)
116
def get_input_scanned(self):
117
"""Get the portion of the input that has been tokenized."""
118
return self.input[:self.pos]
120
def get_input_unscanned(self):
121
"""Get the portion of the input that has not yet been tokenized."""
122
return self.input[self.pos:]
124
def token(self, i, restrict=None):
125
"""Get the i'th token in the input.
127
If C{i} is one past the end, then scan for another token.
129
@param i: token index
131
@param restrict: [token, ...] or C{None}; if restrict is
132
C{None}, then any token is allowed. You may call
133
token(i) more than once. However, the restrict set
134
may never be larger than what was passed in on the
135
first call to token(i).
137
if i == len(self.tokens):
139
if i < len(self.tokens):
140
# Make sure the restriction is more restricted. This
141
# invariant is needed to avoid ruining tokenization at
142
# position i+1 and higher.
143
if restrict and self.restrictions[i]:
145
if r not in self.restrictions[i]:
146
raise NotImplementedError("Unimplemented: restriction set changed")
147
return self.tokens[i]
151
"""Print the last 10 tokens that have been scanned in"""
153
for t in self.tokens[-10:]:
154
output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3]))
157
def scan(self, restrict):
158
"""Should scan another token and add it to the list, self.tokens,
159
and add the restriction to self.restrictions"""
160
# Keep looking for a token, ignoring any in self.ignore
162
# Search the patterns for the longest match, with earlier
163
# tokens in the list having preference
166
for p, regexp in self.patterns:
167
# First check to see if we're ignoring this token
168
if restrict and p not in restrict and p not in self.ignore:
170
m = regexp.match(self.input, self.pos)
171
if m and len(m.group(0)) > best_match:
172
# We got a match that's better than the previous one
174
best_match = len(m.group(0))
176
# If we didn't find anything, raise an error
177
if best_pat == '(error)' and best_match < 0:
180
msg = 'Trying to find one of '+', '.join(restrict)
181
raise SyntaxError(self.pos, msg)
183
# If we found something that isn't to be ignored, return it
184
if best_pat not in self.ignore:
185
# Create a token with this data
186
token = (self.pos, self.pos+best_match, best_pat,
187
self.input[self.pos:self.pos+best_match])
188
self.pos = self.pos + best_match
189
# Only add this token if it's not in the list
190
# (to prevent looping)
191
if not self.tokens or token != self.tokens[-1]:
192
self.tokens.append(token)
193
self.restrictions.append(restrict)
196
# This token should be ignored ..
197
self.pos = self.pos + best_match
200
"""Base class for Yapps-generated parsers.
204
def __init__(self, scanner):
205
self._scanner = scanner
208
def _peek(self, *types):
209
"""Returns the token type for lookahead; if there are any args
210
then the list of args is the set of token types to allow"""
211
tok = self._scanner.token(self._pos, types)
214
def _scan(self, type):
215
"""Returns the matched text, and moves to the next token"""
216
tok = self._scanner.token(self._pos, [type])
218
raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(self._scanner.restrictions[self._pos]))
219
self._pos = 1 + self._pos
223
"""Class to represent the parser's call stack.
225
Every rule creates a Context that links to its parent rule. The
226
contexts can be used for debugging.
230
def __init__(self, parent, scanner, tokenpos, rule, args=()):
231
"""Create a new context.
233
@param parent: Context object or C{None}
234
@param scanner: Scanner object
235
@param tokenpos: scanner token position
236
@type tokenpos: L{int}
237
@param rule: name of the rule
239
@param args: tuple listing parameters to the rule
243
self.scanner = scanner
244
self.tokenpos = tokenpos
250
if self.parent: output = str(self.parent) + ' > '
254
def print_line_with_pointer(text, p):
255
"""Print the line of 'text' that includes position 'p',
256
along with a second line with a single caret (^) at position p"""
258
# TODO: separate out the logic for determining the line/character
259
# location from the logic for determining how to display an
260
# 80-column line to stderr.
262
# Now try printing part of the line
263
text = text[max(p-80, 0):p+80]
267
i = text[:p].rfind('\n')
268
j = text[:p].rfind('\r')
269
if i < 0 or (0 <= j < i): i = j
275
i = text.find('\n', p)
276
j = text.find('\r', p)
277
if i < 0 or (0 <= j < i): i = j
281
# Now shorten the text
282
while len(text) > 70 and p > 60:
284
text = "..." + text[10:]
287
# Now print the string, along with an indicator
288
print >>sys.stderr, '> ',text
289
print >>sys.stderr, '> ',' '*p + '^'
291
def print_error(input, err, scanner):
292
"""Print error messages, the parser stack, and the input text -- for human-readable error messages."""
293
# NOTE: this function assumes 80 columns :-(
294
# Figure out the line number
295
line_number = scanner.get_line_number()
296
column_number = scanner.get_column_number()
297
print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg)
299
context = err.context
301
print_line_with_pointer(input, err.charpos)
304
# TODO: add line number
305
print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args))
306
print_line_with_pointer(input, context.scanner.get_prev_char_pos(context.tokenpos))
307
context = context.parent
309
def wrap_error_reporter(parser, rule):
311
return getattr(parser, rule)()
312
except SyntaxError, e:
313
input = parser._scanner.input
314
print_error(input, e, parser._scanner)
316
print >>sys.stderr, 'Could not complete parsing; stopped around here:'
317
print >>sys.stderr, parser._scanner
320
from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue
321
from twisted.words.xish.xpath import Function, IndexValue, LiteralValue
322
from twisted.words.xish.xpath import _AnyLocation, _Location
327
token INDEX: "[0-9]+"
329
token IDENTIFIER: "[a-zA-Z][a-zA-Z0-9_\-]*"
330
token ATTRIBUTE: "\@[a-zA-Z][a-zA-Z0-9_\-]*"
331
token FUNCNAME: "[a-zA-Z][a-zA-Z0-9_]*"
334
token STR_DQ: '"([^"]|(\\"))*?"'
335
token STR_SQ: "'([^']|(\\'))*?'"
340
rule XPATH: PATH {{ result = PATH; current = result }}
341
( PATH {{ current.childLocation = PATH; current = current.childLocation }} ) * END
344
rule PATH: ("/" {{ result = _Location() }} | "//" {{ result = _AnyLocation() }} )
345
( IDENTIFIER {{ result.elementName = IDENTIFIER }} | WILDCARD {{ result.elementName = None }} )
346
( "\[" PREDICATE {{ result.predicates.append(PREDICATE) }} "\]")*
349
rule PREDICATE: EXPR {{ return EXPR }} |
350
INDEX {{ return IndexValue(INDEX) }}
352
rule EXPR: FACTOR {{ e = FACTOR }}
353
( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR) }} )*
356
rule BOOLOP: ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }} )
358
rule FACTOR: TERM {{ return TERM }}
359
| "\(" EXPR "\)" {{ return EXPR }}
361
rule TERM: VALUE {{ t = VALUE }}
362
[ CMP VALUE {{ t = CompareValue(t, CMP, VALUE) }} ]
365
rule VALUE: "@" IDENTIFIER {{ return AttribValue(IDENTIFIER) }} |
366
FUNCNAME {{ f = Function(FUNCNAME); args = [] }}
367
"\(" [ VALUE {{ args.append(VALUE) }}
369
"," VALUE {{ args.append(VALUE) }}
371
] "\)" {{ f.setParams(*args); return f }} |
372
STR {{ return LiteralValue(STR[1:len(STR)-1]) }}
374
rule CMP: (CMP_EQ {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }})
375
rule STR: (STR_DQ {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }})