1
by Matthias Klose
Import upstream version 2.7~a2 |
1 |
#
|
2 |
# Secret Labs' Regular Expression Engine
|
|
3 |
#
|
|
4 |
# convert re-style regular expression to sre pattern
|
|
5 |
#
|
|
6 |
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
|
7 |
#
|
|
8 |
# See the sre.py file for information on usage and redistribution.
|
|
9 |
#
|
|
10 |
||
11 |
"""Internal support module for sre"""
|
|
12 |
||
13 |
# XXX: show string offset and offending character for all errors
|
|
14 |
||
15 |
import sys |
|
16 |
||
17 |
from sre_constants import * |
|
18 |
||
19 |
SPECIAL_CHARS = ".\\[{()*+?^$|" |
|
20 |
REPEAT_CHARS = "*+?{" |
|
21 |
||
22 |
DIGITS = set("0123456789") |
|
23 |
||
24 |
OCTDIGITS = set("01234567") |
|
25 |
HEXDIGITS = set("0123456789abcdefABCDEF") |
|
26 |
||
27 |
WHITESPACE = set(" \t\n\r\v\f") |
|
28 |
||
29 |
ESCAPES = { |
|
30 |
r"\a": (LITERAL, ord("\a")), |
|
31 |
r"\b": (LITERAL, ord("\b")), |
|
32 |
r"\f": (LITERAL, ord("\f")), |
|
33 |
r"\n": (LITERAL, ord("\n")), |
|
34 |
r"\r": (LITERAL, ord("\r")), |
|
35 |
r"\t": (LITERAL, ord("\t")), |
|
36 |
r"\v": (LITERAL, ord("\v")), |
|
37 |
r"\\": (LITERAL, ord("\\")) |
|
38 |
}
|
|
39 |
||
40 |
CATEGORIES = { |
|
41 |
r"\A": (AT, AT_BEGINNING_STRING), # start of string |
|
42 |
r"\b": (AT, AT_BOUNDARY), |
|
43 |
r"\B": (AT, AT_NON_BOUNDARY), |
|
44 |
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), |
|
45 |
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), |
|
46 |
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), |
|
47 |
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), |
|
48 |
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), |
|
49 |
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), |
|
50 |
r"\Z": (AT, AT_END_STRING), # end of string |
|
51 |
}
|
|
52 |
||
53 |
FLAGS = { |
|
54 |
# standard flags
|
|
55 |
"i": SRE_FLAG_IGNORECASE, |
|
56 |
"L": SRE_FLAG_LOCALE, |
|
57 |
"m": SRE_FLAG_MULTILINE, |
|
58 |
"s": SRE_FLAG_DOTALL, |
|
59 |
"x": SRE_FLAG_VERBOSE, |
|
60 |
# extensions
|
|
61 |
"t": SRE_FLAG_TEMPLATE, |
|
62 |
"u": SRE_FLAG_UNICODE, |
|
63 |
}
|
|
64 |
||
65 |
class Pattern: |
|
66 |
# master pattern object. keeps track of global attributes
|
|
67 |
def __init__(self): |
|
68 |
self.flags = 0 |
|
69 |
self.open = [] |
|
70 |
self.groups = 1 |
|
71 |
self.groupdict = {} |
|
72 |
def opengroup(self, name=None): |
|
73 |
gid = self.groups |
|
74 |
self.groups = gid + 1 |
|
75 |
if name is not None: |
|
76 |
ogid = self.groupdict.get(name, None) |
|
77 |
if ogid is not None: |
|
78 |
raise error, ("redefinition of group name %s as group %d; " |
|
79 |
"was group %d" % (repr(name), gid, ogid)) |
|
80 |
self.groupdict[name] = gid |
|
81 |
self.open.append(gid) |
|
82 |
return gid |
|
83 |
def closegroup(self, gid): |
|
84 |
self.open.remove(gid) |
|
85 |
def checkgroup(self, gid): |
|
86 |
return gid < self.groups and gid not in self.open |
|
87 |
||
88 |
class SubPattern: |
|
89 |
# a subpattern, in intermediate form
|
|
90 |
def __init__(self, pattern, data=None): |
|
91 |
self.pattern = pattern |
|
92 |
if data is None: |
|
93 |
data = [] |
|
94 |
self.data = data |
|
95 |
self.width = None |
|
96 |
def dump(self, level=0): |
|
97 |
nl = 1 |
|
98 |
seqtypes = type(()), type([]) |
|
99 |
for op, av in self.data: |
|
100 |
print level*" " + op,; nl = 0 |
|
101 |
if op == "in": |
|
102 |
# member sublanguage
|
|
103 |
print; nl = 1 |
|
104 |
for op, a in av: |
|
105 |
print (level+1)*" " + op, a |
|
106 |
elif op == "branch": |
|
107 |
print; nl = 1 |
|
108 |
i = 0 |
|
109 |
for a in av[1]: |
|
110 |
if i > 0: |
|
111 |
print level*" " + "or" |
|
112 |
a.dump(level+1); nl = 1 |
|
113 |
i = i + 1 |
|
114 |
elif type(av) in seqtypes: |
|
115 |
for a in av: |
|
116 |
if isinstance(a, SubPattern): |
|
117 |
if not nl: print |
|
118 |
a.dump(level+1); nl = 1 |
|
119 |
else: |
|
120 |
print a, ; nl = 0 |
|
121 |
else: |
|
122 |
print av, ; nl = 0 |
|
123 |
if not nl: print |
|
124 |
def __repr__(self): |
|
125 |
return repr(self.data) |
|
126 |
def __len__(self): |
|
127 |
return len(self.data) |
|
128 |
def __delitem__(self, index): |
|
129 |
del self.data[index] |
|
130 |
def __getitem__(self, index): |
|
131 |
if isinstance(index, slice): |
|
132 |
return SubPattern(self.pattern, self.data[index]) |
|
133 |
return self.data[index] |
|
134 |
def __setitem__(self, index, code): |
|
135 |
self.data[index] = code |
|
136 |
def insert(self, index, code): |
|
137 |
self.data.insert(index, code) |
|
138 |
def append(self, code): |
|
139 |
self.data.append(code) |
|
140 |
def getwidth(self): |
|
141 |
# determine the width (min, max) for this subpattern
|
|
142 |
if self.width: |
|
143 |
return self.width |
|
144 |
lo = hi = 0L |
|
145 |
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) |
|
146 |
REPEATCODES = (MIN_REPEAT, MAX_REPEAT) |
|
147 |
for op, av in self.data: |
|
148 |
if op is BRANCH: |
|
149 |
i = sys.maxint |
|
150 |
j = 0 |
|
151 |
for av in av[1]: |
|
152 |
l, h = av.getwidth() |
|
153 |
i = min(i, l) |
|
154 |
j = max(j, h) |
|
155 |
lo = lo + i |
|
156 |
hi = hi + j |
|
157 |
elif op is CALL: |
|
158 |
i, j = av.getwidth() |
|
159 |
lo = lo + i |
|
160 |
hi = hi + j |
|
161 |
elif op is SUBPATTERN: |
|
162 |
i, j = av[1].getwidth() |
|
163 |
lo = lo + i |
|
164 |
hi = hi + j |
|
165 |
elif op in REPEATCODES: |
|
166 |
i, j = av[2].getwidth() |
|
167 |
lo = lo + long(i) * av[0] |
|
168 |
hi = hi + long(j) * av[1] |
|
169 |
elif op in UNITCODES: |
|
170 |
lo = lo + 1 |
|
171 |
hi = hi + 1 |
|
172 |
elif op == SUCCESS: |
|
173 |
break
|
|
174 |
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) |
|
175 |
return self.width |
|
176 |
||
177 |
class Tokenizer: |
|
178 |
def __init__(self, string): |
|
179 |
self.string = string |
|
180 |
self.index = 0 |
|
181 |
self.__next() |
|
182 |
def __next(self): |
|
183 |
if self.index >= len(self.string): |
|
184 |
self.next = None |
|
185 |
return
|
|
186 |
char = self.string[self.index] |
|
187 |
if char[0] == "\\": |
|
188 |
try: |
|
189 |
c = self.string[self.index + 1] |
|
190 |
except IndexError: |
|
191 |
raise error, "bogus escape (end of line)" |
|
192 |
char = char + c |
|
193 |
self.index = self.index + len(char) |
|
194 |
self.next = char |
|
195 |
def match(self, char, skip=1): |
|
196 |
if char == self.next: |
|
197 |
if skip: |
|
198 |
self.__next() |
|
199 |
return 1 |
|
200 |
return 0 |
|
201 |
def get(self): |
|
202 |
this = self.next |
|
203 |
self.__next() |
|
204 |
return this |
|
205 |
def tell(self): |
|
206 |
return self.index, self.next |
|
207 |
def seek(self, index): |
|
208 |
self.index, self.next = index |
|
209 |
||
210 |
def isident(char): |
|
211 |
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" |
|
212 |
||
213 |
def isdigit(char): |
|
214 |
return "0" <= char <= "9" |
|
215 |
||
216 |
def isname(name): |
|
217 |
# check that group name is a valid string
|
|
218 |
if not isident(name[0]): |
|
219 |
return False |
|
220 |
for char in name[1:]: |
|
221 |
if not isident(char) and not isdigit(char): |
|
222 |
return False |
|
223 |
return True |
|
224 |
||
225 |
def _class_escape(source, escape): |
|
226 |
# handle escape code inside character class
|
|
227 |
code = ESCAPES.get(escape) |
|
228 |
if code: |
|
229 |
return code |
|
230 |
code = CATEGORIES.get(escape) |
|
231 |
if code: |
|
232 |
return code |
|
233 |
try: |
|
234 |
c = escape[1:2] |
|
235 |
if c == "x": |
|
236 |
# hexadecimal escape (exactly two digits)
|
|
237 |
while source.next in HEXDIGITS and len(escape) < 4: |
|
238 |
escape = escape + source.get() |
|
239 |
escape = escape[2:] |
|
240 |
if len(escape) != 2: |
|
241 |
raise error, "bogus escape: %s" % repr("\\" + escape) |
|
242 |
return LITERAL, int(escape, 16) & 0xff |
|
243 |
elif c in OCTDIGITS: |
|
244 |
# octal escape (up to three digits)
|
|
245 |
while source.next in OCTDIGITS and len(escape) < 4: |
|
246 |
escape = escape + source.get() |
|
247 |
escape = escape[1:] |
|
248 |
return LITERAL, int(escape, 8) & 0xff |
|
249 |
elif c in DIGITS: |
|
250 |
raise error, "bogus escape: %s" % repr(escape) |
|
251 |
if len(escape) == 2: |
|
252 |
return LITERAL, ord(escape[1]) |
|
253 |
except ValueError: |
|
254 |
pass
|
|
255 |
raise error, "bogus escape: %s" % repr(escape) |
|
256 |
||
257 |
def _escape(source, escape, state): |
|
258 |
# handle escape code in expression
|
|
259 |
code = CATEGORIES.get(escape) |
|
260 |
if code: |
|
261 |
return code |
|
262 |
code = ESCAPES.get(escape) |
|
263 |
if code: |
|
264 |
return code |
|
265 |
try: |
|
266 |
c = escape[1:2] |
|
267 |
if c == "x": |
|
268 |
# hexadecimal escape
|
|
269 |
while source.next in HEXDIGITS and len(escape) < 4: |
|
270 |
escape = escape + source.get() |
|
271 |
if len(escape) != 4: |
|
272 |
raise ValueError |
|
273 |
return LITERAL, int(escape[2:], 16) & 0xff |
|
274 |
elif c == "0": |
|
275 |
# octal escape
|
|
276 |
while source.next in OCTDIGITS and len(escape) < 4: |
|
277 |
escape = escape + source.get() |
|
278 |
return LITERAL, int(escape[1:], 8) & 0xff |
|
279 |
elif c in DIGITS: |
|
280 |
# octal escape *or* decimal group reference (sigh)
|
|
281 |
if source.next in DIGITS: |
|
282 |
escape = escape + source.get() |
|
283 |
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and |
|
284 |
source.next in OCTDIGITS): |
|
285 |
# got three octal digits; this is an octal escape
|
|
286 |
escape = escape + source.get() |
|
287 |
return LITERAL, int(escape[1:], 8) & 0xff |
|
288 |
# not an octal escape, so this is a group reference
|
|
289 |
group = int(escape[1:]) |
|
290 |
if group < state.groups: |
|
291 |
if not state.checkgroup(group): |
|
292 |
raise error, "cannot refer to open group" |
|
293 |
return GROUPREF, group |
|
294 |
raise ValueError |
|
295 |
if len(escape) == 2: |
|
296 |
return LITERAL, ord(escape[1]) |
|
297 |
except ValueError: |
|
298 |
pass
|
|
299 |
raise error, "bogus escape: %s" % repr(escape) |
|
300 |
||
301 |
def _parse_sub(source, state, nested=1): |
|
302 |
# parse an alternation: a|b|c
|
|
303 |
||
304 |
items = [] |
|
305 |
itemsappend = items.append |
|
306 |
sourcematch = source.match |
|
307 |
while 1: |
|
308 |
itemsappend(_parse(source, state)) |
|
309 |
if sourcematch("|"): |
|
310 |
continue
|
|
311 |
if not nested: |
|
312 |
break
|
|
313 |
if not source.next or sourcematch(")", 0): |
|
314 |
break
|
|
315 |
else: |
|
316 |
raise error, "pattern not properly closed" |
|
317 |
||
318 |
if len(items) == 1: |
|
319 |
return items[0] |
|
320 |
||
321 |
subpattern = SubPattern(state) |
|
322 |
subpatternappend = subpattern.append |
|
323 |
||
324 |
# check if all items share a common prefix
|
|
325 |
while 1: |
|
326 |
prefix = None |
|
327 |
for item in items: |
|
328 |
if not item: |
|
329 |
break
|
|
330 |
if prefix is None: |
|
331 |
prefix = item[0] |
|
332 |
elif item[0] != prefix: |
|
333 |
break
|
|
334 |
else: |
|
335 |
# all subitems start with a common "prefix".
|
|
336 |
# move it out of the branch
|
|
337 |
for item in items: |
|
338 |
del item[0] |
|
339 |
subpatternappend(prefix) |
|
340 |
continue # check next one |
|
341 |
break
|
|
342 |
||
343 |
# check if the branch can be replaced by a character set
|
|
344 |
for item in items: |
|
345 |
if len(item) != 1 or item[0][0] != LITERAL: |
|
346 |
break
|
|
347 |
else: |
|
348 |
# we can store this as a character set instead of a
|
|
349 |
# branch (the compiler may optimize this even more)
|
|
350 |
set = [] |
|
351 |
setappend = set.append |
|
352 |
for item in items: |
|
353 |
setappend(item[0]) |
|
354 |
subpatternappend((IN, set)) |
|
355 |
return subpattern |
|
356 |
||
357 |
subpattern.append((BRANCH, (None, items))) |
|
358 |
return subpattern |
|
359 |
||
360 |
def _parse_sub_cond(source, state, condgroup): |
|
361 |
item_yes = _parse(source, state) |
|
362 |
if source.match("|"): |
|
363 |
item_no = _parse(source, state) |
|
364 |
if source.match("|"): |
|
365 |
raise error, "conditional backref with more than two branches" |
|
366 |
else: |
|
367 |
item_no = None |
|
368 |
if source.next and not source.match(")", 0): |
|
369 |
raise error, "pattern not properly closed" |
|
370 |
subpattern = SubPattern(state) |
|
371 |
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) |
|
372 |
return subpattern |
|
373 |
||
374 |
_PATTERNENDERS = set("|)") |
|
375 |
_ASSERTCHARS = set("=!<") |
|
376 |
_LOOKBEHINDASSERTCHARS = set("=!") |
|
377 |
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) |
|
378 |
||
379 |
def _parse(source, state): |
|
380 |
# parse a simple pattern
|
|
381 |
subpattern = SubPattern(state) |
|
382 |
||
383 |
# precompute constants into local variables
|
|
384 |
subpatternappend = subpattern.append |
|
385 |
sourceget = source.get |
|
386 |
sourcematch = source.match |
|
387 |
_len = len |
|
388 |
PATTERNENDERS = _PATTERNENDERS |
|
389 |
ASSERTCHARS = _ASSERTCHARS |
|
390 |
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS |
|
391 |
REPEATCODES = _REPEATCODES |
|
392 |
||
393 |
while 1: |
|
394 |
||
395 |
if source.next in PATTERNENDERS: |
|
396 |
break # end of subpattern |
|
397 |
this = sourceget() |
|
398 |
if this is None: |
|
399 |
break # end of pattern |
|
400 |
||
401 |
if state.flags & SRE_FLAG_VERBOSE: |
|
402 |
# skip whitespace and comments
|
|
403 |
if this in WHITESPACE: |
|
404 |
continue
|
|
405 |
if this == "#": |
|
406 |
while 1: |
|
407 |
this = sourceget() |
|
408 |
if this in (None, "\n"): |
|
409 |
break
|
|
410 |
continue
|
|
411 |
||
412 |
if this and this[0] not in SPECIAL_CHARS: |
|
413 |
subpatternappend((LITERAL, ord(this))) |
|
414 |
||
415 |
elif this == "[": |
|
416 |
# character set
|
|
417 |
set = [] |
|
418 |
setappend = set.append |
|
419 |
## if sourcematch(":"):
|
|
420 |
## pass # handle character classes
|
|
421 |
if sourcematch("^"): |
|
422 |
setappend((NEGATE, None)) |
|
423 |
# check remaining characters
|
|
424 |
start = set[:] |
|
425 |
while 1: |
|
426 |
this = sourceget() |
|
427 |
if this == "]" and set != start: |
|
428 |
break
|
|
429 |
elif this and this[0] == "\\": |
|
430 |
code1 = _class_escape(source, this) |
|
431 |
elif this: |
|
432 |
code1 = LITERAL, ord(this) |
|
433 |
else: |
|
434 |
raise error, "unexpected end of regular expression" |
|
435 |
if sourcematch("-"): |
|
436 |
# potential range
|
|
437 |
this = sourceget() |
|
438 |
if this == "]": |
|
439 |
if code1[0] is IN: |
|
440 |
code1 = code1[1][0] |
|
441 |
setappend(code1) |
|
442 |
setappend((LITERAL, ord("-"))) |
|
443 |
break
|
|
444 |
elif this: |
|
445 |
if this[0] == "\\": |
|
446 |
code2 = _class_escape(source, this) |
|
447 |
else: |
|
448 |
code2 = LITERAL, ord(this) |
|
449 |
if code1[0] != LITERAL or code2[0] != LITERAL: |
|
450 |
raise error, "bad character range" |
|
451 |
lo = code1[1] |
|
452 |
hi = code2[1] |
|
453 |
if hi < lo: |
|
454 |
raise error, "bad character range" |
|
455 |
setappend((RANGE, (lo, hi))) |
|
456 |
else: |
|
457 |
raise error, "unexpected end of regular expression" |
|
458 |
else: |
|
459 |
if code1[0] is IN: |
|
460 |
code1 = code1[1][0] |
|
461 |
setappend(code1) |
|
462 |
||
463 |
# XXX: <fl> should move set optimization to compiler!
|
|
464 |
if _len(set)==1 and set[0][0] is LITERAL: |
|
465 |
subpatternappend(set[0]) # optimization |
|
466 |
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: |
|
467 |
subpatternappend((NOT_LITERAL, set[1][1])) # optimization |
|
468 |
else: |
|
469 |
# XXX: <fl> should add charmap optimization here
|
|
470 |
subpatternappend((IN, set)) |
|
471 |
||
472 |
elif this and this[0] in REPEAT_CHARS: |
|
473 |
# repeat previous item
|
|
474 |
if this == "?": |
|
475 |
min, max = 0, 1 |
|
476 |
elif this == "*": |
|
477 |
min, max = 0, MAXREPEAT |
|
478 |
||
479 |
elif this == "+": |
|
480 |
min, max = 1, MAXREPEAT |
|
481 |
elif this == "{": |
|
482 |
if source.next == "}": |
|
483 |
subpatternappend((LITERAL, ord(this))) |
|
484 |
continue
|
|
485 |
here = source.tell() |
|
486 |
min, max = 0, MAXREPEAT |
|
487 |
lo = hi = "" |
|
488 |
while source.next in DIGITS: |
|
489 |
lo = lo + source.get() |
|
490 |
if sourcematch(","): |
|
491 |
while source.next in DIGITS: |
|
492 |
hi = hi + sourceget() |
|
493 |
else: |
|
494 |
hi = lo |
|
495 |
if not sourcematch("}"): |
|
496 |
subpatternappend((LITERAL, ord(this))) |
|
497 |
source.seek(here) |
|
498 |
continue
|
|
499 |
if lo: |
|
500 |
min = int(lo) |
|
501 |
if hi: |
|
502 |
max = int(hi) |
|
503 |
if max < min: |
|
504 |
raise error, "bad repeat interval" |
|
505 |
else: |
|
506 |
raise error, "not supported" |
|
507 |
# figure out which item to repeat
|
|
508 |
if subpattern: |
|
509 |
item = subpattern[-1:] |
|
510 |
else: |
|
511 |
item = None |
|
512 |
if not item or (_len(item) == 1 and item[0][0] == AT): |
|
513 |
raise error, "nothing to repeat" |
|
514 |
if item[0][0] in REPEATCODES: |
|
515 |
raise error, "multiple repeat" |
|
516 |
if sourcematch("?"): |
|
517 |
subpattern[-1] = (MIN_REPEAT, (min, max, item)) |
|
518 |
else: |
|
519 |
subpattern[-1] = (MAX_REPEAT, (min, max, item)) |
|
520 |
||
521 |
elif this == ".": |
|
522 |
subpatternappend((ANY, None)) |
|
523 |
||
524 |
elif this == "(": |
|
525 |
group = 1 |
|
526 |
name = None |
|
527 |
condgroup = None |
|
528 |
if sourcematch("?"): |
|
529 |
group = 0 |
|
530 |
# options
|
|
531 |
if sourcematch("P"): |
|
532 |
# python extensions
|
|
533 |
if sourcematch("<"): |
|
534 |
# named group: skip forward to end of name
|
|
535 |
name = "" |
|
536 |
while 1: |
|
537 |
char = sourceget() |
|
538 |
if char is None: |
|
539 |
raise error, "unterminated name" |
|
540 |
if char == ">": |
|
541 |
break
|
|
542 |
name = name + char |
|
543 |
group = 1 |
|
544 |
if not isname(name): |
|
545 |
raise error, "bad character in group name" |
|
546 |
elif sourcematch("="): |
|
547 |
# named backreference
|
|
548 |
name = "" |
|
549 |
while 1: |
|
550 |
char = sourceget() |
|
551 |
if char is None: |
|
552 |
raise error, "unterminated name" |
|
553 |
if char == ")": |
|
554 |
break
|
|
555 |
name = name + char |
|
556 |
if not isname(name): |
|
557 |
raise error, "bad character in group name" |
|
558 |
gid = state.groupdict.get(name) |
|
559 |
if gid is None: |
|
560 |
raise error, "unknown group name" |
|
561 |
subpatternappend((GROUPREF, gid)) |
|
562 |
continue
|
|
563 |
else: |
|
564 |
char = sourceget() |
|
565 |
if char is None: |
|
566 |
raise error, "unexpected end of pattern" |
|
567 |
raise error, "unknown specifier: ?P%s" % char |
|
568 |
elif sourcematch(":"): |
|
569 |
# non-capturing group
|
|
570 |
group = 2 |
|
571 |
elif sourcematch("#"): |
|
572 |
# comment
|
|
573 |
while 1: |
|
574 |
if source.next is None or source.next == ")": |
|
575 |
break
|
|
576 |
sourceget() |
|
577 |
if not sourcematch(")"): |
|
578 |
raise error, "unbalanced parenthesis" |
|
579 |
continue
|
|
580 |
elif source.next in ASSERTCHARS: |
|
581 |
# lookahead assertions
|
|
582 |
char = sourceget() |
|
583 |
dir = 1 |
|
584 |
if char == "<": |
|
585 |
if source.next not in LOOKBEHINDASSERTCHARS: |
|
586 |
raise error, "syntax error" |
|
587 |
dir = -1 # lookbehind |
|
588 |
char = sourceget() |
|
589 |
p = _parse_sub(source, state) |
|
590 |
if not sourcematch(")"): |
|
591 |
raise error, "unbalanced parenthesis" |
|
592 |
if char == "=": |
|
593 |
subpatternappend((ASSERT, (dir, p))) |
|
594 |
else: |
|
595 |
subpatternappend((ASSERT_NOT, (dir, p))) |
|
596 |
continue
|
|
597 |
elif sourcematch("("): |
|
598 |
# conditional backreference group
|
|
599 |
condname = "" |
|
600 |
while 1: |
|
601 |
char = sourceget() |
|
602 |
if char is None: |
|
603 |
raise error, "unterminated name" |
|
604 |
if char == ")": |
|
605 |
break
|
|
606 |
condname = condname + char |
|
607 |
group = 2 |
|
608 |
if isname(condname): |
|
609 |
condgroup = state.groupdict.get(condname) |
|
610 |
if condgroup is None: |
|
611 |
raise error, "unknown group name" |
|
612 |
else: |
|
613 |
try: |
|
614 |
condgroup = int(condname) |
|
615 |
except ValueError: |
|
616 |
raise error, "bad character in group name" |
|
617 |
else: |
|
618 |
# flags
|
|
619 |
if not source.next in FLAGS: |
|
620 |
raise error, "unexpected end of pattern" |
|
621 |
while source.next in FLAGS: |
|
622 |
state.flags = state.flags | FLAGS[sourceget()] |
|
623 |
if group: |
|
624 |
# parse group contents
|
|
625 |
if group == 2: |
|
626 |
# anonymous group
|
|
627 |
group = None |
|
628 |
else: |
|
629 |
group = state.opengroup(name) |
|
630 |
if condgroup: |
|
631 |
p = _parse_sub_cond(source, state, condgroup) |
|
632 |
else: |
|
633 |
p = _parse_sub(source, state) |
|
634 |
if not sourcematch(")"): |
|
635 |
raise error, "unbalanced parenthesis" |
|
636 |
if group is not None: |
|
637 |
state.closegroup(group) |
|
638 |
subpatternappend((SUBPATTERN, (group, p))) |
|
639 |
else: |
|
640 |
while 1: |
|
641 |
char = sourceget() |
|
642 |
if char is None: |
|
643 |
raise error, "unexpected end of pattern" |
|
644 |
if char == ")": |
|
645 |
break
|
|
646 |
raise error, "unknown extension" |
|
647 |
||
648 |
elif this == "^": |
|
649 |
subpatternappend((AT, AT_BEGINNING)) |
|
650 |
||
651 |
elif this == "$": |
|
652 |
subpattern.append((AT, AT_END)) |
|
653 |
||
654 |
elif this and this[0] == "\\": |
|
655 |
code = _escape(source, this, state) |
|
656 |
subpatternappend(code) |
|
657 |
||
658 |
else: |
|
659 |
raise error, "parser error" |
|
660 |
||
661 |
return subpattern |
|
662 |
||
663 |
def parse(str, flags=0, pattern=None): |
|
664 |
# parse 're' pattern into list of (opcode, argument) tuples
|
|
665 |
||
666 |
source = Tokenizer(str) |
|
667 |
||
668 |
if pattern is None: |
|
669 |
pattern = Pattern() |
|
670 |
pattern.flags = flags |
|
671 |
pattern.str = str |
|
672 |
||
673 |
p = _parse_sub(source, pattern, 0) |
|
674 |
||
675 |
tail = source.get() |
|
676 |
if tail == ")": |
|
677 |
raise error, "unbalanced parenthesis" |
|
678 |
elif tail: |
|
679 |
raise error, "bogus characters at end of regular expression" |
|
680 |
||
681 |
if flags & SRE_FLAG_DEBUG: |
|
682 |
p.dump() |
|
683 |
||
684 |
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: |
|
685 |
# the VERBOSE flag was switched on inside the pattern. to be
|
|
686 |
# on the safe side, we'll parse the whole thing again...
|
|
687 |
return parse(str, p.pattern.flags) |
|
688 |
||
689 |
return p |
|
690 |
||
691 |
def parse_template(source, pattern): |
|
692 |
# parse 're' replacement string into list of literals and
|
|
693 |
# group references
|
|
694 |
s = Tokenizer(source) |
|
695 |
sget = s.get |
|
696 |
p = [] |
|
697 |
a = p.append |
|
698 |
def literal(literal, p=p, pappend=a): |
|
699 |
if p and p[-1][0] is LITERAL: |
|
700 |
p[-1] = LITERAL, p[-1][1] + literal |
|
701 |
else: |
|
702 |
pappend((LITERAL, literal)) |
|
703 |
sep = source[:0] |
|
704 |
if type(sep) is type(""): |
|
705 |
makechar = chr |
|
706 |
else: |
|
707 |
makechar = unichr |
|
708 |
while 1: |
|
709 |
this = sget() |
|
710 |
if this is None: |
|
711 |
break # end of replacement string |
|
712 |
if this and this[0] == "\\": |
|
713 |
# group
|
|
714 |
c = this[1:2] |
|
715 |
if c == "g": |
|
716 |
name = "" |
|
717 |
if s.match("<"): |
|
718 |
while 1: |
|
719 |
char = sget() |
|
720 |
if char is None: |
|
721 |
raise error, "unterminated group name" |
|
722 |
if char == ">": |
|
723 |
break
|
|
724 |
name = name + char |
|
725 |
if not name: |
|
726 |
raise error, "bad group name" |
|
727 |
try: |
|
728 |
index = int(name) |
|
729 |
if index < 0: |
|
730 |
raise error, "negative group number" |
|
731 |
except ValueError: |
|
732 |
if not isname(name): |
|
733 |
raise error, "bad character in group name" |
|
734 |
try: |
|
735 |
index = pattern.groupindex[name] |
|
736 |
except KeyError: |
|
737 |
raise IndexError, "unknown group name" |
|
738 |
a((MARK, index)) |
|
739 |
elif c == "0": |
|
740 |
if s.next in OCTDIGITS: |
|
741 |
this = this + sget() |
|
742 |
if s.next in OCTDIGITS: |
|
743 |
this = this + sget() |
|
744 |
literal(makechar(int(this[1:], 8) & 0xff)) |
|
745 |
elif c in DIGITS: |
|
746 |
isoctal = False |
|
747 |
if s.next in DIGITS: |
|
748 |
this = this + sget() |
|
749 |
if (c in OCTDIGITS and this[2] in OCTDIGITS and |
|
750 |
s.next in OCTDIGITS): |
|
751 |
this = this + sget() |
|
752 |
isoctal = True |
|
753 |
literal(makechar(int(this[1:], 8) & 0xff)) |
|
754 |
if not isoctal: |
|
755 |
a((MARK, int(this[1:]))) |
|
756 |
else: |
|
757 |
try: |
|
758 |
this = makechar(ESCAPES[this][1]) |
|
759 |
except KeyError: |
|
760 |
pass
|
|
761 |
literal(this) |
|
762 |
else: |
|
763 |
literal(this) |
|
764 |
# convert template to groups and literals lists
|
|
765 |
i = 0 |
|
766 |
groups = [] |
|
767 |
groupsappend = groups.append |
|
768 |
literals = [None] * len(p) |
|
769 |
for c, s in p: |
|
770 |
if c is MARK: |
|
771 |
groupsappend((i, s)) |
|
772 |
# literal[i] is already None
|
|
773 |
else: |
|
774 |
literals[i] = s |
|
775 |
i = i + 1 |
|
776 |
return groups, literals |
|
777 |
||
778 |
def expand_template(template, match): |
|
779 |
g = match.group |
|
780 |
sep = match.string[:0] |
|
781 |
groups, literals = template |
|
782 |
literals = literals[:] |
|
783 |
try: |
|
784 |
for index, group in groups: |
|
785 |
literals[index] = s = g(group) |
|
786 |
if s is None: |
|
787 |
raise error, "unmatched group" |
|
788 |
except IndexError: |
|
789 |
raise error, "invalid group reference" |
|
790 |
return sep.join(literals) |