2
# Secret Labs' Regular Expression Engine
4
# convert re-style regular expression to sre pattern
6
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8
# See the sre.py file for information on usage and redistribution.
11
# XXX: show string offset and offending character for all errors
13
# this module works under 1.5.2 and later. don't use string methods
16
from sre_constants import *
18
SPECIAL_CHARS = ".\\[{()*+?^$|"
21
DIGITS = tuple("0123456789")
23
OCTDIGITS = tuple("01234567")
24
HEXDIGITS = tuple("0123456789abcdefABCDEF")
26
WHITESPACE = tuple(" \t\n\r\v\f")
29
r"\a": (LITERAL, ord("\a")),
30
r"\b": (LITERAL, ord("\b")),
31
r"\f": (LITERAL, ord("\f")),
32
r"\n": (LITERAL, ord("\n")),
33
r"\r": (LITERAL, ord("\r")),
34
r"\t": (LITERAL, ord("\t")),
35
r"\v": (LITERAL, ord("\v")),
36
r"\\": (LITERAL, ord("\\"))
40
r"\A": (AT, AT_BEGINNING_STRING), # start of string
41
r"\b": (AT, AT_BOUNDARY),
42
r"\B": (AT, AT_NON_BOUNDARY),
43
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
44
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
45
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
46
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
47
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
48
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
49
r"\Z": (AT, AT_END_STRING), # end of string
54
"i": SRE_FLAG_IGNORECASE,
56
"m": SRE_FLAG_MULTILINE,
58
"x": SRE_FLAG_VERBOSE,
60
"t": SRE_FLAG_TEMPLATE,
61
"u": SRE_FLAG_UNICODE,
64
# figure out best way to convert hex/octal numbers to integers
67
atoi = int # 2.0 and later
69
atoi = string.atoi # 1.5.2
72
# master pattern object. keeps track of global attributes
78
def opengroup(self, name=None):
82
self.groupdict[name] = gid
85
def closegroup(self, gid):
87
def checkgroup(self, gid):
88
return gid < self.groups and gid not in self.open
91
# a subpattern, in intermediate form
92
def __init__(self, pattern, data=None):
93
self.pattern = pattern
98
def dump(self, level=0):
100
for op, av in self.data:
101
print level*" " + op,; nl = 0
106
print (level+1)*" " + op, a
112
print level*" " + "or"
113
a.dump(level+1); nl = 1
115
elif type(av) in (type(()), type([])):
117
if isinstance(a, SubPattern):
119
a.dump(level+1); nl = 1
126
return repr(self.data)
128
return len(self.data)
129
def __delitem__(self, index):
131
def __getitem__(self, index):
132
return self.data[index]
133
def __setitem__(self, index, code):
134
self.data[index] = code
135
def __getslice__(self, start, stop):
136
return SubPattern(self.pattern, self.data[start:stop])
137
def insert(self, index, code):
138
self.data.insert(index, code)
139
def append(self, code):
140
self.data.append(code)
142
# determine the width (min, max) for this subpattern
146
for op, av in self.data:
160
elif op is SUBPATTERN:
161
i, j = av[1].getwidth()
164
elif op in (MIN_REPEAT, MAX_REPEAT):
165
i, j = av[2].getwidth()
166
lo = lo + long(i) * av[0]
167
hi = hi + long(j) * av[1]
168
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
173
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
177
def __init__(self, string):
182
if self.index >= len(self.string):
185
char = self.string[self.index]
188
c = self.string[self.index + 1]
190
raise error, "bogus escape"
192
self.index = self.index + len(char)
194
def match(self, char, skip=1):
195
if char == self.next:
205
return self.index, self.next
206
def seek(self, index):
207
self.index, self.next = index
210
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
213
return "0" <= char <= "9"
216
# check that group name is a valid string
217
if not isident(name[0]):
220
if not isident(char) and not isdigit(char):
224
def _group(escape, groups):
225
# check if the escape string represents a valid group
227
gid = atoi(escape[1:])
228
if gid and gid < groups:
232
return None # not a valid group
234
def _class_escape(source, escape):
235
# handle escape code inside character class
236
code = ESCAPES.get(escape)
239
code = CATEGORIES.get(escape)
243
if escape[1:2] == "x":
244
# hexadecimal escape (exactly two digits)
245
while source.next in HEXDIGITS and len(escape) < 4:
246
escape = escape + source.get()
249
raise error, "bogus escape: %s" % repr("\\" + escape)
250
return LITERAL, atoi(escape, 16) & 0xff
251
elif str(escape[1:2]) in OCTDIGITS:
252
# octal escape (up to three digits)
253
while source.next in OCTDIGITS and len(escape) < 5:
254
escape = escape + source.get()
256
return LITERAL, atoi(escape, 8) & 0xff
258
return LITERAL, ord(escape[1])
261
raise error, "bogus escape: %s" % repr(escape)
263
def _escape(source, escape, state):
264
# handle escape code in expression
265
code = CATEGORIES.get(escape)
268
code = ESCAPES.get(escape)
272
if escape[1:2] == "x":
274
while source.next in HEXDIGITS and len(escape) < 4:
275
escape = escape + source.get()
278
return LITERAL, atoi(escape[2:], 16) & 0xff
279
elif escape[1:2] == "0":
281
while source.next in OCTDIGITS and len(escape) < 4:
282
escape = escape + source.get()
283
return LITERAL, atoi(escape[1:], 8) & 0xff
284
elif escape[1:2] in DIGITS:
285
# octal escape *or* decimal group reference (sigh)
287
if source.next in DIGITS:
288
escape = escape + source.get()
289
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290
source.next in OCTDIGITS):
291
# got three octal digits; this is an octal escape
292
escape = escape + source.get()
293
return LITERAL, atoi(escape[1:], 8) & 0xff
294
# got at least one decimal digit; this is a group reference
295
group = _group(escape, state.groups)
297
if not state.checkgroup(group):
298
raise error, "cannot refer to open group"
299
return GROUPREF, group
302
return LITERAL, ord(escape[1])
305
raise error, "bogus escape: %s" % repr(escape)
307
def _parse_sub(source, state, nested=1):
308
# parse an alternation: a|b|c
312
items.append(_parse(source, state))
313
if source.match("|"):
317
if not source.next or source.match(")", 0):
320
raise error, "pattern not properly closed"
325
subpattern = SubPattern(state)
327
# check if all items share a common prefix
335
elif item[0] != prefix:
338
# all subitems start with a common "prefix".
339
# move it out of the branch
342
subpattern.append(prefix)
343
continue # check next one
346
# check if the branch can be replaced by a character set
348
if len(item) != 1 or item[0][0] != LITERAL:
351
# we can store this as a character set instead of a
352
# branch (the compiler may optimize this even more)
356
subpattern.append((IN, set))
359
subpattern.append((BRANCH, (None, items)))
362
def _parse(source, state):
363
# parse a simple pattern
365
subpattern = SubPattern(state)
369
if source.next in ("|", ")"):
370
break # end of subpattern
373
break # end of pattern
375
if state.flags & SRE_FLAG_VERBOSE:
376
# skip whitespace and comments
377
if this in WHITESPACE:
382
if this in (None, "\n"):
386
if this and this[0] not in SPECIAL_CHARS:
387
subpattern.append((LITERAL, ord(this)))
392
## if source.match(":"):
393
## pass # handle character classes
394
if source.match("^"):
395
set.append((NEGATE, None))
396
# check remaining characters
400
if this == "]" and set != start:
402
elif this and this[0] == "\\":
403
code1 = _class_escape(source, this)
405
code1 = LITERAL, ord(this)
407
raise error, "unexpected end of regular expression"
408
if source.match("-"):
415
set.append((LITERAL, ord("-")))
419
code2 = _class_escape(source, this)
421
code2 = LITERAL, ord(this)
422
if code1[0] != LITERAL or code2[0] != LITERAL:
423
raise error, "bad character range"
427
raise error, "bad character range"
428
set.append((RANGE, (lo, hi)))
434
# XXX: <fl> should move set optimization to compiler!
435
if len(set)==1 and set[0][0] is LITERAL:
436
subpattern.append(set[0]) # optimization
437
elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
438
subpattern.append((NOT_LITERAL, set[1][1])) # optimization
440
# XXX: <fl> should add charmap optimization here
441
subpattern.append((IN, set))
443
elif this and this[0] in REPEAT_CHARS:
444
# repeat previous item
448
min, max = 0, MAXREPEAT
451
min, max = 1, MAXREPEAT
454
min, max = 0, MAXREPEAT
456
while source.next in DIGITS:
457
lo = lo + source.get()
458
if source.match(","):
459
while source.next in DIGITS:
460
hi = hi + source.get()
463
if not source.match("}"):
464
subpattern.append((LITERAL, ord(this)))
472
raise error, "bad repeat interval"
474
raise error, "not supported"
475
# figure out which item to repeat
477
item = subpattern[-1:]
480
if not item or (len(item) == 1 and item[0][0] == AT):
481
raise error, "nothing to repeat"
482
if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
483
raise error, "multiple repeat"
484
if source.match("?"):
485
subpattern[-1] = (MIN_REPEAT, (min, max, item))
487
subpattern[-1] = (MAX_REPEAT, (min, max, item))
490
subpattern.append((ANY, None))
495
if source.match("?"):
498
if source.match("P"):
500
if source.match("<"):
501
# named group: skip forward to end of name
506
raise error, "unterminated name"
512
raise error, "bad character in group name"
513
elif source.match("="):
514
# named backreference
519
raise error, "unterminated name"
524
raise error, "bad character in group name"
525
gid = state.groupdict.get(name)
527
raise error, "unknown group name"
528
subpattern.append((GROUPREF, gid))
533
raise error, "unexpected end of pattern"
534
raise error, "unknown specifier: ?P%s" % char
535
elif source.match(":"):
536
# non-capturing group
538
elif source.match("#"):
541
if source.next is None or source.next == ")":
544
if not source.match(")"):
545
raise error, "unbalanced parenthesis"
547
elif source.next in ("=", "!", "<"):
548
# lookahead assertions
552
if source.next not in ("=", "!"):
553
raise error, "syntax error"
554
dir = -1 # lookbehind
556
p = _parse_sub(source, state)
557
if not source.match(")"):
558
raise error, "unbalanced parenthesis"
560
subpattern.append((ASSERT, (dir, p)))
562
subpattern.append((ASSERT_NOT, (dir, p)))
566
if not FLAGS.has_key(source.next):
567
raise error, "unexpected end of pattern"
568
while FLAGS.has_key(source.next):
569
state.flags = state.flags | FLAGS[source.get()]
571
# parse group contents
576
group = state.opengroup(name)
577
p = _parse_sub(source, state)
578
if not source.match(")"):
579
raise error, "unbalanced parenthesis"
580
if group is not None:
581
state.closegroup(group)
582
subpattern.append((SUBPATTERN, (group, p)))
587
raise error, "unexpected end of pattern"
590
raise error, "unknown extension"
593
subpattern.append((AT, AT_BEGINNING))
596
subpattern.append((AT, AT_END))
598
elif this and this[0] == "\\":
599
code = _escape(source, this, state)
600
subpattern.append(code)
603
raise error, "parser error"
607
def parse(str, flags=0, pattern=None):
608
# parse 're' pattern into list of (opcode, argument) tuples
610
source = Tokenizer(str)
614
pattern.flags = flags
617
p = _parse_sub(source, pattern, 0)
621
raise error, "unbalanced parenthesis"
623
raise error, "bogus characters at end of regular expression"
625
if flags & SRE_FLAG_DEBUG:
628
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
629
# the VERBOSE flag was switched on inside the pattern. to be
630
# on the safe side, we'll parse the whole thing again...
631
return parse(str, p.pattern.flags)
635
def parse_template(source, pattern):
636
# parse 're' replacement string into list of literals and
638
s = Tokenizer(source)
641
def literal(literal, p=p):
642
if p and p[-1][0] is LITERAL:
643
p[-1] = LITERAL, p[-1][1] + literal
645
p.append((LITERAL, literal))
647
if type(sep) is type(""):
654
break # end of replacement string
655
if this and this[0] == "\\":
663
raise error, "unterminated group name"
668
raise error, "bad group name"
673
raise error, "bad character in group name"
675
index = pattern.groupindex[name]
677
raise IndexError, "unknown group name"
679
elif len(this) > 1 and this[1] in DIGITS:
682
group = _group(this, pattern.groups+1)
684
if (s.next not in DIGITS or
685
not _group(this + s.next, pattern.groups+1)):
688
elif s.next in OCTDIGITS:
689
this = this + s.get()
694
code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
695
if code[0] is LITERAL:
701
this = char(ESCAPES[this][1])
707
# convert template to groups and literals lists
713
groups.append((i, s))
714
literals.append(None)
718
return groups, literals
720
def expand_template(template, match):
722
sep = match.string[:0]
723
groups, literals = template
724
literals = literals[:]
726
for index, group in groups:
727
literals[index] = s = g(group)
731
raise error, "empty group"
732
return string.join(literals, sep)