2
# Secret Labs' Regular Expression Engine
4
# convert re-style regular expression to sre pattern
6
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8
# See the sre.py file for information on usage and redistribution.
11
"""Internal support module for sre"""
13
# XXX: show string offset and offending character for all errors
17
from sre_constants import *
19
SPECIAL_CHARS = ".\\[{()*+?^$|"
22
DIGITS = set("0123456789")
24
OCTDIGITS = set("01234567")
25
HEXDIGITS = set("0123456789abcdefABCDEF")
27
WHITESPACE = set(" \t\n\r\v\f")
30
r"\a": (LITERAL, ord("\a")),
31
r"\b": (LITERAL, ord("\b")),
32
r"\f": (LITERAL, ord("\f")),
33
r"\n": (LITERAL, ord("\n")),
34
r"\r": (LITERAL, ord("\r")),
35
r"\t": (LITERAL, ord("\t")),
36
r"\v": (LITERAL, ord("\v")),
37
r"\\": (LITERAL, ord("\\"))
41
r"\A": (AT, AT_BEGINNING_STRING), # start of string
42
r"\b": (AT, AT_BOUNDARY),
43
r"\B": (AT, AT_NON_BOUNDARY),
44
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
45
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
46
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
47
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
48
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
49
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
50
r"\Z": (AT, AT_END_STRING), # end of string
55
"i": SRE_FLAG_IGNORECASE,
57
"m": SRE_FLAG_MULTILINE,
59
"x": SRE_FLAG_VERBOSE,
62
"t": SRE_FLAG_TEMPLATE,
63
"u": SRE_FLAG_UNICODE,
67
# master pattern object. keeps track of global attributes
73
def opengroup(self, name=None):
77
ogid = self.groupdict.get(name, None)
79
raise error("redefinition of group name %s as group %d; "
80
"was group %d" % (repr(name), gid, ogid))
81
self.groupdict[name] = gid
84
def closegroup(self, gid):
86
def checkgroup(self, gid):
87
return gid < self.groups and gid not in self.open
90
# a subpattern, in intermediate form
91
def __init__(self, pattern, data=None):
92
self.pattern = pattern
97
def dump(self, level=0):
99
seqtypes = (tuple, list)
100
for op, av in self.data:
101
print(level*" " + op, end=' '); nl = 0
106
print((level+1)*" " + op, a)
112
print(level*" " + "or")
113
a.dump(level+1); nl = 1
115
elif isinstance(av, seqtypes):
117
if isinstance(a, SubPattern):
119
a.dump(level+1); nl = 1
121
print(a, end=' ') ; nl = 0
123
print(av, end=' ') ; nl = 0
126
return repr(self.data)
128
return len(self.data)
129
def __delitem__(self, index):
131
def __getitem__(self, index):
132
if isinstance(index, slice):
133
return SubPattern(self.pattern, self.data[index])
134
return self.data[index]
135
def __setitem__(self, index, code):
136
self.data[index] = code
137
def insert(self, index, code):
138
self.data.insert(index, code)
139
def append(self, code):
140
self.data.append(code)
142
# determine the width (min, max) for this subpattern
146
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
147
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
148
for op, av in self.data:
162
elif op is SUBPATTERN:
163
i, j = av[1].getwidth()
166
elif op in REPEATCODES:
167
i, j = av[2].getwidth()
168
lo = lo + int(i) * av[0]
169
hi = hi + int(j) * av[1]
170
elif op in UNITCODES:
175
self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize))
179
def __init__(self, string):
184
if self.index >= len(self.string):
187
char = self.string[self.index:self.index+1]
188
# Special case for the str8, since indexing returns a integer
189
# XXX This is only needed for test_bug_926075 in test_re.py
190
if char and isinstance(char, bytes):
194
c = self.string[self.index + 1]
196
raise error("bogus escape (end of line)")
197
if isinstance(self.string, bytes):
200
self.index = self.index + len(char)
202
def match(self, char, skip=1):
203
if char == self.next:
213
return self.index, self.next
214
def seek(self, index):
215
self.index, self.next = index
218
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
221
return "0" <= char <= "9"
224
# check that group name is a valid string
225
if not isident(name[0]):
227
for char in name[1:]:
228
if not isident(char) and not isdigit(char):
232
def _class_escape(source, escape):
233
# handle escape code inside character class
234
code = ESCAPES.get(escape)
237
code = CATEGORIES.get(escape)
243
# hexadecimal escape (exactly two digits)
244
while source.next in HEXDIGITS and len(escape) < 4:
245
escape = escape + source.get()
248
raise error("bogus escape: %s" % repr("\\" + escape))
249
return LITERAL, int(escape, 16) & 0xff
251
# octal escape (up to three digits)
252
while source.next in OCTDIGITS and len(escape) < 4:
253
escape = escape + source.get()
255
return LITERAL, int(escape, 8) & 0xff
257
raise error("bogus escape: %s" % repr(escape))
259
return LITERAL, ord(escape[1])
262
raise error("bogus escape: %s" % repr(escape))
264
def _escape(source, escape, state):
265
# handle escape code in expression
266
code = CATEGORIES.get(escape)
269
code = ESCAPES.get(escape)
276
while source.next in HEXDIGITS and len(escape) < 4:
277
escape = escape + source.get()
280
return LITERAL, int(escape[2:], 16) & 0xff
283
while source.next in OCTDIGITS and len(escape) < 4:
284
escape = escape + source.get()
285
return LITERAL, int(escape[1:], 8) & 0xff
287
# octal escape *or* decimal group reference (sigh)
288
if source.next in DIGITS:
289
escape = escape + source.get()
290
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
291
source.next in OCTDIGITS):
292
# got three octal digits; this is an octal escape
293
escape = escape + source.get()
294
return LITERAL, int(escape[1:], 8) & 0xff
295
# not an octal escape, so this is a group reference
296
group = int(escape[1:])
297
if group < state.groups:
298
if not state.checkgroup(group):
299
raise error("cannot refer to open group")
300
return GROUPREF, group
303
return LITERAL, ord(escape[1])
306
raise error("bogus escape: %s" % repr(escape))
308
def _parse_sub(source, state, nested=1):
309
# parse an alternation: a|b|c
312
itemsappend = items.append
313
sourcematch = source.match
315
itemsappend(_parse(source, state))
320
if not source.next or sourcematch(")", 0):
323
raise error("pattern not properly closed")
328
subpattern = SubPattern(state)
329
subpatternappend = subpattern.append
331
# check if all items share a common prefix
339
elif item[0] != prefix:
342
# all subitems start with a common "prefix".
343
# move it out of the branch
346
subpatternappend(prefix)
347
continue # check next one
350
# check if the branch can be replaced by a character set
352
if len(item) != 1 or item[0][0] != LITERAL:
355
# we can store this as a character set instead of a
356
# branch (the compiler may optimize this even more)
358
setappend = set.append
361
subpatternappend((IN, set))
364
subpattern.append((BRANCH, (None, items)))
367
def _parse_sub_cond(source, state, condgroup):
368
item_yes = _parse(source, state)
369
if source.match("|"):
370
item_no = _parse(source, state)
371
if source.match("|"):
372
raise error("conditional backref with more than two branches")
375
if source.next and not source.match(")", 0):
376
raise error("pattern not properly closed")
377
subpattern = SubPattern(state)
378
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
381
_PATTERNENDERS = set("|)")
382
_ASSERTCHARS = set("=!<")
383
_LOOKBEHINDASSERTCHARS = set("=!")
384
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
386
def _parse(source, state):
387
# parse a simple pattern
388
subpattern = SubPattern(state)
390
# precompute constants into local variables
391
subpatternappend = subpattern.append
392
sourceget = source.get
393
sourcematch = source.match
395
PATTERNENDERS = _PATTERNENDERS
396
ASSERTCHARS = _ASSERTCHARS
397
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
398
REPEATCODES = _REPEATCODES
402
if source.next in PATTERNENDERS:
403
break # end of subpattern
406
break # end of pattern
408
if state.flags & SRE_FLAG_VERBOSE:
409
# skip whitespace and comments
410
if this in WHITESPACE:
415
if this in (None, "\n"):
419
if this and this[0] not in SPECIAL_CHARS:
420
subpatternappend((LITERAL, ord(this)))
425
setappend = set.append
426
## if sourcematch(":"):
427
## pass # handle character classes
429
setappend((NEGATE, None))
430
# check remaining characters
434
if this == "]" and set != start:
436
elif this and this[0] == "\\":
437
code1 = _class_escape(source, this)
439
code1 = LITERAL, ord(this)
441
raise error("unexpected end of regular expression")
449
setappend((LITERAL, ord("-")))
453
code2 = _class_escape(source, this)
455
code2 = LITERAL, ord(this)
456
if code1[0] != LITERAL or code2[0] != LITERAL:
457
raise error("bad character range")
461
raise error("bad character range")
462
setappend((RANGE, (lo, hi)))
464
raise error("unexpected end of regular expression")
470
# XXX: <fl> should move set optimization to compiler!
471
if _len(set)==1 and set[0][0] is LITERAL:
472
subpatternappend(set[0]) # optimization
473
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
474
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
476
# XXX: <fl> should add charmap optimization here
477
subpatternappend((IN, set))
479
elif this and this[0] in REPEAT_CHARS:
480
# repeat previous item
484
min, max = 0, MAXREPEAT
487
min, max = 1, MAXREPEAT
489
if source.next == "}":
490
subpatternappend((LITERAL, ord(this)))
493
min, max = 0, MAXREPEAT
495
while source.next in DIGITS:
496
lo = lo + source.get()
498
while source.next in DIGITS:
499
hi = hi + sourceget()
502
if not sourcematch("}"):
503
subpatternappend((LITERAL, ord(this)))
511
raise error("bad repeat interval")
513
raise error("not supported")
514
# figure out which item to repeat
516
item = subpattern[-1:]
519
if not item or (_len(item) == 1 and item[0][0] == AT):
520
raise error("nothing to repeat")
521
if item[0][0] in REPEATCODES:
522
raise error("multiple repeat")
524
subpattern[-1] = (MIN_REPEAT, (min, max, item))
526
subpattern[-1] = (MAX_REPEAT, (min, max, item))
529
subpatternappend((ANY, None))
541
# named group: skip forward to end of name
546
raise error("unterminated name")
552
raise error("bad character in group name")
553
elif sourcematch("="):
554
# named backreference
559
raise error("unterminated name")
564
raise error("bad character in group name")
565
gid = state.groupdict.get(name)
567
raise error("unknown group name")
568
subpatternappend((GROUPREF, gid))
573
raise error("unexpected end of pattern")
574
raise error("unknown specifier: ?P%s" % char)
575
elif sourcematch(":"):
576
# non-capturing group
578
elif sourcematch("#"):
581
if source.next is None or source.next == ")":
584
if not sourcematch(")"):
585
raise error("unbalanced parenthesis")
587
elif source.next in ASSERTCHARS:
588
# lookahead assertions
592
if source.next not in LOOKBEHINDASSERTCHARS:
593
raise error("syntax error")
594
dir = -1 # lookbehind
596
p = _parse_sub(source, state)
597
if not sourcematch(")"):
598
raise error("unbalanced parenthesis")
600
subpatternappend((ASSERT, (dir, p)))
602
subpatternappend((ASSERT_NOT, (dir, p)))
604
elif sourcematch("("):
605
# conditional backreference group
610
raise error("unterminated name")
613
condname = condname + char
616
condgroup = state.groupdict.get(condname)
617
if condgroup is None:
618
raise error("unknown group name")
621
condgroup = int(condname)
623
raise error("bad character in group name")
626
if not source.next in FLAGS:
627
raise error("unexpected end of pattern")
628
while source.next in FLAGS:
629
state.flags = state.flags | FLAGS[sourceget()]
631
# parse group contents
636
group = state.opengroup(name)
638
p = _parse_sub_cond(source, state, condgroup)
640
p = _parse_sub(source, state)
641
if not sourcematch(")"):
642
raise error("unbalanced parenthesis")
643
if group is not None:
644
state.closegroup(group)
645
subpatternappend((SUBPATTERN, (group, p)))
650
raise error("unexpected end of pattern")
653
raise error("unknown extension")
656
subpatternappend((AT, AT_BEGINNING))
659
subpattern.append((AT, AT_END))
661
elif this and this[0] == "\\":
662
code = _escape(source, this, state)
663
subpatternappend(code)
666
raise error("parser error")
670
def fix_flags(src, flags):
671
# Check and fix flags according to the type of pattern (str or bytes)
672
if isinstance(src, str):
673
if not flags & SRE_FLAG_ASCII:
674
flags |= SRE_FLAG_UNICODE
675
elif flags & SRE_FLAG_UNICODE:
676
raise ValueError("ASCII and UNICODE flags are incompatible")
678
if flags & SRE_FLAG_UNICODE:
679
raise ValueError("can't use UNICODE flag with a bytes pattern")
682
def parse(str, flags=0, pattern=None):
683
# parse 're' pattern into list of (opcode, argument) tuples
685
source = Tokenizer(str)
689
pattern.flags = flags
692
p = _parse_sub(source, pattern, 0)
693
p.pattern.flags = fix_flags(str, p.pattern.flags)
697
raise error("unbalanced parenthesis")
699
raise error("bogus characters at end of regular expression")
701
if flags & SRE_FLAG_DEBUG:
704
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
705
# the VERBOSE flag was switched on inside the pattern. to be
706
# on the safe side, we'll parse the whole thing again...
707
return parse(str, p.pattern.flags)
711
def parse_template(source, pattern):
712
# parse 're' replacement string into list of literals and
714
s = Tokenizer(source)
718
def literal(literal, p=p, pappend=a):
719
if p and p[-1][0] is LITERAL:
720
p[-1] = LITERAL, p[-1][1] + literal
722
pappend((LITERAL, literal))
724
if isinstance(sep, str):
731
break # end of replacement string
732
if this and this[0] == "\\":
741
raise error("unterminated group name")
746
raise error("bad group name")
750
raise error("negative group number")
753
raise error("bad character in group name")
755
index = pattern.groupindex[name]
757
raise IndexError("unknown group name")
760
if s.next in OCTDIGITS:
762
if s.next in OCTDIGITS:
764
literal(makechar(int(this[1:], 8) & 0xff))
769
if (c in OCTDIGITS and this[2] in OCTDIGITS and
770
s.next in OCTDIGITS):
773
literal(makechar(int(this[1:], 8) & 0xff))
775
a((MARK, int(this[1:])))
778
this = makechar(ESCAPES[this][1])
784
# convert template to groups and literals lists
787
groupsappend = groups.append
788
literals = [None] * len(p)
792
# literal[i] is already None
796
return groups, literals
798
def expand_template(template, match):
800
sep = match.string[:0]
801
groups, literals = template
802
literals = literals[:]
804
for index, group in groups:
805
literals[index] = s = g(group)
807
raise error("unmatched group")
809
raise error("invalid group reference")
810
return sep.join(literals)