3
# Yapps 3.0 - yet another python parser system
4
# Amit J Patel, January 1999
5
# German M. Bravo, December 2011
6
# See http://theory.stanford.edu/~amitp/Yapps/ for documentation and updates
8
# v3.0.0 changes (December 2011)
10
# * Optimizations in the scanning (added cache and cleanup() for it)
11
# v2.0.1 changes (October 2001):
12
# * The exceptions inherit the standard Exception class (thanks Rich Salz)
13
# * The scanner can use either a different set of regular expressions
14
# per instance, or allows the subclass to define class fields with
15
# the patterns. This improves performance when many Scanner objects
16
# are being created, because the regular expressions don't have to
17
# be recompiled each time. (thanks Amaury Forgeot d'Arc)
18
# v2.0.2 changes (April 2002)
19
# * Bug fix: generating the 'else' clause when the comment was too
20
# long. v2.0.1 was missing a newline. (thanks Steven Engelhardt)
21
# v2.0.3 changes (August 2002)
22
# * Bug fix: inline tokens using the r"" syntax.
23
# v.2.0.4 changes (July 2003)
24
# * Style change: Replaced `expr` with repr(expr)
25
# * Style change: Changed (b >= a and b < c) into (a <= b < c)
26
# * Bug fix: identifiers in grammar rules that had digits in them were
27
# not accessible in the {{python code}} section
28
# * Bug fix: made the SyntaxError exception class call
29
# Exception.__init__ (thanks Alex Verstak)
30
# * Style change: replaced raise "string exception" with raise
31
# ClassException(...) (thanks Alex Verstak)
42
def __init__(self, name, options, tokens, rules):
45
self.options = options
47
self.postparser = None
49
self.tokens = {} # Map from tokens to regexps
50
self.sets = {} # Map for restriction sets
51
self.ignore = [] # List of token names to ignore in parsing
52
self.terminals = [] # List of token names (to maintain ordering)
58
if n in self.tokens.keys() and self.tokens[n] != t:
59
if n not in self.ignore:
60
print 'Warning: token', n, 'multiply defined.'
62
self.terminals.append(n)
65
self.rules = {} # Map from rule names to parser nodes
66
self.params = {} # Map from rule names to parameters
67
self.goals = [] # List of rule names (to maintain ordering)
73
self.output = sys.stdout
75
def __getitem__(self, name):
77
return self.options.get(name, 0)
79
def non_ignored_tokens(self):
80
return filter(lambda x, i=self.ignore: x not in i, self.terminals)
83
self.change_count = 1 + self.change_count
85
def subset(self, a, b):
86
"See if all elements of a are inside b"
92
def equal_set(self, a, b):
93
"See if a and b have the same elements"
98
return self.subset(a, b) and self.subset(b, a)
100
def add_to(self, parent, additions):
101
"Modify parent to include all elements in additions"
107
def equate(self, a, b):
111
def write(self, *args):
115
def in_test(self, r, x, full, b):
119
return '%s == %s' % (x, repr(b[0]))
120
if full and len(b) > len(full) / 2:
121
# Reverse the sense of the test.
122
not_b = filter(lambda x, b=b:
124
return self.not_in_test(r, x, full, not_b)
126
for k, v in self.sets.items():
131
while n in self.sets:
134
b_set = 'self.%s' % n
135
return '%s in %s' % (x, b_set)
137
def not_in_test(self, r, x, full, b):
141
return '%s != %s' % (x, repr(b[0]))
143
for k, v in self.sets.items():
148
while n in self.sets:
151
b_set = 'self.%s' % n
152
return '%s not in %s' % (x, b_set)
154
def peek_call(self, r, a):
156
for k, v in self.sets.items():
161
while n in self.sets:
164
a_set = 'self.%s' % n
165
if self.equal_set(a, self.non_ignored_tokens()):
167
if self['context-insensitive-scanner']:
169
return 'self._peek(%s)' % a_set
171
def peek_test(self, r, a, b):
172
if self.subset(a, b):
174
if self['context-insensitive-scanner']:
175
a = self.non_ignored_tokens()
176
return self.in_test(r, self.peek_call(r, a), a, b)
178
def not_peek_test(self, r, a, b):
179
if self.subset(a, b):
181
return self.not_in_test(r, self.peek_call(r, a), a, b)
186
self.rules[r].setup(self, r)
187
if self.change_count == 0:
189
self.change_count = 0
193
self.rules[r].update(self)
194
if self.change_count == 0:
196
self.change_count = 0
198
def dump_information(self):
201
print ' _____' + '_' * len(r)
202
print ('___/Rule ' + r + '\\' + '_' * 80)[:79]
203
queue = [self.rules[r]]
212
if top.accepts_epsilon:
214
print ' FIRST:', join(top.first + eps, ', ')
215
print ' FOLLOW:', join(top.follow, ', ')
216
for x in top.get_children():
219
def generate_output(self):
222
self.write(self.preparser)
223
# TODO: remove "import *" construct
224
self.write("import re\n")
225
self.write("from string import *\n")
226
self.write("from yappsrt import *\n")
228
self.write("class ", self.name, "Scanner(Scanner):\n")
229
self.write(" patterns = None\n")
230
self.write(" _patterns = [\n")
231
for p in self.terminals:
232
self.write(" (%s, %s),\n" % (
233
repr(p), repr(self.tokens[p])))
235
self.write(" def __init__(self, input=None):\n")
236
self.write(" if hasattr(self, 'setup_patterns'):\n")
237
self.write(" self.setup_patterns(self._patterns)\n")
238
self.write(" elif self.patterns is None:\n")
239
self.write(" self.__class__.patterns = []\n")
240
self.write(" for t, p in self._patterns:\n")
241
self.write(" self.patterns.append((t, re.compile(p)))\n")
242
self.write(" super(", self.name, "Scanner, self).__init__(None, %s, input)\n" %
246
self.write("class ", self.name, "(Parser):\n")
248
self.write(INDENT, "def ", r, "(self")
250
self.write(", ", self.params[r])
252
self.rules[r].output(self, INDENT + INDENT)
255
for n, s in self.sets.items():
256
self.write(" %s = %s\n" % (n, set(s)))
258
if self.postparser is not None:
259
self.write(self.postparser)
262
self.write("P = ", self.name, "(", self.name, "Scanner())\n")
263
self.write("def parse(rule, text, *args):\n")
264
self.write(" P.reset(text)\n")
265
self.write(" return wrap_error_reporter(P, rule, *args)\n")
268
self.write("if __name__ == '__main__':\n")
269
self.write(INDENT, "from sys import argv, stdin\n")
270
self.write(INDENT, "if len(argv) >= 2:\n")
271
self.write(INDENT * 2, "if len(argv) >= 3:\n")
272
self.write(INDENT * 3, "f = open(argv[2],'r')\n")
273
self.write(INDENT * 2, "else:\n")
274
self.write(INDENT * 3, "f = stdin\n")
275
self.write(INDENT * 2, "print parse(argv[1], f.read())\n")
276
self.write(INDENT, "else: print 'Args: <rule> [<filename>]'\n")
279
######################################################################
286
self.accepts_epsilon = 0
289
def setup(self, gen, rule):
290
# Setup will change accepts_epsilon,
291
# sometimes from 0 to 1 but never 1 to 0.
292
# It will take a finite number of steps to set things up
295
def used(self, vars):
296
"Return two lists: one of vars used, and the other of vars assigned"
299
def get_children(self):
300
"Return a list of sub-nodes"
306
def update(self, gen):
307
if self.accepts_epsilon:
308
gen.add_to(self.first, self.follow)
310
def output(self, gen, indent):
311
"Write out code to _gen_ with _indent_:string indentation"
312
gen.write(indent, "assert 0 # Invalid parser node\n")
315
class Terminal(Node):
316
def __init__(self, token):
319
self.accepts_epsilon = 0
324
def update(self, gen):
325
Node.update(self, gen)
326
if self.first != [self.token]:
327
self.first = [self.token]
330
def output(self, gen, indent):
332
if re.match('[a-zA-Z_][a-zA-Z_0-9]*$', self.token):
333
gen.write(self.token, " = ")
334
gen.write("self._scan(%s)\n" % repr(self.token))
338
def __init__(self, expr):
342
def setup(self, gen, rule):
343
Node.setup(self, gen, rule)
344
if not self.accepts_epsilon:
345
self.accepts_epsilon = 1
349
return '{{ %s }}' % self.expr.strip()
351
def output(self, gen, indent):
352
gen.write(indent, self.expr.strip(), '\n')
355
class NonTerminal(Node):
356
def __init__(self, name, args):
361
def setup(self, gen, rule):
362
Node.setup(self, gen, rule)
364
self.target = gen.rules[self.name]
365
if self.accepts_epsilon != self.target.accepts_epsilon:
366
self.accepts_epsilon = self.target.accepts_epsilon
368
except KeyError: # Oops, it's nonexistent
369
print 'Error: no rule <%s>' % self.name
373
return '<%s>' % self.name
375
def update(self, gen):
376
Node.update(self, gen)
377
gen.equate(self.first, self.target.first)
378
gen.equate(self.follow, self.target.follow)
380
def output(self, gen, indent):
382
gen.write(self.name, " = ")
383
gen.write("self.", self.name, "(", self.args, ")\n")
386
class Sequence(Node):
387
def __init__(self, *children):
389
self.children = children
391
def setup(self, gen, rule):
392
Node.setup(self, gen, rule)
393
for c in self.children:
396
if not self.accepts_epsilon:
397
# If it's not already accepting epsilon, it might now do so.
398
for c in self.children:
399
# any non-epsilon means all is non-epsilon
400
if not c.accepts_epsilon:
403
self.accepts_epsilon = 1
406
def get_children(self):
410
return '( %s )' % join(map(lambda x: str(x), self.children))
412
def update(self, gen):
413
Node.update(self, gen)
414
for g in self.children:
418
for g_i in range(len(self.children)):
419
g = self.children[g_i]
422
gen.add_to(self.first, g.first)
423
if not g.accepts_epsilon:
426
if g_i == len(self.children) - 1:
429
next = self.children[1 + g_i].first
430
gen.add_to(g.follow, next)
433
gen.add_to(self.follow, self.children[-1].follow)
435
def output(self, gen, indent):
437
for c in self.children:
438
c.output(gen, indent)
440
# Placeholder for empty sequences, just in case
441
gen.write(indent, 'pass\n')
444
def __init__(self, *children):
446
self.children = children
448
def setup(self, gen, rule):
449
Node.setup(self, gen, rule)
450
for c in self.children:
453
if not self.accepts_epsilon:
454
for c in self.children:
455
if c.accepts_epsilon:
456
self.accepts_epsilon = 1
459
def get_children(self):
463
return '( %s )' % join(map(lambda x: str(x), self.children), ' | ')
465
def update(self, gen):
466
Node.update(self, gen)
467
for g in self.children:
470
for g in self.children:
471
gen.add_to(self.first, g.first)
472
gen.add_to(self.follow, g.follow)
473
for g in self.children:
474
gen.add_to(g.follow, self.follow)
475
if self.accepts_epsilon:
476
gen.add_to(self.first, self.follow)
478
def output(self, gen, indent):
480
gen.write(indent, "_token_ = ", gen.peek_call(self.rule, self.first), "\n")
482
tokens_unseen = self.first[:]
483
if gen['context-insensitive-scanner']:
484
# Context insensitive scanners can return ANY token,
485
# not only the ones in first.
486
tokens_unseen = gen.non_ignored_tokens()
487
for c in self.children:
494
if x in tokens_unseen:
495
tokens_unseen.remove(x)
496
tokens_seen = tokens_seen + testset
499
print 'Error in rule', self.rule + ':', c, 'never matches.'
501
print 'Warning:', self
502
print ' * These tokens are being ignored:', join(removed, ', ')
503
print ' due to previous choices using them.'
506
if not tokens_unseen: # context sensitive scanners only!
508
# if it's the first AND last test, then
509
# we can simply put the code without an if/else
510
c.output(gen, indent)
512
gen.write(indent, "else:")
513
t = gen.in_test(self.rule, '', [], testset)
514
if len(t) < 70 - len(indent):
517
c.output(gen, indent + INDENT)
519
gen.write(indent, test, " ",
520
gen.in_test(self.rule, '_token_', tokens_unseen, testset),
522
c.output(gen, indent + INDENT)
525
if gen['context-insensitive-scanner'] and tokens_unseen:
526
gen.write(indent, "else:\n")
527
gen.write(indent, INDENT, "raise SyntaxError(self._pos, ")
528
gen.write("'Could not match ", self.rule, "')\n")
532
def __init__(self, child):
536
def setup(self, gen, rule):
537
Node.setup(self, gen, rule)
538
self.child.setup(gen, rule)
540
def get_children(self):
543
def update(self, gen):
544
Node.update(self, gen)
545
self.child.update(gen)
546
gen.add_to(self.first, self.child.first)
547
gen.equate(self.follow, self.child.follow)
550
class Option(Wrapper):
551
def setup(self, gen, rule):
552
Wrapper.setup(self, gen, rule)
553
if not self.accepts_epsilon:
554
self.accepts_epsilon = 1
558
return '[ %s ]' % str(self.child)
560
def output(self, gen, indent):
561
if self.child.accepts_epsilon:
562
print 'Warning in rule', self.rule + ': contents may be empty.'
563
gen.write(indent, "if %s:\n" %
564
gen.peek_test(self.rule, self.first, self.child.first))
565
self.child.output(gen, indent + INDENT)
569
def setup(self, gen, rule):
570
Wrapper.setup(self, gen, rule)
571
if self.accepts_epsilon != self.child.accepts_epsilon:
572
self.accepts_epsilon = self.child.accepts_epsilon
576
return '%s+' % str(self.child)
578
def update(self, gen):
579
Wrapper.update(self, gen)
580
gen.add_to(self.follow, self.first)
582
def output(self, gen, indent):
583
if self.child.accepts_epsilon:
584
print 'Warning in rule', self.rule + ':'
585
print ' * The repeated pattern could be empty. The resulting'
586
print ' parser may not work properly.'
587
gen.write(indent, "while 1:\n")
588
self.child.output(gen, indent + INDENT)
589
union = self.first[:]
590
gen.add_to(union, self.follow)
591
gen.write(indent + INDENT, "if %s:\n" %
592
gen.not_peek_test(self.rule, union, self.child.first))
593
gen.write(indent + INDENT * 2, "break\n")
597
def setup(self, gen, rule):
598
Wrapper.setup(self, gen, rule)
599
if not self.accepts_epsilon:
600
self.accepts_epsilon = 1
604
return '%s*' % str(self.child)
606
def output(self, gen, indent):
607
if self.child.accepts_epsilon:
608
print 'Warning in rule', self.rule + ':'
609
print ' * The repeated pattern could be empty. The resulting'
610
print ' parser probably will not work properly.'
611
gen.write(indent, "while %s:\n" %
612
gen.peek_test(self.rule, self.follow, self.child.first))
613
self.child.output(gen, indent + INDENT)
615
######################################################################
616
# The remainder of this file is from parsedesc.{g,py}
625
def add_inline_token(tokens, str):
626
tokens.insert(0, (str, eval(str, {}, {})))
630
def cleanup_choice(lst):
635
return apply(Choice, tuple(lst))
638
def cleanup_sequence(lst):
641
return apply(Sequence, tuple(lst))
644
def cleanup_rep(node, rep):
653
def resolve_name(tokens, id, args):
654
if id in map(lambda x: x[0], tokens):
657
print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args)
660
# It's a name, so assume it's a nonterminal
661
return NonTerminal(id, args)
665
from yappsrt import *
668
class ParserDescriptionScanner(Scanner):
669
def __init__(self, str):
670
Scanner.__init__(self, [
672
('"ignore"', 'ignore'),
673
('"token"', 'token'),
674
('"option"', 'option'),
676
('"parser"', 'parser'),
677
('[ \011\015\012]+', '[ \011\015\012]+'),
678
('#.*?\015?\012', '#.*?\015?\012'),
682
('ID', '[a-zA-Z_][a-zA-Z_0-9]*'),
683
('STR', '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"'),
691
], ['[ \011\015\012]+', '#.*?\015?\012'], str)
694
class ParserDescription(Parser):
696
self._scan('"parser"')
697
ID = self._scan('ID')
699
Options = self.Options()
700
Tokens = self.Tokens()
701
Rules = self.Rules(Tokens)
702
END = self._scan('END')
703
return Generator(ID, Options, Tokens, Rules)
707
while self._peek(set(['"option"', '"token"', '"ignore"', 'END', '"rule"'])) == '"option"':
708
self._scan('"option"')
716
while self._peek(set(['"token"', '"ignore"', 'END', '"rule"'])) in ['"token"', '"ignore"']:
717
_token_ = self._peek(set(['"token"', '"ignore"']))
718
if _token_ == '"token"':
719
self._scan('"token"')
720
ID = self._scan('ID')
723
tok.append((ID, Str))
724
else: # == '"ignore"'
725
self._scan('"ignore"')
728
tok.append(('#ignore', Str))
731
def Rules(self, tokens):
733
while self._peek(set(['"rule"', 'END'])) == '"rule"':
735
ID = self._scan('ID')
736
OptParam = self.OptParam()
738
ClauseA = self.ClauseA(tokens)
739
rul.append((ID, OptParam, ClauseA))
742
def ClauseA(self, tokens):
743
ClauseB = self.ClauseB(tokens)
745
while self._peek(set(['OR', 'RP', 'RB', '"rule"', 'END'])) == 'OR':
746
OR = self._scan('OR')
747
ClauseB = self.ClauseB(tokens)
749
return cleanup_choice(v)
751
def ClauseB(self, tokens):
753
while self._peek(set(['STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END'])) in ['STR', 'ID', 'LP', 'LB', 'STMT']:
754
ClauseC = self.ClauseC(tokens)
756
return cleanup_sequence(v)
758
def ClauseC(self, tokens):
759
ClauseD = self.ClauseD(tokens)
760
_token_ = self._peek(set(['PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END']))
761
if _token_ == 'PLUS':
762
PLUS = self._scan('PLUS')
764
elif _token_ == 'STAR':
765
STAR = self._scan('STAR')
770
def ClauseD(self, tokens):
771
_token_ = self._peek(set(['STR', 'ID', 'LP', 'LB', 'STMT']))
773
STR = self._scan('STR')
774
t = (STR, eval(STR, {}, {}))
778
elif _token_ == 'ID':
779
ID = self._scan('ID')
780
OptParam = self.OptParam()
781
return resolve_name(tokens, ID, OptParam)
782
elif _token_ == 'LP':
783
LP = self._scan('LP')
784
ClauseA = self.ClauseA(tokens)
785
RP = self._scan('RP')
787
elif _token_ == 'LB':
788
LB = self._scan('LB')
789
ClauseA = self.ClauseA(tokens)
790
RB = self._scan('RB')
791
return Option(ClauseA)
793
STMT = self._scan('STMT')
794
return Eval(STMT[2:-2])
797
if self._peek(set(['ATTR', '":"', 'PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END'])) == 'ATTR':
798
ATTR = self._scan('ATTR')
803
STR = self._scan('STR')
804
return eval(STR, {}, {})
807
# This replaces the default main routine
811
('context-insensitive-scanner', 'context-insensitive-scanner',
812
'Scan all tokens (see docs)')
816
def generate(inputfilename, outputfilename='', dump=0, **flags):
817
"""Generate a grammar, given an input filename (X.g)
818
and an output filename (defaulting to X.py)."""
820
if not outputfilename:
821
if inputfilename[-2:] == '.g':
822
outputfilename = inputfilename[:-2] + '.py'
824
raise Exception("Missing output filename")
826
print 'Input Grammar:', inputfilename
827
print 'Output File:', outputfilename
829
DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers
830
preparser, postparser = None, None # Code before and after the parser desc
832
# Read the entire file
833
s = open(inputfilename, 'r').read()
835
# See if there's a separation between the pre-parser and parser
838
preparser, s = s[:f] + '\n\n', s[f + len(DIVIDER):]
840
# See if there's a separation between the parser and post-parser
843
s, postparser = s[:f], '\n\n' + s[f + len(DIVIDER):]
845
# Create the parser and scanner
846
p = ParserDescription(ParserDescriptionScanner(s))
851
t = wrap_error_reporter(p, 'Parser')
854
if preparser is not None:
855
t.preparser = preparser
856
if postparser is not None:
857
t.postparser = postparser
860
for f in t.options.keys():
861
for opt, _, _ in yapps_options:
865
print 'Warning: unrecognized option', f
866
# Add command line options to the set
867
for f in flags.keys():
868
t.options[f] = flags[f]
870
# Generate the output
874
t.output = open(outputfilename, 'w')
877
if __name__ == '__main__':
879
optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump'])
880
if not args or len(args) > 2:
882
print ' python', sys.argv[0], '[flags] input.g [output.py]'
884
print (' --dump' + ' ' * 40)[:35] + 'Dump out grammar information'
885
for flag, _, doc in yapps_options:
886
print (' -f' + flag + ' ' * 40)[:35] + doc
888
# Read in the options and create a list of flags
891
for flag, name, _ in yapps_options:
892
if opt == ('-f', flag):
896
if opt == ('--dump', ''):
899
print 'Warning: unrecognized option', opt[0], opt[1]
901
apply(generate, tuple(args), flags)