1
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2
# Licensed to PSF under a Contributor Agreement.
4
"""Convert graminit.[ch] spit out by pgen to Python code.
6
Pgen is the Python parser generator. It is useful to quickly create a
7
parser from a grammar file in Python's grammar notation. But I don't
8
want my parsers to be written in C (yet), so I'm translating the
9
parsing tables to Python data structures and writing a Python parse
12
Note that the token numbers are constants determined by the standard
13
Python tokenizer. The standard token module defines these numbers and
14
their names (the names are not used much). The token numbers are
15
hardcoded into the Python tokenizer and into pgen. A Python
16
implementation of the Python tokenizer is also available, in the
17
standard tokenize module.
19
On the other hand, symbol numbers (representing the grammar's
20
non-terminals) are assigned by pgen based on the actual grammar
23
Note: this module is pretty much obsolete; the pgen module generates
24
equivalent grammar tables directly from the Grammar.txt input file
25
without having to invoke the Python pgen C program.
33
from pgen2 import grammar, token
36
class Converter(grammar.Grammar):
37
"""Grammar subclass that reads classic pgen output files.
39
The run() method reads the tables as produced by the pgen parser
40
generator, typically contained in two C files, graminit.h and
41
graminit.c. The other methods are for internal use only.
43
See the base class for more documentation.
47
def run(self, graminit_h, graminit_c):
48
"""Load the grammar tables from the text files written by pgen."""
49
self.parse_graminit_h(graminit_h)
50
self.parse_graminit_c(graminit_c)
53
def parse_graminit_h(self, filename):
54
"""Parse the .h file written by pgen. (Internal)
56
This file is a sequence of #define statements defining the
57
nonterminals of the grammar as numbers. We build two tables
58
mapping the numbers to names and back.
63
except OSError as err:
64
print("Can't open %s: %s" % (filename, err))
66
self.symbol2number = {}
67
self.number2symbol = {}
71
mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
72
if not mo and line.strip():
73
print("%s(%s): can't parse %s" % (filename, lineno,
76
symbol, number = mo.groups()
78
assert symbol not in self.symbol2number
79
assert number not in self.number2symbol
80
self.symbol2number[symbol] = number
81
self.number2symbol[number] = symbol
84
def parse_graminit_c(self, filename):
85
"""Parse the .c file written by pgen. (Internal)
87
The file looks as follows. The first two lines are always this:
89
#include "pgenheaders.h"
92
After that come four blocks:
94
1) one or more state definitions
95
2) a table defining dfas
96
3) a table defining labels
97
4) a struct defining the grammar
99
A state definition has the following form:
100
- one or more arc arrays, each of the form:
101
static arc arcs_<n>_<m>[<k>] = {
105
- followed by a state array, of the form:
106
static state states_<s>[<t>] = {
114
except OSError as err:
115
print("Can't open %s: %s" % (filename, err))
117
# The code below essentially uses f's iterator-ness!
120
# Expect the two #include lines
121
lineno, line = lineno+1, next(f)
122
assert line == '#include "pgenheaders.h"\n', (lineno, line)
123
lineno, line = lineno+1, next(f)
124
assert line == '#include "grammar.h"\n', (lineno, line)
126
# Parse the state definitions
127
lineno, line = lineno+1, next(f)
130
while line.startswith("static arc "):
131
while line.startswith("static arc "):
132
mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
134
assert mo, (lineno, line)
135
n, m, k = list(map(int, mo.groups()))
138
lineno, line = lineno+1, next(f)
139
mo = re.match(r"\s+{(\d+), (\d+)},$", line)
140
assert mo, (lineno, line)
141
i, j = list(map(int, mo.groups()))
143
lineno, line = lineno+1, next(f)
144
assert line == "};\n", (lineno, line)
145
allarcs[(n, m)] = arcs
146
lineno, line = lineno+1, next(f)
147
mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
148
assert mo, (lineno, line)
149
s, t = list(map(int, mo.groups()))
150
assert s == len(states), (lineno, line)
153
lineno, line = lineno+1, next(f)
154
mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
155
assert mo, (lineno, line)
156
k, n, m = list(map(int, mo.groups()))
158
assert k == len(arcs), (lineno, line)
161
lineno, line = lineno+1, next(f)
162
assert line == "};\n", (lineno, line)
163
lineno, line = lineno+1, next(f)
168
mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
169
assert mo, (lineno, line)
170
ndfas = int(mo.group(1))
171
for i in range(ndfas):
172
lineno, line = lineno+1, next(f)
173
mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
175
assert mo, (lineno, line)
177
number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
178
assert self.symbol2number[symbol] == number, (lineno, line)
179
assert self.number2symbol[number] == symbol, (lineno, line)
180
assert x == 0, (lineno, line)
182
assert y == len(state), (lineno, line)
183
lineno, line = lineno+1, next(f)
184
mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
185
assert mo, (lineno, line)
187
rawbitset = eval(mo.group(1))
188
for i, c in enumerate(rawbitset):
193
dfas[number] = (state, first)
194
lineno, line = lineno+1, next(f)
195
assert line == "};\n", (lineno, line)
200
lineno, line = lineno+1, next(f)
201
mo = re.match(r"static label labels\[(\d+)\] = {$", line)
202
assert mo, (lineno, line)
203
nlabels = int(mo.group(1))
204
for i in range(nlabels):
205
lineno, line = lineno+1, next(f)
206
mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
207
assert mo, (lineno, line)
214
labels.append((x, y))
215
lineno, line = lineno+1, next(f)
216
assert line == "};\n", (lineno, line)
219
# Parse the grammar struct
220
lineno, line = lineno+1, next(f)
221
assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
222
lineno, line = lineno+1, next(f)
223
mo = re.match(r"\s+(\d+),$", line)
224
assert mo, (lineno, line)
225
ndfas = int(mo.group(1))
226
assert ndfas == len(self.dfas)
227
lineno, line = lineno+1, next(f)
228
assert line == "\tdfas,\n", (lineno, line)
229
lineno, line = lineno+1, next(f)
230
mo = re.match(r"\s+{(\d+), labels},$", line)
231
assert mo, (lineno, line)
232
nlabels = int(mo.group(1))
233
assert nlabels == len(self.labels), (lineno, line)
234
lineno, line = lineno+1, next(f)
235
mo = re.match(r"\s+(\d+)$", line)
236
assert mo, (lineno, line)
237
start = int(mo.group(1))
238
assert start in self.number2symbol, (lineno, line)
240
lineno, line = lineno+1, next(f)
241
assert line == "};\n", (lineno, line)
243
lineno, line = lineno+1, next(f)
244
except StopIteration:
247
assert 0, (lineno, line)
249
def finish_off(self):
250
"""Create additional useful structures. (Internal)."""
251
self.keywords = {} # map from keyword strings to arc labels
252
self.tokens = {} # map from numeric token values to arc labels
253
for ilabel, (type, value) in enumerate(self.labels):
254
if type == token.NAME and value is not None:
255
self.keywords[value] = ilabel
257
self.tokens[type] = ilabel