6
from sets import Set as set, FrozenSet as frozenset
8
def compress_char_set(chars):
11
result = [chars[0], 1]
12
for a, b in zip(chars[:-1], chars[1:]):
13
if ord(a) == ord(b) - 1:
14
result.append(result.pop() + 1)
19
for i in range(len(result) // 2):
20
real_result.append((result[i * 2], result[i * 2 + 1]))
23
class LexerError(Exception):
24
def __init__(self, input, state, source_pos):
27
self.source_pos = source_pos
28
self.args = (input, state, source_pos)
30
def nice_error_message(self, filename="<unknown>"):
31
result = [" File %s, line %s" % (filename, self.source_pos.lineno)]
32
result.append(self.input.split("\n")[self.source_pos.lineno])
33
result.append(" " * self.source_pos.columnno + "^")
34
result.append("LexerError")
35
return "\n".join(result)
37
def make_nice_charset_repr(chars):
38
charranges = compress_char_set(chars)
40
for a, num in charranges:
46
result.append("%s-%s" % (repr(a)[1:-1], repr(chr(ord(a) + num - 1))[1:-1]))
47
return "".join(result)
50
def __init__(self, num_states=0, transitions=None, final_states=None,
51
unmergeable_states=None, names=None):
53
if transitions is None:
55
if final_states is None:
57
if unmergeable_states is None:
58
unmergeable_states = set()
61
self.transitions = transitions
62
self.final_states = final_states
63
self.unmergeable_states = unmergeable_states
67
from pprint import pformat
68
return "DFA%s" % (pformat((
69
self.num_states, self.transitions, self.final_states,
70
self.unmergeable_states, self.names)), )
72
def add_state(self, name=None, final=False, unmergeable=False):
73
state = self.num_states
76
self.final_states.add(state)
78
self.unmergeable_states.add(state)
81
self.names.append(name)
82
return self.num_states - 1
84
def __setitem__(self, (state, input), next_state):
85
self.transitions[state, input] = next_state
87
def __getitem__(self, (state, input)):
88
return self.transitions[state, input]
90
def __contains__(self, (state, input)):
91
return (state, input) in self.transitions
93
def get_all_chars(self):
95
for state, input in self.transitions:
100
all_chars = self.get_all_chars()
102
non_final = frozenset(set(range(self.num_states)) - self.final_states -
103
self.unmergeable_states)
104
final = frozenset(self.final_states - self.unmergeable_states)
106
equivalence_sets = set()
108
equivalence_sets.add(non_final)
110
equivalence_sets.add(final)
111
for state in range(self.num_states):
113
state_to_set[state] = final
114
elif state in self.unmergeable_states:
115
singleset = frozenset([state])
116
state_to_set[state] = singleset
117
equivalence_sets.add(singleset)
119
state_to_set[state] = non_final
120
assert len(equivalence_sets) <= self.num_states
121
while len(equivalence_sets) < self.num_states:
122
new_equivalence_sets = set()
124
for equivalent in equivalence_sets:
125
#print "checking", equivalent
126
for char in all_chars:
128
for state in equivalent:
129
if (state, char) in self:
130
nextstate = self[state, char]
131
target = frozenset(state_to_set[nextstate])
135
targets.setdefault(target, set()).add(state)
136
if len(targets) != 1:
137
#print "\nsplitting %s with %r\ninto %s" % (equivalent, char, targets.values())
138
for target, newequivalent in targets.iteritems():
139
#print " ", newequivalent
140
newequivalent = frozenset(newequivalent)
141
new_equivalence_sets.add(newequivalent)
142
for state in newequivalent:
143
state_to_set[state] = newequivalent
144
#print " ", new_equivalence_sets
148
new_equivalence_sets.add(equivalent)
151
#print "end", equivalence_sets
152
#print new_equivalence_sets
153
equivalence_sets = new_equivalence_sets
154
if len(equivalence_sets) == self.num_states:
156
#print equivalence_sets
160
newnum_states = len(equivalence_sets)
161
newstates = list(equivalence_sets)
162
newstate_to_index = {}
163
newfinal_states = set()
164
newunmergeable_states = set()
165
for i, newstate in enumerate(newstates):
166
newstate_to_index[newstate] = i
167
# bring startstate into first slot
168
startstateindex = newstate_to_index[state_to_set[0]]
169
newstates[0], newstates[startstateindex] = newstates[startstateindex], newstates[0]
170
newstate_to_index[newstates[0]] = 0
171
newstate_to_index[newstates[startstateindex]] = startstateindex
172
for i, newstate in enumerate(newstates):
173
name = ", ".join([self.names[s] for s in newstate])
174
for state in newstate:
175
if state in self.unmergeable_states:
176
newunmergeable_states.add(i)
177
name = self.names[state]
178
if state in self.final_states:
179
newfinal_states.add(i)
180
newnames.append(name)
181
for (state, char), nextstate in self.transitions.iteritems():
182
newstate = newstate_to_index[state_to_set[state]]
183
newnextstate = newstate_to_index[state_to_set[nextstate]]
184
newtransitions[newstate, char] = newnextstate
185
self.names = newnames
186
self.transitions = newtransitions
187
self.num_states = newnum_states
188
self.final_states = newfinal_states
189
self.unmergeable_states = newunmergeable_states
194
def recognize(input):
200
for (state, char), nextstate in self.transitions.iteritems():
201
state_to_chars.setdefault(state, {}).setdefault(nextstate, set()).add(char)
203
for state, nextstates in state_to_chars.iteritems():
210
else:""" % (state, ))
211
if state in self.final_states:
212
result.append(" return True")
214
result.append(" break")
216
for nextstate, chars in nextstates.iteritems():
217
final = nextstate in self.final_states
218
compressed = compress_char_set(chars)
219
if nextstate in above:
220
continue_prefix = "\n" + " " * 16 + "continue"
223
for i, (a, num) in enumerate(compressed):
225
for charord in range(ord(a), ord(a) + num):
228
state = %s%s""" % (elif_prefix, chr(charord), nextstate, continue_prefix))
233
%sif %r <= char <= %r:
234
state = %s%s""" % (elif_prefix, a, chr(ord(a) + num - 1), nextstate, continue_prefix))
238
result.append(" " * 12 + "else:")
239
result.append(" " * 16 + "break")
240
#print state_to_chars.keys()
241
for state in range(self.num_states):
242
if state in state_to_chars:
249
break""" % (state, ))
250
result.append(" break")
251
result.append(" raise LexerError(input, state, i)")
252
result = "\n".join(result)
253
while "\n\n" in result:
254
result = result.replace("\n\n", "\n")
256
exec py.code.Source(result).compile()
259
def make_lexing_code(self):
261
def recognize(runner, i):
268
for (state, char), nextstate in self.transitions.iteritems():
269
state_to_chars.setdefault(state, {}).setdefault(nextstate, set()).add(char)
271
for state, nextstates in state_to_chars.iteritems():
273
result.append(" if state == %s:" % (state, ))
274
if state in self.final_states:
275
result.append(" runner.last_matched_index = i - 1")
276
result.append(" runner.last_matched_state = state")
282
runner.state = %s""" % (state, ))
283
if state in self.final_states:
284
result.append(" return i")
286
result.append(" return ~i")
288
for nextstate, chars in nextstates.iteritems():
289
final = nextstate in self.final_states
290
compressed = compress_char_set(chars)
291
if nextstate in above:
292
continue_prefix = "\n" + " " * 16 + "continue"
295
for i, (a, num) in enumerate(compressed):
297
for charord in range(ord(a), ord(a) + num):
300
state = %s%s""" % (elif_prefix, chr(charord), nextstate, continue_prefix))
305
%sif %r <= char <= %r:
306
state = %s%s""" % (elif_prefix, a, chr(ord(a) + num - 1), nextstate, continue_prefix))
310
result.append(" " * 12 + "else:")
311
result.append(" " * 16 + "break")
312
#print state_to_chars.keys()
313
for state in range(self.num_states):
314
if state in state_to_chars:
316
assert state in self.final_states
318
runner.last_matched_state = state
319
runner.last_matched_index = i - 1
328
result = "\n".join(result)
329
while "\n\n" in result:
330
result = result.replace("\n\n", "\n")
332
exec py.code.Source(result).compile()
335
def get_runner(self):
336
return DFARunner(self)
338
def make_nondeterministic(self):
340
result.num_states = self.num_states
341
result.names = self.names
342
result.start_states = set([0])
343
result.final_states = self.final_states.copy()
344
for (state, input), nextstate in self.transitions.iteritems():
345
result.add_transition(state, nextstate, input)
349
result = ["graph G {"]
350
for i in range(self.num_states):
352
extra = ", color=red"
355
if i in self.final_states:
360
'state%s [label="%s", shape=%s%s];' %
361
(i, repr(self.names[i]).replace("\\", "\\\\"), shape, extra))
363
for (state, input), next_state in self.transitions.iteritems():
364
edges.setdefault((state, next_state), set()).add(input)
365
for (state, next_state), inputs in edges.iteritems():
366
inputs = make_nice_charset_repr(inputs)
367
result.append('state%s -- state%s [label="%s", arrowhead=normal];' %
368
(state, next_state, repr(inputs).replace("\\", "\\\\")))
370
return "\n".join(result)
373
from pypy.translator.tool.pygame import graphclient
374
p = py.test.ensuretemp("automaton").join("temp.dot")
377
plainpath = p.new(ext="plain")
379
py.process.cmdexec("neato -Tplain %s > %s" % (p, plainpath))
380
except py.error.Error:
381
py.process.cmdexec("fdp -Tplain %s > %s" % (p, plainpath))
382
graphclient.display_dot_file(str(plainpath))
384
class DFARunner(object):
385
def __init__(self, automaton):
386
self.automaton = automaton
389
def nextstate(self, char):
390
self.state = self.automaton[self.state, char]
393
def recognize(self, s):
400
return self.state in self.automaton.final_states
406
self.transitions = {}
407
self.start_states = set()
408
self.final_states = set()
409
self.unmergeable_states = set()
411
def add_state(self, name=None, start=False, final=False,
413
new_state = self.num_states
416
name = str(new_state)
417
self.names.append(name)
419
self.start_states.add(new_state)
421
self.final_states.add(new_state)
423
self.unmergeable_states.add(new_state)
426
def add_transition(self, state, next_state, input=None):
427
subtransitions = self.transitions.setdefault(state, {})
428
subtransitions.setdefault(input, set()).add(next_state)
430
def get_next_states(self, state, char):
432
sub_transitions = self.transitions.get(state, {})
433
for e_state in self.epsilon_closure([state]):
434
result.update(self.transitions.get(e_state, {}).get(char, set()))
437
def epsilon_closure(self, states):
442
for next_state in self.transitions.get(state, {}).get(None, set()):
443
if next_state not in result:
444
result.add(next_state)
445
stack.append(next_state)
448
def make_deterministic(self, name_precedence=None):
452
def get_state(states):
453
states = self.epsilon_closure(states)
454
frozenstates = frozenset(states)
455
if frozenstates in set_to_state:
456
return set_to_state[frozenstates]
457
if states == self.start_states:
458
assert not set_to_state
460
filter(None, [state in self.final_states for state in states]))
461
name = ", ".join([self.names[state] for state in states])
462
if name_precedence is not None:
463
name_index = len(name_precedence)
467
if state in self.unmergeable_states:
468
new_name = self.names[state]
469
if name_precedence is not None:
471
index = name_precedence.index(new_name)
474
#print new_name, index, name_precedence
475
if index < name_index:
481
result = set_to_state[frozenstates] = fda.add_state(
482
name, final, unmergeable)
483
stack.append((result, states))
485
startstate = get_state(self.start_states)
487
fdastate, ndastates = stack.pop()
489
for state in ndastates:
490
sub_transitions = self.transitions.get(state, {})
491
for char, next_states in sub_transitions.iteritems():
492
chars_to_states.setdefault(char, set()).update(next_states)
493
for char, states in chars_to_states.iteritems():
496
fda[fdastate, char] = get_state(states)
499
def update(self, other):
501
for i, name in enumerate(other.names):
502
new_state = self.add_state(name)
503
mapping[i] = new_state
504
for state, subtransitions in other.transitions.iteritems():
505
new_state = mapping[state]
506
new_subtransitions = self.transitions.setdefault(new_state, {})
507
for input, next_states in subtransitions.iteritems():
508
next_states = [mapping[i] for i in next_states]
509
new_subtransitions.setdefault(input, set()).update(next_states)
513
from pypy.translator.tool.pygame import graphclient
514
p = py.test.ensuretemp("automaton").join("temp.dot")
517
plainpath = p.new(ext="plain")
520
py.process.cmdexec("neato -Tplain %s > %s" % (p, plainpath))
521
except py.error.Error:
522
py.process.cmdexec("fdp -Tplain %s > %s" % (p, plainpath))
523
except py.error.Error:
525
dot.replace("graph G {", "digraph G {").replace(" -- ", " -> "))
526
py.process.cmdexec("dot -Tplain %s > %s" % (p, plainpath))
527
graphclient.display_dot_file(str(plainpath))
530
result = ["graph G {"]
531
for i in range(self.num_states):
532
if i in self.start_states:
533
extra = ", color=red"
536
if i in self.final_states:
538
extra += ", shape=octagon"
542
'state%s [label="%s", peripheries=%s%s];' %
543
(i, self.names[i], peripheries, extra))
544
for state, sub_transitions in self.transitions.iteritems():
545
for input, next_states in sub_transitions.iteritems():
546
for next_state in next_states:
548
'state%s -- state%s [label="%s", arrowhead=normal];' %
549
(state, next_state, repr(input).replace("\\", "\\\\")))
551
return "\n".join(result)
553
class SetNFARunner(object):
554
def __init__(self, automaton):
555
self.automaton = automaton
557
def next_state(self, char):
559
for state in self.states:
560
nextstates.update(self.automaton.get_next_states(state, char))
563
def recognize(self, s):
564
self.states = self.automaton.start_states.copy()
566
nextstates = self.next_state(char)
569
self.states = nextstates
570
for state in self.states:
571
if state in self.automaton.final_states:
575
class BacktrackingNFARunner(object):
576
def __init__(self, automaton):
577
self.automaton = automaton
579
def recognize(self, s):
580
def recurse(i, state):
582
return state in self.automaton.final_states
583
for next_state in self.automaton.get_next_states(state, s[i]):
584
if recurse(i + 1, next_state):
587
for state in self.automaton.start_states:
588
if recurse(0, state):