~ubuntu-branches/ubuntu/karmic/pypy/karmic

« back to all changes in this revision

Viewing changes to pypy/rlib/parsing/deterministic.py

  • Committer: Bazaar Package Importer
  • Author(s): Alexandre Fayolle
  • Date: 2007-04-13 09:33:09 UTC
  • Revision ID: james.westby@ubuntu.com-20070413093309-yoojh4jcoocu2krz
Tags: upstream-1.0.0
ImportĀ upstreamĀ versionĀ 1.0.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
import py
 
2
 
 
3
try:
 
4
    set
 
5
except NameError:
 
6
    from sets import Set as set, FrozenSet as frozenset
 
7
 
 
8
def compress_char_set(chars):
 
9
    chars = list(chars)
 
10
    chars.sort()
 
11
    result = [chars[0], 1]
 
12
    for a, b in zip(chars[:-1], chars[1:]):
 
13
        if ord(a) == ord(b) - 1:
 
14
            result.append(result.pop() + 1)
 
15
        else:
 
16
            result.append(b)
 
17
            result.append(1)
 
18
    real_result = []
 
19
    for i in range(len(result) // 2):
 
20
        real_result.append((result[i * 2], result[i * 2 + 1]))
 
21
    return real_result
 
22
 
 
23
class LexerError(Exception):
 
24
    def __init__(self, input, state, source_pos):
 
25
        self.input = input
 
26
        self.state = state
 
27
        self.source_pos = source_pos
 
28
        self.args = (input, state, source_pos)
 
29
 
 
30
    def nice_error_message(self, filename="<unknown>"):
 
31
        result = ["  File %s, line %s" % (filename, self.source_pos.lineno)]
 
32
        result.append(self.input.split("\n")[self.source_pos.lineno])
 
33
        result.append(" " * self.source_pos.columnno + "^")
 
34
        result.append("LexerError")
 
35
        return "\n".join(result)
 
36
 
 
37
def make_nice_charset_repr(chars):
 
38
    charranges = compress_char_set(chars)
 
39
    result = []
 
40
    for a, num in charranges:
 
41
        if num == 1:
 
42
            result.append(a)
 
43
            if a == "-":
 
44
                result.append("\\-")
 
45
        else:
 
46
            result.append("%s-%s" % (repr(a)[1:-1], repr(chr(ord(a) + num - 1))[1:-1]))
 
47
    return "".join(result)
 
48
 
 
49
class DFA(object):
 
50
    def __init__(self, num_states=0, transitions=None, final_states=None,
 
51
                 unmergeable_states=None, names=None):
 
52
        self.num_states = 0
 
53
        if transitions is None:
 
54
            transitions = {}
 
55
        if final_states is None:
 
56
            final_states = set()
 
57
        if unmergeable_states is None:
 
58
            unmergeable_states = set()
 
59
        if names is None:
 
60
            names = []
 
61
        self.transitions = transitions
 
62
        self.final_states = final_states
 
63
        self.unmergeable_states = unmergeable_states
 
64
        self.names = names
 
65
 
 
66
    def __repr__(self):
 
67
        from pprint import pformat
 
68
        return "DFA%s" % (pformat((
 
69
            self.num_states, self.transitions, self.final_states,
 
70
            self.unmergeable_states, self.names)), )
 
71
 
 
72
    def add_state(self, name=None, final=False, unmergeable=False):
 
73
        state = self.num_states
 
74
        self.num_states += 1
 
75
        if final:
 
76
            self.final_states.add(state)
 
77
        if unmergeable:
 
78
            self.unmergeable_states.add(state)
 
79
        if name is None:
 
80
            name = str(state)
 
81
        self.names.append(name)
 
82
        return self.num_states - 1
 
83
 
 
84
    def __setitem__(self, (state, input), next_state):
 
85
        self.transitions[state, input] = next_state
 
86
 
 
87
    def __getitem__(self, (state, input)):
 
88
        return self.transitions[state, input]
 
89
 
 
90
    def __contains__(self, (state, input)):
 
91
        return (state, input) in self.transitions
 
92
 
 
93
    def get_all_chars(self):
 
94
        all_chars = set()
 
95
        for state, input in self.transitions:
 
96
            all_chars.add(input)
 
97
        return all_chars
 
98
 
 
99
    def optimize(self):
 
100
        all_chars = self.get_all_chars()
 
101
        # find mergeable
 
102
        non_final = frozenset(set(range(self.num_states)) - self.final_states -
 
103
                              self.unmergeable_states)
 
104
        final = frozenset(self.final_states - self.unmergeable_states)
 
105
        state_to_set = {}
 
106
        equivalence_sets = set()
 
107
        if non_final:
 
108
            equivalence_sets.add(non_final)
 
109
        if final:
 
110
            equivalence_sets.add(final)
 
111
        for state in range(self.num_states):
 
112
            if state in final:
 
113
                state_to_set[state] = final
 
114
            elif state in self.unmergeable_states:
 
115
                singleset = frozenset([state])
 
116
                state_to_set[state] = singleset
 
117
                equivalence_sets.add(singleset)
 
118
            else:
 
119
                state_to_set[state] = non_final
 
120
        assert len(equivalence_sets) <= self.num_states
 
121
        while len(equivalence_sets) < self.num_states:
 
122
            new_equivalence_sets = set()
 
123
            changed = False
 
124
            for equivalent in equivalence_sets:
 
125
                #print "checking", equivalent
 
126
                for char in all_chars:
 
127
                    targets = {}
 
128
                    for state in equivalent:
 
129
                        if (state, char) in self:
 
130
                            nextstate = self[state, char]
 
131
                            target = frozenset(state_to_set[nextstate])
 
132
                        else:
 
133
                            nextstate = None
 
134
                            target = None
 
135
                        targets.setdefault(target, set()).add(state)
 
136
                    if len(targets) != 1:
 
137
                        #print "\nsplitting %s with %r\ninto %s" % (equivalent, char, targets.values())
 
138
                        for target, newequivalent in targets.iteritems():
 
139
                            #print "   ", newequivalent
 
140
                            newequivalent = frozenset(newequivalent)
 
141
                            new_equivalence_sets.add(newequivalent)
 
142
                            for state in newequivalent:
 
143
                                state_to_set[state] = newequivalent
 
144
                            #print "   ", new_equivalence_sets
 
145
                        changed = True
 
146
                        break
 
147
                else:
 
148
                    new_equivalence_sets.add(equivalent)
 
149
            if not changed:
 
150
                break
 
151
            #print "end", equivalence_sets
 
152
            #print new_equivalence_sets
 
153
            equivalence_sets = new_equivalence_sets
 
154
        if len(equivalence_sets) == self.num_states:
 
155
            return False
 
156
        #print equivalence_sets
 
157
        # merging the states
 
158
        newnames = []
 
159
        newtransitions = {}
 
160
        newnum_states = len(equivalence_sets)
 
161
        newstates = list(equivalence_sets)
 
162
        newstate_to_index = {}
 
163
        newfinal_states = set()
 
164
        newunmergeable_states = set()
 
165
        for i, newstate in enumerate(newstates):
 
166
            newstate_to_index[newstate] = i
 
167
        # bring startstate into first slot
 
168
        startstateindex = newstate_to_index[state_to_set[0]]
 
169
        newstates[0], newstates[startstateindex] = newstates[startstateindex], newstates[0]
 
170
        newstate_to_index[newstates[0]] = 0
 
171
        newstate_to_index[newstates[startstateindex]] = startstateindex
 
172
        for i, newstate in enumerate(newstates):
 
173
            name = ", ".join([self.names[s] for s in newstate])
 
174
            for state in newstate:
 
175
                if state in self.unmergeable_states:
 
176
                    newunmergeable_states.add(i)
 
177
                    name = self.names[state]
 
178
                if state in self.final_states:
 
179
                    newfinal_states.add(i)
 
180
            newnames.append(name)
 
181
        for (state, char), nextstate in self.transitions.iteritems():
 
182
            newstate = newstate_to_index[state_to_set[state]]
 
183
            newnextstate = newstate_to_index[state_to_set[nextstate]]
 
184
            newtransitions[newstate, char] = newnextstate
 
185
        self.names = newnames
 
186
        self.transitions = newtransitions
 
187
        self.num_states = newnum_states
 
188
        self.final_states = newfinal_states
 
189
        self.unmergeable_states = newunmergeable_states
 
190
        return True
 
191
 
 
192
    def make_code(self):
 
193
        result = ["""
 
194
def recognize(input):
 
195
    i = 0
 
196
    state = 0
 
197
    while 1:
 
198
"""]
 
199
        state_to_chars = {}
 
200
        for (state, char), nextstate in self.transitions.iteritems():
 
201
            state_to_chars.setdefault(state, {}).setdefault(nextstate, set()).add(char)
 
202
        above = set()
 
203
        for state, nextstates in state_to_chars.iteritems():
 
204
            above.add(state)
 
205
            result.append("""\
 
206
        if state == %s:
 
207
            if i < len(input):
 
208
                char = input[i]
 
209
                i += 1
 
210
            else:""" % (state, ))
 
211
            if state in self.final_states:
 
212
                result.append("                return True")
 
213
            else:
 
214
                result.append("                break")
 
215
            elif_prefix = ""
 
216
            for nextstate, chars in nextstates.iteritems():
 
217
                final = nextstate in self.final_states
 
218
                compressed = compress_char_set(chars)
 
219
                if nextstate in above:
 
220
                    continue_prefix = "\n" + " " * 16 + "continue"
 
221
                else:
 
222
                    continue_prefix = ""
 
223
                for i, (a, num) in enumerate(compressed):
 
224
                    if num < 5:
 
225
                        for charord in range(ord(a), ord(a) + num):
 
226
                            result.append("""
 
227
            %sif char == %r:
 
228
                state = %s%s""" % (elif_prefix, chr(charord), nextstate, continue_prefix))
 
229
                            if not elif_prefix:
 
230
                                elif_prefix = "el"
 
231
                    else:
 
232
                        result.append("""
 
233
            %sif %r <= char <= %r:
 
234
                state = %s%s""" % (elif_prefix, a, chr(ord(a) + num - 1), nextstate, continue_prefix))
 
235
                        if not elif_prefix:
 
236
                            elif_prefix = "el"
 
237
            
 
238
            result.append(" " * 12 + "else:")
 
239
            result.append(" " * 16 + "break")
 
240
        #print state_to_chars.keys()
 
241
        for state in range(self.num_states):
 
242
            if state in state_to_chars:
 
243
                continue
 
244
            result.append("""\
 
245
        if state == %s:
 
246
            if i == len(input):
 
247
                return True
 
248
            else:
 
249
                break""" % (state, ))
 
250
        result.append("        break")
 
251
        result.append("    raise LexerError(input, state, i)")
 
252
        result = "\n".join(result)
 
253
        while "\n\n" in result:
 
254
            result = result.replace("\n\n", "\n")
 
255
        #print result
 
256
        exec py.code.Source(result).compile()
 
257
        return recognize
 
258
        
 
259
    def make_lexing_code(self):
 
260
        result = ["""
 
261
def recognize(runner, i):
 
262
    assert i >= 0
 
263
    input = runner.text
 
264
    state = 0
 
265
    while 1:
 
266
"""]
 
267
        state_to_chars = {}
 
268
        for (state, char), nextstate in self.transitions.iteritems():
 
269
            state_to_chars.setdefault(state, {}).setdefault(nextstate, set()).add(char)
 
270
        above = set()
 
271
        for state, nextstates in state_to_chars.iteritems():
 
272
            above.add(state)
 
273
            result.append("        if state == %s:" % (state, ))
 
274
            if state in self.final_states:
 
275
                result.append("            runner.last_matched_index = i - 1")
 
276
                result.append("            runner.last_matched_state = state")
 
277
            result.append("""\
 
278
            if i < len(input):
 
279
                char = input[i]
 
280
                i += 1
 
281
            else:
 
282
                runner.state = %s""" % (state, ))
 
283
            if state in self.final_states:
 
284
                result.append("                return i")
 
285
            else:
 
286
                result.append("                return ~i")
 
287
            elif_prefix = ""
 
288
            for nextstate, chars in nextstates.iteritems():
 
289
                final = nextstate in self.final_states
 
290
                compressed = compress_char_set(chars)
 
291
                if nextstate in above:
 
292
                    continue_prefix = "\n" + " " * 16 + "continue"
 
293
                else:
 
294
                    continue_prefix = ""
 
295
                for i, (a, num) in enumerate(compressed):
 
296
                    if num < 3:
 
297
                        for charord in range(ord(a), ord(a) + num):
 
298
                            result.append("""
 
299
            %sif char == %r:
 
300
                state = %s%s""" % (elif_prefix, chr(charord), nextstate, continue_prefix))
 
301
                            if not elif_prefix:
 
302
                                elif_prefix = "el"
 
303
                    else:
 
304
                        result.append("""
 
305
            %sif %r <= char <= %r:
 
306
                state = %s%s""" % (elif_prefix, a, chr(ord(a) + num - 1), nextstate, continue_prefix))
 
307
                        if not elif_prefix:
 
308
                            elif_prefix = "el"
 
309
            
 
310
            result.append(" " * 12 + "else:")
 
311
            result.append(" " * 16 + "break")
 
312
        #print state_to_chars.keys()
 
313
        for state in range(self.num_states):
 
314
            if state in state_to_chars:
 
315
                continue
 
316
            assert state in self.final_states
 
317
        result.append("""\
 
318
        runner.last_matched_state = state
 
319
        runner.last_matched_index = i - 1
 
320
        runner.state = state
 
321
        if i == len(input):
 
322
            return i
 
323
        else:
 
324
            return ~i
 
325
        break
 
326
    runner.state = state
 
327
    return ~i""")
 
328
        result = "\n".join(result)
 
329
        while "\n\n" in result:
 
330
            result = result.replace("\n\n", "\n")
 
331
        #print result
 
332
        exec py.code.Source(result).compile()
 
333
        return recognize
 
334
 
 
335
    def get_runner(self):
 
336
        return DFARunner(self)
 
337
 
 
338
    def make_nondeterministic(self):
 
339
        result = NFA()
 
340
        result.num_states = self.num_states
 
341
        result.names = self.names
 
342
        result.start_states = set([0])
 
343
        result.final_states = self.final_states.copy()
 
344
        for (state, input), nextstate in self.transitions.iteritems():
 
345
            result.add_transition(state, nextstate, input)
 
346
        return result
 
347
 
 
348
    def dot(self):
 
349
        result = ["graph G {"]
 
350
        for i in range(self.num_states):
 
351
            if i == 0:
 
352
                extra = ", color=red"
 
353
            else:
 
354
                extra = ""
 
355
            if i in self.final_states:
 
356
                shape = "octagon"
 
357
            else:
 
358
                shape = "box"
 
359
            result.append(
 
360
                'state%s [label="%s", shape=%s%s];' %
 
361
                    (i, repr(self.names[i]).replace("\\", "\\\\"), shape, extra))
 
362
        edges = {}
 
363
        for (state, input), next_state in self.transitions.iteritems():
 
364
            edges.setdefault((state, next_state), set()).add(input)
 
365
        for (state, next_state), inputs in edges.iteritems():
 
366
            inputs = make_nice_charset_repr(inputs)
 
367
            result.append('state%s -- state%s [label="%s", arrowhead=normal];' %
 
368
                          (state, next_state, repr(inputs).replace("\\", "\\\\")))
 
369
        result.append("}")
 
370
        return "\n".join(result)
 
371
 
 
372
    def view(self):
 
373
        from pypy.translator.tool.pygame import graphclient
 
374
        p = py.test.ensuretemp("automaton").join("temp.dot")
 
375
        dot = self.dot()
 
376
        p.write(dot)
 
377
        plainpath = p.new(ext="plain")
 
378
        try:
 
379
            py.process.cmdexec("neato -Tplain %s > %s" % (p, plainpath))
 
380
        except py.error.Error:
 
381
            py.process.cmdexec("fdp -Tplain %s > %s" % (p, plainpath))
 
382
        graphclient.display_dot_file(str(plainpath))
 
383
 
 
384
class DFARunner(object):
 
385
    def __init__(self, automaton):
 
386
        self.automaton = automaton
 
387
        self.state = 0
 
388
 
 
389
    def nextstate(self, char):
 
390
        self.state = self.automaton[self.state, char]
 
391
        return self.state
 
392
        
 
393
    def recognize(self, s):
 
394
        self.state = 0
 
395
        try:
 
396
            for char in s:
 
397
                self.nextstate(char)
 
398
        except KeyError:
 
399
            return False
 
400
        return self.state in self.automaton.final_states
 
401
 
 
402
class NFA(object):
 
403
    def __init__(self):
 
404
        self.num_states = 0
 
405
        self.names = []
 
406
        self.transitions = {}
 
407
        self.start_states = set()
 
408
        self.final_states = set()
 
409
        self.unmergeable_states = set()
 
410
 
 
411
    def add_state(self, name=None, start=False, final=False,
 
412
                  unmergeable=False):
 
413
        new_state = self.num_states
 
414
        self.num_states += 1
 
415
        if name is None:
 
416
            name = str(new_state)
 
417
        self.names.append(name)
 
418
        if start:
 
419
            self.start_states.add(new_state)
 
420
        if final:
 
421
            self.final_states.add(new_state)
 
422
        if unmergeable:
 
423
            self.unmergeable_states.add(new_state)
 
424
        return new_state
 
425
 
 
426
    def add_transition(self, state, next_state, input=None):
 
427
        subtransitions = self.transitions.setdefault(state, {})
 
428
        subtransitions.setdefault(input, set()).add(next_state)
 
429
 
 
430
    def get_next_states(self, state, char):
 
431
        result = set()
 
432
        sub_transitions = self.transitions.get(state, {})
 
433
        for e_state in self.epsilon_closure([state]):
 
434
            result.update(self.transitions.get(e_state, {}).get(char, set()))
 
435
        return result
 
436
 
 
437
    def epsilon_closure(self, states):
 
438
        result = set(states)
 
439
        stack = list(states)
 
440
        while stack:
 
441
            state = stack.pop()
 
442
            for next_state in self.transitions.get(state, {}).get(None, set()):
 
443
                if next_state not in result:
 
444
                    result.add(next_state)
 
445
                    stack.append(next_state)
 
446
        return result
 
447
 
 
448
    def make_deterministic(self, name_precedence=None):
 
449
        fda = DFA()
 
450
        set_to_state = {}
 
451
        stack = []
 
452
        def get_state(states):
 
453
            states = self.epsilon_closure(states)
 
454
            frozenstates = frozenset(states)
 
455
            if frozenstates in set_to_state:
 
456
                return set_to_state[frozenstates]
 
457
            if states == self.start_states:
 
458
                assert not set_to_state
 
459
            final = bool(
 
460
                filter(None, [state in self.final_states for state in states]))
 
461
            name = ", ".join([self.names[state] for state in states])
 
462
            if name_precedence is not None:
 
463
                name_index = len(name_precedence)
 
464
            unmergeable = False
 
465
            for state in states:
 
466
                #print state
 
467
                if state in self.unmergeable_states:
 
468
                    new_name = self.names[state]
 
469
                    if name_precedence is not None:
 
470
                        try:
 
471
                            index = name_precedence.index(new_name)
 
472
                        except ValueError:
 
473
                            index = name_index
 
474
                        #print new_name, index, name_precedence
 
475
                        if index < name_index:
 
476
                            name_index = index
 
477
                            name = new_name
 
478
                    else:
 
479
                        name = new_name
 
480
                    unmergeable = True
 
481
            result = set_to_state[frozenstates] = fda.add_state(
 
482
                name, final, unmergeable)
 
483
            stack.append((result, states))
 
484
            return result
 
485
        startstate = get_state(self.start_states)
 
486
        while stack:
 
487
            fdastate, ndastates = stack.pop()
 
488
            chars_to_states = {}
 
489
            for state in ndastates:
 
490
                sub_transitions = self.transitions.get(state, {})
 
491
                for char, next_states in sub_transitions.iteritems():
 
492
                    chars_to_states.setdefault(char, set()).update(next_states)
 
493
            for char, states in chars_to_states.iteritems():
 
494
                if char is None:
 
495
                    continue
 
496
                fda[fdastate, char] = get_state(states)
 
497
        return fda
 
498
 
 
499
    def update(self, other):
 
500
        mapping = {}
 
501
        for i, name in enumerate(other.names):
 
502
            new_state = self.add_state(name)
 
503
            mapping[i] = new_state
 
504
        for state, subtransitions in other.transitions.iteritems():
 
505
            new_state = mapping[state]
 
506
            new_subtransitions = self.transitions.setdefault(new_state, {})
 
507
            for input, next_states in subtransitions.iteritems():
 
508
                next_states = [mapping[i] for i in next_states]
 
509
                new_subtransitions.setdefault(input, set()).update(next_states)
 
510
        return mapping
 
511
 
 
512
    def view(self):
 
513
        from pypy.translator.tool.pygame import graphclient
 
514
        p = py.test.ensuretemp("automaton").join("temp.dot")
 
515
        dot = self.dot()
 
516
        p.write(dot)
 
517
        plainpath = p.new(ext="plain")
 
518
        try:
 
519
            try:
 
520
                py.process.cmdexec("neato -Tplain %s > %s" % (p, plainpath))
 
521
            except py.error.Error:
 
522
                py.process.cmdexec("fdp -Tplain %s > %s" % (p, plainpath))
 
523
        except py.error.Error:
 
524
            p.write(
 
525
                dot.replace("graph G {", "digraph G {").replace(" -- ", " -> "))
 
526
            py.process.cmdexec("dot -Tplain %s > %s" % (p, plainpath))
 
527
        graphclient.display_dot_file(str(plainpath))
 
528
 
 
529
    def dot(self):
 
530
        result = ["graph G {"]
 
531
        for i in range(self.num_states):
 
532
            if i in self.start_states:
 
533
                extra = ", color=red"
 
534
            else:
 
535
                extra = ""
 
536
            if i in self.final_states:
 
537
                peripheries = 2
 
538
                extra += ", shape=octagon"
 
539
            else:
 
540
                peripheries = 1
 
541
            result.append(
 
542
                'state%s [label="%s", peripheries=%s%s];' %
 
543
                    (i, self.names[i], peripheries, extra))
 
544
        for state, sub_transitions in self.transitions.iteritems():
 
545
            for input, next_states in sub_transitions.iteritems():
 
546
                for next_state in next_states:
 
547
                    result.append(
 
548
                        'state%s -- state%s [label="%s", arrowhead=normal];' %
 
549
                            (state, next_state, repr(input).replace("\\", "\\\\")))
 
550
        result.append("}")
 
551
        return "\n".join(result)
 
552
 
 
553
class SetNFARunner(object):
 
554
    def __init__(self, automaton):
 
555
        self.automaton = automaton
 
556
    
 
557
    def next_state(self, char):
 
558
        nextstates = set()
 
559
        for state in self.states:
 
560
            nextstates.update(self.automaton.get_next_states(state, char))
 
561
        return nextstates
 
562
 
 
563
    def recognize(self, s):
 
564
        self.states = self.automaton.start_states.copy()
 
565
        for char in s:
 
566
            nextstates = self.next_state(char)
 
567
            if not nextstates:
 
568
                return False
 
569
            self.states = nextstates
 
570
        for state in self.states:
 
571
            if state in self.automaton.final_states:
 
572
                return True
 
573
        return False
 
574
 
 
575
class BacktrackingNFARunner(object):
 
576
    def __init__(self, automaton):
 
577
        self.automaton = automaton
 
578
   
 
579
    def recognize(self, s):
 
580
        def recurse(i, state):
 
581
            if i == len(s):
 
582
                return state in self.automaton.final_states
 
583
            for next_state in self.automaton.get_next_states(state, s[i]):
 
584
                if recurse(i + 1, next_state):
 
585
                    return True
 
586
            return False
 
587
        for state in self.automaton.start_states:
 
588
            if recurse(0, state):
 
589
                return True
 
590
        return False