~pida/vellum/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
    vellum parser
    ~~~~~~~~~~~~~

    this is a simple re.Scanner based recursive parser

    :license: GNU GPL 2 or later
    :copyright: 2009 by Ronny Pfannschmidt

    current issues
    --------------

    * assertion handling of parse errors
    * unbalanced expressions cause StopIteration to pass thru
    * the input parser ignores unbalanced expressions in file sections
"""

import re

# helper types

class ODict(dict):
    """dummy type for saving the order of dictmaker items 
    for later usage in ui's"""
    def __init__(self):
        dict.__init__(self)
        self.order = []

    def add(self, name, expr):
        self[name] = expr
        self.order.append(name)


class ShMarked(str):
    def __new__(cls, data, mark):
        result = str.__new__(cls, data)
        result.flavor = mark
        return result

    #this is convience for tools, strings behave like sh references, so they can be threaded the same
    name = 'sh'

    @property
    def expr(self):
        return self


class Reference(object):
    def __init__(self, name, expr):
        self.name = name
        self.expr = expr
    def __repr__(self):
        return "%s -> %r"%(self.name, self.expr)


# tokenizer

def t(name):
    def _token(scanner, text):
        return name, text, scanner.match.start()
    return _token

scanner = re.Scanner([
    (r'[\r\n\ \t]+', None),
    (r'\#.*\n?', None),
    (r'yes\b', t('true')),
    (r'no\b', t('false')),
    (r'[a-zA-Z][\w.]+', t('name')),
    (r'\b[a-zA-Z]\b', t('shortname')),
    (r'\.[0-9]+|[0-9]+(\.[0-9\.]+)?', t('number')),
    (r"'[^\n']*'", t('string')),
    (r'"[^\n\"]*"', t('string')),
    (r'\(', t('dict_start')),
    (r'\)', t('dict_end')),
    (r'\[', t('list_start')),
    (r'\]', t('list_end')),
    (r'[>$|][^\r\n]+\n', t('shell')),
])

def tokenize(line):
    return scanner.scan(line)[0]


def tokenize_lines(lines):
    for index, line in enumerate(lines):
        for token_type, token, start in tokenize(line):
            yield token_type, token, index+1, start

# parser

def assert_read(iter, *types):
    token_type, token, line, pos = iter.next()
    #XXX: more nice
    assert token_type in types, "%s not in %s, line %s pos %d"%(token_type, types, line, pos)
    return token_type, token, line, pos

def parse_reference(iter):
    token_type, token, line, pos = assert_read(iter, 'name')
    return Reference(token, parse_expr(iter))


def parse_list(iter):
    data = []
    while True:
        next = parse_expr(iter, list)
        if next is None:
            break
        data.append(next)

    return data

def parse_dict(iter):
    data = ODict()
    while True:
        token_type, token, line, pos = assert_read(iter, 'name', 'dict_end')
        if token_type == 'dict_end':
            break
        data.add(token, parse_expr(iter))
    return data



def parse_expr(iter, nesting=None):
    #XXX: ugly
    token_type, token, line, pos = assert_read(iter,
                                               'shortname',
                                               'true', 'false',
                                               'string', 'number',
                                               'shell', 'name',
                                               'list_start',
                                               'list_end',
                                               'dict_start',
                                              )
    if token_type=='shortname':
        #XXX: better errors
        raise SyntaxError("Name '%s' too short - line %d pos %d"%(token, line, pos))
    elif token_type=='true':
        return True
    elif token_type=='false':
        return False
    elif token_type=='number':
        if '.' in token:
            return float(token)
        else:
            return int(token)
    elif token_type=='string':
        return token[1:-1].decode('string_escape')
    elif token_type=='shell':
        return ShMarked(token[1:], token[0]) #XXX: better type
    elif token_type=='name':
        return Reference(token, parse_expr(iter))
    elif token_type=='list_start':
        return parse_list(iter)
    elif token_type == 'dict_start':
        return parse_dict(iter)
    elif token_type == 'list_end':
       assert nesting is list




def parse_input(iter):

    data = ODict()
    while True:
        try:
            token_type, name, line, pos = assert_read(iter, 'name')
            token_type, token, line, pos = assert_read(iter, 'list_start', 'dict_start')
            if token_type=='list_start':
                expr = parse_list(iter)
            else:
                expr = parse_dict(iter)
            data.add(name, expr)
        except StopIteration: 
            # XXX: this ignores unbalanced expressions at the end
            return data

def parse(type, content):
    lines = content.splitlines(True)
    token_stream = tokenize_lines(lines)
    token_list = list(token_stream)
    parser = globals()['parse_'+type]
    return parser(iter(token_list))