1
#Copyright ReportLab Europe Ltd. 2000-2004
2
#see license.txt for license details
3
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/tools/docco/t_parse.py
5
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).
7
Template initialization has the form:
8
T = Template(template_string, wild_card_marker, single_char_marker,
9
x = regex_x, y = regex_y, ...)
11
([match1, match2, ..., matchn], lastindex) = T.PARSE(string)
13
Only the first argument is mandatory.
15
The resultant object efficiently parses strings that match the template_string,
16
giving a list of substrings that correspond to each "directive" of the template.
21
The template may be initialized with a wildcard that matches any string
22
up to the string matching the next directive (which may not be a wild
23
card or single character marker) or the next literal sequence of characters
24
of the template. The character that represents a wildcard is specified
25
by the wild_card_marker parameter, which has no default.
27
For example, using X as the wildcard:
30
>>> T = Template("prefixXinteriorX", "X")
31
>>> T.PARSE("prefix this is before interior and this is after")
32
([' this is before ', ' and this is after'], 47)
33
>>> T = Template("<X>X<X>", "X")
34
>>> T.PARSE('<A HREF="index.html">go to index</A>')
35
(['A HREF="index.html"', 'go to index', '/A'], 36)
37
Obviously the character used to represent the wildcard must be distinct
38
from the characters used to represent literals or other directives.
40
Fixed length character sequences:
41
The template may have a marker character which indicates a fixed
42
length field. All adjacent instances of this marker will be matched
43
by a substring of the same length in the parsed string. For example:
45
>>> T = Template("NNN-NN-NNNN", single_char_marker="N")
46
>>> T.PARSE("1-2-34-5-12")
47
(['1-2', '34', '5-12'], 11)
48
>>> T.PARSE("111-22-3333")
49
(['111', '22', '3333'], 11)
50
>>> T.PARSE("1111-22-3333")
51
ValueError: literal not found at (3, '-')
53
A template may have multiple fixed length markers, which allows fixed
54
length fields to be adjacent, but recognized separately. For example:
56
>>> T = Template("MMDDYYX", "X", "MDY")
57
>>> T.PARSE("112489 Somebody's birthday!")
58
(['11', '24', '89', " Somebody's birthday!"], 27)
60
Regular expression markers:
61
The template may have markers associated with regular expressions.
62
the regular expressions may be either string represenations of compiled.
64
>>> T = Template("v: s i", v=id, s=str, i=int)
65
>>> T.PARSE("this_is_an_identifier: 'a string' 12344")
66
(['this_is_an_identifier', "'a string'", '12344'], 39)
68
Here id, str, and int are regular expression conveniences provided by
71
Directive markers may be mixed and matched, except that wildcards cannot precede
72
wildcards or single character markers.
74
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
75
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
76
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
82
from types import StringType
83
from string import find
88
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
89
# ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
95
wild_card_marker=None,
96
single_char_marker=None,
97
**marker_to_regex_dict):
98
self.template = template
99
self.wild_card = wild_card_marker
100
self.char = single_char_marker
101
# determine the set of markers for this template
102
markers = marker_to_regex_dict.keys()
104
markers.append(wild_card_marker)
105
if single_char_marker:
106
for ch in single_char_marker: # allow multiple scm's
108
self.char = single_char_primary = single_char_marker[0]
109
self.markers = markers
112
raise ValueError, "Marks must be single characters: "+`mark`
113
# compile the regular expressions if needed
114
self.marker_dict = marker_dict = {}
115
for (mark, rgex) in marker_to_regex_dict.items():
116
if type(rgex) == StringType:
117
rgex = re.compile(rgex)
118
marker_dict[mark] = rgex
119
# determine the parse sequence
125
# count the number of directives encountered
129
thischar = template[index]
131
if thischar == wild_card_marker:
132
if lastchar == wild_card_marker:
133
raise ValueError, "two wild cards in sequence is not allowed"
134
parse_seq.append( (wild_card_marker, None) )
136
ndirectives = ndirectives+1
137
# is it a sequence of single character markers?
138
elif single_char_marker and thischar in single_char_marker:
139
if lastchar == wild_card_marker:
140
raise ValueError, "wild card cannot precede single char marker"
141
while index<last and template[index] == thischar:
143
parse_seq.append( (single_char_primary, index-start) )
144
ndirectives = ndirectives+1
145
# is it a literal sequence?
146
elif not thischar in markers:
147
while index<last and not template[index] in markers:
149
parse_seq.append( (None, template[start:index]) )
150
# otherwise it must be a re marker
152
rgex = marker_dict[thischar]
153
parse_seq.append( (thischar, rgex) )
154
ndirectives = ndirectives+1
156
lastchar = template[index-1]
157
self.parse_seq = parse_seq
158
self.ndirectives = ndirectives
160
def PARSE(self, str, start=0):
161
ndirectives = self.ndirectives
162
wild_card = self.wild_card
163
single_char = self.char
164
parse_seq = self.parse_seq
165
lparse_seq = len(parse_seq) - 1
166
# make a list long enough for substitutions for directives
167
result = [None] * ndirectives
168
current_directive_index = 0
170
# scan through the parse sequence, recognizing
171
for parse_index in xrange(lparse_seq + 1):
172
(indicator, data) = parse_seq[parse_index]
173
# is it a literal indicator?
174
if indicator is None:
175
if find(str, data, currentindex) != currentindex:
176
raise ValueError, "literal not found at "+`(currentindex,data)`
177
currentindex = currentindex + len(data)
179
# anything else is a directive
181
if indicator == wild_card:
182
# if it is the last directive then it matches the rest of the string
183
if parse_index == lparse_seq:
185
# otherwise must look at next directive to find end of wildcard
187
# next directive must be re or literal
188
(nextindicator, nextdata) = parse_seq[parse_index+1]
189
if nextindicator is None:
191
last = find(str, nextdata, currentindex)
192
if last<currentindex:
194
"couldn't terminate wild with lit "+`currentindex`
196
# data is a re, search for it
197
last = nextdata.search(str, currentindex)
198
if last<currentindex:
200
"couldn't terminate wild with re "+`currentindex`
201
elif indicator == single_char:
202
# data is length to eat
203
last = currentindex + data
205
# other directives are always regular expressions
206
last = data.match(str, currentindex) + currentindex
207
if last<currentindex:
208
raise ValueError, "couldn't match re at "+`currentindex`
209
#print "accepting", str[currentindex:last]
210
result[current_directive_index] = str[currentindex:last]
211
current_directive_index = current_directive_index+1
214
if current_directive_index != ndirectives:
215
raise SystemError, "not enough directives found?"
216
return (result, currentindex)
218
# some useful regular expressions
220
"["+string.letters+"]["+string.letters+string.digits+"_]*"
221
STRINGLITREGEX = "'[^\n']*'"
222
SIMPLEINTREGEX = "["+string.digits+"]+"
223
id = re.compile(USERNAMEREGEX)
224
str = re.compile(STRINGLITREGEX)
225
int = re.compile(SIMPLEINTREGEX)
230
T = Template("(NNN)NNN-NNNN X X", "X", "N")
231
print T.PARSE("(908)949-2726 Aaron Watters")
233
T1 = Template("s --> s blah", s=str)
234
s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
237
T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
238
print T2.PARSE("'A STRING' --> 15964653alpha beta gamma")
240
T3 = Template("XsXi", "X", "N", s=str, i=int)
241
print T3.PARSE("prefix'string'interior1234junk not parsed")
243
T4 = Template("MMDDYYX", "X", "MDY")
244
print T4.PARSE("122961 Somebody's birthday!")
247
if __name__=="__main__": test()
b'\\ No newline at end of file'