1
# A parser for XML, using the derived class as static DTD.
2
# Author: Sjoerd Mullender.
4
# sgmlop support added by fredrik@pythonware.com (May 19, 1998)
10
from _xmlplus.parsers import sgmlop
11
#import sgmlop # this works for both builtin on the path or relative
15
# standard entity defs
25
# XML parser base class -- find tags and call handler functions.
26
# Usage: p = XMLParser(); p.feed(data); ...; p.close().
27
# The dtd is defined by deriving a class which defines methods with
28
# special names to handle tags: start_foo and end_foo to handle <foo>
29
# and </foo>, respectively. The data between tags is passed to the
30
# parser by calling self.handle_data() with some data as argument (the
31
# data may be split up in arbutrary chunks). Entity references are
32
# passed by calling self.handle_entityref() with the entity reference
35
# --------------------------------------------------------------------
36
# original re-based XML parser
40
_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
41
interesting = re.compile('[&<]')
42
incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
44
'/([a-zA-Z_:][^<>]*)?|'
48
ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
49
entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
50
charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
51
space = re.compile(_S)
52
newline = re.compile('\n')
54
starttagopen = re.compile('<' + _Name)
55
endtagopen = re.compile('</')
56
starttagend = re.compile(_opS + '(?P<slash>/?)>')
57
endbracket = re.compile('>')
58
tagfind = re.compile(_Name)
59
cdataopen = re.compile('<!\[CDATA\[')
60
cdataclose = re.compile('\]\]>')
61
special = re.compile('<!(?P<special>[^<>]*)>')
62
procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S)
63
procclose = re.compile('\?>')
64
commentopen = re.compile('<!--')
65
commentclose = re.compile('-->')
66
doubledash = re.compile('--')
67
attrfind = re.compile(
68
_opS + '(?P<name>' + _Name + ')'
69
'(' + _opS + '=' + _opS +
70
'(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
74
# Interface -- initialize and reset this instance
75
def __init__(self, verbose=0):
76
self.verbose = verbose
79
# Interface -- reset this instance. Loses all unprocessed data
88
# For derived classes only -- enter literal mode (CDATA) till EOF
89
def setnomoretags(self):
90
self.nomoretags = self.literal = 1
92
# For derived classes only -- enter literal mode (CDATA)
93
def setliteral(self, *args):
96
# Interface -- feed some data to the parser. Call this as
97
# often as you want, with as little or as much text as you
98
# want (may include '\n'). (This just saves the text, all the
99
# processing is done by goahead().)
100
def feed(self, data):
101
self.rawdata = self.rawdata + data
104
# Interface -- handle the remaining data
108
# Interface -- translate references
109
def translate_references(self, data):
113
res = ref.search(data, i)
115
newdata.append(data[i:])
116
return string.join(newdata, '')
117
if data[res.end(0) - 1] != ';':
118
self.syntax_error(self.lineno,
119
'; missing after entity/char reference')
120
newdata.append(data[i:res.start(0)])
124
newdata.append(chr(string.atoi(str[2:], 16)))
126
newdata.append(chr(string.atoi(str[1:])))
129
newdata.append(self.entitydefs[str])
131
# can't do it, so keep the entity ref in
132
newdata.append('&' + str + ';')
135
# Internal -- handle data as far as reasonable. May leave state
136
# and data to be processed by a subsequent call. If 'end' is
137
# true, force handling all data as if followed by EOF marker.
138
def goahead(self, end):
139
rawdata = self.rawdata
145
self.handle_data(data)
146
self.lineno = self.lineno + string.count(data, '\n')
149
res = interesting.search(rawdata, i)
156
self.handle_data(data)
157
self.lineno = self.lineno + string.count(data, '\n')
160
if rawdata[i] == '<':
161
if starttagopen.match(rawdata, i):
164
self.handle_data(data)
165
self.lineno = self.lineno + string.count(data, '\n')
168
k = self.parse_starttag(i)
170
self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
173
if endtagopen.match(rawdata, i):
174
k = self.parse_endtag(i)
176
self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
180
if commentopen.match(rawdata, i):
183
self.handle_data(data)
184
self.lineno = self.lineno + string.count(data, '\n')
187
k = self.parse_comment(i)
189
self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
192
if cdataopen.match(rawdata, i):
193
k = self.parse_cdata(i)
195
self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
198
res = procopen.match(rawdata, i)
200
k = self.parse_proc(i, res)
202
self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
205
res = special.match(rawdata, i)
209
self.handle_data(data)
210
self.lineno = self.lineno + string.count(data, '\n')
213
self.handle_special(res.group('special'))
214
self.lineno = self.lineno + string.count(res.group(0), '\n')
217
elif rawdata[i] == '&':
218
res = charref.match(rawdata, i)
221
if rawdata[i-1] != ';':
222
self.syntax_error(self.lineno, '; missing in charref')
224
self.handle_charref(res.group('char')[:-1])
225
self.lineno = self.lineno + string.count(res.group(0), '\n')
227
res = entityref.match(rawdata, i)
230
if rawdata[i-1] != ';':
231
self.syntax_error(self.lineno, '; missing in entityref')
233
self.handle_entityref(res.group('name'))
234
self.lineno = self.lineno + string.count(res.group(0), '\n')
237
raise RuntimeError, 'neither < nor & ??'
238
# We get here only if incomplete matches but
240
res = incomplete.match(rawdata, i)
243
self.handle_data(data)
244
self.lineno = self.lineno + string.count(data, '\n')
249
break # Really incomplete
250
self.syntax_error(self.lineno, 'bogus < or &')
252
self.handle_data(data)
253
self.lineno = self.lineno + string.count(data, '\n')
258
self.handle_data(data)
259
self.lineno = self.lineno + string.count(data, '\n')
261
self.rawdata = rawdata[i:]
262
# XXX if end: check for empty stack
264
# Internal -- parse comment, return length or -1 if not terminated
265
def parse_comment(self, i):
266
rawdata = self.rawdata
267
if rawdata[i:i+4] <> '<!--':
268
raise RuntimeError, 'unexpected call to handle_comment'
269
res = commentclose.search(rawdata, i+4)
272
# doubledash search will succeed because it's a subset of commentclose
273
if doubledash.search(rawdata, i+4).start(0) < res.start(0):
274
self.syntax_error(self.lineno, "`--' inside comment")
275
self.handle_comment(rawdata[i+4: res.start(0)])
278
# Internal -- handle CDATA tag, return lenth or -1 if not terminated
279
def parse_cdata(self, i):
280
rawdata = self.rawdata
281
if rawdata[i:i+9] <> '<![CDATA[':
282
raise RuntimeError, 'unexpected call to handle_cdata'
283
res = cdataclose.search(rawdata, i+9)
286
self.handle_cdata(rawdata[i+9:res.start(0)])
289
def parse_proc(self, i, res):
290
rawdata = self.rawdata
292
raise RuntimeError, 'unexpected call to parse_proc'
293
name = res.group('proc')
294
res = procclose.search(rawdata, res.end(0))
297
self.handle_proc(name, rawdata[res.pos:res.start(0)])
300
# Internal -- handle starttag, return length or -1 if not terminated
301
def parse_starttag(self, i):
302
rawdata = self.rawdata
303
# i points to start of tag
304
end = endbracket.search(rawdata, i+1)
308
# Now parse the data between i+1 and j into a tag and attrs
310
res = tagfind.match(rawdata, i+1)
312
raise RuntimeError, 'unexpected call to parse_starttag'
315
if hasattr(self, tag + '_attributes'):
316
attrlist = getattr(self, tag + '_attributes')
321
res = attrfind.match(rawdata, k)
323
attrname, attrvalue = res.group('name', 'value')
324
if attrvalue is None:
325
self.syntax_error(self.lineno, 'no attribute value specified')
327
elif attrvalue[:1] == "'" == attrvalue[-1:] or \
328
attrvalue[:1] == '"' == attrvalue[-1:]:
329
attrvalue = attrvalue[1:-1]
331
self.syntax_error(self.lineno, 'attribute value not quoted')
332
if attrlist is not None and attrname not in attrlist:
333
self.syntax_error(self.lineno,
334
'unknown attribute %s of element %s' %
336
if attrdict.has_key(attrname):
337
self.syntax_error(self.lineno, 'attribute specified twice')
338
attrdict[attrname] = self.translate_references(attrvalue)
340
res = starttagend.match(rawdata, k)
342
self.syntax_error(self.lineno, 'garbage in start tag')
343
self.finish_starttag(tag, attrdict)
344
if res and res.group('slash') == '/':
345
self.finish_endtag(tag)
348
# Internal -- parse endtag
349
def parse_endtag(self, i):
350
rawdata = self.rawdata
351
end = endbracket.search(rawdata, i+1)
354
res = tagfind.match(rawdata, i+2)
356
self.syntax_error(self.lineno, 'no name specified in end tag')
362
if k != end.start(0):
363
# check that there is only white space at end of tag
364
res = space.match(rawdata, k)
365
if res is None or res.end(0) != end.start(0):
366
self.syntax_error(self.lineno, 'garbage in end tag')
367
self.finish_endtag(tag)
370
# Internal -- finish processing of start tag
371
# Return -1 for unknown tag, 1 for balanced tag
372
def finish_starttag(self, tag, attrs):
373
self.stack.append(tag)
375
method = getattr(self, 'start_' + tag)
376
except AttributeError:
377
self.unknown_starttag(tag, attrs)
380
self.handle_starttag(tag, method, attrs)
383
# Internal -- finish processing of end tag
384
def finish_endtag(self, tag):
386
found = len(self.stack) - 1
388
self.unknown_endtag(tag)
391
if tag not in self.stack:
393
method = getattr(self, 'end_' + tag)
394
except AttributeError:
395
self.unknown_endtag(tag)
397
found = len(self.stack)
398
for i in range(found):
399
if self.stack[i] == tag: found = i
400
while len(self.stack) > found:
403
method = getattr(self, 'end_' + tag)
404
except AttributeError:
407
self.handle_endtag(tag, method)
409
self.unknown_endtag(tag)
412
# Overridable -- handle start tag
413
def handle_starttag(self, tag, method, attrs):
416
# Overridable -- handle end tag
417
def handle_endtag(self, tag, method):
420
# Example -- handle character reference, no need to override
421
def handle_charref(self, name):
424
n = string.atoi(name[1:], 16)
426
n = string.atoi(name)
427
except string.atoi_error:
428
self.unknown_charref(name)
430
if not 0 <= n <= 255:
431
self.unknown_charref(name)
433
self.handle_data(chr(n))
435
# Definition of entities -- derived classes may override
436
entitydefs = ENTITYDEFS
438
# Example -- handle entity reference, no need to override
439
def handle_entityref(self, name):
440
table = self.entitydefs
441
if table.has_key(name):
442
self.handle_data(table[name])
444
self.unknown_entityref(name)
447
# Example -- handle data, should be overridden
448
def handle_data(self, data):
451
# Example -- handle cdata, could be overridden
452
def handle_cdata(self, data):
455
# Example -- handle comment, could be overridden
456
def handle_comment(self, data):
459
# Example -- handle processing instructions, could be overridden
460
def handle_proc(self, name, data):
463
# Example -- handle special instructions, could be overridden
464
def handle_special(self, data):
467
# Example -- handle relatively harmless syntax errors, could be overridden
468
def syntax_error(self, lineno, message):
469
raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
471
# To be overridden -- handlers for unknown objects
472
def unknown_starttag(self, tag, attrs): pass
473
def unknown_endtag(self, tag): pass
474
def unknown_charref(self, ref): pass
475
def unknown_entityref(self, ref): pass
478
# --------------------------------------------------------------------
479
# accelerated XML parser
483
# Interface -- initialize and reset this instance
484
def __init__(self, verbose=0):
485
self.verbose = verbose
488
# Interface -- reset this instance. Loses all unprocessed data
496
self.parser = sgmlop.XMLParser()
497
self.feed = self.parser.feed
498
self.parser.register(self)
500
# For derived classes only -- enter literal mode (CDATA) till EOF
501
def setnomoretags(self):
502
self.nomoretags = self.literal = 1
504
# For derived classes only -- enter literal mode (CDATA)
505
def setliteral(self, *args):
508
# Interface -- feed some data to the parser. Call this as
509
# often as you want, with as little or as much text as you
510
# want (may include '\n'). (This just saves the text, all the
511
# processing is done by goahead().)
512
def feed(self, data): # overridden by reset
513
self.parser.feed(data)
515
# Interface -- handle the remaining data
522
# Interface -- translate references
523
def translate_references(self, data):
527
res = ref.search(data, i)
529
newdata.append(data[i:])
530
return string.join(newdata, '')
531
if data[res.end(0) - 1] != ';':
532
self.syntax_error(self.lineno,
533
'; missing after entity/char reference')
534
newdata.append(data[i:res.start(0)])
538
newdata.append(chr(string.atoi(str[2:], 16)))
540
newdata.append(chr(string.atoi(str[1:])))
543
newdata.append(self.entitydefs[str])
545
# can't do it, so keep the entity ref in
546
newdata.append('&' + str + ';')
549
# Internal -- finish processing of start tag
550
# Return -1 for unknown tag, 1 for balanced tag
551
def finish_starttag(self, tag, attrs):
552
self.stack.append(tag)
554
method = getattr(self, 'start_' + tag)
555
except AttributeError:
556
self.unknown_starttag(tag, attrs)
559
self.handle_starttag(tag, method, attrs)
562
# Internal -- finish processing of end tag
563
def finish_endtag(self, tag):
565
found = len(self.stack) - 1
567
self.unknown_endtag(tag)
570
if tag not in self.stack:
572
method = getattr(self, 'end_' + tag)
573
except AttributeError:
574
self.unknown_endtag(tag)
576
found = len(self.stack)
577
for i in range(found):
578
if self.stack[i] == tag: found = i
579
while len(self.stack) > found:
582
method = getattr(self, 'end_' + tag)
583
except AttributeError:
586
self.handle_endtag(tag, method)
588
self.unknown_endtag(tag)
591
# Overridable -- handle start tag
592
def handle_starttag(self, tag, method, attrs):
595
# Overridable -- handle end tag
596
def handle_endtag(self, tag, method):
599
# Example -- handle character reference, no need to override
600
def handle_charref(self, name):
603
n = string.atoi(name[1:], 16)
605
n = string.atoi(name)
606
except string.atoi_error:
607
self.unknown_charref(name)
609
if not 0 <= n <= 255:
610
self.unknown_charref(name)
612
self.handle_data(chr(n))
614
# Definition of entities -- derived classes may override
615
entitydefs = ENTITYDEFS
617
# Example -- handle entity reference, no need to override
618
def handle_entityref(self, name):
619
table = self.entitydefs
620
if table.has_key(name):
621
self.handle_data(table[name])
623
self.unknown_entityref(name)
626
# Example -- handle data, should be overridden
627
def handle_data(self, data):
630
# Example -- handle cdata, could be overridden
631
def handle_cdata(self, data):
634
# Example -- handle comment, could be overridden
635
def handle_comment(self, data):
638
# Example -- handle processing instructions, could be overridden
639
def handle_proc(self, name, data):
642
# Example -- handle special instructions, could be overridden
643
def handle_special(self, data):
646
# Example -- handle relatively harmless syntax errors, could be overridden
647
def syntax_error(self, lineno, message):
648
raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
650
# To be overridden -- handlers for unknown objects
651
def unknown_starttag(self, tag, attrs): pass
652
def unknown_endtag(self, tag): pass
653
def unknown_charref(self, ref): pass
654
def unknown_entityref(self, ref): pass
659
# pick a suitable parser
661
XMLParser = FastXMLParser
663
XMLParser = SlowXMLParser
665
# --------------------------------------------------------------------
668
class TestXMLParser(XMLParser):
670
def __init__(self, verbose=0):
672
XMLParser.__init__(self, verbose)
674
def handle_data(self, data):
675
self.testdata = self.testdata + data
676
if len(`self.testdata`) >= 70:
683
print 'data:', `data`
685
def handle_cdata(self, data):
687
print 'cdata:', `data`
689
def handle_proc(self, name, data):
691
print 'processing:',name,`data`
693
def handle_special(self, data):
695
print 'special:',`data`
697
def handle_comment(self, data):
701
r = r[:32] + '...' + r[-32:]
704
def syntax_error(self, lineno, message):
705
print 'error at line %d:' % lineno, message
707
def unknown_starttag(self, tag, attrs):
710
print 'start tag: <' + tag + '>'
712
print 'start tag: <' + tag,
713
for name, value in attrs.items():
714
print name + '=' + '"' + value + '"',
717
def unknown_endtag(self, tag):
719
print 'end tag: </' + tag + '>'
721
def unknown_entityref(self, ref):
723
print '*** unknown entity ref: &' + ref + ';'
725
def unknown_charref(self, ref):
727
print '*** unknown char ref: &#' + ref + ';'
730
XMLParser.close(self)
733
def test(args = None):
739
if args and args[0] == '-s':
743
klass = TestXMLParser
760
if f is not sys.stdin:
769
if __name__ == '__main__': #NO_REPORTLAB_TEST