1
"""A parser for SGML, using the derived class as a static DTD."""
3
# XXX This only supports those SGML features used by HTML.
5
# XXX There should be a way to distinguish between PCDATA (parsed
6
# character data -- the normal case), RCDATA (replaceable character
7
# data -- only char and entity references and end tags are special)
8
# and CDATA (character data -- only end tags are special). RCDATA is
9
# not supported at all.
12
from warnings import warnpy3k
13
warnpy3k("the sgmllib module has been removed in Python 3.0",
20
__all__ = ["SGMLParser", "SGMLParseError"]
22
# Regular expressions used for parsing
24
interesting = re.compile('[&<]')
25
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
30
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31
charref = re.compile('&#([0-9]+)[^0-9]')
33
starttagopen = re.compile('<[>a-zA-Z]')
34
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36
piclose = re.compile('>')
37
endbracket = re.compile('[<>]')
38
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39
attrfind = re.compile(
40
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
44
class SGMLParseError(RuntimeError):
45
"""Exception raised for all parse errors."""
49
# SGML parser base class -- find tags and call handler functions.
50
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51
# The dtd is defined by deriving a class which defines methods
52
# with special names to handle tags: start_foo and end_foo to handle
53
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54
# (Tags are converted to lower case for this purpose.) The data
55
# between tags is passed to the parser by calling self.handle_data()
56
# with some data as argument (the data may be split up in arbitrary
57
# chunks). Entity references are passed by calling
58
# self.handle_entityref() with the entity reference as argument.
60
class SGMLParser(markupbase.ParserBase):
61
# Definition of entities -- derived classes may override
62
entity_or_charref = re.compile('&(?:'
63
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
66
def __init__(self, verbose=0):
67
"""Initialize and reset this instance."""
68
self.verbose = verbose
72
"""Reset this instance. Loses all unprocessed data."""
73
self.__starttag_text = None
79
markupbase.ParserBase.reset(self)
81
def setnomoretags(self):
82
"""Enter literal mode (CDATA) till EOF.
84
Intended for derived classes only.
86
self.nomoretags = self.literal = 1
88
def setliteral(self, *args):
89
"""Enter literal mode (CDATA).
91
Intended for derived classes only.
96
"""Feed some data to the parser.
98
Call this as often as you want, with as little or as much text
99
as you want (may include '\n'). (This just saves the text,
100
all the processing is done by goahead().)
103
self.rawdata = self.rawdata + data
107
"""Handle the remaining data."""
110
def error(self, message):
111
raise SGMLParseError(message)
113
# Internal -- handle data as far as reasonable. May leave state
114
# and data to be processed by a subsequent call. If 'end' is
115
# true, force handling all data as if followed by EOF marker.
116
def goahead(self, end):
117
rawdata = self.rawdata
122
self.handle_data(rawdata[i:n])
125
match = interesting.search(rawdata, i)
126
if match: j = match.start()
129
self.handle_data(rawdata[i:j])
132
if rawdata[i] == '<':
133
if starttagopen.match(rawdata, i):
135
self.handle_data(rawdata[i])
138
k = self.parse_starttag(i)
142
if rawdata.startswith("</", i):
143
k = self.parse_endtag(i)
150
self.handle_data("<")
156
if rawdata.startswith("<!--", i):
157
# Strictly speaking, a comment is --.*--
158
# within a declaration tag <!...>.
159
# This should be removed,
160
# and comments handled only in parse_declaration.
161
k = self.parse_comment(i)
165
if rawdata.startswith("<?", i):
170
if rawdata.startswith("<!", i):
171
# This is some sort of declaration; in "HTML as
172
# deployed," this should only be the document type
173
# declaration ("<!DOCTYPE html...>").
174
k = self.parse_declaration(i)
178
elif rawdata[i] == '&':
180
self.handle_data(rawdata[i])
183
match = charref.match(rawdata, i)
185
name = match.group(1)
186
self.handle_charref(name)
188
if rawdata[i-1] != ';': i = i-1
190
match = entityref.match(rawdata, i)
192
name = match.group(1)
193
self.handle_entityref(name)
195
if rawdata[i-1] != ';': i = i-1
198
self.error('neither < nor & ??')
199
# We get here only if incomplete matches but
201
match = incomplete.match(rawdata, i)
203
self.handle_data(rawdata[i])
208
break # Really incomplete
209
self.handle_data(rawdata[i:j])
213
self.handle_data(rawdata[i:n])
215
self.rawdata = rawdata[i:]
216
# XXX if end: check for empty stack
218
# Extensions for the DOCTYPE scanner:
219
_decl_otherchars = '='
221
# Internal -- parse processing instr, return length or -1 if not terminated
222
def parse_pi(self, i):
223
rawdata = self.rawdata
224
if rawdata[i:i+2] != '<?':
225
self.error('unexpected call to parse_pi()')
226
match = piclose.search(rawdata, i+2)
230
self.handle_pi(rawdata[i+2: j])
234
def get_starttag_text(self):
235
return self.__starttag_text
237
# Internal -- handle starttag, return length or -1 if not terminated
238
def parse_starttag(self, i):
239
self.__starttag_text = None
241
rawdata = self.rawdata
242
if shorttagopen.match(rawdata, i):
243
# SGML shorthand: <tag/data/ == <tag>data</tag>
244
# XXX Can data contain &... (entity or char refs)?
245
# XXX Can data contain < or > (tag characters)?
246
# XXX Can there be whitespace before the first /?
247
match = shorttag.match(rawdata, i)
250
tag, data = match.group(1, 2)
251
self.__starttag_text = '<%s/' % tag
254
self.finish_shorttag(tag, data)
255
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
257
# XXX The following should skip matching quotes (' or ")
258
# As a shortcut way to exit, this isn't so bad, but shouldn't
259
# be used to locate the actual end of the start tag since the
260
# < or > characters may be embedded in an attribute value.
261
match = endbracket.search(rawdata, i+1)
265
# Now parse the data between i+1 and j into a tag and attrs
267
if rawdata[i:i+2] == '<>':
268
# SGML shorthand: <> == <last open tag seen>
272
match = tagfind.match(rawdata, i+1)
274
self.error('unexpected call to parse_starttag')
276
tag = rawdata[i+1:k].lower()
279
match = attrfind.match(rawdata, k)
281
attrname, rest, attrvalue = match.group(1, 2, 3)
285
if (attrvalue[:1] == "'" == attrvalue[-1:] or
286
attrvalue[:1] == '"' == attrvalue[-1:]):
288
attrvalue = attrvalue[1:-1]
289
attrvalue = self.entity_or_charref.sub(
290
self._convert_ref, attrvalue)
291
attrs.append((attrname.lower(), attrvalue))
293
if rawdata[j] == '>':
295
self.__starttag_text = rawdata[start_pos:j]
296
self.finish_starttag(tag, attrs)
299
# Internal -- convert entity or character reference
300
def _convert_ref(self, match):
302
return self.convert_charref(match.group(2)) or \
303
'&#%s%s' % match.groups()[1:]
305
return self.convert_entityref(match.group(1)) or \
306
'&%s;' % match.group(1)
308
return '&%s' % match.group(1)
310
# Internal -- parse endtag
311
def parse_endtag(self, i):
312
rawdata = self.rawdata
313
match = endbracket.search(rawdata, i+1)
317
tag = rawdata[i+2:j].strip().lower()
318
if rawdata[j] == '>':
320
self.finish_endtag(tag)
323
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324
def finish_shorttag(self, tag, data):
325
self.finish_starttag(tag, [])
326
self.handle_data(data)
327
self.finish_endtag(tag)
329
# Internal -- finish processing of start tag
330
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331
def finish_starttag(self, tag, attrs):
333
method = getattr(self, 'start_' + tag)
334
except AttributeError:
336
method = getattr(self, 'do_' + tag)
337
except AttributeError:
338
self.unknown_starttag(tag, attrs)
341
self.handle_starttag(tag, method, attrs)
344
self.stack.append(tag)
345
self.handle_starttag(tag, method, attrs)
348
# Internal -- finish processing of end tag
349
def finish_endtag(self, tag):
351
found = len(self.stack) - 1
353
self.unknown_endtag(tag)
356
if tag not in self.stack:
358
method = getattr(self, 'end_' + tag)
359
except AttributeError:
360
self.unknown_endtag(tag)
362
self.report_unbalanced(tag)
364
found = len(self.stack)
365
for i in range(found):
366
if self.stack[i] == tag: found = i
367
while len(self.stack) > found:
370
method = getattr(self, 'end_' + tag)
371
except AttributeError:
374
self.handle_endtag(tag, method)
376
self.unknown_endtag(tag)
379
# Overridable -- handle start tag
380
def handle_starttag(self, tag, method, attrs):
383
# Overridable -- handle end tag
384
def handle_endtag(self, tag, method):
387
# Example -- report an unbalanced </...> tag.
388
def report_unbalanced(self, tag):
390
print '*** Unbalanced </' + tag + '>'
391
print '*** Stack:', self.stack
393
def convert_charref(self, name):
394
"""Convert character reference, may be overridden."""
399
if not 0 <= n <= 127:
401
return self.convert_codepoint(n)
403
def convert_codepoint(self, codepoint):
404
return chr(codepoint)
406
def handle_charref(self, name):
407
"""Handle character reference, no need to override."""
408
replacement = self.convert_charref(name)
409
if replacement is None:
410
self.unknown_charref(name)
412
self.handle_data(replacement)
414
# Definition of entities -- derived classes may override
416
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
418
def convert_entityref(self, name):
419
"""Convert entity references.
421
As an alternative to overriding this method; one can tailor the
422
results by setting up the self.entitydefs mapping appropriately.
424
table = self.entitydefs
430
def handle_entityref(self, name):
431
"""Handle entity references, no need to override."""
432
replacement = self.convert_entityref(name)
433
if replacement is None:
434
self.unknown_entityref(name)
436
self.handle_data(replacement)
438
# Example -- handle data, should be overridden
439
def handle_data(self, data):
442
# Example -- handle comment, could be overridden
443
def handle_comment(self, data):
446
# Example -- handle declaration, could be overridden
447
def handle_decl(self, decl):
450
# Example -- handle processing instruction, could be overridden
451
def handle_pi(self, data):
454
# To be overridden -- handlers for unknown objects
455
def unknown_starttag(self, tag, attrs): pass
456
def unknown_endtag(self, tag): pass
457
def unknown_charref(self, ref): pass
458
def unknown_entityref(self, ref): pass
461
class TestSGMLParser(SGMLParser):
463
def __init__(self, verbose=0):
465
SGMLParser.__init__(self, verbose)
467
def handle_data(self, data):
468
self.testdata = self.testdata + data
469
if len(repr(self.testdata)) >= 70:
476
print 'data:', repr(data)
478
def handle_comment(self, data):
482
r = r[:32] + '...' + r[-32:]
485
def unknown_starttag(self, tag, attrs):
488
print 'start tag: <' + tag + '>'
490
print 'start tag: <' + tag,
491
for name, value in attrs:
492
print name + '=' + '"' + value + '"',
495
def unknown_endtag(self, tag):
497
print 'end tag: </' + tag + '>'
499
def unknown_entityref(self, ref):
501
print '*** unknown entity ref: &' + ref + ';'
503
def unknown_charref(self, ref):
505
print '*** unknown char ref: &#' + ref + ';'
507
def unknown_decl(self, data):
509
print '*** unknown decl: [' + data + ']'
512
SGMLParser.close(self)
516
def test(args = None):
522
if args and args[0] == '-s':
526
klass = TestSGMLParser
543
if f is not sys.stdin:
552
if __name__ == '__main__':
1
"""A parser for SGML, using the derived class as a static DTD."""
3
# XXX This only supports those SGML features used by HTML.
5
# XXX There should be a way to distinguish between PCDATA (parsed
6
# character data -- the normal case), RCDATA (replaceable character
7
# data -- only char and entity references and end tags are special)
8
# and CDATA (character data -- only end tags are special). RCDATA is
9
# not supported at all.
12
from warnings import warnpy3k
13
warnpy3k("the sgmllib module has been removed in Python 3.0",
20
__all__ = ["SGMLParser", "SGMLParseError"]
22
# Regular expressions used for parsing
24
interesting = re.compile('[&<]')
25
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
30
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31
charref = re.compile('&#([0-9]+)[^0-9]')
33
starttagopen = re.compile('<[>a-zA-Z]')
34
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36
piclose = re.compile('>')
37
endbracket = re.compile('[<>]')
38
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39
attrfind = re.compile(
40
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
44
class SGMLParseError(RuntimeError):
45
"""Exception raised for all parse errors."""
49
# SGML parser base class -- find tags and call handler functions.
50
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51
# The dtd is defined by deriving a class which defines methods
52
# with special names to handle tags: start_foo and end_foo to handle
53
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54
# (Tags are converted to lower case for this purpose.) The data
55
# between tags is passed to the parser by calling self.handle_data()
56
# with some data as argument (the data may be split up in arbitrary
57
# chunks). Entity references are passed by calling
58
# self.handle_entityref() with the entity reference as argument.
60
class SGMLParser(markupbase.ParserBase):
61
# Definition of entities -- derived classes may override
62
entity_or_charref = re.compile('&(?:'
63
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
66
def __init__(self, verbose=0):
67
"""Initialize and reset this instance."""
68
self.verbose = verbose
72
"""Reset this instance. Loses all unprocessed data."""
73
self.__starttag_text = None
79
markupbase.ParserBase.reset(self)
81
def setnomoretags(self):
82
"""Enter literal mode (CDATA) till EOF.
84
Intended for derived classes only.
86
self.nomoretags = self.literal = 1
88
def setliteral(self, *args):
89
"""Enter literal mode (CDATA).
91
Intended for derived classes only.
96
"""Feed some data to the parser.
98
Call this as often as you want, with as little or as much text
99
as you want (may include '\n'). (This just saves the text,
100
all the processing is done by goahead().)
103
self.rawdata = self.rawdata + data
107
"""Handle the remaining data."""
110
def error(self, message):
111
raise SGMLParseError(message)
113
# Internal -- handle data as far as reasonable. May leave state
114
# and data to be processed by a subsequent call. If 'end' is
115
# true, force handling all data as if followed by EOF marker.
116
def goahead(self, end):
117
rawdata = self.rawdata
122
self.handle_data(rawdata[i:n])
125
match = interesting.search(rawdata, i)
126
if match: j = match.start()
129
self.handle_data(rawdata[i:j])
132
if rawdata[i] == '<':
133
if starttagopen.match(rawdata, i):
135
self.handle_data(rawdata[i])
138
k = self.parse_starttag(i)
142
if rawdata.startswith("</", i):
143
k = self.parse_endtag(i)
150
self.handle_data("<")
156
if rawdata.startswith("<!--", i):
157
# Strictly speaking, a comment is --.*--
158
# within a declaration tag <!...>.
159
# This should be removed,
160
# and comments handled only in parse_declaration.
161
k = self.parse_comment(i)
165
if rawdata.startswith("<?", i):
170
if rawdata.startswith("<!", i):
171
# This is some sort of declaration; in "HTML as
172
# deployed," this should only be the document type
173
# declaration ("<!DOCTYPE html...>").
174
k = self.parse_declaration(i)
178
elif rawdata[i] == '&':
180
self.handle_data(rawdata[i])
183
match = charref.match(rawdata, i)
185
name = match.group(1)
186
self.handle_charref(name)
188
if rawdata[i-1] != ';': i = i-1
190
match = entityref.match(rawdata, i)
192
name = match.group(1)
193
self.handle_entityref(name)
195
if rawdata[i-1] != ';': i = i-1
198
self.error('neither < nor & ??')
199
# We get here only if incomplete matches but
201
match = incomplete.match(rawdata, i)
203
self.handle_data(rawdata[i])
208
break # Really incomplete
209
self.handle_data(rawdata[i:j])
213
self.handle_data(rawdata[i:n])
215
self.rawdata = rawdata[i:]
216
# XXX if end: check for empty stack
218
# Extensions for the DOCTYPE scanner:
219
_decl_otherchars = '='
221
# Internal -- parse processing instr, return length or -1 if not terminated
222
def parse_pi(self, i):
223
rawdata = self.rawdata
224
if rawdata[i:i+2] != '<?':
225
self.error('unexpected call to parse_pi()')
226
match = piclose.search(rawdata, i+2)
230
self.handle_pi(rawdata[i+2: j])
234
def get_starttag_text(self):
235
return self.__starttag_text
237
# Internal -- handle starttag, return length or -1 if not terminated
238
def parse_starttag(self, i):
239
self.__starttag_text = None
241
rawdata = self.rawdata
242
if shorttagopen.match(rawdata, i):
243
# SGML shorthand: <tag/data/ == <tag>data</tag>
244
# XXX Can data contain &... (entity or char refs)?
245
# XXX Can data contain < or > (tag characters)?
246
# XXX Can there be whitespace before the first /?
247
match = shorttag.match(rawdata, i)
250
tag, data = match.group(1, 2)
251
self.__starttag_text = '<%s/' % tag
254
self.finish_shorttag(tag, data)
255
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
257
# XXX The following should skip matching quotes (' or ")
258
# As a shortcut way to exit, this isn't so bad, but shouldn't
259
# be used to locate the actual end of the start tag since the
260
# < or > characters may be embedded in an attribute value.
261
match = endbracket.search(rawdata, i+1)
265
# Now parse the data between i+1 and j into a tag and attrs
267
if rawdata[i:i+2] == '<>':
268
# SGML shorthand: <> == <last open tag seen>
272
match = tagfind.match(rawdata, i+1)
274
self.error('unexpected call to parse_starttag')
276
tag = rawdata[i+1:k].lower()
279
match = attrfind.match(rawdata, k)
281
attrname, rest, attrvalue = match.group(1, 2, 3)
285
if (attrvalue[:1] == "'" == attrvalue[-1:] or
286
attrvalue[:1] == '"' == attrvalue[-1:]):
288
attrvalue = attrvalue[1:-1]
289
attrvalue = self.entity_or_charref.sub(
290
self._convert_ref, attrvalue)
291
attrs.append((attrname.lower(), attrvalue))
293
if rawdata[j] == '>':
295
self.__starttag_text = rawdata[start_pos:j]
296
self.finish_starttag(tag, attrs)
299
# Internal -- convert entity or character reference
300
def _convert_ref(self, match):
302
return self.convert_charref(match.group(2)) or \
303
'&#%s%s' % match.groups()[1:]
305
return self.convert_entityref(match.group(1)) or \
306
'&%s;' % match.group(1)
308
return '&%s' % match.group(1)
310
# Internal -- parse endtag
311
def parse_endtag(self, i):
312
rawdata = self.rawdata
313
match = endbracket.search(rawdata, i+1)
317
tag = rawdata[i+2:j].strip().lower()
318
if rawdata[j] == '>':
320
self.finish_endtag(tag)
323
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324
def finish_shorttag(self, tag, data):
325
self.finish_starttag(tag, [])
326
self.handle_data(data)
327
self.finish_endtag(tag)
329
# Internal -- finish processing of start tag
330
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331
def finish_starttag(self, tag, attrs):
333
method = getattr(self, 'start_' + tag)
334
except AttributeError:
336
method = getattr(self, 'do_' + tag)
337
except AttributeError:
338
self.unknown_starttag(tag, attrs)
341
self.handle_starttag(tag, method, attrs)
344
self.stack.append(tag)
345
self.handle_starttag(tag, method, attrs)
348
# Internal -- finish processing of end tag
349
def finish_endtag(self, tag):
351
found = len(self.stack) - 1
353
self.unknown_endtag(tag)
356
if tag not in self.stack:
358
method = getattr(self, 'end_' + tag)
359
except AttributeError:
360
self.unknown_endtag(tag)
362
self.report_unbalanced(tag)
364
found = len(self.stack)
365
for i in range(found):
366
if self.stack[i] == tag: found = i
367
while len(self.stack) > found:
370
method = getattr(self, 'end_' + tag)
371
except AttributeError:
374
self.handle_endtag(tag, method)
376
self.unknown_endtag(tag)
379
# Overridable -- handle start tag
380
def handle_starttag(self, tag, method, attrs):
383
# Overridable -- handle end tag
384
def handle_endtag(self, tag, method):
387
# Example -- report an unbalanced </...> tag.
388
def report_unbalanced(self, tag):
390
print '*** Unbalanced </' + tag + '>'
391
print '*** Stack:', self.stack
393
def convert_charref(self, name):
394
"""Convert character reference, may be overridden."""
399
if not 0 <= n <= 127:
401
return self.convert_codepoint(n)
403
def convert_codepoint(self, codepoint):
404
return chr(codepoint)
406
def handle_charref(self, name):
407
"""Handle character reference, no need to override."""
408
replacement = self.convert_charref(name)
409
if replacement is None:
410
self.unknown_charref(name)
412
self.handle_data(replacement)
414
# Definition of entities -- derived classes may override
416
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
418
def convert_entityref(self, name):
419
"""Convert entity references.
421
As an alternative to overriding this method; one can tailor the
422
results by setting up the self.entitydefs mapping appropriately.
424
table = self.entitydefs
430
def handle_entityref(self, name):
431
"""Handle entity references, no need to override."""
432
replacement = self.convert_entityref(name)
433
if replacement is None:
434
self.unknown_entityref(name)
436
self.handle_data(replacement)
438
# Example -- handle data, should be overridden
439
def handle_data(self, data):
442
# Example -- handle comment, could be overridden
443
def handle_comment(self, data):
446
# Example -- handle declaration, could be overridden
447
def handle_decl(self, decl):
450
# Example -- handle processing instruction, could be overridden
451
def handle_pi(self, data):
454
# To be overridden -- handlers for unknown objects
455
def unknown_starttag(self, tag, attrs): pass
456
def unknown_endtag(self, tag): pass
457
def unknown_charref(self, ref): pass
458
def unknown_entityref(self, ref): pass
461
class TestSGMLParser(SGMLParser):
463
def __init__(self, verbose=0):
465
SGMLParser.__init__(self, verbose)
467
def handle_data(self, data):
468
self.testdata = self.testdata + data
469
if len(repr(self.testdata)) >= 70:
476
print 'data:', repr(data)
478
def handle_comment(self, data):
482
r = r[:32] + '...' + r[-32:]
485
def unknown_starttag(self, tag, attrs):
488
print 'start tag: <' + tag + '>'
490
print 'start tag: <' + tag,
491
for name, value in attrs:
492
print name + '=' + '"' + value + '"',
495
def unknown_endtag(self, tag):
497
print 'end tag: </' + tag + '>'
499
def unknown_entityref(self, ref):
501
print '*** unknown entity ref: &' + ref + ';'
503
def unknown_charref(self, ref):
505
print '*** unknown char ref: &#' + ref + ';'
507
def unknown_decl(self, data):
509
print '*** unknown decl: [' + data + ']'
512
SGMLParser.close(self)
516
def test(args = None):
522
if args and args[0] == '-s':
526
klass = TestSGMLParser
543
if f is not sys.stdin:
552
if __name__ == '__main__':