1
# -*- test-case-name: twisted.web.test.test_xml -*-
3
# Copyright (c) 2001-2004 Twisted Matrix Laboratories.
4
# See LICENSE for details.
8
*S*mall, *U*ncomplicated *X*ML.
10
This is a very simple implementation of XML/HTML as a network
11
protocol. It is not at all clever. Its main features are that it
15
- mung mnemonic entity references
17
- perform *any* external actions (such as fetching URLs or writing files)
18
under *any* circumstances
19
- has lots and lots of horrible hacks for supporting broken HTML (as an
20
option, they're not on by default).
23
from twisted.internet.protocol import Protocol, FileWrapper
24
from twisted.python.reflect import prefixedMethodNames
28
# Elements of the three-tuples in the state table.
34
lenientIdentChars = identChars + ';+#/%~'
44
d = dict([(x, 1) for x in l])
48
def zipfndict(*args, **kw):
49
default = kw.get('default', nop)
51
for key in unionlist(*[fndict.keys() for fndict in args]):
52
d[key] = tuple([x.get(key, default) for x in args])
56
def prefixedMethodClassDict(clazz, prefix):
57
return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
60
def prefixedMethodObjDict(obj, prefix):
61
return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
64
class ParseError(Exception):
66
def __init__(self, filename, line, col, message):
67
self.filename = filename
70
self.message = message
73
return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
76
class XMLParser(Protocol):
81
beExtremelyLenient = 0
84
# _leadingBodyData will sometimes be set before switching to the
85
# 'bodydata' state, when we "accidentally" read a byte of bodydata
86
# in a different state.
87
_leadingBodyData = None
89
def connectionMade(self):
95
'''Get the line number and column of the last character parsed'''
96
# This gets replaced during dataReceived, restored afterwards
97
return (self.lineno, self.colno)
99
def _parseError(self, message):
100
raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
102
def _buildStateTable(self):
103
'''Return a dictionary of begin, do, end state function tuples'''
104
# _buildStateTable leaves something to be desired but it does what it
105
# does.. probably slowly, so I'm doing some evil caching so it doesn't
106
# get called more than once per class.
107
stateTable = getattr(self.__class__, '__stateTable', None)
108
if stateTable is None:
109
stateTable = self.__class__.__stateTable = zipfndict(
110
*[prefixedMethodObjDict(self, prefix)
111
for prefix in ('begin_', 'do_', 'end_')])
114
def _decode(self, data):
115
if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
116
assert not len(data) & 1, 'UTF-16 must come in pairs for now'
118
data = self._prepend + data
119
for encoding in self.encodings:
120
data = unicode(data, encoding)
123
def maybeBodyData(self):
127
# Get ready for fun! We're going to allow
128
# <script>if (foo < bar)</script> to work!
129
# We do this by making everything between <script> and
131
# BUT <script src="foo"> will be special-cased to do regular,
132
# lenient behavior, because those may not have </script>
135
if (self.tagName == 'script'
136
and not self.tagAttributes.has_key('src')):
137
# we do this ourselves rather than having begin_waitforendscript
138
# becuase that can get called multiple times and we don't want
139
# bodydata to get reset other than the first time.
140
self.begin_bodydata(None)
141
return 'waitforendscript'
146
def dataReceived(self, data):
147
stateTable = self._buildStateTable()
149
# all UTF-16 starts with this string
150
if data.startswith('\xff\xfe'):
151
self._prepend = '\xff\xfe'
152
self.encodings.append('UTF-16')
154
elif data.startswith('\xfe\xff'):
155
self._prepend = '\xfe\xff'
156
self.encodings.append('UTF-16')
160
data = self._decode(data)
161
# bring state, lineno, colno into local scope
162
lineno, colno = self.lineno, self.colno
163
curState = self.state
164
# replace saveMark with a nested scope function
165
_saveMark = self.saveMark
167
return (lineno, colno)
168
self.saveMark = saveMark
169
# fetch functions from the stateTable
170
beginFn, doFn, endFn = stateTable[curState]
179
newState = doFn(byte)
180
if newState is not None and newState != curState:
181
# this is the endFn from the previous state
184
beginFn, doFn, endFn = stateTable[curState]
187
self.saveMark = _saveMark
188
self.lineno, self.colno = lineno, colno
189
# state doesn't make sense if there's an exception..
190
self.state = curState
193
def connectionLost(self, reason):
195
End the last state we were in.
197
stateTable = self._buildStateTable()
198
stateTable[self.state][END_HANDLER]()
203
def do_begin(self, byte):
207
if self.beExtremelyLenient:
208
self._leadingBodyData = byte
210
self._parseError("First char of document [%r] wasn't <" % (byte,))
213
def begin_comment(self, byte):
216
def do_comment(self, byte):
217
self.commentbuf += byte
218
if self.commentbuf.endswith('-->'):
219
self.gotComment(self.commentbuf[:-3])
222
def begin_tagstart(self, byte):
223
self.tagName = '' # name of the tag
224
self.tagAttributes = {} # attributes of the tag
225
self.termtag = 0 # is the tag self-terminating
228
def do_tagstart(self, byte):
229
if byte.isalnum() or byte in identChars:
231
if self.tagName == '!--':
236
# properly strict thing to do here is probably to only
241
self._parseError("Whitespace before tag-name")
244
self.gotTagEnd(self.tagName)
247
self.gotTagStart(self.tagName, {})
248
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
256
if not self.beExtremelyLenient:
257
self._parseError("Invalid character in tag-name")
262
if self.tagName == '!':
265
self._parseError("Invalid '[' in tag-name")
267
if self.beExtremelyLenient:
270
self._parseError('Invalid tag character: %r'% byte)
272
def begin_unentity(self, byte):
273
self.bodydata += byte
275
def do_unentity(self, byte):
276
self.bodydata += byte
279
def end_unentity(self):
280
self.gotText(self.bodydata)
282
def begin_expectcdata(self, byte):
285
def do_expectcdata(self, byte):
286
self.cdatabuf += byte
289
if len(cd) > len(cdb):
290
if cd.startswith(cdb):
292
elif self.beExtremelyLenient:
293
## WHAT THE CRAP!? MSWord9 generates HTML that includes these
294
## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
295
## 'em as best I can. this should really be a separate parse
296
## state but I don't even have any idea what these _are_.
299
self._parseError("Mal-formed CDATA header")
303
self._parseError("Mal-formed CDATA header")
305
def do_cdata(self, byte):
306
self.cdatabuf += byte
307
if self.cdatabuf.endswith("]]>"):
308
self.cdatabuf = self.cdatabuf[:-3]
312
self.gotCData(self.cdatabuf)
315
def do_attrs(self, byte):
316
if byte.isalnum() or byte in identChars:
317
# XXX FIXME really handle !DOCTYPE at some point
318
if self.tagName == '!DOCTYPE':
320
if self.tagName[0] in '!?':
326
self.gotTagStart(self.tagName, self.tagAttributes)
327
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
330
elif self.beExtremelyLenient:
331
# discard and move on? Only case I've seen of this so far was:
334
self._parseError("Unexpected character: %r" % byte)
336
def begin_doctype(self, byte):
339
def do_doctype(self, byte):
344
def end_doctype(self):
345
self.gotDoctype(self.doctype)
348
def do_waitforgt(self, byte):
350
if self.endtag or not self.beExtremelyLenient:
352
return self.maybeBodyData()
354
def begin_attrname(self, byte):
356
self._attrname_termtag = 0
358
def do_attrname(self, byte):
359
if byte.isalnum() or byte in identChars:
360
self.attrname += byte
363
return 'beforeattrval'
366
elif self.beExtremelyLenient:
369
if byte in lenientIdentChars or byte.isalnum():
370
self.attrname += byte
373
self._attrname_termtag = 1
376
self.attrval = 'True'
377
self.tagAttributes[self.attrname] = self.attrval
378
self.gotTagStart(self.tagName, self.tagAttributes)
379
if self._attrname_termtag:
380
self.gotTagEnd(self.tagName)
382
return self.maybeBodyData()
383
# something is really broken. let's leave this attribute where it
384
# is and move on to the next thing
386
self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
388
def do_beforeattrval(self, byte):
393
elif self.beExtremelyLenient:
394
if byte in lenientIdentChars or byte.isalnum():
397
self.attrval = 'True'
398
self.tagAttributes[self.attrname] = self.attrval
399
self.gotTagStart(self.tagName, self.tagAttributes)
400
return self.maybeBodyData()
402
# I saw this in actual HTML once:
403
# <font size=\"3\"><sup>SM</sup></font>
405
self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
410
def begin_beforeeq(self,byte):
411
self._beforeeq_termtag = 0
413
def do_beforeeq(self, byte):
415
return 'beforeattrval'
418
elif self.beExtremelyLenient:
419
if byte.isalnum() or byte in identChars:
420
self.attrval = 'True'
421
self.tagAttributes[self.attrname] = self.attrval
424
self.attrval = 'True'
425
self.tagAttributes[self.attrname] = self.attrval
426
self.gotTagStart(self.tagName, self.tagAttributes)
427
if self._beforeeq_termtag:
428
self.gotTagEnd(self.tagName)
430
return self.maybeBodyData()
432
self._beforeeq_termtag = 1
434
self._parseError("Invalid attribute")
436
def begin_attrval(self, byte):
437
self.quotetype = byte
440
def do_attrval(self, byte):
441
if byte == self.quotetype:
445
def end_attrval(self):
446
self.tagAttributes[self.attrname] = self.attrval
447
self.attrname = self.attrval = ''
449
def begin_messyattr(self, byte):
452
def do_messyattr(self, byte):
457
if self.attrval.endswith('/'):
459
self.attrval = self.attrval[:-1]
460
self.tagAttributes[self.attrname] = self.attrval
461
self.gotTagStart(self.tagName, self.tagAttributes)
463
self.gotTagEnd(self.tagName)
465
return self.maybeBodyData()
469
def end_messyattr(self):
471
self.tagAttributes[self.attrname] = self.attrval
473
def begin_afterslash(self, byte):
474
self._after_slash_closed = 0
476
def do_afterslash(self, byte):
477
# this state is only after a self-terminating slash, e.g. <foo/>
478
if self._after_slash_closed:
479
self._parseError("Mal-formed")#XXX When does this happen??
481
if self.beExtremelyLenient:
484
self._parseError("No data allowed after '/'")
485
self._after_slash_closed = 1
486
self.gotTagStart(self.tagName, self.tagAttributes)
487
self.gotTagEnd(self.tagName)
488
# don't need maybeBodyData here because there better not be
489
# any javascript code after a <script/>... we'll see :(
492
def begin_bodydata(self, byte):
493
if self._leadingBodyData:
494
self.bodydata = self._leadingBodyData
495
del self._leadingBodyData
499
def do_bodydata(self, byte):
504
self.bodydata += byte
506
def end_bodydata(self):
507
self.gotText(self.bodydata)
510
def do_waitforendscript(self, byte):
512
return 'waitscriptendtag'
513
self.bodydata += byte
515
def begin_waitscriptendtag(self, byte):
516
self.temptagdata = ''
520
def do_waitscriptendtag(self, byte):
521
# 1 enforce / as first byte read
522
# 2 enforce following bytes to be subset of "script" until
523
# tagName == "script"
524
# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
525
# 3 spaces can happen anywhere, they're ignored
527
# 4 anything else causes all data I've read to be moved to the
528
# bodydata, and switch back to waitforendscript state
530
# If it turns out this _isn't_ a </script>, we need to
531
# remember all the data we've been through so we can append it
533
self.temptagdata += byte
538
elif not self.endtag:
539
self.bodydata += "<" + self.temptagdata
540
return 'waitforendscript'
542
elif byte.isalnum() or byte in identChars:
544
if not 'script'.startswith(self.tagName):
545
self.bodydata += "<" + self.temptagdata
546
return 'waitforendscript'
547
elif self.tagName == 'script':
548
self.gotText(self.bodydata)
549
self.gotTagEnd(self.tagName)
553
return 'waitscriptendtag'
556
self.bodydata += "<" + self.temptagdata
557
return 'waitforendscript'
560
def begin_entityref(self, byte):
562
self.erefextra = '' # extra bit for lenient mode
564
def do_entityref(self, byte):
565
if byte.isspace() or byte == "<":
566
if self.beExtremelyLenient:
567
# '&foo' probably was '&foo'
568
if self.erefbuf and self.erefbuf != "amp":
569
self.erefextra = self.erefbuf
574
self.erefextra += byte
575
return 'spacebodydata'
576
self._parseError("Bad entity reference")
582
def end_entityref(self):
583
self.gotEntityReference(self.erefbuf)
585
# hacky support for space after & in entityref in beExtremelyLenient
586
# state should only happen in that case
587
def begin_spacebodydata(self, byte):
588
self.bodydata = self.erefextra
589
self.erefextra = None
590
do_spacebodydata = do_bodydata
591
end_spacebodydata = end_bodydata
595
def gotTagStart(self, name, attributes):
596
'''Encountered an opening tag.
598
Default behaviour is to print.'''
599
print 'begin', name, attributes
601
def gotText(self, data):
604
Default behaviour is to print.'''
605
print 'text:', repr(data)
607
def gotEntityReference(self, entityRef):
608
'''Encountered mnemonic entity reference
610
Default behaviour is to print.'''
611
print 'entityRef: &%s;' % entityRef
613
def gotComment(self, comment):
614
'''Encountered comment.
616
Default behaviour is to ignore.'''
619
def gotCData(self, cdata):
622
Default behaviour is to call the gotText method'''
625
def gotDoctype(self, doctype):
626
"""Encountered DOCTYPE
628
This is really grotty: it basically just gives you everything between
629
'<!DOCTYPE' and '>' as an argument.
631
print '!DOCTYPE', repr(doctype)
633
def gotTagEnd(self, name):
634
'''Encountered closing tag
636
Default behaviour is to print.'''
639
if __name__ == '__main__':
640
from cStringIO import StringIO
643
<!DOCTYPE ignore all this shit, hah its malformed!!!!@$>
644
<?xml version="suck it"?>
648
<baz boz="buz">boz &zop;</baz>
649
<![CDATA[ foo bar baz ]]>
653
x.makeConnection(FileWrapper(StringIO()))
654
# fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"
655
fn = "/home/glyph/gruesome.xml"
656
# testDocument = open(fn).read()
657
x.dataReceived(testDocument)