1
# -*- test-case-name: twisted.web.test.test_xml -*-
2
# Copyright (c) 2001-2008 Twisted Matrix Laboratories.
3
# See LICENSE for details.
6
Micro Document Object Model: a partial DOM implementation with SUX.
8
This is an implementation of what we consider to be the useful subset of the
9
DOM. The chief advantage of this library is that, not being burdened with
10
standards compliance, it can remain very stable between versions. We can also
11
implement utility 'pythonic' ways to access and mutate the XML tree.
13
Since this has not subjected to a serious trial by fire, it is not recommended
14
to use this outside of Twisted applications. However, it seems to work just
15
fine for the documentation generator, which parses a fairly representative
18
Microdom mainly focuses on working with HTML and XHTML.
23
from cStringIO import StringIO
25
# create NodeList class
26
from types import ListType as NodeList
27
from types import StringTypes, UnicodeType
30
from twisted.web.sux import XMLParser, ParseError
31
from twisted.python.util import InsensitiveDict
34
def getElementsByTagName(iNode, name):
36
Return a list of all child elements of C{iNode} with a name matching
39
Note that this implementation does not conform to the DOM Level 1 Core
40
specification because it may return C{iNode}.
42
@param iNode: An element at which to begin searching. If C{iNode} has a
43
name matching C{name}, it will be included in the result.
45
@param name: A C{str} giving the name of the elements to return.
47
@return: A C{list} of direct or indirect child elements of C{iNode} with
48
the name C{name}. This may include C{iNode}.
51
matches_append = matches.append # faster lookup. don't do this at home
55
if c.nodeName == name:
57
slice[:0] = c.childNodes
62
def getElementsByTagNameNoCase(iNode, name):
65
matches_append = matches.append
69
if c.nodeName.lower() == name:
71
slice[:0] = c.childNodes
75
HTML_ESCAPE_CHARS = (('&', '&'), # don't add any entities before this one
79
REV_HTML_ESCAPE_CHARS = list(HTML_ESCAPE_CHARS)
80
REV_HTML_ESCAPE_CHARS.reverse()
82
XML_ESCAPE_CHARS = HTML_ESCAPE_CHARS + (("'", '''),)
83
REV_XML_ESCAPE_CHARS = list(XML_ESCAPE_CHARS)
84
REV_XML_ESCAPE_CHARS.reverse()
86
def unescape(text, chars=REV_HTML_ESCAPE_CHARS):
87
"Perform the exact opposite of 'escape'."
89
text = text.replace(h, s)
92
def escape(text, chars=HTML_ESCAPE_CHARS):
93
"Escape a few XML special chars with XML entities."
95
text = text.replace(s, h)
99
class MismatchedTags(Exception):
101
def __init__(self, filename, expect, got, endLine, endCol, begLine, begCol):
102
(self.filename, self.expect, self.got, self.begLine, self.begCol, self.endLine,
103
self.endCol) = filename, expect, got, begLine, begCol, endLine, endCol
106
return ("expected </%s>, got </%s> line: %s col: %s, began line: %s col: %s"
107
% (self.expect, self.got, self.endLine, self.endCol, self.begLine,
114
def __init__(self, parentNode=None):
115
self.parentNode = parentNode
118
def isEqualToNode(self, other):
120
Compare this node to C{other}. If the nodes have the same number of
121
children and corresponding children are equal to each other, return
122
C{True}, otherwise return C{False}.
127
if len(self.childNodes) != len(other.childNodes):
129
for a, b in zip(self.childNodes, other.childNodes):
130
if not a.isEqualToNode(b):
134
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
135
nsprefixes={}, namespace=''):
136
raise NotImplementedError()
138
def toxml(self, indent='', addindent='', newl='', strip=0, nsprefixes={},
141
self.writexml(s, indent, addindent, newl, strip, nsprefixes, namespace)
145
def writeprettyxml(self, stream, indent='', addindent=' ', newl='\n', strip=0):
146
return self.writexml(stream, indent, addindent, newl, strip)
148
def toprettyxml(self, indent='', addindent=' ', newl='\n', strip=0):
149
return self.toxml(indent, addindent, newl, strip)
151
def cloneNode(self, deep=0, parent=None):
152
raise NotImplementedError()
154
def hasChildNodes(self):
161
def appendChild(self, child):
163
Make the given L{Node} the last child of this node.
165
@param child: The L{Node} which will become a child of this node.
167
@raise TypeError: If C{child} is not a C{Node} instance.
169
if not isinstance(child, Node):
170
raise TypeError("expected Node instance")
171
self.childNodes.append(child)
172
child.parentNode = self
175
def insertBefore(self, new, ref):
177
Make the given L{Node} C{new} a child of this node which comes before
180
@param new: A L{Node} which will become a child of this node.
182
@param ref: A L{Node} which is already a child of this node which
183
C{new} will be inserted before.
185
@raise TypeError: If C{new} or C{ref} is not a C{Node} instance.
189
if not isinstance(new, Node) or not isinstance(ref, Node):
190
raise TypeError("expected Node instance")
191
i = self.childNodes.index(ref)
192
new.parentNode = self
193
self.childNodes.insert(i, new)
197
def removeChild(self, child):
199
Remove the given L{Node} from this node's children.
201
@param child: A L{Node} which is a child of this node which will no
202
longer be a child of this node after this method is called.
204
@raise TypeError: If C{child} is not a C{Node} instance.
208
if not isinstance(child, Node):
209
raise TypeError("expected Node instance")
210
if child in self.childNodes:
211
self.childNodes.remove(child)
212
child.parentNode = None
215
def replaceChild(self, newChild, oldChild):
217
Replace a L{Node} which is already a child of this node with a
220
@param newChild: A L{Node} which will be made a child of this node.
222
@param oldChild: A L{Node} which is a child of this node which will
223
give up its position to C{newChild}.
225
@raise TypeError: If C{newChild} or C{oldChild} is not a C{Node}
228
@raise ValueError: If C{oldChild} is not a child of this C{Node}.
230
if not isinstance(newChild, Node) or not isinstance(oldChild, Node):
231
raise TypeError("expected Node instance")
232
if oldChild.parentNode is not self:
233
raise ValueError("oldChild is not a child of this node")
234
self.childNodes[self.childNodes.index(oldChild)] = newChild
235
oldChild.parentNode = None
236
newChild.parentNode = self
240
return self.childNodes[-1]
243
def firstChild(self):
244
if len(self.childNodes):
245
return self.childNodes[0]
248
#def get_ownerDocument(self):
249
# """This doesn't really get the owner document; microdom nodes
250
# don't even have one necessarily. This gets the root node,
251
# which is usually what you really meant.
252
# *NOT DOM COMPLIANT.*
255
# while (node.parentNode): node=node.parentNode
257
#ownerDocument=node.get_ownerDocument()
258
# leaving commented for discussion; see also domhelpers.getParents(node)
260
class Document(Node):
262
def __init__(self, documentElement=None):
265
self.appendChild(documentElement)
267
def cloneNode(self, deep=0, parent=None):
269
d.doctype = self.doctype
271
newEl = self.documentElement.cloneNode(1, self)
273
newEl = self.documentElement
279
def isEqualToDocument(self, n):
280
return (self.doctype == n.doctype) and Node.isEqualToNode(self, n)
281
isEqualToNode = isEqualToDocument
283
def get_documentElement(self):
284
return self.childNodes[0]
285
documentElement=property(get_documentElement)
287
def appendChild(self, child):
289
Make the given L{Node} the I{document element} of this L{Document}.
291
@param child: The L{Node} to make into this L{Document}'s document
294
@raise ValueError: If this document already has a document element.
297
raise ValueError("Only one element per document.")
298
Node.appendChild(self, child)
300
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
301
nsprefixes={}, namespace=''):
302
stream.write('<?xml version="1.0"?>' + newl)
304
stream.write("<!DOCTYPE "+self.doctype+">" + newl)
305
self.documentElement.writexml(stream, indent, addindent, newl, strip,
306
nsprefixes, namespace)
308
# of dubious utility (?)
309
def createElement(self, name, **kw):
310
return Element(name, **kw)
312
def createTextNode(self, text):
315
def createComment(self, text):
318
def getElementsByTagName(self, name):
319
if self.documentElement.caseInsensitive:
320
return getElementsByTagNameNoCase(self, name)
321
return getElementsByTagName(self, name)
323
def getElementById(self, id):
324
childNodes = self.childNodes[:]
326
node = childNodes.pop(0)
328
childNodes.extend(node.childNodes)
329
if hasattr(node, 'getAttribute') and node.getAttribute("id") == id:
333
class EntityReference(Node):
335
def __init__(self, eref, parentNode=None):
336
Node.__init__(self, parentNode)
338
self.nodeValue = self.data = "&" + eref + ";"
340
def isEqualToEntityReference(self, n):
341
if not isinstance(n, EntityReference):
343
return (self.eref == n.eref) and (self.nodeValue == n.nodeValue)
344
isEqualToNode = isEqualToEntityReference
346
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
347
nsprefixes={}, namespace=''):
348
stream.write(self.nodeValue)
350
def cloneNode(self, deep=0, parent=None):
351
return EntityReference(self.eref, parent)
354
class CharacterData(Node):
356
def __init__(self, data, parentNode=None):
357
Node.__init__(self, parentNode)
358
self.value = self.data = self.nodeValue = data
360
def isEqualToCharacterData(self, n):
361
return self.value == n.value
362
isEqualToNode = isEqualToCharacterData
365
class Comment(CharacterData):
366
"""A comment node."""
368
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
369
nsprefixes={}, namespace=''):
371
if isinstance(val, UnicodeType):
372
val=val.encode('utf8')
373
stream.write("<!--%s-->" % val)
375
def cloneNode(self, deep=0, parent=None):
376
return Comment(self.nodeValue, parent)
379
class Text(CharacterData):
381
def __init__(self, data, parentNode=None, raw=0):
382
CharacterData.__init__(self, data, parentNode)
386
def isEqualToNode(self, other):
388
Compare this text to C{text}. If the underlying values and the C{raw}
389
flag are the same, return C{True}, otherwise return C{False}.
392
CharacterData.isEqualToNode(self, other) and
393
self.raw == other.raw)
396
def cloneNode(self, deep=0, parent=None):
397
return Text(self.nodeValue, parent, self.raw)
399
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
400
nsprefixes={}, namespace=''):
403
if not isinstance(val, StringTypes):
404
val = str(self.nodeValue)
407
if not isinstance(v, StringTypes):
410
v = ' '.join(v.split())
412
if isinstance(val, UnicodeType):
413
val = val.encode('utf8')
417
return "Text(%s" % repr(self.nodeValue) + ')'
420
class CDATASection(CharacterData):
421
def cloneNode(self, deep=0, parent=None):
422
return CDATASection(self.nodeValue, parent)
424
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
425
nsprefixes={}, namespace=''):
426
stream.write("<![CDATA[")
427
stream.write(self.nodeValue)
435
genprefix = _genprefix().next
437
class _Attr(CharacterData):
438
"Support class for getAttributeNode."
446
def __init__(self, tagName, attributes=None, parentNode=None,
447
filename=None, markpos=None,
448
caseInsensitive=1, preserveCase=0,
450
Node.__init__(self, parentNode)
451
self.preserveCase = preserveCase or not caseInsensitive
452
self.caseInsensitive = caseInsensitive
454
tagName = tagName.lower()
455
if attributes is None:
458
self.attributes = attributes
459
for k, v in self.attributes.items():
460
self.attributes[k] = unescape(v)
463
self.attributes = InsensitiveDict(self.attributes,
464
preserve=preserveCase)
466
self.endTagName = self.nodeName = self.tagName = tagName
467
self._filename = filename
468
self._markpos = markpos
469
self.namespace = namespace
471
def addPrefixes(self, pfxs):
472
if self.nsprefixes is None:
473
self.nsprefixes = pfxs
475
self.nsprefixes.update(pfxs)
477
def endTag(self, endTagName):
478
if not self.preserveCase:
479
endTagName = endTagName.lower()
480
self.endTagName = endTagName
482
def isEqualToElement(self, n):
483
if self.caseInsensitive:
484
return ((self.attributes == n.attributes)
485
and (self.nodeName.lower() == n.nodeName.lower()))
486
return (self.attributes == n.attributes) and (self.nodeName == n.nodeName)
489
def isEqualToNode(self, other):
491
Compare this element to C{other}. If the C{nodeName}, C{namespace},
492
C{attributes}, and C{childNodes} are all the same, return C{True},
493
otherwise return C{False}.
496
self.nodeName.lower() == other.nodeName.lower() and
497
self.namespace == other.namespace and
498
self.attributes == other.attributes and
499
Node.isEqualToNode(self, other))
502
def cloneNode(self, deep=0, parent=None):
504
self.tagName, parentNode=parent, namespace=self.namespace,
505
preserveCase=self.preserveCase, caseInsensitive=self.caseInsensitive)
506
clone.attributes.update(self.attributes)
508
clone.childNodes = [child.cloneNode(1, clone) for child in self.childNodes]
510
clone.childNodes = []
513
def getElementsByTagName(self, name):
514
if self.caseInsensitive:
515
return getElementsByTagNameNoCase(self, name)
516
return getElementsByTagName(self, name)
518
def hasAttributes(self):
521
def getAttribute(self, name, default=None):
522
return self.attributes.get(name, default)
524
def getAttributeNS(self, ns, name, default=None):
526
if self.attributes.has_key(nsk):
527
return self.attributes[nsk]
528
if ns == self.namespace:
529
return self.attributes.get(name, default)
532
def getAttributeNode(self, name):
533
return _Attr(self.getAttribute(name), self)
535
def setAttribute(self, name, attr):
536
self.attributes[name] = attr
538
def removeAttribute(self, name):
539
if name in self.attributes:
540
del self.attributes[name]
542
def hasAttribute(self, name):
543
return name in self.attributes
546
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
547
nsprefixes={}, namespace=''):
549
Serialize this L{Element} to the given stream.
551
@param stream: A file-like object to which this L{Element} will be
554
@param nsprefixes: A C{dict} mapping namespace URIs as C{str} to
555
prefixes as C{str}. This defines the prefixes which are already in
556
scope in the document at the point at which this L{Element} exists.
557
This is essentially an implementation detail for namespace support.
558
Applications should not try to use it.
560
@param namespace: The namespace URI as a C{str} which is the default at
561
the point in the document at which this L{Element} exists. This is
562
essentially an implementation detail for namespace support.
563
Applications should not try to use it.
566
ALLOWSINGLETON = ('img', 'br', 'hr', 'base', 'meta', 'link', 'param',
567
'area', 'input', 'col', 'basefont', 'isindex',
569
BLOCKELEMENTS = ('html', 'head', 'body', 'noscript', 'ins', 'del',
570
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script',
571
'ul', 'ol', 'dl', 'pre', 'hr', 'blockquote',
572
'address', 'p', 'div', 'fieldset', 'table', 'tr',
573
'form', 'object', 'fieldset', 'applet', 'map')
574
FORMATNICELY = ('tr', 'ul', 'ol', 'head')
576
# this should never be necessary unless people start
577
# changing .tagName on the fly(?)
578
if not self.preserveCase:
579
self.endTagName = self.tagName
582
newprefixes = self.nsprefixes.copy()
583
for ns in nsprefixes.keys():
584
if ns in newprefixes:
590
if self.tagName in BLOCKELEMENTS:
591
begin = [newl, indent] + begin
593
writeattr = lambda _atr, _val: bext((' ', _atr, '="', escape(_val), '"'))
595
# Make a local for tracking what end tag will be used. If namespace
596
# prefixes are involved, this will be changed to account for that
597
# before it's actually used.
598
endTagName = self.endTagName
600
if namespace != self.namespace and self.namespace is not None:
601
# If the current default namespace is not the namespace of this tag
602
# (and this tag has a namespace at all) then we'll write out
603
# something related to namespaces.
604
if self.namespace in nsprefixes:
605
# This tag's namespace already has a prefix bound to it. Use
607
prefix = nsprefixes[self.namespace]
608
bext(prefix + ':' + self.tagName)
609
# Also make sure we use it for the end tag.
610
endTagName = prefix + ':' + self.endTagName
612
# This tag's namespace has no prefix bound to it. Change the
613
# default namespace to this tag's namespace so we don't need
614
# prefixes. Alternatively, we could add a new prefix binding.
615
# I'm not sure why the code was written one way rather than the
618
writeattr("xmlns", self.namespace)
619
# The default namespace just changed. Make sure any children
621
namespace = self.namespace
623
# This tag has no namespace or its namespace is already the default
624
# namespace. Nothing extra to do here.
628
for attr, val in self.attributes.iteritems():
629
if isinstance(attr, tuple):
631
if nsprefixes.has_key(ns):
632
prefix = nsprefixes[ns]
635
newprefixes[ns] = prefix
636
assert val is not None
637
writeattr(prefix+':'+key,val)
639
assert val is not None
642
for ns, prefix in newprefixes.iteritems():
644
writeattr('xmlns:'+prefix, ns)
645
newprefixes.update(nsprefixes)
646
downprefixes = newprefixes
648
downprefixes = nsprefixes
652
newindent = indent + addindent
653
for child in self.childNodes:
654
if self.tagName in BLOCKELEMENTS and \
655
self.tagName in FORMATNICELY:
656
w(j((newl, newindent)))
657
child.writexml(stream, newindent, addindent, newl, strip,
658
downprefixes, namespace)
659
if self.tagName in BLOCKELEMENTS:
661
w(j(('</', endTagName, '>')))
662
elif self.tagName.lower() not in ALLOWSINGLETON:
663
w(j(('></', endTagName, '>')))
669
rep = "Element(%s" % repr(self.nodeName)
671
rep += ", attributes=%r" % (self.attributes,)
673
rep += ", filename=%r" % (self._filename,)
675
rep += ", markpos=%r" % (self._markpos,)
679
rep = "<" + self.nodeName
680
if self._filename or self._markpos:
683
rep += repr(self._filename)
685
rep += " line %s column %s" % self._markpos
686
if self._filename or self._markpos:
688
for item in self.attributes.items():
689
rep += " %s=%r" % item
690
if self.hasChildNodes():
691
rep += " >...</%s>" % self.nodeName
696
def _unescapeDict(d):
698
for k, v in d.items():
704
for k, v in d.items():
708
class MicroDOMParser(XMLParser):
710
# <dash> glyph: a quick scan thru the DTD says BODY, AREA, LINK, IMG, HR,
711
# P, DT, DD, LI, INPUT, OPTION, THEAD, TFOOT, TBODY, COLGROUP, COL, TR, TH,
712
# TD, HEAD, BASE, META, HTML all have optional closing tags
714
soonClosers = 'area link br img hr input base meta'.split()
715
laterClosers = {'p': ['p', 'dt'],
719
'tbody': ['thead', 'tfoot', 'tbody'],
720
'thead': ['thead', 'tfoot', 'tbody'],
721
'tfoot': ['thead', 'tfoot', 'tbody'],
722
'colgroup': ['colgroup'],
728
'title': ['head', 'body'], # this looks wrong...
729
'option': ['option'],
733
def __init__(self, beExtremelyLenient=0, caseInsensitive=1, preserveCase=0,
734
soonClosers=soonClosers, laterClosers=laterClosers):
735
self.elementstack = []
736
d = {'xmlns': 'xmlns', '': None}
738
self.nsstack = [(d,None,dr)]
740
self._mddoctype = None
741
self.beExtremelyLenient = beExtremelyLenient
742
self.caseInsensitive = caseInsensitive
743
self.preserveCase = preserveCase or not caseInsensitive
744
self.soonClosers = soonClosers
745
self.laterClosers = laterClosers
746
# self.indentlevel = 0
748
def shouldPreserveSpace(self):
749
for edx in xrange(len(self.elementstack)):
750
el = self.elementstack[-edx]
751
if el.tagName == 'pre' or el.getAttribute("xml:space", '') == 'preserve':
755
def _getparent(self):
756
if self.elementstack:
757
return self.elementstack[-1]
761
COMMENT = re.compile(r"\s*/[/*]\s*")
763
def _fixScriptElement(self, el):
764
# this deals with case where there is comment or CDATA inside
765
# <script> tag and we want to do the right thing with it
766
if not self.beExtremelyLenient or not len(el.childNodes) == 1:
769
if isinstance(c, Text):
770
# deal with nasty people who do stuff like:
774
# tidy does this, for example.
777
match = self.COMMENT.match(oldvalue)
779
prefix = match.group()
780
oldvalue = oldvalue[len(prefix):]
782
# now see if contents are actual node and comment or CDATA
784
e = parseString("<a>%s</a>" % oldvalue).childNodes[0]
785
except (ParseError, MismatchedTags):
787
if len(e.childNodes) != 1:
790
if isinstance(e, (CDATASection, Comment)):
793
el.childNodes.append(Text(prefix))
794
el.childNodes.append(e)
796
def gotDoctype(self, doctype):
797
self._mddoctype = doctype
799
def gotTagStart(self, name, attributes):
800
# print ' '*self.indentlevel, 'start tag',name
801
# self.indentlevel += 1
802
parent = self._getparent()
803
if (self.beExtremelyLenient and isinstance(parent, Element)):
804
parentName = parent.tagName
806
if self.caseInsensitive:
807
parentName = parentName.lower()
808
myName = myName.lower()
809
if myName in self.laterClosers.get(parentName, []):
810
self.gotTagEnd(parent.tagName)
811
parent = self._getparent()
812
attributes = _unescapeDict(attributes)
813
namespaces = self.nsstack[-1][0]
815
for k, v in attributes.items():
816
if k.startswith('xmlns'):
817
spacenames = k.split(':',1)
818
if len(spacenames) == 2:
819
newspaces[spacenames[1]] = v
824
namespaces = namespaces.copy()
825
namespaces.update(newspaces)
826
for k, v in attributes.items():
827
ksplit = k.split(':', 1)
830
if pfx != 'xml' and namespaces.has_key(pfx):
831
attributes[namespaces[pfx], tv] = v
833
el = Element(name, attributes, parent,
834
self.filename, self.saveMark(),
835
caseInsensitive=self.caseInsensitive,
836
preserveCase=self.preserveCase,
837
namespace=namespaces.get(''))
838
revspaces = _reverseDict(newspaces)
839
el.addPrefixes(revspaces)
842
rscopy = self.nsstack[-1][2].copy()
843
rscopy.update(revspaces)
844
self.nsstack.append((namespaces, el, rscopy))
845
self.elementstack.append(el)
847
parent.appendChild(el)
848
if (self.beExtremelyLenient and el.tagName in self.soonClosers):
851
def _gotStandalone(self, factory, data):
852
parent = self._getparent()
853
te = factory(data, parent)
855
parent.appendChild(te)
856
elif self.beExtremelyLenient:
857
self.documents.append(te)
859
def gotText(self, data):
860
if data.strip() or self.shouldPreserveSpace():
861
self._gotStandalone(Text, data)
863
def gotComment(self, data):
864
self._gotStandalone(Comment, data)
866
def gotEntityReference(self, entityRef):
867
self._gotStandalone(EntityReference, entityRef)
869
def gotCData(self, cdata):
870
self._gotStandalone(CDATASection, cdata)
872
def gotTagEnd(self, name):
873
# print ' '*self.indentlevel, 'end tag',name
874
# self.indentlevel -= 1
875
if not self.elementstack:
876
if self.beExtremelyLenient:
878
raise MismatchedTags(*((self.filename, "NOTHING", name)
879
+self.saveMark()+(0,0)))
880
el = self.elementstack.pop()
881
pfxdix = self.nsstack[-1][2]
882
if self.nsstack[-1][1] is el:
883
nstuple = self.nsstack.pop()
886
if self.caseInsensitive:
887
tn = el.tagName.lower()
893
nsplit = name.split(':',1)
895
pfx, newname = nsplit
896
ns = pfxdix.get(pfx,None)
898
if el.namespace != ns:
899
if not self.beExtremelyLenient:
900
raise MismatchedTags(*((self.filename, el.tagName, name)
901
+self.saveMark()+el._markpos))
902
if not (tn == cname):
903
if self.beExtremelyLenient:
904
if self.elementstack:
905
lastEl = self.elementstack[0]
906
for idx in xrange(len(self.elementstack)):
907
if self.elementstack[-(idx+1)].tagName == cname:
908
self.elementstack[-(idx+1)].endTag(name)
911
# this was a garbage close tag; wait for a real one
912
self.elementstack.append(el)
913
if nstuple is not None:
914
self.nsstack.append(nstuple)
916
del self.elementstack[-(idx+1):]
917
if not self.elementstack:
918
self.documents.append(lastEl)
921
raise MismatchedTags(*((self.filename, el.tagName, name)
922
+self.saveMark()+el._markpos))
924
if not self.elementstack:
925
self.documents.append(el)
926
if self.beExtremelyLenient and el.tagName == "script":
927
self._fixScriptElement(el)
929
def connectionLost(self, reason):
930
XMLParser.connectionLost(self, reason) # This can cause more events!
931
if self.elementstack:
932
if self.beExtremelyLenient:
933
self.documents.append(self.elementstack[0])
935
raise MismatchedTags(*((self.filename, self.elementstack[-1],
938
+self.elementstack[-1]._markpos))
941
def parse(readable, *args, **kwargs):
942
"""Parse HTML or XML readable."""
943
if not hasattr(readable, "read"):
944
readable = open(readable, "rb")
945
mdp = MicroDOMParser(*args, **kwargs)
946
mdp.filename = getattr(readable, "name", "<xmlfile />")
947
mdp.makeConnection(None)
948
if hasattr(readable,"getvalue"):
949
mdp.dataReceived(readable.getvalue())
951
r = readable.read(1024)
954
r = readable.read(1024)
955
mdp.connectionLost(None)
957
if not mdp.documents:
958
raise ParseError(mdp.filename, 0, 0, "No top-level Nodes in document")
960
if mdp.beExtremelyLenient:
961
if len(mdp.documents) == 1:
963
if not isinstance(d, Element):
969
for child in mdp.documents:
974
doc.doctype = mdp._mddoctype
977
def parseString(st, *args, **kw):
978
if isinstance(st, UnicodeType):
979
# this isn't particularly ideal, but it does work.
980
return parse(StringIO(st.encode('UTF-16')), *args, **kw)
981
return parse(StringIO(st), *args, **kw)
984
def parseXML(readable):
985
"""Parse an XML readable object."""
986
return parse(readable, caseInsensitive=0, preserveCase=1)
989
def parseXMLString(st):
990
"""Parse an XML readable object."""
991
return parseString(st, caseInsensitive=0, preserveCase=1)
997
"""Easy creation of XML."""
999
def __init__(self, node='div'):
1000
if isinstance(node, StringTypes):
1001
node = Element(node)
1004
def __getattr__(self, name):
1006
raise AttributeError("no private attrs")
1007
return lambda **kw: self.add(name,**kw)
1009
def __setitem__(self, key, val):
1010
self.node.setAttribute(key, val)
1012
def __getitem__(self, key):
1013
return self.node.getAttribute(key)
1015
def text(self, txt, raw=0):
1016
nn = Text(txt, raw=raw)
1017
self.node.appendChild(nn)
1020
def add(self, tagName, **kw):
1021
newNode = Element(tagName, caseInsensitive=0, preserveCase=0)
1022
self.node.appendChild(newNode)
1024
for k, v in kw.items():