6
_StringTypes = [types.StringType, types.UnicodeType]
8
_StringTypes = [types.StringType]
10
START_ELEMENT = "START_ELEMENT"
11
END_ELEMENT = "END_ELEMENT"
13
START_DOCUMENT = "START_DOCUMENT"
14
END_DOCUMENT = "END_DOCUMENT"
15
PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16
IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17
CHARACTERS = "CHARACTERS"
19
class PullDOM(xml.sax.ContentHandler):
23
def __init__(self, documentFactory=None):
24
from xml.dom import XML_NAMESPACE
25
self.documentFactory = documentFactory
26
self.firstEvent = [None, None]
27
self.lastEvent = self.firstEvent
28
self.elementStack = []
29
self.push = self.elementStack.append
31
self.pop = self.elementStack.pop
32
except AttributeError:
33
# use class' pop instead
35
self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
36
self._current_context = self._ns_contexts[-1]
37
self.pending_events = []
40
result = self.elementStack[-1]
41
del self.elementStack[-1]
44
def setDocumentLocator(self, locator):
45
self._locator = locator
47
def startPrefixMapping(self, prefix, uri):
48
if not hasattr(self, '_xmlns_attrs'):
49
self._xmlns_attrs = []
50
self._xmlns_attrs.append((prefix or 'xmlns', uri))
51
self._ns_contexts.append(self._current_context.copy())
52
self._current_context[uri] = prefix or None
54
def endPrefixMapping(self, prefix):
55
self._current_context = self._ns_contexts.pop()
57
def startElementNS(self, name, tagName , attrs):
58
# Retrieve xml namespace declaration attributes.
59
xmlns_uri = 'http://www.w3.org/2000/xmlns/'
60
xmlns_attrs = getattr(self, '_xmlns_attrs', None)
61
if xmlns_attrs is not None:
62
for aname, value in xmlns_attrs:
63
attrs._attrs[(xmlns_uri, aname)] = value
64
self._xmlns_attrs = []
67
# When using namespaces, the reader may or may not
68
# provide us with the original name. If not, create
69
# *a* valid tagName from the current context.
71
prefix = self._current_context[uri]
73
tagName = prefix + ":" + localname
77
node = self.document.createElementNS(uri, tagName)
79
node = self.buildDocument(uri, tagName)
81
# When the tagname is not prefixed, it just appears as
84
node = self.document.createElement(localname)
86
node = self.buildDocument(None, localname)
88
for aname,value in attrs.items():
89
a_uri, a_localname = aname
90
if a_uri == xmlns_uri:
91
if a_localname == 'xmlns':
94
qname = 'xmlns:' + a_localname
95
attr = self.document.createAttributeNS(a_uri, qname)
96
node.setAttributeNodeNS(attr)
98
prefix = self._current_context[a_uri]
100
qname = prefix + ":" + a_localname
103
attr = self.document.createAttributeNS(a_uri, qname)
104
node.setAttributeNodeNS(attr)
106
attr = self.document.createAttribute(a_localname)
107
node.setAttributeNode(attr)
110
self.lastEvent[1] = [(START_ELEMENT, node), None]
111
self.lastEvent = self.lastEvent[1]
114
def endElementNS(self, name, tagName):
115
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
116
self.lastEvent = self.lastEvent[1]
118
def startElement(self, name, attrs):
120
node = self.document.createElement(name)
122
node = self.buildDocument(None, name)
124
for aname,value in attrs.items():
125
attr = self.document.createAttribute(aname)
127
node.setAttributeNode(attr)
129
self.lastEvent[1] = [(START_ELEMENT, node), None]
130
self.lastEvent = self.lastEvent[1]
133
def endElement(self, name):
134
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
135
self.lastEvent = self.lastEvent[1]
137
def comment(self, s):
139
node = self.document.createComment(s)
140
self.lastEvent[1] = [(COMMENT, node), None]
141
self.lastEvent = self.lastEvent[1]
143
event = [(COMMENT, s), None]
144
self.pending_events.append(event)
146
def processingInstruction(self, target, data):
148
node = self.document.createProcessingInstruction(target, data)
149
self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
150
self.lastEvent = self.lastEvent[1]
152
event = [(PROCESSING_INSTRUCTION, target, data), None]
153
self.pending_events.append(event)
155
def ignorableWhitespace(self, chars):
156
node = self.document.createTextNode(chars)
157
self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
158
self.lastEvent = self.lastEvent[1]
160
def characters(self, chars):
161
node = self.document.createTextNode(chars)
162
self.lastEvent[1] = [(CHARACTERS, node), None]
163
self.lastEvent = self.lastEvent[1]
165
def startDocument(self):
166
if self.documentFactory is None:
167
import xml.dom.minidom
168
self.documentFactory = xml.dom.minidom.Document.implementation
170
def buildDocument(self, uri, tagname):
171
# Can't do that in startDocument, since we need the tagname
172
# XXX: obtain DocumentType
173
node = self.documentFactory.createDocument(uri, tagname, None)
175
self.lastEvent[1] = [(START_DOCUMENT, node), None]
176
self.lastEvent = self.lastEvent[1]
178
# Put everything we have seen so far into the document
179
for e in self.pending_events:
180
if e[0][0] == PROCESSING_INSTRUCTION:
182
n = self.document.createProcessingInstruction(target, data)
183
e[0] = (PROCESSING_INSTRUCTION, n)
184
elif e[0][0] == COMMENT:
185
n = self.document.createComment(e[0][1])
188
raise AssertionError("Unknown pending event ",e[0][0])
189
self.lastEvent[1] = e
191
self.pending_events = None
192
return node.firstChild
194
def endDocument(self):
195
self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
199
"clear(): Explicitly release parsing structures"
203
def warning(self, exception):
205
def error(self, exception):
207
def fatalError(self, exception):
210
class DOMEventStream:
211
def __init__(self, stream, parser, bufsize):
214
self.bufsize = bufsize
215
if not hasattr(self.parser, 'feed'):
216
self.getEvent = self._slurp
220
self.pulldom = PullDOM()
221
# This content handler relies on namespace support
222
self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
223
self.parser.setContentHandler(self.pulldom)
225
def __getitem__(self, pos):
240
def expandNode(self, node):
241
event = self.getEvent()
244
token, cur_node = event
247
if token != END_ELEMENT:
248
parents[-1].appendChild(cur_node)
249
if token == START_ELEMENT:
250
parents.append(cur_node)
251
elif token == END_ELEMENT:
253
event = self.getEvent()
256
# use IncrementalParser interface, so we get the desired
258
if not self.pulldom.firstEvent[1]:
259
self.pulldom.lastEvent = self.pulldom.firstEvent
260
while not self.pulldom.firstEvent[1]:
261
buf = self.stream.read(self.bufsize)
265
self.parser.feed(buf)
266
rc = self.pulldom.firstEvent[1][0]
267
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
271
""" Fallback replacement for getEvent() using the
272
standard SAX2 interface, which means we slurp the
273
SAX events into memory (no performance gain, but
274
we are compatible to all SAX parsers).
276
self.parser.parse(self.stream)
277
self.getEvent = self._emit
281
""" Fallback replacement for getEvent() that emits
282
the events that _slurp() read previously.
284
rc = self.pulldom.firstEvent[1][0]
285
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
289
"""clear(): Explicitly release parsing objects"""
295
class SAX2DOM(PullDOM):
297
def startElementNS(self, name, tagName , attrs):
298
PullDOM.startElementNS(self, name, tagName, attrs)
299
curNode = self.elementStack[-1]
300
parentNode = self.elementStack[-2]
301
parentNode.appendChild(curNode)
303
def startElement(self, name, attrs):
304
PullDOM.startElement(self, name, attrs)
305
curNode = self.elementStack[-1]
306
parentNode = self.elementStack[-2]
307
parentNode.appendChild(curNode)
309
def processingInstruction(self, target, data):
310
PullDOM.processingInstruction(self, target, data)
311
node = self.lastEvent[0][1]
312
parentNode = self.elementStack[-1]
313
parentNode.appendChild(node)
315
def ignorableWhitespace(self, chars):
316
PullDOM.ignorableWhitespace(self, chars)
317
node = self.lastEvent[0][1]
318
parentNode = self.elementStack[-1]
319
parentNode.appendChild(node)
321
def characters(self, chars):
322
PullDOM.characters(self, chars)
323
node = self.lastEvent[0][1]
324
parentNode = self.elementStack[-1]
325
parentNode.appendChild(node)
328
default_bufsize = (2 ** 14) - 20
330
def parse(stream_or_string, parser=None, bufsize=None):
332
bufsize = default_bufsize
333
if type(stream_or_string) in _StringTypes:
334
stream = open(stream_or_string)
336
stream = stream_or_string
338
parser = xml.sax.make_parser()
339
return DOMEventStream(stream, parser, bufsize)
341
def parseString(string, parser=None):
343
from cStringIO import StringIO
345
from StringIO import StringIO
347
bufsize = len(string)
348
buf = StringIO(string)
350
parser = xml.sax.make_parser()
351
return DOMEventStream(buf, parser, bufsize)