1
"""A collection of modules for iterating through different kinds of
2
tree, generating tokens identical to those produced by the tokenizer
5
To create a tree walker for a new type of tree, you need to do
6
implement a tree walker object (called TreeWalker by convention) that
7
implements a 'serialize' method taking a tree as sole argument and
8
returning an iterator generating tokens.
11
from __future__ import absolute_import, division, unicode_literals
13
from .. import constants
14
from .._utils import default_etree
16
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
21
def getTreeWalker(treeType, implementation=None, **kwargs):
22
"""Get a TreeWalker class for various types of tree with built-in support
25
treeType (str): the name of the tree type required (case-insensitive).
28
- "dom": The xml.dom.minidom DOM implementation
29
- "etree": A generic walker for tree implementations exposing an
30
elementtree-like interface (known to work with
31
ElementTree, cElementTree and lxml.etree).
32
- "lxml": Optimized walker for lxml.etree
33
- "genshi": a Genshi stream
35
Implementation: A module implementing the tree type e.g.
36
xml.etree.ElementTree or cElementTree (Currently applies to the
37
"etree" tree type only).
40
treeType = treeType.lower()
41
if treeType not in treeWalkerCache:
44
treeWalkerCache[treeType] = dom.TreeWalker
45
elif treeType == "genshi":
47
treeWalkerCache[treeType] = genshi.TreeWalker
48
elif treeType == "lxml":
49
from . import etree_lxml
50
treeWalkerCache[treeType] = etree_lxml.TreeWalker
51
elif treeType == "etree":
53
if implementation is None:
54
implementation = default_etree
55
# XXX: NEVER cache here, caching is done in the etree submodule
56
return etree.getETreeModule(implementation, **kwargs).TreeWalker
57
return treeWalkerCache.get(treeType)
60
def concatenateCharacterTokens(tokens):
61
pendingCharacters = []
64
if type in ("Characters", "SpaceCharacters"):
65
pendingCharacters.append(token["data"])
68
yield {"type": "Characters", "data": "".join(pendingCharacters)}
69
pendingCharacters = []
72
yield {"type": "Characters", "data": "".join(pendingCharacters)}
76
"""Pretty printer for tree walkers"""
79
for token in concatenateCharacterTokens(walker):
81
if type in ("StartTag", "EmptyTag"):
83
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
84
if token["namespace"] in constants.prefixes:
85
ns = constants.prefixes[token["namespace"]]
87
ns = token["namespace"]
88
name = "%s %s" % (ns, token["name"])
91
output.append("%s<%s>" % (" " * indent, name))
93
# attributes (sorted for consistent ordering)
95
for (namespace, localname), value in sorted(attrs.items()):
97
if namespace in constants.prefixes:
98
ns = constants.prefixes[namespace]
101
name = "%s %s" % (ns, localname)
104
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
106
if type == "EmptyTag":
109
elif type == "EndTag":
112
elif type == "Comment":
113
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
115
elif type == "Doctype":
117
if token["publicId"]:
118
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
122
token["systemId"] if token["systemId"] else ""))
123
elif token["systemId"]:
124
output.append("""%s<!DOCTYPE %s "" "%s">""" %
129
output.append("%s<!DOCTYPE %s>" % (" " * indent,
132
output.append("%s<!DOCTYPE >" % (" " * indent,))
134
elif type == "Characters":
135
output.append("%s\"%s\"" % (" " * indent, token["data"]))
137
elif type == "SpaceCharacters":
138
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
141
raise ValueError("Unknown token type, %s" % type)
143
return "\n".join(output)