1
from __future__ import absolute_import, division, unicode_literals
2
from pip._vendor.six import text_type
6
from codecs import register_error, xmlcharrefreplace_errors
8
from .constants import voidElements, booleanAttributes, spaceCharacters
9
from .constants import rcdataElements, entities, xmlEntities
10
from . import treewalkers, _utils
11
from xml.sax.saxutils import escape
13
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
14
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
15
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
16
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
25
_encode_entity_map = {}
26
_is_ucs4 = len("\U0010FFFF") == 1
27
for k, v in list(entities.items()):
28
# skip multi-character entities
29
if ((_is_ucs4 and len(v) > 1) or
30
(not _is_ucs4 and len(v) > 2)):
34
v = _utils.surrogatePairToCodepoint(v)
37
if v not in _encode_entity_map or k.islower():
38
# prefer < over < and similarly for &, >, etc.
39
_encode_entity_map[v] = k
42
def htmlentityreplace_errors(exc):
43
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
47
for i, c in enumerate(exc.object[exc.start:exc.end]):
52
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
53
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
57
codepoints.append(codepoint)
59
e = _encode_entity_map.get(cp)
63
if not e.endswith(";"):
66
res.append("&#x%s;" % (hex(cp)[2:]))
67
return ("".join(res), exc.end)
69
return xmlcharrefreplace_errors(exc)
71
register_error("htmlentityreplace", htmlentityreplace_errors)
74
def serialize(input, tree="etree", encoding=None, **serializer_opts):
75
# XXX: Should we cache this?
76
walker = treewalkers.getTreeWalker(tree)
77
s = HTMLSerializer(**serializer_opts)
78
return s.render(walker(input), encoding)
81
class HTMLSerializer(object):
83
# attribute quoting options
84
quote_attr_values = "legacy" # be secure by default
86
use_best_quote_char = True
89
omit_optional_tags = True
90
minimize_boolean_attributes = True
91
use_trailing_solidus = False
92
space_before_trailing_solidus = True
95
escape_lt_in_attrs = False
97
resolve_entities = True
99
# miscellaneous options
100
alphabetical_attributes = False
101
inject_meta_charset = True
102
strip_whitespace = False
105
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
106
"omit_optional_tags", "minimize_boolean_attributes",
107
"use_trailing_solidus", "space_before_trailing_solidus",
108
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
109
"alphabetical_attributes", "inject_meta_charset",
110
"strip_whitespace", "sanitize")
112
def __init__(self, **kwargs):
113
"""Initialize HTMLSerializer.
115
Keyword options (default given first unless specified) include:
117
inject_meta_charset=True|False
118
Whether it insert a meta element to define the character set of the
120
quote_attr_values="legacy"|"spec"|"always"
121
Whether to quote attribute values that don't require quoting
122
per legacy browser behaviour, when required by the standard, or always.
124
Use given quote character for attribute quoting. Default is to
125
use double quote unless attribute value contains a double quote,
126
in which case single quotes are used instead.
127
escape_lt_in_attrs=False|True
128
Whether to escape < in attribute values.
129
escape_rcdata=False|True
130
Whether to escape characters that need to be escaped within normal
131
elements within rcdata elements such as style.
132
resolve_entities=True|False
133
Whether to resolve named character entities that appear in the
134
source tree. The XML predefined entities < > & " '
135
are unaffected by this setting.
136
strip_whitespace=False|True
137
Whether to remove semantically meaningless whitespace. (This
138
compresses all whitespace to a single space except within pre.)
139
minimize_boolean_attributes=True|False
140
Shortens boolean attributes to give just the attribute value,
141
for example <input disabled="disabled"> becomes <input disabled>.
142
use_trailing_solidus=False|True
143
Includes a close-tag slash at the end of the start tag of void
144
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
145
space_before_trailing_solidus=True|False
146
Places a space immediately before the closing slash in a tag
147
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
149
Strip all unsafe or unknown constructs from output.
150
See `html5lib user documentation`_
151
omit_optional_tags=True|False
152
Omit start/end tags that are optional.
153
alphabetical_attributes=False|True
154
Reorder attributes to be in alphabetical order.
156
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
158
unexpected_args = frozenset(kwargs) - frozenset(self.options)
159
if len(unexpected_args) > 0:
160
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
161
if 'quote_char' in kwargs:
162
self.use_best_quote_char = False
163
for attr in self.options:
164
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
168
def encode(self, string):
169
assert(isinstance(string, text_type))
171
return string.encode(self.encoding, "htmlentityreplace")
175
def encodeStrict(self, string):
176
assert(isinstance(string, text_type))
178
return string.encode(self.encoding, "strict")
182
def serialize(self, treewalker, encoding=None):
183
# pylint:disable=too-many-nested-blocks
184
self.encoding = encoding
188
if encoding and self.inject_meta_charset:
189
from .filters.inject_meta_charset import Filter
190
treewalker = Filter(treewalker, encoding)
191
# Alphabetical attributes is here under the assumption that none of
192
# the later filters add or change order of attributes; it needs to be
193
# before the sanitizer so escaped elements come out correctly
194
if self.alphabetical_attributes:
195
from .filters.alphabeticalattributes import Filter
196
treewalker = Filter(treewalker)
197
# WhitespaceFilter should be used before OptionalTagFilter
198
# for maximum efficiently of this latter filter
199
if self.strip_whitespace:
200
from .filters.whitespace import Filter
201
treewalker = Filter(treewalker)
203
from .filters.sanitizer import Filter
204
treewalker = Filter(treewalker)
205
if self.omit_optional_tags:
206
from .filters.optionaltags import Filter
207
treewalker = Filter(treewalker)
209
for token in treewalker:
211
if type == "Doctype":
212
doctype = "<!DOCTYPE %s" % token["name"]
214
if token["publicId"]:
215
doctype += ' PUBLIC "%s"' % token["publicId"]
216
elif token["systemId"]:
218
if token["systemId"]:
219
if token["systemId"].find('"') >= 0:
220
if token["systemId"].find("'") >= 0:
221
self.serializeError("System identifer contains both single and double quote characters")
225
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
228
yield self.encodeStrict(doctype)
230
elif type in ("Characters", "SpaceCharacters"):
231
if type == "SpaceCharacters" or in_cdata:
232
if in_cdata and token["data"].find("</") >= 0:
233
self.serializeError("Unexpected </ in CDATA")
234
yield self.encode(token["data"])
236
yield self.encode(escape(token["data"]))
238
elif type in ("StartTag", "EmptyTag"):
240
yield self.encodeStrict("<%s" % name)
241
if name in rcdataElements and not self.escape_rcdata:
244
self.serializeError("Unexpected child element of a CDATA element")
245
for (_, attr_name), attr_value in token["data"].items():
246
# TODO: Add namespace support here
249
yield self.encodeStrict(' ')
251
yield self.encodeStrict(k)
252
if not self.minimize_boolean_attributes or \
253
(k not in booleanAttributes.get(name, tuple()) and
254
k not in booleanAttributes.get("", tuple())):
255
yield self.encodeStrict("=")
256
if self.quote_attr_values == "always" or len(v) == 0:
258
elif self.quote_attr_values == "spec":
259
quote_attr = _quoteAttributeSpec.search(v) is not None
260
elif self.quote_attr_values == "legacy":
261
quote_attr = _quoteAttributeLegacy.search(v) is not None
263
raise ValueError("quote_attr_values must be one of: "
264
"'always', 'spec', or 'legacy'")
265
v = v.replace("&", "&")
266
if self.escape_lt_in_attrs:
267
v = v.replace("<", "<")
269
quote_char = self.quote_char
270
if self.use_best_quote_char:
271
if "'" in v and '"' not in v:
273
elif '"' in v and "'" not in v:
275
if quote_char == "'":
276
v = v.replace("'", "'")
278
v = v.replace('"', """)
279
yield self.encodeStrict(quote_char)
281
yield self.encodeStrict(quote_char)
284
if name in voidElements and self.use_trailing_solidus:
285
if self.space_before_trailing_solidus:
286
yield self.encodeStrict(" /")
288
yield self.encodeStrict("/")
289
yield self.encode(">")
291
elif type == "EndTag":
293
if name in rcdataElements:
296
self.serializeError("Unexpected child element of a CDATA element")
297
yield self.encodeStrict("</%s>" % name)
299
elif type == "Comment":
301
if data.find("--") >= 0:
302
self.serializeError("Comment contains --")
303
yield self.encodeStrict("<!--%s-->" % token["data"])
305
elif type == "Entity":
308
if key not in entities:
309
self.serializeError("Entity %s not recognized" % name)
310
if self.resolve_entities and key not in xmlEntities:
314
yield self.encodeStrict(data)
317
self.serializeError(token["data"])
319
def render(self, treewalker, encoding=None):
321
return b"".join(list(self.serialize(treewalker, encoding)))
323
return "".join(list(self.serialize(treewalker)))
325
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
326
# XXX The idea is to make data mandatory.
327
self.errors.append(data)
332
class SerializeError(Exception):
333
"""Error in serialized tree"""