~ibmcharmers/charms/trusty/ibm-dsm-enterprise/devel

« back to all changes in this revision

Viewing changes to .tox/py35/lib/python3.5/site-packages/pip/_vendor/html5lib/serializer.py

  • Committer: anita nayak
  • Date: 2016-12-08 14:10:42 UTC
  • Revision ID: anitanayak@in.ibm.com-20161208141042-jyicg7udd6liy6s3
Initial Check in for IBM DSM for trusty

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
from __future__ import absolute_import, division, unicode_literals
 
2
from pip._vendor.six import text_type
 
3
 
 
4
import re
 
5
 
 
6
from codecs import register_error, xmlcharrefreplace_errors
 
7
 
 
8
from .constants import voidElements, booleanAttributes, spaceCharacters
 
9
from .constants import rcdataElements, entities, xmlEntities
 
10
from . import treewalkers, _utils
 
11
from xml.sax.saxutils import escape
 
12
 
 
13
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
 
14
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
 
15
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
 
16
                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
 
17
                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
 
18
                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
 
19
                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
 
20
                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
 
21
                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
 
22
                                   "\u3000]")
 
23
 
 
24
 
 
25
_encode_entity_map = {}
 
26
_is_ucs4 = len("\U0010FFFF") == 1
 
27
for k, v in list(entities.items()):
 
28
    # skip multi-character entities
 
29
    if ((_is_ucs4 and len(v) > 1) or
 
30
            (not _is_ucs4 and len(v) > 2)):
 
31
        continue
 
32
    if v != "&":
 
33
        if len(v) == 2:
 
34
            v = _utils.surrogatePairToCodepoint(v)
 
35
        else:
 
36
            v = ord(v)
 
37
        if v not in _encode_entity_map or k.islower():
 
38
            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
 
39
            _encode_entity_map[v] = k
 
40
 
 
41
 
 
42
def htmlentityreplace_errors(exc):
 
43
    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
 
44
        res = []
 
45
        codepoints = []
 
46
        skip = False
 
47
        for i, c in enumerate(exc.object[exc.start:exc.end]):
 
48
            if skip:
 
49
                skip = False
 
50
                continue
 
51
            index = i + exc.start
 
52
            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
 
53
                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
 
54
                skip = True
 
55
            else:
 
56
                codepoint = ord(c)
 
57
            codepoints.append(codepoint)
 
58
        for cp in codepoints:
 
59
            e = _encode_entity_map.get(cp)
 
60
            if e:
 
61
                res.append("&")
 
62
                res.append(e)
 
63
                if not e.endswith(";"):
 
64
                    res.append(";")
 
65
            else:
 
66
                res.append("&#x%s;" % (hex(cp)[2:]))
 
67
        return ("".join(res), exc.end)
 
68
    else:
 
69
        return xmlcharrefreplace_errors(exc)
 
70
 
 
71
register_error("htmlentityreplace", htmlentityreplace_errors)
 
72
 
 
73
 
 
74
def serialize(input, tree="etree", encoding=None, **serializer_opts):
 
75
    # XXX: Should we cache this?
 
76
    walker = treewalkers.getTreeWalker(tree)
 
77
    s = HTMLSerializer(**serializer_opts)
 
78
    return s.render(walker(input), encoding)
 
79
 
 
80
 
 
81
class HTMLSerializer(object):
 
82
 
 
83
    # attribute quoting options
 
84
    quote_attr_values = "legacy"  # be secure by default
 
85
    quote_char = '"'
 
86
    use_best_quote_char = True
 
87
 
 
88
    # tag syntax options
 
89
    omit_optional_tags = True
 
90
    minimize_boolean_attributes = True
 
91
    use_trailing_solidus = False
 
92
    space_before_trailing_solidus = True
 
93
 
 
94
    # escaping options
 
95
    escape_lt_in_attrs = False
 
96
    escape_rcdata = False
 
97
    resolve_entities = True
 
98
 
 
99
    # miscellaneous options
 
100
    alphabetical_attributes = False
 
101
    inject_meta_charset = True
 
102
    strip_whitespace = False
 
103
    sanitize = False
 
104
 
 
105
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
 
106
               "omit_optional_tags", "minimize_boolean_attributes",
 
107
               "use_trailing_solidus", "space_before_trailing_solidus",
 
108
               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
 
109
               "alphabetical_attributes", "inject_meta_charset",
 
110
               "strip_whitespace", "sanitize")
 
111
 
 
112
    def __init__(self, **kwargs):
 
113
        """Initialize HTMLSerializer.
 
114
 
 
115
        Keyword options (default given first unless specified) include:
 
116
 
 
117
        inject_meta_charset=True|False
 
118
          Whether it insert a meta element to define the character set of the
 
119
          document.
 
120
        quote_attr_values="legacy"|"spec"|"always"
 
121
          Whether to quote attribute values that don't require quoting
 
122
          per legacy browser behaviour, when required by the standard, or always.
 
123
        quote_char=u'"'|u"'"
 
124
          Use given quote character for attribute quoting. Default is to
 
125
          use double quote unless attribute value contains a double quote,
 
126
          in which case single quotes are used instead.
 
127
        escape_lt_in_attrs=False|True
 
128
          Whether to escape < in attribute values.
 
129
        escape_rcdata=False|True
 
130
          Whether to escape characters that need to be escaped within normal
 
131
          elements within rcdata elements such as style.
 
132
        resolve_entities=True|False
 
133
          Whether to resolve named character entities that appear in the
 
134
          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
 
135
          are unaffected by this setting.
 
136
        strip_whitespace=False|True
 
137
          Whether to remove semantically meaningless whitespace. (This
 
138
          compresses all whitespace to a single space except within pre.)
 
139
        minimize_boolean_attributes=True|False
 
140
          Shortens boolean attributes to give just the attribute value,
 
141
          for example <input disabled="disabled"> becomes <input disabled>.
 
142
        use_trailing_solidus=False|True
 
143
          Includes a close-tag slash at the end of the start tag of void
 
144
          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
 
145
        space_before_trailing_solidus=True|False
 
146
          Places a space immediately before the closing slash in a tag
 
147
          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
 
148
        sanitize=False|True
 
149
          Strip all unsafe or unknown constructs from output.
 
150
          See `html5lib user documentation`_
 
151
        omit_optional_tags=True|False
 
152
          Omit start/end tags that are optional.
 
153
        alphabetical_attributes=False|True
 
154
          Reorder attributes to be in alphabetical order.
 
155
 
 
156
        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
 
157
        """
 
158
        unexpected_args = frozenset(kwargs) - frozenset(self.options)
 
159
        if len(unexpected_args) > 0:
 
160
            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
 
161
        if 'quote_char' in kwargs:
 
162
            self.use_best_quote_char = False
 
163
        for attr in self.options:
 
164
            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
 
165
        self.errors = []
 
166
        self.strict = False
 
167
 
 
168
    def encode(self, string):
 
169
        assert(isinstance(string, text_type))
 
170
        if self.encoding:
 
171
            return string.encode(self.encoding, "htmlentityreplace")
 
172
        else:
 
173
            return string
 
174
 
 
175
    def encodeStrict(self, string):
 
176
        assert(isinstance(string, text_type))
 
177
        if self.encoding:
 
178
            return string.encode(self.encoding, "strict")
 
179
        else:
 
180
            return string
 
181
 
 
182
    def serialize(self, treewalker, encoding=None):
 
183
        # pylint:disable=too-many-nested-blocks
 
184
        self.encoding = encoding
 
185
        in_cdata = False
 
186
        self.errors = []
 
187
 
 
188
        if encoding and self.inject_meta_charset:
 
189
            from .filters.inject_meta_charset import Filter
 
190
            treewalker = Filter(treewalker, encoding)
 
191
        # Alphabetical attributes is here under the assumption that none of
 
192
        # the later filters add or change order of attributes; it needs to be
 
193
        # before the sanitizer so escaped elements come out correctly
 
194
        if self.alphabetical_attributes:
 
195
            from .filters.alphabeticalattributes import Filter
 
196
            treewalker = Filter(treewalker)
 
197
        # WhitespaceFilter should be used before OptionalTagFilter
 
198
        # for maximum efficiently of this latter filter
 
199
        if self.strip_whitespace:
 
200
            from .filters.whitespace import Filter
 
201
            treewalker = Filter(treewalker)
 
202
        if self.sanitize:
 
203
            from .filters.sanitizer import Filter
 
204
            treewalker = Filter(treewalker)
 
205
        if self.omit_optional_tags:
 
206
            from .filters.optionaltags import Filter
 
207
            treewalker = Filter(treewalker)
 
208
 
 
209
        for token in treewalker:
 
210
            type = token["type"]
 
211
            if type == "Doctype":
 
212
                doctype = "<!DOCTYPE %s" % token["name"]
 
213
 
 
214
                if token["publicId"]:
 
215
                    doctype += ' PUBLIC "%s"' % token["publicId"]
 
216
                elif token["systemId"]:
 
217
                    doctype += " SYSTEM"
 
218
                if token["systemId"]:
 
219
                    if token["systemId"].find('"') >= 0:
 
220
                        if token["systemId"].find("'") >= 0:
 
221
                            self.serializeError("System identifer contains both single and double quote characters")
 
222
                        quote_char = "'"
 
223
                    else:
 
224
                        quote_char = '"'
 
225
                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
 
226
 
 
227
                doctype += ">"
 
228
                yield self.encodeStrict(doctype)
 
229
 
 
230
            elif type in ("Characters", "SpaceCharacters"):
 
231
                if type == "SpaceCharacters" or in_cdata:
 
232
                    if in_cdata and token["data"].find("</") >= 0:
 
233
                        self.serializeError("Unexpected </ in CDATA")
 
234
                    yield self.encode(token["data"])
 
235
                else:
 
236
                    yield self.encode(escape(token["data"]))
 
237
 
 
238
            elif type in ("StartTag", "EmptyTag"):
 
239
                name = token["name"]
 
240
                yield self.encodeStrict("<%s" % name)
 
241
                if name in rcdataElements and not self.escape_rcdata:
 
242
                    in_cdata = True
 
243
                elif in_cdata:
 
244
                    self.serializeError("Unexpected child element of a CDATA element")
 
245
                for (_, attr_name), attr_value in token["data"].items():
 
246
                    # TODO: Add namespace support here
 
247
                    k = attr_name
 
248
                    v = attr_value
 
249
                    yield self.encodeStrict(' ')
 
250
 
 
251
                    yield self.encodeStrict(k)
 
252
                    if not self.minimize_boolean_attributes or \
 
253
                        (k not in booleanAttributes.get(name, tuple()) and
 
254
                         k not in booleanAttributes.get("", tuple())):
 
255
                        yield self.encodeStrict("=")
 
256
                        if self.quote_attr_values == "always" or len(v) == 0:
 
257
                            quote_attr = True
 
258
                        elif self.quote_attr_values == "spec":
 
259
                            quote_attr = _quoteAttributeSpec.search(v) is not None
 
260
                        elif self.quote_attr_values == "legacy":
 
261
                            quote_attr = _quoteAttributeLegacy.search(v) is not None
 
262
                        else:
 
263
                            raise ValueError("quote_attr_values must be one of: "
 
264
                                             "'always', 'spec', or 'legacy'")
 
265
                        v = v.replace("&", "&amp;")
 
266
                        if self.escape_lt_in_attrs:
 
267
                            v = v.replace("<", "&lt;")
 
268
                        if quote_attr:
 
269
                            quote_char = self.quote_char
 
270
                            if self.use_best_quote_char:
 
271
                                if "'" in v and '"' not in v:
 
272
                                    quote_char = '"'
 
273
                                elif '"' in v and "'" not in v:
 
274
                                    quote_char = "'"
 
275
                            if quote_char == "'":
 
276
                                v = v.replace("'", "&#39;")
 
277
                            else:
 
278
                                v = v.replace('"', "&quot;")
 
279
                            yield self.encodeStrict(quote_char)
 
280
                            yield self.encode(v)
 
281
                            yield self.encodeStrict(quote_char)
 
282
                        else:
 
283
                            yield self.encode(v)
 
284
                if name in voidElements and self.use_trailing_solidus:
 
285
                    if self.space_before_trailing_solidus:
 
286
                        yield self.encodeStrict(" /")
 
287
                    else:
 
288
                        yield self.encodeStrict("/")
 
289
                yield self.encode(">")
 
290
 
 
291
            elif type == "EndTag":
 
292
                name = token["name"]
 
293
                if name in rcdataElements:
 
294
                    in_cdata = False
 
295
                elif in_cdata:
 
296
                    self.serializeError("Unexpected child element of a CDATA element")
 
297
                yield self.encodeStrict("</%s>" % name)
 
298
 
 
299
            elif type == "Comment":
 
300
                data = token["data"]
 
301
                if data.find("--") >= 0:
 
302
                    self.serializeError("Comment contains --")
 
303
                yield self.encodeStrict("<!--%s-->" % token["data"])
 
304
 
 
305
            elif type == "Entity":
 
306
                name = token["name"]
 
307
                key = name + ";"
 
308
                if key not in entities:
 
309
                    self.serializeError("Entity %s not recognized" % name)
 
310
                if self.resolve_entities and key not in xmlEntities:
 
311
                    data = entities[key]
 
312
                else:
 
313
                    data = "&%s;" % name
 
314
                yield self.encodeStrict(data)
 
315
 
 
316
            else:
 
317
                self.serializeError(token["data"])
 
318
 
 
319
    def render(self, treewalker, encoding=None):
 
320
        if encoding:
 
321
            return b"".join(list(self.serialize(treewalker, encoding)))
 
322
        else:
 
323
            return "".join(list(self.serialize(treewalker)))
 
324
 
 
325
    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
 
326
        # XXX The idea is to make data mandatory.
 
327
        self.errors.append(data)
 
328
        if self.strict:
 
329
            raise SerializeError
 
330
 
 
331
 
 
332
class SerializeError(Exception):
 
333
    """Error in serialized tree"""
 
334
    pass