1
# -*- coding: utf-8 -*-
2
# copied from trac.util.html, revision 3609, merged on 2006-08-20
4
# Copyright (C) 2003-2006 Edgewall Software
5
# Copyright 2006 MoinMoin:AlexanderSchremmer
8
# This software is licensed as described in the file COPYING, which
9
# you should have received as part of this distribution. The terms
10
# are also available at http://trac.edgewall.com/license.html.
12
# This software consists of voluntary contributions made by many
13
# individuals. For exact contribution history, see the revision
14
# history and logs, available at http://projects.edgewall.com/trac/.
17
from HTMLParser import HTMLParser, HTMLParseError
22
from sets import ImmutableSet as frozenset
23
from StringIO import StringIO
25
__all__ = ['escape', 'unescape', 'html']
27
_EMPTY_TAGS = frozenset(['br', 'hr', 'img', 'input'])
28
_BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
29
'defer', 'disabled', 'ismap', 'multiple', 'nohref',
30
'noresize', 'noshade', 'nowrap'])
33
class Markup(unicode):
34
"""Marks a string as being safe for inclusion in XML output without needing
37
Strings are normally automatically escaped when added to the HDF.
38
`Markup`-strings are however an exception. Use with care.
42
def __new__(self, text='', *args):
44
text %= tuple([escape(arg) for arg in args])
45
return unicode.__new__(self, text)
47
def __add__(self, other):
48
return Markup(unicode(self) + Markup.escape(other))
50
def __mod__(self, args):
51
if not isinstance(args, (list, tuple)):
53
return Markup(unicode.__mod__(self,
54
tuple([escape(arg) for arg in args])))
56
def __mul__(self, num):
57
return Markup(unicode(self) * num)
60
return Markup(unicode(self).join([Markup.escape(item) for item in seq]))
62
def stripentities(self, keepxmlentities=False):
63
"""Return a copy of the text with any character or numeric entities
64
replaced by the equivalent UTF-8 characters.
66
If the `keepxmlentities` parameter is provided and evaluates to `True`,
67
the core XML entities (&, ', >, < and ").
71
def _replace_entity(match):
72
if match.group(1): # numeric entity
74
if ref.startswith('x'):
75
ref = int(ref[1:], 16)
79
else: # character entity
81
if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
84
codepoint = htmlentitydefs.name2codepoint[ref]
85
return unichr(codepoint)
88
return '&%s;' % ref
91
return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
92
_replace_entity, self))
95
"""Return a copy of the text with all XML/HTML tags removed."""
96
return Markup(re.sub(r'<[^>]*?>', '', self))
98
def escape(cls, text, quotes=True):
99
"""Create a Markup instance from a string and escape special characters
100
it may contain (<, >, & and \").
102
If the `quotes` parameter is set to `False`, the \" character is left
103
as is. Escaping quotes is generally only required for strings that are
104
to be used in attribute values.
106
if isinstance(text, (cls, Element)):
111
text = text.replace('&', '&') \
112
.replace('<', '<') \
113
.replace('>', '>')
115
text = text.replace('"', '"')
117
escape = classmethod(escape)
120
"""Reverse-escapes &, <, > and \" and returns a `unicode` object."""
123
return unicode(self).replace('"', '"') \
124
.replace('>', '>') \
125
.replace('<', '<') \
126
.replace('&', '&')
128
def plaintext(self, keeplinebreaks=True):
129
"""Returns the text as a `unicode`with all entities and tags removed."""
130
text = unicode(self.striptags().stripentities())
131
if not keeplinebreaks:
132
text = text.replace('\n', ' ')
136
"""Parse the text as HTML and return a cleaned up XHTML representation.
138
This will remove any javascript code or other potentially dangerous
141
If the HTML cannot be parsed, an `HTMLParseError` will be raised by the
142
underlying `HTMLParser` module, which should be handled by the caller of
146
sanitizer = HTMLSanitizer(buf)
147
sanitizer.feed(self.stripentities(keepxmlentities=True))
148
return Markup(buf.getvalue())
151
escape = Markup.escape
154
"""Reverse-escapes &, <, > and \" and returns a `unicode` object."""
155
if not isinstance(text, Markup):
157
return text.unescape()
160
class Deuglifier(object):
163
self = object.__new__(cls)
164
if not hasattr(cls, '_compiled_rules'):
165
cls._compiled_rules = re.compile('(?:' + '|'.join(cls.rules()) + ')')
166
self._compiled_rules = cls._compiled_rules
169
def format(self, indata):
170
return re.sub(self._compiled_rules, self.replace, indata)
172
def replace(self, fullmatch):
173
for mtype, match in fullmatch.groupdict().items():
177
elif mtype == 'endfont':
179
return '<span class="code-%s">' % mtype
182
class HTMLSanitizer(HTMLParser):
184
safe_tags = frozenset(['a', 'abbr', 'acronym', 'address', 'area',
185
'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center',
186
'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
187
'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2',
188
'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd',
189
'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
190
'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small',
191
'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody',
192
'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul',
194
safe_attrs = frozenset(['abbr', 'accept', 'accept-charset',
195
'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'bgcolor',
196
'cellpadding', 'cellspacing', 'char', 'charoff', 'charset',
197
'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color',
198
'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
199
'for', 'frame', 'headers', 'height', 'href', 'hreflang',
200
'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc',
201
'maxlength', 'media', 'method', 'multiple', 'name', 'nohref',
202
'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows',
203
'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
204
'span', 'src', 'start', 'style', 'summary', 'tabindex',
205
'target', 'title', 'type', 'usemap', 'valign', 'value',
207
ignore_tags = frozenset(['html', 'body'])
209
uri_attrs = frozenset(['action', 'background', 'dynsrc', 'href',
211
safe_schemes = frozenset(['file', 'ftp', 'http', 'https', 'mailto',
214
def __init__(self, out):
215
HTMLParser.__init__(self)
217
self.waiting_for = None
219
def handle_starttag(self, tag, attrs):
222
if tag in self.ignore_tags:
225
if tag not in self.safe_tags:
226
self.waiting_for = tag
228
self.out.write('<' + tag)
230
def _get_scheme(text):
233
chars = [char for char in text.split(':', 1)[0]
235
return ''.join(chars).lower()
237
for attrname, attrval in attrs:
238
if attrname not in self.safe_attrs:
240
elif attrname in self.uri_attrs:
241
# Don't allow URI schemes such as "javascript:"
242
if _get_scheme(attrval) not in self.safe_schemes:
244
elif attrname == 'style':
245
# Remove dangerous CSS declarations from inline styles
247
for decl in filter(None, attrval.split(';')):
249
if 'expression' in decl:
251
for m in re.finditer(r'url\s*\(([^)]+)', decl):
252
if _get_scheme(m.group(1)) not in self.safe_schemes:
256
decls.append(decl.strip())
259
attrval = '; '.join(decls)
260
self.out.write(' ' + attrname + '="' + escape(attrval) + '"')
262
if tag in _EMPTY_TAGS:
263
self.out.write(' />')
267
def handle_entityref(self, name):
268
if not self.waiting_for:
269
self.out.write('&%s;' % name)
271
def handle_data(self, data):
272
if not self.waiting_for:
273
self.out.write(escape(data, quotes=False))
275
def handle_endtag(self, tag):
276
if tag in self.ignore_tags:
280
if self.waiting_for == tag:
281
self.waiting_for = None
283
if tag not in _EMPTY_TAGS:
284
self.out.write('</' + tag + '>')
287
class Fragment(object):
288
__slots__ = ['children']
293
def append(self, node):
294
"""Append an element or string as child node."""
295
if isinstance(node, (Element, Markup, basestring, int, float, long)):
296
# For objects of a known/primitive type, we avoid the check for
297
# whether it is iterable for better performance
298
self.children.append(node)
299
elif isinstance(node, Fragment):
300
self.children += node.children
301
elif node is not None:
306
self.children.append(node)
308
def __call__(self, *args):
314
"""Generator that yield tags and text nodes as strings."""
315
for child in self.children:
316
if isinstance(child, Fragment):
319
yield escape(child, quotes=False)
321
def __unicode__(self):
322
return u''.join(self.serialize())
325
return ''.join(self.serialize())
327
def __add__(self, other):
328
return Fragment()(self, other)
331
class Element(Fragment):
332
"""Simple XHTML output generator based on the builder pattern.
334
Construct XHTML elements by passing the tag name to the constructor:
336
>>> print Element('strong')
339
Attributes can be specified using keyword arguments. The values of the
340
arguments will be converted to strings and any special XML characters
343
>>> print Element('textarea', rows=10, cols=60)
344
<textarea rows="10" cols="60"></textarea>
345
>>> print Element('span', title='1 < 2')
346
<span title="1 < 2"></span>
347
>>> print Element('span', title='"baz"')
348
<span title=""baz""></span>
350
The " character is escaped using a numerical entity.
351
The order in which attributes are rendered is undefined.
353
If an attribute value evaluates to `None`, that attribute is not included
356
>>> print Element('a', name=None)
359
Attribute names that conflict with Python keywords can be specified by
360
appending an underscore:
362
>>> print Element('div', class_='warning')
363
<div class="warning"></div>
365
While the tag names and attributes are not restricted to the XHTML language,
366
some HTML characteristics such as boolean (minimized) attributes and empty
367
elements get special treatment.
369
For compatibility with HTML user agents, some XHTML elements need to be
370
closed using a separate closing tag even if they are empty. For this, the
371
close tag is only ommitted for a small set of elements which are known be
372
be safe for use as empty elements:
374
>>> print Element('br')
377
Trying to add nested elements to such an element will cause an
380
>>> Element('br')('Oops')
381
Traceback (most recent call last):
383
AssertionError: 'br' elements must not have content
385
Furthermore, boolean attributes such as "selected" or "checked" are omitted
386
if the value evaluates to `False`. Otherwise, the name of the attribute is
389
>>> print Element('option', value=0, selected=False)
390
<option value="0"></option>
391
>>> print Element('option', selected='yeah')
392
<option selected="selected"></option>
395
Nested elements can be added to an element by calling the instance using
396
positional arguments. The same technique can also be used for adding
397
attributes using keyword arguments, as one would do in the constructor:
399
>>> print Element('ul')(Element('li'), Element('li'))
400
<ul><li></li><li></li></ul>
401
>>> print Element('a')('Label')
403
>>> print Element('a')('Label', href="target")
404
<a href="target">Label</a>
406
Text nodes can be nested in an element by adding strings instead of
407
elements. Any special characters in the strings are escaped automatically:
409
>>> print Element('em')('Hello world')
411
>>> print Element('em')(42)
413
>>> print Element('em')('1 < 2')
416
This technique also allows mixed content:
418
>>> print Element('p')('Hello ', Element('b')('world'))
419
<p>Hello <b>world</b></p>
421
Elements can also be combined with other elements or strings using the
422
addition operator, which results in a `Fragment` object that contains the
425
>>> print Element('br') + 'some text' + Element('br')
426
<br />some text<br />
428
__slots__ = ['tagname', 'attr']
430
def __init__(self, tagname_=None, **attr):
431
Fragment.__init__(self)
433
self.tagname = tagname_
437
def __call__(self, *args, **attr):
438
self.attr.update(attr)
439
return Fragment.__call__(self, *args)
441
def append(self, node):
442
"""Append an element or string as child node."""
443
assert self.tagname not in _EMPTY_TAGS, \
444
"'%s' elements must not have content" % self.tagname
445
Fragment.append(self, node)
448
"""Generator that yield tags and text nodes as strings."""
449
starttag = ['<', self.tagname]
450
for name, value in self.attr.items():
453
if name in _BOOLEAN_ATTRS:
458
name = name.rstrip('_').replace('_', '-')
459
starttag.append(' %s="%s"' % (name.lower(), escape(value)))
461
if self.children or self.tagname not in _EMPTY_TAGS:
463
yield Markup(''.join(starttag))
464
for part in Fragment.serialize(self):
466
yield Markup('</%s>', self.tagname)
469
starttag.append(' />')
470
yield Markup(''.join(starttag))
475
def __getattribute__(self, name):
476
return Element(name.lower())