3
3
from __future__ import unicode_literals
7
from django.utils.safestring import SafeData, mark_safe
8
from django.utils.deprecation import RemovedInDjango18Warning
8
9
from django.utils.encoding import force_text, force_str
9
10
from django.utils.functional import allow_lazy
11
from django.utils.safestring import SafeData, mark_safe
10
12
from django.utils import six
11
13
from django.utils.six.moves.urllib.parse import quote, unquote, urlsplit, urlunsplit
12
14
from django.utils.text import normalize_newlines
17
19
# Configuration for urlize() function.
18
TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
19
WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')]
20
TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)', '"', '\'']
21
WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>'), ('"', '"'), ('\'', '\'')]
21
23
# List of possible strings used for bullets in bulleted lists.
22
24
DOTS = ['·', '*', '\u2022', '•', '•', '•']
28
30
simple_email_re = re.compile(r'^\S+@\S+\.\S+$')
29
31
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
30
32
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
31
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
33
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join(re.escape(x) for x in DOTS), re.DOTALL)
32
34
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
56
58
# Escape every ASCII character with a value less than 32.
57
59
_js_escapes.update((ord('%c' % z), '\\u%04X' % z) for z in range(32))
59
62
def escapejs(value):
60
63
"""Hex encodes characters for use in JavaScript strings."""
61
64
return mark_safe(force_text(value).translate(_js_escapes))
62
65
escapejs = allow_lazy(escapejs, six.text_type)
64
68
def conditional_escape(text):
66
70
Similar to escape(), except that it doesn't operate on pre-escaped strings.
68
if isinstance(text, SafeData):
72
if hasattr(text, '__html__'):
73
return text.__html__()
71
75
return escape(text)
73
78
def format_html(format_string, *args, **kwargs):
75
80
Similar to str.format, but passes all arguments through conditional_escape,
77
82
of str.format or % interpolation to build up small HTML fragments.
79
84
args_safe = map(conditional_escape, args)
80
kwargs_safe = dict([(k, conditional_escape(v)) for (k, v) in
81
six.iteritems(kwargs)])
85
kwargs_safe = dict((k, conditional_escape(v)) for (k, v) in six.iteritems(kwargs))
82
86
return mark_safe(format_string.format(*args_safe, **kwargs_safe))
84
89
def format_html_join(sep, format_string, args_generator):
86
91
A wrapper of format_html, for the common case of a group of arguments that
99
104
return mark_safe(conditional_escape(sep).join(
100
format_html(format_string, *tuple(args))
101
for args in args_generator))
105
format_html(format_string, *tuple(args))
106
for args in args_generator))
104
109
def linebreaks(value, autoescape=False):
121
126
HTMLParser.__init__(self, strict=False)
124
130
def handle_data(self, d):
125
131
self.fed.append(d)
126
133
def handle_entityref(self, name):
127
134
self.fed.append('&%s;' % name)
128
136
def handle_charref(self, name):
129
137
self.fed.append('&#%s;' % name)
130
139
def get_data(self):
131
140
return ''.join(self.fed)
145
except (HTMLParseError, UnboundLocalError) as err:
154
except (HTMLParseError, UnboundLocalError):
146
155
# UnboundLocalError because of http://bugs.python.org/issue17802
147
156
# on Python 3.2, triggered by strict=False mode of HTMLParser
148
157
return s.get_data() + s.rawdata
153
162
def strip_tags(value):
154
163
"""Returns the given HTML with all tags stripped."""
156
if not ('<' in value or '>' in value):
164
# Note: in typical case this loop executes _strip_once once. Loop condition
165
# is redundant, but helps to reduce number of executions of _strip_once.
166
while '<' in value and '>' in value:
158
167
new_value = _strip_once(value)
159
168
if new_value == value:
160
169
# _strip_once was not able to detect more tags
164
173
strip_tags = allow_lazy(strip_tags)
166
176
def remove_tags(html, tags):
167
177
"""Returns the given HTML with given tags removed."""
168
178
tags = [re.escape(tag) for tag in tags.split()]
175
185
remove_tags = allow_lazy(remove_tags, six.text_type)
177
188
def strip_spaces_between_tags(value):
178
189
"""Returns the given HTML with spaces between tags removed."""
179
190
return re.sub(r'>\s+<', '><', force_text(value))
180
191
strip_spaces_between_tags = allow_lazy(strip_spaces_between_tags, six.text_type)
182
194
def strip_entities(value):
183
195
"""Returns the given HTML with all entities (&something;) stripped."""
184
196
return re.sub(r'&(?:\w+|#\d+);', '', force_text(value))
185
197
strip_entities = allow_lazy(strip_entities, six.text_type)
187
200
def fix_ampersands(value):
188
201
"""Returns the given HTML with all unencoded ampersands encoded correctly."""
202
# As fix_ampersands is wrapped in allow_lazy, stacklevel 3 is more useful than 2.
203
warnings.warn("The fix_ampersands function is deprecated and will be removed in Django 1.8.",
204
RemovedInDjango18Warning, stacklevel=3)
189
205
return unencoded_ampersands_re.sub('&', force_text(value))
190
206
fix_ampersands = allow_lazy(fix_ampersands, six.text_type)
192
209
def smart_urlquote(url):
193
210
"Quotes a URL if it isn't already quoted."
194
211
# Handle IDN before quoting.
196
213
scheme, netloc, path, query, fragment = urlsplit(url)
198
netloc = netloc.encode('idna').decode('ascii') # IDN -> ACE
199
except UnicodeError: # invalid domain part
215
netloc = netloc.encode('idna').decode('ascii') # IDN -> ACE
216
except UnicodeError: # invalid domain part
202
219
url = urlunsplit((scheme, netloc, path, query, fragment))
219
237
Links can have trailing punctuation (periods, commas, close-parens) and
220
238
leading punctuation (opening parens) and it'll still do the right thing.
222
If trim_url_limit is not None, the URLs in link text longer than this limit
223
will truncated to trim_url_limit-3 characters and appended with an elipsis.
225
If nofollow is True, the URLs in link text will get a rel="nofollow"
228
If autoescape is True, the link text and URLs will get autoescaped.
240
If trim_url_limit is not None, the URLs in the link text longer than this
241
limit will be truncated to trim_url_limit-3 characters and appended with
244
If nofollow is True, the links will get a rel="nofollow" attribute.
246
If autoescape is True, the link text and URLs will be autoescaped.
230
248
def trim_url(x, limit=trim_url_limit):
231
249
if limit is None or len(x) <= limit:
234
252
safe_input = isinstance(text, SafeData)
235
253
words = word_split_re.split(force_text(text))
236
254
for i, word in enumerate(words):
238
255
if '.' in word or '@' in word or ':' in word:
239
256
# Deal with punctuation.
240
257
lead, middle, trail = '', word, ''
248
265
lead = lead + opening
249
266
# Keep parentheses at the end only if they're balanced.
250
267
if (middle.endswith(closing)
251
and middle.count(closing) == middle.count(opening) + 1):
268
and middle.count(closing) == middle.count(opening) + 1):
252
269
middle = middle[:-len(closing)]
253
270
trail = closing + trail
259
276
url = smart_urlquote(middle)
260
277
elif simple_url_2_re.match(middle):
261
278
url = smart_urlquote('http://%s' % middle)
262
elif not ':' in middle and simple_email_re.match(middle):
279
elif ':' not in middle and simple_email_re.match(middle):
263
280
local, domain = middle.rsplit('@', 1)
265
282
domain = domain.encode('idna').decode('ascii')
300
318
* Remove stuff like "<p> </p>", but only if it's at the
301
319
bottom of the text.
303
from django.utils.text import normalize_newlines
304
text = normalize_newlines(force_text(text))
321
# As clean_html is wrapped in allow_lazy, stacklevel 3 is more useful than 2.
322
warnings.warn("The clean_html function is deprecated and will be removed in Django 1.8.",
323
RemovedInDjango18Warning, stacklevel=3)
324
text = normalize_newlines(text)
305
325
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
306
326
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
307
327
text = fix_ampersands(text)
310
330
# Trim stupid HTML such as <br clear="all">.
311
331
text = html_gunk_re.sub('', text)
312
332
# Convert hard-coded bullets into HTML unordered lists.
313
334
def replace_p_tags(match):
314
335
s = match.group().replace('</p>', '</li>')