7
This is a Python implementation of the `WHATWG Encoding standard
8
<http://encoding.spec.whatwg.org/>`. See README for details.
10
:copyright: Copyright 2012 by Simon Sapin
11
:license: BSD, see LICENSE for details.
15
from __future__ import unicode_literals
19
from .labels import LABELS
25
# Some names in Encoding are not valid Python aliases. Remap these.
27
'iso-8859-8-i': 'iso-8859-8',
28
'x-mac-cyrillic': 'mac-cyrillic',
29
'macintosh': 'mac-roman',
30
'windows-874': 'cp874'}
35
def ascii_lower(string):
36
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
38
:param string: An Unicode string.
39
:returns: A new Unicode string.
41
This is used for `ASCII case-insensitive
42
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
43
matching of encoding labels.
44
The same matching is also used, among other things,
45
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
47
This is different from the :meth:`~py:str.lower` method of Unicode strings
48
which also affect non-ASCII characters,
49
sometimes mapping them into the ASCII range:
51
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
52
>>> assert keyword.lower() == u'background'
53
>>> assert ascii_lower(keyword) != keyword.lower()
54
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
57
# This turns out to be faster than unicode.translate()
58
return string.encode('utf8').lower().decode('utf8')
63
Look for an encoding by its label.
64
This is the spec’s `get an encoding
65
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
66
Supported labels are listed there.
68
:param label: A string.
70
An :class:`Encoding` object, or :obj:`None` for an unknown label.
73
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
74
label = ascii_lower(label.strip('\t\n\f\r '))
75
name = LABELS.get(label)
78
encoding = CACHE.get(name)
80
if name == 'x-user-defined':
81
from .x_user_defined import codec_info
83
python_name = PYTHON_NAMES.get(name, name)
84
# Any python_name value that gets to here should be valid.
85
codec_info = codecs.lookup(python_name)
86
encoding = Encoding(name, codec_info)
87
CACHE[name] = encoding
91
def _get_encoding(encoding_or_label):
93
Accept either an encoding object or label.
95
:param encoding: An :class:`Encoding` object or a label string.
96
:returns: An :class:`Encoding` object.
97
:raises: :exc:`~exceptions.LookupError` for an unknown label.
100
if hasattr(encoding_or_label, 'codec_info'):
101
return encoding_or_label
103
encoding = lookup(encoding_or_label)
105
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
109
class Encoding(object):
110
"""Reresents a character encoding such as UTF-8,
111
that can be used for decoding or encoding.
115
Canonical name of the encoding
117
.. attribute:: codec_info
119
The actual implementation of the encoding,
120
a stdlib :class:`~codecs.CodecInfo` object.
121
See :func:`codecs.register`.
124
def __init__(self, name, codec_info):
126
self.codec_info = codec_info
129
return '<Encoding %s>' % self.name
132
#: The UTF-8 encoding. Should be used for new content and formats.
133
UTF8 = lookup('utf-8')
135
_UTF16LE = lookup('utf-16le')
136
_UTF16BE = lookup('utf-16be')
139
def decode(input, fallback_encoding, errors='replace'):
141
Decode a single string.
143
:param input: A byte string
144
:param fallback_encoding:
145
An :class:`Encoding` object or a label string.
146
The encoding to use if :obj:`input` does note have a BOM.
147
:param errors: Type of error handling. See :func:`codecs.register`.
148
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
150
A ``(output, encoding)`` tuple of an Unicode string
151
and an :obj:`Encoding`.
154
# Fail early if `encoding` is an invalid label.
155
fallback_encoding = _get_encoding(fallback_encoding)
156
bom_encoding, input = _detect_bom(input)
157
encoding = bom_encoding or fallback_encoding
158
return encoding.codec_info.decode(input, errors)[0], encoding
161
def _detect_bom(input):
162
"""Return (bom_encoding, input), with any BOM removed from the input."""
163
if input.startswith(b'\xFF\xFE'):
164
return _UTF16LE, input[2:]
165
if input.startswith(b'\xFE\xFF'):
166
return _UTF16BE, input[2:]
167
if input.startswith(b'\xEF\xBB\xBF'):
168
return UTF8, input[3:]
172
def encode(input, encoding=UTF8, errors='strict'):
174
Encode a single string.
176
:param input: An Unicode string.
177
:param encoding: An :class:`Encoding` object or a label string.
178
:param errors: Type of error handling. See :func:`codecs.register`.
179
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180
:return: A byte string.
183
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
186
def iter_decode(input, fallback_encoding, errors='replace'):
188
"Pull"-based decoder.
191
An iterable of byte strings.
193
The input is first consumed just enough to determine the encoding
194
based on the precense of a BOM,
195
then consumed on demand when the return value is.
196
:param fallback_encoding:
197
An :class:`Encoding` object or a label string.
198
The encoding to use if :obj:`input` does note have a BOM.
199
:param errors: Type of error handling. See :func:`codecs.register`.
200
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
202
An ``(output, encoding)`` tuple.
203
:obj:`output` is an iterable of Unicode strings,
204
:obj:`encoding` is the :obj:`Encoding` that is being used.
208
decoder = IncrementalDecoder(fallback_encoding, errors)
209
generator = _iter_decode_generator(input, decoder)
210
encoding = next(generator)
211
return generator, encoding
214
def _iter_decode_generator(input, decoder):
215
"""Return a generator that first yields the :obj:`Encoding`,
216
then yields output chukns as Unicode strings.
219
decode = decoder.decode
222
output = decode(chunck)
224
assert decoder.encoding is not None
225
yield decoder.encoding
229
# Input exhausted without determining the encoding
230
output = decode(b'', final=True)
231
assert decoder.encoding is not None
232
yield decoder.encoding
238
output = decode(chunck)
241
output = decode(b'', final=True)
246
def iter_encode(input, encoding=UTF8, errors='strict'):
248
“Pull”-based encoder.
250
:param input: An iterable of Unicode strings.
251
:param encoding: An :class:`Encoding` object or a label string.
252
:param errors: Type of error handling. See :func:`codecs.register`.
253
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254
:returns: An iterable of byte strings.
257
# Fail early if `encoding` is an invalid label.
258
encode = IncrementalEncoder(encoding, errors).encode
259
return _iter_encode_generator(input, encode)
262
def _iter_encode_generator(input, encode):
264
output = encode(chunck)
267
output = encode('', final=True)
272
class IncrementalDecoder(object):
274
“Push”-based decoder.
276
:param fallback_encoding:
277
An :class:`Encoding` object or a label string.
278
The encoding to use if :obj:`input` does note have a BOM.
279
:param errors: Type of error handling. See :func:`codecs.register`.
280
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
283
def __init__(self, fallback_encoding, errors='replace'):
284
# Fail early if `encoding` is an invalid label.
285
self._fallback_encoding = _get_encoding(fallback_encoding)
286
self._errors = errors
289
#: The actual :class:`Encoding` that is being used,
290
#: or :obj:`None` if that is not determined yet.
291
#: (Ie. if there is not enough input yet to determine
292
#: if there is a BOM.)
293
self.encoding = None # Not known yet.
295
def decode(self, input, final=False):
296
"""Decode one chunk of the input.
298
:param input: A byte string.
300
Indicate that no more input is available.
301
Must be :obj:`True` if this is the last call.
302
:returns: An Unicode string.
305
decoder = self._decoder
306
if decoder is not None:
307
return decoder(input, final)
309
input = self._buffer + input
310
encoding, input = _detect_bom(input)
312
if len(input) < 3 and not final: # Not enough data yet.
316
encoding = self._fallback_encoding
317
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
318
self._decoder = decoder
319
self.encoding = encoding
320
return decoder(input, final)
323
class IncrementalEncoder(object):
325
“Push”-based encoder.
327
:param encoding: An :class:`Encoding` object or a label string.
328
:param errors: Type of error handling. See :func:`codecs.register`.
329
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
331
.. method:: encode(input, final=False)
333
:param input: An Unicode string.
335
Indicate that no more input is available.
336
Must be :obj:`True` if this is the last call.
337
:returns: A byte string.
340
def __init__(self, encoding=UTF8, errors='strict'):
341
encoding = _get_encoding(encoding)
342
self.encode = encoding.codec_info.incrementalencoder(errors).encode