30
30
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33
from codecs import BOM_UTF8, lookup
33
34
from lib2to3.pgen2.token import *
35
36
from . import token
144
145
class StopTokenizing(Exception): pass
146
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
147
def printtoken(type, token, start, end, line): # for testing
147
150
print "%d,%d-%d,%d:\t%s\t%s" % \
148
151
(srow, scol, erow, ecol, tok_name[type], repr(token))
226
229
startline = False
227
230
toks_append(tokval)
232
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
234
def _get_normal_name(orig_enc):
235
"""Imitates get_normal_name in tokenizer.c."""
236
# Only care about the first 12 characters.
237
enc = orig_enc[:12].lower().replace("_", "-")
238
if enc == "utf-8" or enc.startswith("utf-8-"):
240
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
241
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
245
def detect_encoding(readline):
247
The detect_encoding() function is used to detect the encoding that should
248
be used to decode a Python source file. It requires one argment, readline,
249
in the same way as the tokenize() generator.
251
It will call readline a maximum of twice, and return the encoding used
252
(as a string) and a list of any lines (left as bytes) it has read
255
It detects the encoding from the presence of a utf-8 bom or an encoding
256
cookie as specified in pep-0263. If both a bom and a cookie are present,
257
but disagree, a SyntaxError will be raised. If the encoding cookie is an
258
invalid charset, raise a SyntaxError.
260
If no encoding is specified, then the default of 'utf-8' will be returned.
267
except StopIteration:
270
def find_cookie(line):
272
line_string = line.decode('ascii')
273
except UnicodeDecodeError:
276
matches = cookie_re.findall(line_string)
279
encoding = _get_normal_name(matches[0])
281
codec = lookup(encoding)
283
# This behaviour mimics the Python interpreter
284
raise SyntaxError("unknown encoding: " + encoding)
287
if codec.name != 'utf-8':
288
# This behaviour mimics the Python interpreter
289
raise SyntaxError('encoding problem: utf-8')
291
# Allow it to be properly encoded and decoded.
292
encoding = 'utf-8-sig'
295
first = read_or_stop()
296
if first.startswith(BOM_UTF8):
302
encoding = find_cookie(first)
304
return encoding, [first]
306
second = read_or_stop()
308
return 'utf-8', [first]
310
encoding = find_cookie(second)
312
return encoding, [first, second]
314
return 'utf-8', [first, second]
229
316
def untokenize(iterable):
230
317
"""Transform tokens back into Python source code.