1
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4
"""Tokenization help for Python programs.
6
generate_tokens(readline) is a generator that breaks a stream of
7
text into Python tokens. It accepts a readline-like method which is called
8
repeatedly to get the next line of input (or "" for EOF). It generates
9
5-tuples with these members:
11
the token type (see token.py)
13
the starting (row, column) indices of the token (a 2-tuple of ints)
14
the ending (row, column) indices of the token (a 2-tuple of ints)
15
the original line (string)
17
It is designed to match the working of the Python tokenizer exactly, except
18
that it produces COMMENT tokens for comments and gives type OP for all
22
tokenize_loop(readline, tokeneater)
23
tokenize(readline, tokeneater=printtoken)
24
are the same, except instead of generating tokens, tokeneater is a callback
25
function to which the 5 fields described above are passed as 5 arguments,
26
each time a new token is found."""
28
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
30
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33
from codecs import BOM_UTF8, lookup
34
from lib2to3.pgen2.token import *
37
__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38
"generate_tokens", "untokenize"]
44
# Support bytes type in Python <= 2.5, so 2to3 turns itself into
45
# valid Python 3 code.
48
def group(*choices): return '(' + '|'.join(choices) + ')'
49
def any(*choices): return group(*choices) + '*'
50
def maybe(*choices): return group(*choices) + '?'
52
Whitespace = r'[ \f\t]*'
53
Comment = r'#[^\r\n]*'
54
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55
Name = r'[a-zA-Z_]\w*'
57
Binnumber = r'0[bB][01]*'
58
Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59
Octnumber = r'0[oO]?[0-7]*[lL]?'
60
Decnumber = r'[1-9]\d*[lL]?'
61
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62
Exponent = r'[eE][-+]?\d+'
63
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64
Expfloat = r'\d+' + Exponent
65
Floatnumber = group(Pointfloat, Expfloat)
66
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67
Number = group(Imagnumber, Floatnumber, Intnumber)
69
# Tail end of ' string.
70
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71
# Tail end of " string.
72
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73
# Tail end of ''' string.
74
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75
# Tail end of """ string.
76
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77
Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78
# Single-line ' or " string.
79
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80
r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
82
# Because of leftmost-then-longest match semantics, be sure to put the
83
# longest operators first (e.g., if = came before ==, == would get
84
# recognized as two instances of =).
85
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
91
Special = group(r'\r?\n', r'[:;.,`@]')
92
Funny = group(Operator, Bracket, Special)
94
PlainToken = group(Number, Funny, String, Name)
95
Token = Ignore + PlainToken
97
# First (or only) line of ' or " string.
98
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99
group("'", r'\\\r?\n'),
100
r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101
group('"', r'\\\r?\n'))
102
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
105
tokenprog, pseudoprog, single3prog, double3prog = list(map(
106
re.compile, (Token, PseudoToken, Single3, Double3)))
107
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108
"'''": single3prog, '"""': double3prog,
109
"r'''": single3prog, 'r"""': double3prog,
110
"u'''": single3prog, 'u"""': double3prog,
111
"b'''": single3prog, 'b"""': double3prog,
112
"ur'''": single3prog, 'ur"""': double3prog,
113
"br'''": single3prog, 'br"""': double3prog,
114
"R'''": single3prog, 'R"""': double3prog,
115
"U'''": single3prog, 'U"""': double3prog,
116
"B'''": single3prog, 'B"""': double3prog,
117
"uR'''": single3prog, 'uR"""': double3prog,
118
"Ur'''": single3prog, 'Ur"""': double3prog,
119
"UR'''": single3prog, 'UR"""': double3prog,
120
"bR'''": single3prog, 'bR"""': double3prog,
121
"Br'''": single3prog, 'Br"""': double3prog,
122
"BR'''": single3prog, 'BR"""': double3prog,
123
'r': None, 'R': None,
124
'u': None, 'U': None,
125
'b': None, 'B': None}
128
for t in ("'''", '"""',
129
"r'''", 'r"""', "R'''", 'R"""',
130
"u'''", 'u"""', "U'''", 'U"""',
131
"b'''", 'b"""', "B'''", 'B"""',
132
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
133
"uR'''", 'uR"""', "UR'''", 'UR"""',
134
"br'''", 'br"""', "Br'''", 'Br"""',
135
"bR'''", 'bR"""', "BR'''", 'BR"""',):
139
"r'", 'r"', "R'", 'R"',
140
"u'", 'u"', "U'", 'U"',
141
"b'", 'b"', "B'", 'B"',
142
"ur'", 'ur"', "Ur'", 'Ur"',
143
"uR'", 'uR"', "UR'", 'UR"',
144
"br'", 'br"', "Br'", 'Br"',
145
"bR'", 'bR"', "BR'", 'BR"', ):
150
class TokenError(Exception): pass
152
class StopTokenizing(Exception): pass
154
def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
155
(srow, scol) = xxx_todo_changeme
156
(erow, ecol) = xxx_todo_changeme1
157
print("%d,%d-%d,%d:\t%s\t%s" % \
158
(srow, scol, erow, ecol, tok_name[type], repr(token)))
160
def tokenize(readline, tokeneater=printtoken):
162
The tokenize() function accepts two parameters: one representing the
163
input stream, and one providing an output mechanism for tokenize().
165
The first parameter, readline, must be a callable object which provides
166
the same interface as the readline() method of built-in file objects.
167
Each call to the function should return one line of input as a string.
169
The second parameter, tokeneater, must also be a callable object. It is
170
called once for each token, with five arguments, corresponding to the
171
tuples generated by generate_tokens().
174
tokenize_loop(readline, tokeneater)
175
except StopTokenizing:
178
# backwards compatible interface
179
def tokenize_loop(readline, tokeneater):
180
for token_info in generate_tokens(readline):
181
tokeneater(*token_info)
190
def add_whitespace(self, start):
192
assert row <= self.prev_row
193
col_offset = col - self.prev_col
195
self.tokens.append(" " * col_offset)
197
def untokenize(self, iterable):
200
self.compat(t, iterable)
202
tok_type, token, start, end, line = t
203
self.add_whitespace(start)
204
self.tokens.append(token)
205
self.prev_row, self.prev_col = end
206
if tok_type in (NEWLINE, NL):
209
return "".join(self.tokens)
211
def compat(self, token, iterable):
214
toks_append = self.tokens.append
215
toknum, tokval = token
216
if toknum in (NAME, NUMBER):
218
if toknum in (NEWLINE, NL):
221
toknum, tokval = tok[:2]
223
if toknum in (NAME, NUMBER):
227
indents.append(tokval)
229
elif toknum == DEDENT:
232
elif toknum in (NEWLINE, NL):
234
elif startline and indents:
235
toks_append(indents[-1])
239
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
241
def _get_normal_name(orig_enc):
242
"""Imitates get_normal_name in tokenizer.c."""
243
# Only care about the first 12 characters.
244
enc = orig_enc[:12].lower().replace("_", "-")
245
if enc == "utf-8" or enc.startswith("utf-8-"):
247
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
252
def detect_encoding(readline):
254
The detect_encoding() function is used to detect the encoding that should
255
be used to decode a Python source file. It requires one argment, readline,
256
in the same way as the tokenize() generator.
258
It will call readline a maximum of twice, and return the encoding used
259
(as a string) and a list of any lines (left as bytes) it has read
262
It detects the encoding from the presence of a utf-8 bom or an encoding
263
cookie as specified in pep-0263. If both a bom and a cookie are present, but
264
disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
265
charset, raise a SyntaxError. Note that if a utf-8 bom is found,
266
'utf-8-sig' is returned.
268
If no encoding is specified, then the default of 'utf-8' will be returned.
276
except StopIteration:
279
def find_cookie(line):
281
line_string = line.decode('ascii')
282
except UnicodeDecodeError:
284
match = cookie_re.match(line_string)
287
encoding = _get_normal_name(match.group(1))
289
codec = lookup(encoding)
291
# This behaviour mimics the Python interpreter
292
raise SyntaxError("unknown encoding: " + encoding)
295
if codec.name != 'utf-8':
296
# This behaviour mimics the Python interpreter
297
raise SyntaxError('encoding problem: utf-8')
301
first = read_or_stop()
302
if first.startswith(BOM_UTF8):
305
default = 'utf-8-sig'
309
encoding = find_cookie(first)
311
return encoding, [first]
313
second = read_or_stop()
315
return default, [first]
317
encoding = find_cookie(second)
319
return encoding, [first, second]
321
return default, [first, second]
323
def untokenize(iterable):
324
"""Transform tokens back into Python source code.
326
Each element returned by the iterable must be a token sequence
327
with at least two elements, a token number and token value. If
328
only two tokens are passed, the resulting output is poor.
330
Round-trip invariant for full input:
331
Untokenized source will match input source exactly
333
Round-trip invariant for limited intput:
334
# Output text will tokenize the back to the input
335
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
336
newcode = untokenize(t1)
337
readline = iter(newcode.splitlines(1)).next
338
t2 = [tok[:2] for tokin generate_tokens(readline)]
342
return ut.untokenize(iterable)
344
def generate_tokens(readline):
346
The generate_tokens() generator requires one argment, readline, which
347
must be a callable object which provides the same interface as the
348
readline() method of built-in file objects. Each call to the function
349
should return one line of input as a string. Alternately, readline
350
can be a callable function terminating with StopIteration:
351
readline = open(myfile).next # Example of alternate readline
353
The generator produces 5-tuples with these members: the token type; the
354
token string; a 2-tuple (srow, scol) of ints specifying the row and
355
column where the token begins in the source; a 2-tuple (erow, ecol) of
356
ints specifying the row and column where the token ends in the source;
357
and the line on which the token was found. The line passed is the
358
logical line; continuation lines are included.
360
lnum = parenlev = continued = 0
361
namechars, numchars = string.ascii_letters + '_', '0123456789'
362
contstr, needcont = '', 0
366
while 1: # loop over lines in stream
369
except StopIteration:
372
pos, max = 0, len(line)
374
if contstr: # continued string
376
raise TokenError("EOF in multi-line string", strstart)
377
endmatch = endprog.match(line)
379
pos = end = endmatch.end(0)
380
yield (STRING, contstr + line[:end],
381
strstart, (lnum, end), contline + line)
382
contstr, needcont = '', 0
384
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
385
yield (ERRORTOKEN, contstr + line,
386
strstart, (lnum, len(line)), contline)
391
contstr = contstr + line
392
contline = contline + line
395
elif parenlev == 0 and not continued: # new statement
398
while pos < max: # measure leading whitespace
399
if line[pos] == ' ': column = column + 1
400
elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
401
elif line[pos] == '\f': column = 0
406
if line[pos] in '#\r\n': # skip comments or blank lines
408
comment_token = line[pos:].rstrip('\r\n')
409
nl_pos = pos + len(comment_token)
410
yield (COMMENT, comment_token,
411
(lnum, pos), (lnum, pos + len(comment_token)), line)
412
yield (NL, line[nl_pos:],
413
(lnum, nl_pos), (lnum, len(line)), line)
415
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
416
(lnum, pos), (lnum, len(line)), line)
419
if column > indents[-1]: # count indents or dedents
420
indents.append(column)
421
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
422
while column < indents[-1]:
423
if column not in indents:
424
raise IndentationError(
425
"unindent does not match any outer indentation level",
426
("<tokenize>", lnum, pos, line))
427
indents = indents[:-1]
428
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
430
else: # continued statement
432
raise TokenError("EOF in multi-line statement", (lnum, 0))
436
pseudomatch = pseudoprog.match(line, pos)
437
if pseudomatch: # scan for tokens
438
start, end = pseudomatch.span(1)
439
spos, epos, pos = (lnum, start), (lnum, end), end
440
token, initial = line[start:end], line[start]
442
if initial in numchars or \
443
(initial == '.' and token != '.'): # ordinary number
444
yield (NUMBER, token, spos, epos, line)
445
elif initial in '\r\n':
449
yield (newline, token, spos, epos, line)
451
assert not token.endswith("\n")
452
yield (COMMENT, token, spos, epos, line)
453
elif token in triple_quoted:
454
endprog = endprogs[token]
455
endmatch = endprog.match(line, pos)
456
if endmatch: # all on one line
457
pos = endmatch.end(0)
458
token = line[start:pos]
459
yield (STRING, token, spos, (lnum, pos), line)
461
strstart = (lnum, start) # multiple lines
462
contstr = line[start:]
465
elif initial in single_quoted or \
466
token[:2] in single_quoted or \
467
token[:3] in single_quoted:
468
if token[-1] == '\n': # continued string
469
strstart = (lnum, start)
470
endprog = (endprogs[initial] or endprogs[token[1]] or
472
contstr, needcont = line[start:], 1
475
else: # ordinary string
476
yield (STRING, token, spos, epos, line)
477
elif initial in namechars: # ordinary name
478
yield (NAME, token, spos, epos, line)
479
elif initial == '\\': # continued stmt
480
# This yield is new; needed for better idempotency:
481
yield (NL, token, spos, (lnum, pos), line)
484
if initial in '([{': parenlev = parenlev + 1
485
elif initial in ')]}': parenlev = parenlev - 1
486
yield (OP, token, spos, epos, line)
488
yield (ERRORTOKEN, line[pos],
489
(lnum, pos), (lnum, pos+1), line)
492
for indent in indents[1:]: # pop remaining indent levels
493
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
494
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
496
if __name__ == '__main__': # testing
498
if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
499
else: tokenize(sys.stdin.readline)