1
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
4
"""Tokenization help for Python programs.
6
generate_tokens(readline) is a generator that breaks a stream of
7
text into Python tokens. It accepts a readline-like method which is called
8
repeatedly to get the next line of input (or "" for EOF). It generates
9
5-tuples with these members:
11
the token type (see token.py)
13
the starting (row, column) indices of the token (a 2-tuple of ints)
14
the ending (row, column) indices of the token (a 2-tuple of ints)
15
the original line (string)
17
It is designed to match the working of the Python tokenizer exactly, except
18
that it produces COMMENT tokens for comments and gives type OP for all
22
tokenize_loop(readline, tokeneater)
23
tokenize(readline, tokeneater=printtoken)
24
are the same, except instead of generating tokens, tokeneater is a callback
25
function to which the 5 fields described above are passed as 5 arguments,
26
each time a new token is found."""
28
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
30
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33
from lib2to3.pgen2.token import *
36
__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
37
"generate_tokens", "untokenize"]
40
def group(*choices): return '(' + '|'.join(choices) + ')'
41
def any(*choices): return group(*choices) + '*'
42
def maybe(*choices): return group(*choices) + '?'
44
Whitespace = r'[ \f\t]*'
45
Comment = r'#[^\r\n]*'
46
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
47
Name = r'[a-zA-Z_]\w*'
49
Binnumber = r'0[bB][01]*'
50
Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
51
Octnumber = r'0[oO]?[0-7]*[lL]?'
52
Decnumber = r'[1-9]\d*[lL]?'
53
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
54
Exponent = r'[eE][-+]?\d+'
55
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
56
Expfloat = r'\d+' + Exponent
57
Floatnumber = group(Pointfloat, Expfloat)
58
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
59
Number = group(Imagnumber, Floatnumber, Intnumber)
61
# Tail end of ' string.
62
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
63
# Tail end of " string.
64
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
65
# Tail end of ''' string.
66
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
67
# Tail end of """ string.
68
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
69
Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
70
# Single-line ' or " string.
71
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
72
r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
# Because of leftmost-then-longest match semantics, be sure to put the
75
# longest operators first (e.g., if = came before ==, == would get
76
# recognized as two instances of =).
77
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
83
Special = group(r'\r?\n', r'[:;.,`@]')
84
Funny = group(Operator, Bracket, Special)
86
PlainToken = group(Number, Funny, String, Name)
87
Token = Ignore + PlainToken
89
# First (or only) line of ' or " string.
90
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
91
group("'", r'\\\r?\n'),
92
r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
93
group('"', r'\\\r?\n'))
94
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
95
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
tokenprog, pseudoprog, single3prog, double3prog = map(
98
re.compile, (Token, PseudoToken, Single3, Double3))
99
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
100
"'''": single3prog, '"""': double3prog,
101
"r'''": single3prog, 'r"""': double3prog,
102
"u'''": single3prog, 'u"""': double3prog,
103
"b'''": single3prog, 'b"""': double3prog,
104
"ur'''": single3prog, 'ur"""': double3prog,
105
"br'''": single3prog, 'br"""': double3prog,
106
"R'''": single3prog, 'R"""': double3prog,
107
"U'''": single3prog, 'U"""': double3prog,
108
"B'''": single3prog, 'B"""': double3prog,
109
"uR'''": single3prog, 'uR"""': double3prog,
110
"Ur'''": single3prog, 'Ur"""': double3prog,
111
"UR'''": single3prog, 'UR"""': double3prog,
112
"bR'''": single3prog, 'bR"""': double3prog,
113
"Br'''": single3prog, 'Br"""': double3prog,
114
"BR'''": single3prog, 'BR"""': double3prog,
115
'r': None, 'R': None,
116
'u': None, 'U': None,
117
'b': None, 'B': None}
120
for t in ("'''", '"""',
121
"r'''", 'r"""', "R'''", 'R"""',
122
"u'''", 'u"""', "U'''", 'U"""',
123
"b'''", 'b"""', "B'''", 'B"""',
124
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
125
"uR'''", 'uR"""', "UR'''", 'UR"""',
126
"br'''", 'br"""', "Br'''", 'Br"""',
127
"bR'''", 'bR"""', "BR'''", 'BR"""',):
131
"r'", 'r"', "R'", 'R"',
132
"u'", 'u"', "U'", 'U"',
133
"b'", 'b"', "B'", 'B"',
134
"ur'", 'ur"', "Ur'", 'Ur"',
135
"uR'", 'uR"', "UR'", 'UR"',
136
"br'", 'br"', "Br'", 'Br"',
137
"bR'", 'bR"', "BR'", 'BR"', ):
142
class TokenError(Exception): pass
144
class StopTokenizing(Exception): pass
146
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
147
print "%d,%d-%d,%d:\t%s\t%s" % \
148
(srow, scol, erow, ecol, tok_name[type], repr(token))
150
def tokenize(readline, tokeneater=printtoken):
152
The tokenize() function accepts two parameters: one representing the
153
input stream, and one providing an output mechanism for tokenize().
155
The first parameter, readline, must be a callable object which provides
156
the same interface as the readline() method of built-in file objects.
157
Each call to the function should return one line of input as a string.
159
The second parameter, tokeneater, must also be a callable object. It is
160
called once for each token, with five arguments, corresponding to the
161
tuples generated by generate_tokens().
164
tokenize_loop(readline, tokeneater)
165
except StopTokenizing:
168
# backwards compatible interface
169
def tokenize_loop(readline, tokeneater):
170
for token_info in generate_tokens(readline):
171
tokeneater(*token_info)
180
def add_whitespace(self, start):
182
assert row <= self.prev_row
183
col_offset = col - self.prev_col
185
self.tokens.append(" " * col_offset)
187
def untokenize(self, iterable):
190
self.compat(t, iterable)
192
tok_type, token, start, end, line = t
193
self.add_whitespace(start)
194
self.tokens.append(token)
195
self.prev_row, self.prev_col = end
196
if tok_type in (NEWLINE, NL):
199
return "".join(self.tokens)
201
def compat(self, token, iterable):
204
toks_append = self.tokens.append
205
toknum, tokval = token
206
if toknum in (NAME, NUMBER):
208
if toknum in (NEWLINE, NL):
211
toknum, tokval = tok[:2]
213
if toknum in (NAME, NUMBER):
217
indents.append(tokval)
219
elif toknum == DEDENT:
222
elif toknum in (NEWLINE, NL):
224
elif startline and indents:
225
toks_append(indents[-1])
229
def untokenize(iterable):
230
"""Transform tokens back into Python source code.
232
Each element returned by the iterable must be a token sequence
233
with at least two elements, a token number and token value. If
234
only two tokens are passed, the resulting output is poor.
236
Round-trip invariant for full input:
237
Untokenized source will match input source exactly
239
Round-trip invariant for limited intput:
240
# Output text will tokenize the back to the input
241
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
242
newcode = untokenize(t1)
243
readline = iter(newcode.splitlines(1)).next
244
t2 = [tok[:2] for tokin generate_tokens(readline)]
248
return ut.untokenize(iterable)
250
def generate_tokens(readline):
252
The generate_tokens() generator requires one argment, readline, which
253
must be a callable object which provides the same interface as the
254
readline() method of built-in file objects. Each call to the function
255
should return one line of input as a string. Alternately, readline
256
can be a callable function terminating with StopIteration:
257
readline = open(myfile).next # Example of alternate readline
259
The generator produces 5-tuples with these members: the token type; the
260
token string; a 2-tuple (srow, scol) of ints specifying the row and
261
column where the token begins in the source; a 2-tuple (erow, ecol) of
262
ints specifying the row and column where the token ends in the source;
263
and the line on which the token was found. The line passed is the
264
logical line; continuation lines are included.
266
lnum = parenlev = continued = 0
267
namechars, numchars = string.ascii_letters + '_', '0123456789'
268
contstr, needcont = '', 0
272
while 1: # loop over lines in stream
275
except StopIteration:
278
pos, max = 0, len(line)
280
if contstr: # continued string
282
raise TokenError, ("EOF in multi-line string", strstart)
283
endmatch = endprog.match(line)
285
pos = end = endmatch.end(0)
286
yield (STRING, contstr + line[:end],
287
strstart, (lnum, end), contline + line)
288
contstr, needcont = '', 0
290
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
291
yield (ERRORTOKEN, contstr + line,
292
strstart, (lnum, len(line)), contline)
297
contstr = contstr + line
298
contline = contline + line
301
elif parenlev == 0 and not continued: # new statement
304
while pos < max: # measure leading whitespace
305
if line[pos] == ' ': column = column + 1
306
elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
307
elif line[pos] == '\f': column = 0
312
if line[pos] in '#\r\n': # skip comments or blank lines
314
comment_token = line[pos:].rstrip('\r\n')
315
nl_pos = pos + len(comment_token)
316
yield (COMMENT, comment_token,
317
(lnum, pos), (lnum, pos + len(comment_token)), line)
318
yield (NL, line[nl_pos:],
319
(lnum, nl_pos), (lnum, len(line)), line)
321
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
322
(lnum, pos), (lnum, len(line)), line)
325
if column > indents[-1]: # count indents or dedents
326
indents.append(column)
327
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
328
while column < indents[-1]:
329
if column not in indents:
330
raise IndentationError(
331
"unindent does not match any outer indentation level",
332
("<tokenize>", lnum, pos, line))
333
indents = indents[:-1]
334
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
336
else: # continued statement
338
raise TokenError, ("EOF in multi-line statement", (lnum, 0))
342
pseudomatch = pseudoprog.match(line, pos)
343
if pseudomatch: # scan for tokens
344
start, end = pseudomatch.span(1)
345
spos, epos, pos = (lnum, start), (lnum, end), end
346
token, initial = line[start:end], line[start]
348
if initial in numchars or \
349
(initial == '.' and token != '.'): # ordinary number
350
yield (NUMBER, token, spos, epos, line)
351
elif initial in '\r\n':
355
yield (newline, token, spos, epos, line)
357
assert not token.endswith("\n")
358
yield (COMMENT, token, spos, epos, line)
359
elif token in triple_quoted:
360
endprog = endprogs[token]
361
endmatch = endprog.match(line, pos)
362
if endmatch: # all on one line
363
pos = endmatch.end(0)
364
token = line[start:pos]
365
yield (STRING, token, spos, (lnum, pos), line)
367
strstart = (lnum, start) # multiple lines
368
contstr = line[start:]
371
elif initial in single_quoted or \
372
token[:2] in single_quoted or \
373
token[:3] in single_quoted:
374
if token[-1] == '\n': # continued string
375
strstart = (lnum, start)
376
endprog = (endprogs[initial] or endprogs[token[1]] or
378
contstr, needcont = line[start:], 1
381
else: # ordinary string
382
yield (STRING, token, spos, epos, line)
383
elif initial in namechars: # ordinary name
384
yield (NAME, token, spos, epos, line)
385
elif initial == '\\': # continued stmt
386
# This yield is new; needed for better idempotency:
387
yield (NL, token, spos, (lnum, pos), line)
390
if initial in '([{': parenlev = parenlev + 1
391
elif initial in ')]}': parenlev = parenlev - 1
392
yield (OP, token, spos, epos, line)
394
yield (ERRORTOKEN, line[pos],
395
(lnum, pos), (lnum, pos+1), line)
398
for indent in indents[1:]: # pop remaining indent levels
399
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
400
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
402
if __name__ == '__main__': # testing
404
if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
405
else: tokenize(sys.stdin.readline)