1
"""Tokenization help for Python programs.
3
tokenize(readline) is a generator that breaks a stream of bytes into
4
Python tokens. It decodes the bytes according to PEP-0263 for
5
determining source file encoding.
7
It accepts a readline-like method which is called repeatedly to get the
8
next line of input (or b"" for EOF). It generates 5-tuples with these
11
the token type (see token.py)
13
the starting (row, column) indices of the token (a 2-tuple of ints)
14
the ending (row, column) indices of the token (a 2-tuple of ints)
15
the original line (string)
17
It is designed to match the working of the Python tokenizer exactly, except
18
that it produces COMMENT tokens for comments and gives type OP for all
19
operators. Additionally, all token lists start with an ENCODING token
20
which tells you which encoding was used to decode the bytes stream.
23
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
31
from codecs import lookup, BOM_UTF8
33
from io import TextIOWrapper
34
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
37
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
38
"NL", "untokenize", "ENCODING", "TokenInfo"]
42
tok_name[COMMENT] = 'COMMENT'
45
ENCODING = N_TOKENS + 2
46
tok_name[ENCODING] = 'ENCODING'
85
'^=': CIRCUMFLEXEQUAL,
86
'<<=': LEFTSHIFTEQUAL,
87
'>>=': RIGHTSHIFTEQUAL,
88
'**=': DOUBLESTAREQUAL,
90
'//=': DOUBLESLASHEQUAL,
94
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
96
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
97
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
98
self._replace(type=annotated_type))
101
def exact_type(self):
102
if self.type == OP and self.string in EXACT_TOKEN_TYPES:
103
return EXACT_TOKEN_TYPES[self.string]
107
def group(*choices): return '(' + '|'.join(choices) + ')'
108
def any(*choices): return group(*choices) + '*'
109
def maybe(*choices): return group(*choices) + '?'
111
# Note: we use unicode matching for names ("\w") but ascii matching for
113
Whitespace = r'[ \f\t]*'
114
Comment = r'#[^\r\n]*'
115
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
118
Hexnumber = r'0[xX][0-9a-fA-F]+'
119
Binnumber = r'0[bB][01]+'
120
Octnumber = r'0[oO][0-7]+'
121
Decnumber = r'(?:0+|[1-9][0-9]*)'
122
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
123
Exponent = r'[eE][-+]?[0-9]+'
124
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
125
Expfloat = r'[0-9]+' + Exponent
126
Floatnumber = group(Pointfloat, Expfloat)
127
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
128
Number = group(Imagnumber, Floatnumber, Intnumber)
130
StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
132
# Tail end of ' string.
133
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
134
# Tail end of " string.
135
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
136
# Tail end of ''' string.
137
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
138
# Tail end of """ string.
139
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
140
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
141
# Single-line ' or " string.
142
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
143
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
145
# Because of leftmost-then-longest match semantics, be sure to put the
146
# longest operators first (e.g., if = came before ==, == would get
147
# recognized as two instances of =).
148
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
154
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
155
Funny = group(Operator, Bracket, Special)
157
PlainToken = group(Number, Funny, String, Name)
158
Token = Ignore + PlainToken
160
# First (or only) line of ' or " string.
161
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
162
group("'", r'\\\r?\n'),
163
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
164
group('"', r'\\\r?\n'))
165
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
166
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
169
return re.compile(expr, re.UNICODE)
171
endpats = {"'": Single, '"': Double,
172
"'''": Single3, '"""': Double3,
173
"r'''": Single3, 'r"""': Double3,
174
"b'''": Single3, 'b"""': Double3,
175
"R'''": Single3, 'R"""': Double3,
176
"B'''": Single3, 'B"""': Double3,
177
"br'''": Single3, 'br"""': Double3,
178
"bR'''": Single3, 'bR"""': Double3,
179
"Br'''": Single3, 'Br"""': Double3,
180
"BR'''": Single3, 'BR"""': Double3,
181
"rb'''": Single3, 'rb"""': Double3,
182
"Rb'''": Single3, 'Rb"""': Double3,
183
"rB'''": Single3, 'rB"""': Double3,
184
"RB'''": Single3, 'RB"""': Double3,
185
"u'''": Single3, 'u"""': Double3,
186
"R'''": Single3, 'R"""': Double3,
187
"U'''": Single3, 'U"""': Double3,
188
'r': None, 'R': None, 'b': None, 'B': None,
189
'u': None, 'U': None}
192
for t in ("'''", '"""',
193
"r'''", 'r"""', "R'''", 'R"""',
194
"b'''", 'b"""', "B'''", 'B"""',
195
"br'''", 'br"""', "Br'''", 'Br"""',
196
"bR'''", 'bR"""', "BR'''", 'BR"""',
197
"rb'''", 'rb"""', "rB'''", 'rB"""',
198
"Rb'''", 'Rb"""', "RB'''", 'RB"""',
199
"u'''", 'u"""', "U'''", 'U"""',
204
"r'", 'r"', "R'", 'R"',
205
"b'", 'b"', "B'", 'B"',
206
"br'", 'br"', "Br'", 'Br"',
207
"bR'", 'bR"', "BR'", 'BR"' ,
208
"rb'", 'rb"', "rB'", 'rB"',
209
"Rb'", 'Rb"', "RB'", 'RB"' ,
210
"u'", 'u"', "U'", 'U"',
216
class TokenError(Exception): pass
218
class StopTokenizing(Exception): pass
229
def add_whitespace(self, start):
231
assert row <= self.prev_row
232
col_offset = col - self.prev_col
234
self.tokens.append(" " * col_offset)
236
def untokenize(self, iterable):
239
self.compat(t, iterable)
241
tok_type, token, start, end, line = t
242
if tok_type == ENCODING:
243
self.encoding = token
245
self.add_whitespace(start)
246
self.tokens.append(token)
247
self.prev_row, self.prev_col = end
248
if tok_type in (NEWLINE, NL):
251
return "".join(self.tokens)
253
def compat(self, token, iterable):
256
toks_append = self.tokens.append
257
toknum, tokval = token
259
if toknum in (NAME, NUMBER):
261
if toknum in (NEWLINE, NL):
265
toknum, tokval = tok[:2]
266
if toknum == ENCODING:
267
self.encoding = tokval
270
if toknum in (NAME, NUMBER):
273
# Insert a space between two consecutive strings
276
tokval = ' ' + tokval
282
indents.append(tokval)
284
elif toknum == DEDENT:
287
elif toknum in (NEWLINE, NL):
289
elif startline and indents:
290
toks_append(indents[-1])
295
def untokenize(iterable):
296
"""Transform tokens back into Python source code.
297
It returns a bytes object, encoded using the ENCODING
298
token, which is the first token sequence output by tokenize.
300
Each element returned by the iterable must be a token sequence
301
with at least two elements, a token number and token value. If
302
only two tokens are passed, the resulting output is poor.
304
Round-trip invariant for full input:
305
Untokenized source will match input source exactly
307
Round-trip invariant for limited intput:
308
# Output bytes will tokenize the back to the input
309
t1 = [tok[:2] for tok in tokenize(f.readline)]
310
newcode = untokenize(t1)
311
readline = BytesIO(newcode).readline
312
t2 = [tok[:2] for tok in tokenize(readline)]
316
out = ut.untokenize(iterable)
317
if ut.encoding is not None:
318
out = out.encode(ut.encoding)
322
def _get_normal_name(orig_enc):
323
"""Imitates get_normal_name in tokenizer.c."""
324
# Only care about the first 12 characters.
325
enc = orig_enc[:12].lower().replace("_", "-")
326
if enc == "utf-8" or enc.startswith("utf-8-"):
328
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
329
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
333
def detect_encoding(readline):
335
The detect_encoding() function is used to detect the encoding that should
336
be used to decode a Python source file. It requires one argment, readline,
337
in the same way as the tokenize() generator.
339
It will call readline a maximum of twice, and return the encoding used
340
(as a string) and a list of any lines (left as bytes) it has read in.
342
It detects the encoding from the presence of a utf-8 bom or an encoding
343
cookie as specified in pep-0263. If both a bom and a cookie are present,
344
but disagree, a SyntaxError will be raised. If the encoding cookie is an
345
invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
346
'utf-8-sig' is returned.
348
If no encoding is specified, then the default of 'utf-8' will be returned.
351
filename = readline.__self__.name
352
except AttributeError:
360
except StopIteration:
363
def find_cookie(line):
365
# Decode as UTF-8. Either the line is an encoding declaration,
366
# in which case it should be pure ASCII, or it must be UTF-8
367
# per default encoding.
368
line_string = line.decode('utf-8')
369
except UnicodeDecodeError:
370
msg = "invalid or missing encoding declaration"
371
if filename is not None:
372
msg = '{} for {!r}'.format(msg, filename)
373
raise SyntaxError(msg)
375
match = cookie_re.match(line_string)
378
encoding = _get_normal_name(match.group(1))
380
codec = lookup(encoding)
382
# This behaviour mimics the Python interpreter
384
msg = "unknown encoding: " + encoding
386
msg = "unknown encoding for {!r}: {}".format(filename,
388
raise SyntaxError(msg)
391
if encoding != 'utf-8':
392
# This behaviour mimics the Python interpreter
394
msg = 'encoding problem: utf-8'
396
msg = 'encoding problem for {!r}: utf-8'.format(filename)
397
raise SyntaxError(msg)
401
first = read_or_stop()
402
if first.startswith(BOM_UTF8):
405
default = 'utf-8-sig'
409
encoding = find_cookie(first)
411
return encoding, [first]
413
second = read_or_stop()
415
return default, [first]
417
encoding = find_cookie(second)
419
return encoding, [first, second]
421
return default, [first, second]
425
"""Open a file in read only mode using the encoding detected by
428
buffer = builtins.open(filename, 'rb')
429
encoding, lines = detect_encoding(buffer.readline)
431
text = TextIOWrapper(buffer, encoding, line_buffering=True)
436
def tokenize(readline):
438
The tokenize() generator requires one argment, readline, which
439
must be a callable object which provides the same interface as the
440
readline() method of built-in file objects. Each call to the function
441
should return one line of input as bytes. Alternately, readline
442
can be a callable function terminating with StopIteration:
443
readline = open(myfile, 'rb').__next__ # Example of alternate readline
445
The generator produces 5-tuples with these members: the token type; the
446
token string; a 2-tuple (srow, scol) of ints specifying the row and
447
column where the token begins in the source; a 2-tuple (erow, ecol) of
448
ints specifying the row and column where the token ends in the source;
449
and the line on which the token was found. The line passed is the
450
logical line; continuation lines are included.
452
The first token sequence will always be an ENCODING token
453
which tells you which encoding was used to decode the bytes stream.
455
# This import is here to avoid problems when the itertools module is not
456
# built yet and tokenize is imported.
457
from itertools import chain, repeat
458
encoding, consumed = detect_encoding(readline)
459
rl_gen = iter(readline, b"")
461
return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
464
def _tokenize(readline, encoding):
465
lnum = parenlev = continued = 0
466
numchars = '0123456789'
467
contstr, needcont = '', 0
471
if encoding is not None:
472
if encoding == "utf-8-sig":
473
# BOM will already have been stripped.
475
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
476
while True: # loop over lines in stream
479
except StopIteration:
482
if encoding is not None:
483
line = line.decode(encoding)
485
pos, max = 0, len(line)
487
if contstr: # continued string
489
raise TokenError("EOF in multi-line string", strstart)
490
endmatch = endprog.match(line)
492
pos = end = endmatch.end(0)
493
yield TokenInfo(STRING, contstr + line[:end],
494
strstart, (lnum, end), contline + line)
495
contstr, needcont = '', 0
497
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
498
yield TokenInfo(ERRORTOKEN, contstr + line,
499
strstart, (lnum, len(line)), contline)
504
contstr = contstr + line
505
contline = contline + line
508
elif parenlev == 0 and not continued: # new statement
511
while pos < max: # measure leading whitespace
514
elif line[pos] == '\t':
515
column = (column//tabsize + 1)*tabsize
516
elif line[pos] == '\f':
524
if line[pos] in '#\r\n': # skip comments or blank lines
526
comment_token = line[pos:].rstrip('\r\n')
527
nl_pos = pos + len(comment_token)
528
yield TokenInfo(COMMENT, comment_token,
529
(lnum, pos), (lnum, pos + len(comment_token)), line)
530
yield TokenInfo(NL, line[nl_pos:],
531
(lnum, nl_pos), (lnum, len(line)), line)
533
yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
534
(lnum, pos), (lnum, len(line)), line)
537
if column > indents[-1]: # count indents or dedents
538
indents.append(column)
539
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
540
while column < indents[-1]:
541
if column not in indents:
542
raise IndentationError(
543
"unindent does not match any outer indentation level",
544
("<tokenize>", lnum, pos, line))
545
indents = indents[:-1]
546
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
548
else: # continued statement
550
raise TokenError("EOF in multi-line statement", (lnum, 0))
554
pseudomatch = _compile(PseudoToken).match(line, pos)
555
if pseudomatch: # scan for tokens
556
start, end = pseudomatch.span(1)
557
spos, epos, pos = (lnum, start), (lnum, end), end
560
token, initial = line[start:end], line[start]
562
if (initial in numchars or # ordinary number
563
(initial == '.' and token != '.' and token != '...')):
564
yield TokenInfo(NUMBER, token, spos, epos, line)
565
elif initial in '\r\n':
566
yield TokenInfo(NL if parenlev > 0 else NEWLINE,
567
token, spos, epos, line)
569
assert not token.endswith("\n")
570
yield TokenInfo(COMMENT, token, spos, epos, line)
571
elif token in triple_quoted:
572
endprog = _compile(endpats[token])
573
endmatch = endprog.match(line, pos)
574
if endmatch: # all on one line
575
pos = endmatch.end(0)
576
token = line[start:pos]
577
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
579
strstart = (lnum, start) # multiple lines
580
contstr = line[start:]
583
elif initial in single_quoted or \
584
token[:2] in single_quoted or \
585
token[:3] in single_quoted:
586
if token[-1] == '\n': # continued string
587
strstart = (lnum, start)
588
endprog = _compile(endpats[initial] or
591
contstr, needcont = line[start:], 1
594
else: # ordinary string
595
yield TokenInfo(STRING, token, spos, epos, line)
596
elif initial.isidentifier(): # ordinary name
597
yield TokenInfo(NAME, token, spos, epos, line)
598
elif initial == '\\': # continued stmt
603
elif initial in ')]}':
605
yield TokenInfo(OP, token, spos, epos, line)
607
yield TokenInfo(ERRORTOKEN, line[pos],
608
(lnum, pos), (lnum, pos+1), line)
611
for indent in indents[1:]: # pop remaining indent levels
612
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
613
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
616
# An undocumented, backwards compatible, API for all the places in the standard
617
# library that expect to be able to use tokenize with strings
618
def generate_tokens(readline):
619
return _tokenize(readline, None)
624
# Helper error handling routines
626
print(message, file=sys.stderr)
628
def error(message, filename=None, location=None):
630
args = (filename,) + location + (message,)
631
perror("%s:%d:%d: error: %s" % args)
633
perror("%s: error: %s" % (filename, message))
635
perror("error: %s" % message)
638
# Parse the arguments and options
639
parser = argparse.ArgumentParser(prog='python -m tokenize')
640
parser.add_argument(dest='filename', nargs='?',
641
metavar='filename.py',
642
help='the file to tokenize; defaults to stdin')
643
parser.add_argument('-e', '--exact', dest='exact', action='store_true',
644
help='display token names using the exact type')
645
args = parser.parse_args()
650
filename = args.filename
651
with builtins.open(filename, 'rb') as f:
652
tokens = list(tokenize(f.readline))
655
tokens = _tokenize(sys.stdin.readline, None)
657
# Output the tokenization
659
token_type = token.type
661
token_type = token.exact_type
662
token_range = "%d,%d-%d,%d:" % (token.start + token.end)
663
print("%-20s%-15s%-15r" %
664
(token_range, tok_name[token_type], token.string))
665
except IndentationError as err:
666
line, column = err.args[1][1:3]
667
error(err.args[0], filename, (line, column))
668
except TokenError as err:
669
line, column = err.args[1]
670
error(err.args[0], filename, (line, column))
671
except SyntaxError as err:
673
except OSError as err:
675
except KeyboardInterrupt:
676
print("interrupted\n")
677
except Exception as err:
678
perror("unexpected error: %s" % err)
681
if __name__ == "__main__":