2
/* Tokenizer implementation */
5
#include "pgenheaders.h"
10
#include "tokenizer.h"
14
#include "unicodeobject.h"
15
#include "bytesobject.h"
16
#include "fileobject.h"
21
#define is_potential_identifier_start(c) (\
22
(c >= 'a' && c <= 'z')\
23
|| (c >= 'A' && c <= 'Z')\
27
#define is_potential_identifier_char(c) (\
28
(c >= 'a' && c <= 'z')\
29
|| (c >= 'A' && c <= 'Z')\
30
|| (c >= '0' && c <= '9')\
34
extern char *PyOS_Readline(FILE *, FILE *, const char *);
35
/* Return malloc'ed string including trailing \n;
36
empty malloc'ed string for EOF;
37
NULL if interrupted */
39
/* Don't ever change this -- it would break the portability of Python code */
43
static struct tok_state *tok_new(void);
44
static int tok_nextc(struct tok_state *tok);
45
static void tok_backup(struct tok_state *tok, int c);
50
const char *_PyParser_TokenNames[] = {
103
/* This table must match the #defines in token.h! */
110
/* Create and initialize a new tok_state structure */
112
static struct tok_state *
115
struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116
sizeof(struct tok_state));
119
tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123
tok->tabsize = TABSIZE;
125
tok->indstack[0] = 0;
128
tok->prompt = tok->nextprompt = NULL;
134
tok->altindstack[0] = 0;
135
tok->decoding_state = STATE_INIT;
136
tok->decoding_erred = 0;
137
tok->read_coding_spec = 0;
139
tok->encoding = NULL;
142
tok->filename = NULL;
143
tok->decoding_readline = NULL;
144
tok->decoding_buffer = NULL;
150
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
152
char* result = (char *)PyMem_MALLOC(len + 1);
157
memcpy(result, s, len);
165
decoding_fgets(char *s, int size, struct tok_state *tok)
167
return fgets(s, size, tok->fp);
171
decoding_feof(struct tok_state *tok)
173
return feof(tok->fp);
177
decode_str(const char *str, int exec_input, struct tok_state *tok)
179
return new_string(str, strlen(str), tok);
185
error_ret(struct tok_state *tok) /* XXX */
187
tok->decoding_erred = 1;
188
if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189
PyMem_FREE(tok->buf);
191
return NULL; /* as if it were EOF */
196
get_normal_name(char *s) /* for utf-8 and latin-1 */
200
for (i = 0; i < 12; i++) {
210
if (strcmp(buf, "utf-8") == 0 ||
211
strncmp(buf, "utf-8-", 6) == 0)
213
else if (strcmp(buf, "latin-1") == 0 ||
214
strcmp(buf, "iso-8859-1") == 0 ||
215
strcmp(buf, "iso-latin-1") == 0 ||
216
strncmp(buf, "latin-1-", 8) == 0 ||
217
strncmp(buf, "iso-8859-1-", 11) == 0 ||
218
strncmp(buf, "iso-latin-1-", 12) == 0)
224
/* Return the coding spec in S, or NULL if none is found. */
227
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
231
/* Coding spec must be in a comment, and that comment must be
232
* the only statement on the source code line. */
233
for (i = 0; i < size - 6; i++) {
236
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
239
for (; i < size - 6; i++) { /* XXX inefficient search */
240
const char* t = s + i;
241
if (strncmp(t, "coding", 6) == 0) {
242
const char* begin = NULL;
244
if (t[0] != ':' && t[0] != '=')
248
} while (t[0] == '\x20' || t[0] == '\t');
251
while (Py_ISALNUM(t[0]) ||
252
t[0] == '-' || t[0] == '_' || t[0] == '.')
256
char* r = new_string(begin, t - begin, tok);
260
q = get_normal_name(r);
263
r = new_string(q, strlen(q), tok);
274
/* Check whether the line contains a coding spec. If it does,
275
invoke the set_readline function for the new encoding.
276
This function receives the tok_state and the new encoding.
277
Return 1 on success, 0 on failure. */
280
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
281
int set_readline(struct tok_state *, const char *))
287
/* It's a continuation line, so it can't be a coding spec. */
289
if (!get_coding_spec(line, &cs, size, tok))
293
tok->read_coding_spec = 1;
294
if (tok->encoding == NULL) {
295
assert(tok->decoding_state == STATE_RAW);
296
if (strcmp(cs, "utf-8") == 0) {
299
r = set_readline(tok, cs);
302
tok->decoding_state = STATE_NORMAL;
305
PyErr_Format(PyExc_SyntaxError,
306
"encoding problem: %s", cs);
310
} else { /* then, compare cs with BOM */
311
r = (strcmp(tok->encoding, cs) == 0);
313
PyErr_Format(PyExc_SyntaxError,
314
"encoding problem: %s with BOM", cs);
320
/* See whether the file starts with a BOM. If it does,
321
invoke the set_readline function with the new encoding.
322
Return 1 on success, 0 on failure. */
325
check_bom(int get_char(struct tok_state *),
326
void unget_char(int, struct tok_state *),
327
int set_readline(struct tok_state *, const char *),
328
struct tok_state *tok)
332
tok->decoding_state = STATE_RAW;
335
} else if (ch1 == 0xEF) {
338
unget_char(ch2, tok);
339
unget_char(ch1, tok);
344
unget_char(ch3, tok);
345
unget_char(ch2, tok);
346
unget_char(ch1, tok);
350
/* Disable support for UTF-16 BOMs until a decision
351
is made whether this needs to be supported. */
352
} else if (ch1 == 0xFE) {
355
unget_char(ch2, tok);
356
unget_char(ch1, tok);
359
if (!set_readline(tok, "utf-16-be"))
361
tok->decoding_state = STATE_NORMAL;
362
} else if (ch1 == 0xFF) {
365
unget_char(ch2, tok);
366
unget_char(ch1, tok);
369
if (!set_readline(tok, "utf-16-le"))
371
tok->decoding_state = STATE_NORMAL;
374
unget_char(ch1, tok);
377
if (tok->encoding != NULL)
378
PyMem_FREE(tok->encoding);
379
tok->encoding = new_string("utf-8", 5, tok);
382
/* No need to set_readline: input is already utf-8 */
386
/* Read a line of text from TOK into S, using the stream in TOK.
387
Return NULL on failure, else S.
389
On entry, tok->decoding_buffer will be one of:
390
1) NULL: need to call tok->decoding_readline to get a new line
391
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
392
stored the result in tok->decoding_buffer
393
3) PyByteArrayObject *: previous call to fp_readl did not have enough room
394
(in the s buffer) to copy entire contents of the line read
395
by tok->decoding_readline. tok->decoding_buffer has the overflow.
396
In this case, fp_readl is called in a loop (with an expanded buffer)
397
until the buffer ends with a '\n' (or until the end of the file is
398
reached): see tok_nextc and its calls to decoding_fgets.
402
fp_readl(char *s, int size, struct tok_state *tok)
408
/* Ask for one less byte so we can terminate it */
412
if (tok->decoding_buffer) {
413
bufobj = tok->decoding_buffer;
418
bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
422
if (PyUnicode_CheckExact(bufobj))
424
buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
431
buf = PyByteArray_AsString(bufobj);
435
buflen = PyByteArray_GET_SIZE(bufobj);
438
Py_XDECREF(tok->decoding_buffer);
440
/* Too many chars, the rest goes into tok->decoding_buffer */
441
tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
443
if (tok->decoding_buffer == NULL)
448
tok->decoding_buffer = NULL;
450
memcpy(s, buf, buflen);
452
if (buflen == 0) /* EOF */
459
return error_ret(tok);
462
/* Set the readline function for TOK to a StreamReader's
463
readline function. The StreamReader is named ENC.
465
This function is called from check_bom and check_coding_spec.
467
ENC is usually identical to the future value of tok->encoding,
468
except for the (currently unsupported) case of UTF-16.
470
Return 1 on success, 0 on failure. */
473
fp_setreadl(struct tok_state *tok, const char* enc)
475
PyObject *readline = NULL, *stream = NULL, *io = NULL;
476
_Py_IDENTIFIER(open);
477
_Py_IDENTIFIER(readline);
480
io = PyImport_ImportModuleNoBlock("io");
484
fd = fileno(tok->fp);
485
if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
486
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
490
stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
491
fd, "r", -1, enc, Py_None, Py_None, Py_False);
495
Py_XDECREF(tok->decoding_readline);
496
readline = _PyObject_GetAttrId(stream, &PyId_readline);
497
tok->decoding_readline = readline;
499
/* The file has been reopened; parsing will restart from
500
* the beginning of the file, we have to reset the line number.
501
* But this function has been called from inside tok_nextc() which
502
* will increment lineno before it returns. So we set it -1 so that
503
* the next call to tok_nextc() will start with tok->lineno == 0.
510
return readline != NULL;
513
/* Fetch the next byte from TOK. */
515
static int fp_getc(struct tok_state *tok) {
516
return getc(tok->fp);
519
/* Unfetch the last byte back into TOK. */
521
static void fp_ungetc(int c, struct tok_state *tok) {
525
/* Check whether the characters at s start a valid
526
UTF-8 sequence. Return the number of characters forming
527
the sequence if yes, 0 if not. */
528
static int valid_utf8(const unsigned char* s)
533
/* single-byte code */
546
length = expected + 1;
547
for (; expected; expected--)
548
if (s[expected] < 0x80 || s[expected] >= 0xC0)
553
/* Read a line of input from TOK. Determine encoding
557
decoding_fgets(char *s, int size, struct tok_state *tok)
562
if (tok->decoding_state == STATE_NORMAL) {
563
/* We already have a codec associated with
565
line = fp_readl(s, size, tok);
567
} else if (tok->decoding_state == STATE_RAW) {
568
/* We want a 'raw' read. */
569
line = Py_UniversalNewlineFgets(s, size,
573
/* We have not yet determined the encoding.
574
If an encoding is found, use the file-pointer
575
reader functions from now on. */
576
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
577
return error_ret(tok);
578
assert(tok->decoding_state != STATE_INIT);
581
if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
582
if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
583
return error_ret(tok);
587
/* The default encoding is UTF-8, so make sure we don't have any
588
non-UTF-8 sequences in it. */
589
if (line && !tok->encoding) {
592
for (c = (unsigned char *)line; *c; c += length)
593
if (!(length = valid_utf8(c))) {
599
/* Need to add 1 to the line number, since this line
600
has not been counted, yet. */
601
PyErr_Format(PyExc_SyntaxError,
602
"Non-UTF-8 code starting with '\\x%.2x' "
603
"in file %U on line %i, "
604
"but no encoding declared; "
605
"see http://python.org/dev/peps/pep-0263/ for details",
606
badchar, tok->filename, tok->lineno + 1);
607
return error_ret(tok);
614
decoding_feof(struct tok_state *tok)
616
if (tok->decoding_state != STATE_NORMAL) {
617
return feof(tok->fp);
619
PyObject* buf = tok->decoding_buffer;
621
buf = PyObject_CallObject(tok->decoding_readline, NULL);
626
tok->decoding_buffer = buf;
629
return PyObject_Length(buf) == 0;
633
/* Fetch a byte from TOK, using the string buffer. */
636
buf_getc(struct tok_state *tok) {
637
return Py_CHARMASK(*tok->str++);
640
/* Unfetch a byte from TOK, using the string buffer. */
643
buf_ungetc(int c, struct tok_state *tok) {
645
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
648
/* Set the readline function for TOK to ENC. For the string-based
649
tokenizer, this means to just record the encoding. */
652
buf_setreadl(struct tok_state *tok, const char* enc) {
657
/* Return a UTF-8 encoding Python string object from the
658
C byte string STR, which is encoded with ENC. */
661
translate_into_utf8(const char* str, const char* enc) {
663
PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
666
utf8 = PyUnicode_AsUTF8String(buf);
673
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
674
int skip_next_lf = 0;
675
size_t needed_length = strlen(s) + 2, final_length;
678
buf = PyMem_MALLOC(needed_length);
683
for (current = buf; *s; s++, current++) {
699
/* If this is exec input, add a newline to the end of the string if
700
there isn't one already. */
701
if (exec_input && c != '\n') {
706
final_length = current - buf + 1;
707
if (final_length < needed_length && final_length)
708
/* should never fail */
709
buf = PyMem_REALLOC(buf, final_length);
713
/* Decode a byte string STR for use as the buffer of TOK.
714
Look for encoding declarations inside STR, and record them
718
decode_str(const char *input, int single, struct tok_state *tok)
720
PyObject* utf8 = NULL;
723
const char *newl[2] = {NULL, NULL};
725
tok->input = str = translate_newlines(input, single, tok);
730
if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
731
return error_ret(tok);
732
str = tok->str; /* string after BOM if any */
734
if (tok->enc != NULL) {
735
utf8 = translate_into_utf8(str, tok->enc);
737
return error_ret(tok);
738
str = PyBytes_AsString(utf8);
740
for (s = str;; s++) {
741
if (*s == '\0') break;
742
else if (*s == '\n') {
746
if (lineno == 2) break;
750
/* need to check line 1 and 2 separately since check_coding_spec
751
assumes a single line as input */
753
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
754
return error_ret(tok);
755
if (tok->enc == NULL && newl[1]) {
756
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
758
return error_ret(tok);
761
if (tok->enc != NULL) {
762
assert(utf8 == NULL);
763
utf8 = translate_into_utf8(str, tok->enc);
765
return error_ret(tok);
766
str = PyBytes_AS_STRING(utf8);
768
assert(tok->decoding_buffer == NULL);
769
tok->decoding_buffer = utf8; /* CAUTION */
775
/* Set up tokenizer for string */
778
PyTokenizer_FromString(const char *str, int exec_input)
780
struct tok_state *tok = tok_new();
783
str = decode_str(str, exec_input, tok);
785
PyTokenizer_Free(tok);
789
/* XXX: constify members. */
790
tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
795
PyTokenizer_FromUTF8(const char *str, int exec_input)
797
struct tok_state *tok = tok_new();
801
tok->input = str = translate_newlines(str, exec_input, tok);
804
PyTokenizer_Free(tok);
807
tok->decoding_state = STATE_RAW;
808
tok->read_coding_spec = 1;
811
tok->encoding = (char *)PyMem_MALLOC(6);
812
if (!tok->encoding) {
813
PyTokenizer_Free(tok);
816
strcpy(tok->encoding, "utf-8");
818
/* XXX: constify members. */
819
tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823
/* Set up tokenizer for file */
826
PyTokenizer_FromFile(FILE *fp, const char* enc,
827
const char *ps1, const char *ps2)
829
struct tok_state *tok = tok_new();
832
if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
833
PyTokenizer_Free(tok);
836
tok->cur = tok->inp = tok->buf;
837
tok->end = tok->buf + BUFSIZ;
840
tok->nextprompt = ps2;
842
/* Must copy encoding declaration since it
843
gets copied into the parse tree. */
844
tok->encoding = PyMem_MALLOC(strlen(enc)+1);
845
if (!tok->encoding) {
846
PyTokenizer_Free(tok);
849
strcpy(tok->encoding, enc);
850
tok->decoding_state = STATE_NORMAL;
856
/* Free a tok_state structure */
859
PyTokenizer_Free(struct tok_state *tok)
861
if (tok->encoding != NULL)
862
PyMem_FREE(tok->encoding);
864
Py_XDECREF(tok->decoding_readline);
865
Py_XDECREF(tok->decoding_buffer);
866
Py_XDECREF(tok->filename);
868
if (tok->fp != NULL && tok->buf != NULL)
869
PyMem_FREE(tok->buf);
871
PyMem_FREE((char *)tok->input);
875
/* Get next char, updating state; error code goes into tok->done */
878
tok_nextc(struct tok_state *tok)
881
if (tok->cur != tok->inp) {
882
return Py_CHARMASK(*tok->cur++); /* Fast path */
884
if (tok->done != E_OK)
886
if (tok->fp == NULL) {
887
char *end = strchr(tok->inp, '\n');
891
end = strchr(tok->inp, '\0');
892
if (end == tok->inp) {
897
if (tok->start == NULL)
899
tok->line_start = tok->cur;
902
return Py_CHARMASK(*tok->cur++);
904
if (tok->prompt != NULL) {
905
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
907
if (newtok != NULL) {
908
char *translated = translate_newlines(newtok, 0, tok);
910
if (translated == NULL)
914
if (tok->encoding && newtok && *newtok) {
915
/* Recode to UTF-8 */
918
PyObject *u = translate_into_utf8(newtok, tok->encoding);
921
tok->done = E_DECODE;
924
buflen = PyBytes_GET_SIZE(u);
925
buf = PyBytes_AS_STRING(u);
928
tok->done = E_DECODE;
931
newtok = PyMem_MALLOC(buflen+1);
936
if (tok->nextprompt != NULL)
937
tok->prompt = tok->nextprompt;
940
else if (*newtok == '\0') {
944
else if (tok->start != NULL) {
945
size_t start = tok->start - tok->buf;
946
size_t oldlen = tok->cur - tok->buf;
947
size_t newlen = oldlen + strlen(newtok);
948
char *buf = tok->buf;
949
buf = (char *)PyMem_REALLOC(buf, newlen+1);
952
PyMem_FREE(tok->buf);
959
tok->cur = tok->buf + oldlen;
960
tok->line_start = tok->cur;
961
strcpy(tok->buf + oldlen, newtok);
963
tok->inp = tok->buf + newlen;
964
tok->end = tok->inp + 1;
965
tok->start = tok->buf + start;
969
if (tok->buf != NULL)
970
PyMem_FREE(tok->buf);
972
tok->line_start = tok->buf;
974
tok->line_start = tok->buf;
975
tok->inp = strchr(tok->buf, '\0');
976
tok->end = tok->inp + 1;
983
if (tok->start == NULL) {
984
if (tok->buf == NULL) {
986
PyMem_MALLOC(BUFSIZ);
987
if (tok->buf == NULL) {
991
tok->end = tok->buf + BUFSIZ;
993
if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1000
tok->inp = strchr(tok->buf, '\0');
1001
done = tok->inp[-1] == '\n';
1005
cur = tok->cur - tok->buf;
1006
if (decoding_feof(tok)) {
1014
/* Read until '\n' or EOF */
1016
Py_ssize_t curstart = tok->start == NULL ? -1 :
1017
tok->start - tok->buf;
1018
Py_ssize_t curvalid = tok->inp - tok->buf;
1019
Py_ssize_t newsize = curvalid + BUFSIZ;
1020
char *newbuf = tok->buf;
1021
newbuf = (char *)PyMem_REALLOC(newbuf,
1023
if (newbuf == NULL) {
1024
tok->done = E_NOMEM;
1025
tok->cur = tok->inp;
1029
tok->inp = tok->buf + curvalid;
1030
tok->end = tok->buf + newsize;
1031
tok->start = curstart < 0 ? NULL :
1032
tok->buf + curstart;
1033
if (decoding_fgets(tok->inp,
1034
(int)(tok->end - tok->inp),
1036
/* Break out early on decoding
1037
errors, as tok->buf will be NULL
1039
if (tok->decoding_erred)
1041
/* Last line does not end in \n,
1043
strcpy(tok->inp, "\n");
1045
tok->inp = strchr(tok->inp, '\0');
1046
done = tok->inp[-1] == '\n';
1048
if (tok->buf != NULL) {
1049
tok->cur = tok->buf + cur;
1050
tok->line_start = tok->cur;
1051
/* replace "\r\n" with "\n" */
1052
/* For Mac leave the \r, giving a syntax error */
1054
if (pt >= tok->buf && *pt == '\r') {
1061
if (tok->done != E_OK) {
1062
if (tok->prompt != NULL)
1063
PySys_WriteStderr("\n");
1064
tok->cur = tok->inp;
1072
/* Back-up one character */
1075
tok_backup(struct tok_state *tok, int c)
1078
if (--tok->cur < tok->buf)
1079
Py_FatalError("tok_backup: beginning of buffer");
1086
/* Return the token corresponding to a single character */
1089
PyToken_OneChar(int c)
1092
case '(': return LPAR;
1093
case ')': return RPAR;
1094
case '[': return LSQB;
1095
case ']': return RSQB;
1096
case ':': return COLON;
1097
case ',': return COMMA;
1098
case ';': return SEMI;
1099
case '+': return PLUS;
1100
case '-': return MINUS;
1101
case '*': return STAR;
1102
case '/': return SLASH;
1103
case '|': return VBAR;
1104
case '&': return AMPER;
1105
case '<': return LESS;
1106
case '>': return GREATER;
1107
case '=': return EQUAL;
1108
case '.': return DOT;
1109
case '%': return PERCENT;
1110
case '{': return LBRACE;
1111
case '}': return RBRACE;
1112
case '^': return CIRCUMFLEX;
1113
case '~': return TILDE;
1114
case '@': return AT;
1121
PyToken_TwoChars(int c1, int c2)
1126
case '=': return EQEQUAL;
1131
case '=': return NOTEQUAL;
1136
case '>': return NOTEQUAL;
1137
case '=': return LESSEQUAL;
1138
case '<': return LEFTSHIFT;
1143
case '=': return GREATEREQUAL;
1144
case '>': return RIGHTSHIFT;
1149
case '=': return PLUSEQUAL;
1154
case '=': return MINEQUAL;
1155
case '>': return RARROW;
1160
case '*': return DOUBLESTAR;
1161
case '=': return STAREQUAL;
1166
case '/': return DOUBLESLASH;
1167
case '=': return SLASHEQUAL;
1172
case '=': return VBAREQUAL;
1177
case '=': return PERCENTEQUAL;
1182
case '=': return AMPEREQUAL;
1187
case '=': return CIRCUMFLEXEQUAL;
1195
PyToken_ThreeChars(int c1, int c2, int c3)
1203
return LEFTSHIFTEQUAL;
1213
return RIGHTSHIFTEQUAL;
1223
return DOUBLESTAREQUAL;
1233
return DOUBLESLASHEQUAL;
1253
indenterror(struct tok_state *tok)
1255
if (tok->alterror) {
1256
tok->done = E_TABSPACE;
1257
tok->cur = tok->inp;
1260
if (tok->altwarning) {
1262
PySys_WriteStderr("inconsistent use of tabs and spaces "
1263
"in indentation\n");
1265
PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1266
"in indentation\n", tok->filename);
1268
tok->altwarning = 0;
1274
#define verify_identifier(tok) 1
1276
/* Verify that the identifier follows PEP 3131.
1277
All identifier strings are guaranteed to be "ready" unicode objects.
1280
verify_identifier(struct tok_state *tok)
1284
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1285
if (s == NULL || PyUnicode_READY(s) == -1) {
1286
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1288
tok->done = E_IDENTIFIER;
1290
tok->done = E_ERROR;
1294
result = PyUnicode_IsIdentifier(s);
1297
tok->done = E_IDENTIFIER;
1302
/* Get next token, after space stripping etc. */
1305
tok_get(struct tok_state *tok, char **p_start, char **p_end)
1308
int blankline, nonascii;
1310
*p_start = *p_end = NULL;
1315
/* Get indentation level */
1324
else if (c == '\t') {
1325
col = (col/tok->tabsize + 1) * tok->tabsize;
1326
altcol = (altcol/tok->alttabsize + 1)
1329
else if (c == '\014') /* Control-L (formfeed) */
1330
col = altcol = 0; /* For Emacs users */
1335
if (c == '#' || c == '\n') {
1336
/* Lines with only whitespace and/or comments
1337
shouldn't affect the indentation and are
1338
not passed to the parser as NEWLINE tokens,
1339
except *totally* empty lines in interactive
1340
mode, which signal the end of a command group. */
1341
if (col == 0 && c == '\n' && tok->prompt != NULL)
1342
blankline = 0; /* Let it through */
1344
blankline = 1; /* Ignore completely */
1345
/* We can't jump back right here since we still
1346
may need to skip to the end of a comment */
1348
if (!blankline && tok->level == 0) {
1349
if (col == tok->indstack[tok->indent]) {
1351
if (altcol != tok->altindstack[tok->indent]) {
1352
if (indenterror(tok))
1356
else if (col > tok->indstack[tok->indent]) {
1357
/* Indent -- always one */
1358
if (tok->indent+1 >= MAXINDENT) {
1359
tok->done = E_TOODEEP;
1360
tok->cur = tok->inp;
1363
if (altcol <= tok->altindstack[tok->indent]) {
1364
if (indenterror(tok))
1368
tok->indstack[++tok->indent] = col;
1369
tok->altindstack[tok->indent] = altcol;
1371
else /* col < tok->indstack[tok->indent] */ {
1372
/* Dedent -- any number, must be consistent */
1373
while (tok->indent > 0 &&
1374
col < tok->indstack[tok->indent]) {
1378
if (col != tok->indstack[tok->indent]) {
1379
tok->done = E_DEDENT;
1380
tok->cur = tok->inp;
1383
if (altcol != tok->altindstack[tok->indent]) {
1384
if (indenterror(tok))
1391
tok->start = tok->cur;
1393
/* Return pending indents/dedents */
1394
if (tok->pendin != 0) {
1395
if (tok->pendin < 0) {
1410
} while (c == ' ' || c == '\t' || c == '\014');
1412
/* Set start of current token */
1413
tok->start = tok->cur - 1;
1417
while (c != EOF && c != '\n')
1420
/* Check for EOF and errors now */
1422
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1425
/* Identifier (most frequent token!) */
1427
if (is_potential_identifier_start(c)) {
1428
/* Process b"", r"", u"", br"" and rb"" */
1429
int saw_b = 0, saw_r = 0, saw_u = 0;
1431
if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
1433
/* Since this is a backwards compatibility support literal we don't
1434
want to support it in arbitrary order like byte literals. */
1435
else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1437
/* ur"" and ru"" are not supported */
1438
else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
1443
if (c == '"' || c == '\'')
1446
while (is_potential_identifier_char(c)) {
1453
!verify_identifier(tok)) {
1454
tok->done = E_IDENTIFIER;
1457
*p_start = tok->start;
1465
if (blankline || tok->level > 0)
1467
*p_start = tok->start;
1468
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1473
/* Period or number starting with period? */
1478
} else if (c == '.') {
1481
*p_start = tok->start;
1487
tok_backup(tok, '.');
1491
*p_start = tok->start;
1499
/* Hex, octal or binary -- maybe. */
1503
if (c == 'j' || c == 'J')
1505
if (c == 'x' || c == 'X') {
1510
tok->done = E_TOKEN;
1516
} while (isxdigit(c));
1518
else if (c == 'o' || c == 'O') {
1521
if (c < '0' || c >= '8') {
1522
tok->done = E_TOKEN;
1528
} while ('0' <= c && c < '8');
1530
else if (c == 'b' || c == 'B') {
1533
if (c != '0' && c != '1') {
1534
tok->done = E_TOKEN;
1540
} while (c == '0' || c == '1');
1544
/* maybe old-style octal; c is first char of it */
1545
/* in any case, allow '0' as a literal */
1548
while (isdigit(c)) {
1554
else if (c == 'e' || c == 'E')
1556
else if (c == 'j' || c == 'J')
1559
tok->done = E_TOKEN;
1569
} while (isdigit(c));
1571
/* Accept floating point numbers. */
1577
} while (isdigit(c));
1579
if (c == 'e' || c == 'E') {
1583
if (c == '+' || c == '-')
1586
tok->done = E_TOKEN;
1592
} while (isdigit(c));
1594
if (c == 'j' || c == 'J')
1595
/* Imaginary part */
1601
*p_start = tok->start;
1608
if (c == '\'' || c == '"') {
1610
int quote_size = 1; /* 1 or 3 */
1611
int end_quote_size = 0;
1613
/* Find the quote size and start of string */
1620
end_quote_size = 1; /* empty string found */
1625
/* Get rest of string */
1626
while (end_quote_size != quote_size) {
1629
if (quote_size == 3)
1633
tok->cur = tok->inp;
1636
if (quote_size == 1 && c == '\n') {
1638
tok->cur = tok->inp;
1642
end_quote_size += 1;
1646
c = tok_nextc(tok); /* skip escaped char */
1650
*p_start = tok->start;
1655
/* Line continuation */
1659
tok->done = E_LINECONT;
1660
tok->cur = tok->inp;
1664
goto again; /* Read next line */
1667
/* Check for two-character token */
1669
int c2 = tok_nextc(tok);
1670
int token = PyToken_TwoChars(c, c2);
1672
int c3 = tok_nextc(tok);
1673
int token3 = PyToken_ThreeChars(c, c2, c3);
1677
tok_backup(tok, c3);
1679
*p_start = tok->start;
1683
tok_backup(tok, c2);
1686
/* Keep track of parentheses nesting level */
1700
/* Punctuation character */
1701
*p_start = tok->start;
1703
return PyToken_OneChar(c);
1707
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1709
int result = tok_get(tok, p_start, p_end);
1710
if (tok->decoding_erred) {
1711
result = ERRORTOKEN;
1712
tok->done = E_DECODE;
1717
/* Get the encoding of a Python file. Check for the coding cookie and check if
1718
the file starts with a BOM.
1720
PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1721
encoding in the first or second line of the file (in which case the encoding
1722
should be assumed to be UTF-8).
1724
The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1728
PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1730
struct tok_state *tok;
1732
char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1743
fp = fdopen(fd, "r");
1747
tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1753
if (filename != NULL) {
1754
Py_INCREF(filename);
1755
tok->filename = filename;
1758
tok->filename = PyUnicode_FromString("<string>");
1759
if (tok->filename == NULL) {
1761
PyTokenizer_Free(tok);
1766
while (tok->lineno < 2 && tok->done == E_OK) {
1767
PyTokenizer_Get(tok, &p_start, &p_end);
1770
if (tok->encoding) {
1771
encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1773
strcpy(encoding, tok->encoding);
1775
PyTokenizer_Free(tok);
1780
PyTokenizer_FindEncoding(int fd)
1782
return PyTokenizer_FindEncodingFilename(fd, NULL);
1788
tok_dump(int type, char *start, char *end)
1790
printf("%s", _PyParser_TokenNames[type]);
1791
if (type == NAME || type == NUMBER || type == STRING || type == OP)
1792
printf("(%.*s)", (int)(end - start), start);