2
/* Tokenizer implementation */
5
#include "pgenheaders.h"
10
#include "tokenizer.h"
14
#include "unicodeobject.h"
15
#include "bytesobject.h"
16
#include "fileobject.h"
21
#define is_potential_identifier_start(c) (\
22
(c >= 'a' && c <= 'z')\
23
|| (c >= 'A' && c <= 'Z')\
27
#define is_potential_identifier_char(c) (\
28
(c >= 'a' && c <= 'z')\
29
|| (c >= 'A' && c <= 'Z')\
30
|| (c >= '0' && c <= '9')\
34
extern char *PyOS_Readline(FILE *, FILE *, char *);
35
/* Return malloc'ed string including trailing \n;
36
empty malloc'ed string for EOF;
37
NULL if interrupted */
39
/* Don't ever change this -- it would break the portability of Python code */
43
static struct tok_state *tok_new(void);
44
static int tok_nextc(struct tok_state *tok);
45
static void tok_backup(struct tok_state *tok, int c);
50
char *_PyParser_TokenNames[] = {
103
/* This table must match the #defines in token.h! */
110
/* Create and initialize a new tok_state structure */
112
static struct tok_state *
115
struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116
sizeof(struct tok_state));
119
tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
122
tok->tabsize = TABSIZE;
124
tok->indstack[0] = 0;
127
tok->prompt = tok->nextprompt = NULL;
130
tok->filename = NULL;
134
tok->altindstack[0] = 0;
135
tok->decoding_state = STATE_INIT;
136
tok->decoding_erred = 0;
137
tok->read_coding_spec = 0;
139
tok->encoding = NULL;
142
tok->decoding_readline = NULL;
143
tok->decoding_buffer = NULL;
151
decoding_fgets(char *s, int size, struct tok_state *tok)
153
return fgets(s, size, tok->fp);
157
decoding_feof(struct tok_state *tok)
159
return feof(tok->fp);
163
decode_str(const char *str, struct tok_state *tok)
171
error_ret(struct tok_state *tok) /* XXX */
173
tok->decoding_erred = 1;
174
if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
175
PyMem_FREE(tok->buf);
177
return NULL; /* as if it were EOF */
181
new_string(const char *s, Py_ssize_t len)
183
char* result = (char *)PyMem_MALLOC(len + 1);
184
if (result != NULL) {
185
memcpy(result, s, len);
192
get_normal_name(char *s) /* for utf-8 and latin-1 */
196
for (i = 0; i < 12; i++) {
198
if (c == '\0') break;
199
else if (c == '_') buf[i] = '-';
200
else buf[i] = tolower(c);
203
if (strcmp(buf, "utf-8") == 0 ||
204
strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
205
else if (strcmp(buf, "latin-1") == 0 ||
206
strcmp(buf, "iso-8859-1") == 0 ||
207
strcmp(buf, "iso-latin-1") == 0 ||
208
strncmp(buf, "latin-1-", 8) == 0 ||
209
strncmp(buf, "iso-8859-1-", 11) == 0 ||
210
strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
214
/* Return the coding spec in S, or NULL if none is found. */
217
get_coding_spec(const char *s, Py_ssize_t size)
220
/* Coding spec must be in a comment, and that comment must be
221
* the only statement on the source code line. */
222
for (i = 0; i < size - 6; i++) {
225
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
228
for (; i < size - 6; i++) { /* XXX inefficient search */
229
const char* t = s + i;
230
if (strncmp(t, "coding", 6) == 0) {
231
const char* begin = NULL;
233
if (t[0] != ':' && t[0] != '=')
237
} while (t[0] == '\x20' || t[0] == '\t');
240
while (isalnum(Py_CHARMASK(t[0])) ||
241
t[0] == '-' || t[0] == '_' || t[0] == '.')
245
char* r = new_string(begin, t - begin);
246
char* q = get_normal_name(r);
249
r = new_string(q, strlen(q));
258
/* Check whether the line contains a coding spec. If it does,
259
invoke the set_readline function for the new encoding.
260
This function receives the tok_state and the new encoding.
261
Return 1 on success, 0 on failure. */
264
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
265
int set_readline(struct tok_state *, const char *))
271
/* It's a continuation line, so it can't be a coding spec. */
273
cs = get_coding_spec(line, size);
275
tok->read_coding_spec = 1;
276
if (tok->encoding == NULL) {
277
assert(tok->decoding_state == STATE_RAW);
278
if (strcmp(cs, "utf-8") == 0) {
281
r = set_readline(tok, cs);
284
tok->decoding_state = STATE_NORMAL;
289
} else { /* then, compare cs with BOM */
290
r = (strcmp(tok->encoding, cs) == 0);
298
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
303
/* See whether the file starts with a BOM. If it does,
304
invoke the set_readline function with the new encoding.
305
Return 1 on success, 0 on failure. */
308
check_bom(int get_char(struct tok_state *),
309
void unget_char(int, struct tok_state *),
310
int set_readline(struct tok_state *, const char *),
311
struct tok_state *tok)
313
int ch = get_char(tok);
314
tok->decoding_state = STATE_RAW;
317
} else if (ch == 0xEF) {
321
unget_char(0xEF, tok);
322
/* any token beginning with '\xEF' is a bad token */
328
unget_char(0xBB, tok);
329
unget_char(0xEF, tok);
330
/* any token beginning with '\xEF' is a bad token */
334
/* Disable support for UTF-16 BOMs until a decision
335
is made whether this needs to be supported. */
336
} else if (ch == 0xFE) {
337
ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
338
if (!set_readline(tok, "utf-16-be")) return 0;
339
tok->decoding_state = STATE_NORMAL;
340
} else if (ch == 0xFF) {
341
ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
342
if (!set_readline(tok, "utf-16-le")) return 0;
343
tok->decoding_state = STATE_NORMAL;
349
if (tok->encoding != NULL)
350
PyMem_FREE(tok->encoding);
351
tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
352
/* No need to set_readline: input is already utf-8 */
356
/* Read a line of text from TOK into S, using the stream in TOK.
357
Return NULL on failure, else S.
359
On entry, tok->decoding_buffer will be one of:
360
1) NULL: need to call tok->decoding_readline to get a new line
361
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362
stored the result in tok->decoding_buffer
363
3) PyByteArrayObject *: previous call to fp_readl did not have enough room
364
(in the s buffer) to copy entire contents of the line read
365
by tok->decoding_readline. tok->decoding_buffer has the overflow.
366
In this case, fp_readl is called in a loop (with an expanded buffer)
367
until the buffer ends with a '\n' (or until the end of the file is
368
reached): see tok_nextc and its calls to decoding_fgets.
372
fp_readl(char *s, int size, struct tok_state *tok)
378
/* Ask for one less byte so we can terminate it */
382
if (tok->decoding_buffer) {
383
bufobj = tok->decoding_buffer;
388
bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
392
if (PyUnicode_CheckExact(bufobj))
394
buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
401
buf = PyByteArray_AsString(bufobj);
405
buflen = PyByteArray_GET_SIZE(bufobj);
408
Py_XDECREF(tok->decoding_buffer);
410
/* Too many chars, the rest goes into tok->decoding_buffer */
411
tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
413
if (tok->decoding_buffer == NULL)
418
tok->decoding_buffer = NULL;
420
memcpy(s, buf, buflen);
422
if (buflen == 0) /* EOF */
429
return error_ret(tok);
432
/* Set the readline function for TOK to a StreamReader's
433
readline function. The StreamReader is named ENC.
435
This function is called from check_bom and check_coding_spec.
437
ENC is usually identical to the future value of tok->encoding,
438
except for the (currently unsupported) case of UTF-16.
440
Return 1 on success, 0 on failure. */
443
fp_setreadl(struct tok_state *tok, const char* enc)
445
PyObject *readline = NULL, *stream = NULL, *io = NULL;
447
io = PyImport_ImportModuleNoBlock("io");
452
stream = PyObject_CallMethod(io, "open", "ssis",
453
tok->filename, "r", -1, enc);
455
stream = PyObject_CallMethod(io, "open", "isisOOO",
456
fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
460
Py_XDECREF(tok->decoding_readline);
461
readline = PyObject_GetAttrString(stream, "readline");
462
tok->decoding_readline = readline;
464
/* The file has been reopened; parsing will restart from
465
* the beginning of the file, we have to reset the line number.
466
* But this function has been called from inside tok_nextc() which
467
* will increment lineno before it returns. So we set it -1 so that
468
* the next call to tok_nextc() will start with tok->lineno == 0.
475
return readline != NULL;
478
/* Fetch the next byte from TOK. */
480
static int fp_getc(struct tok_state *tok) {
481
return getc(tok->fp);
484
/* Unfetch the last byte back into TOK. */
486
static void fp_ungetc(int c, struct tok_state *tok) {
490
/* Check whether the characters at s start a valid
491
UTF-8 sequence. Return the number of characters forming
492
the sequence if yes, 0 if not. */
493
static int valid_utf8(const unsigned char* s)
498
/* single-byte code */
511
length = expected + 1;
512
for (; expected; expected--)
513
if (s[expected] < 0x80 || s[expected] >= 0xC0)
518
/* Read a line of input from TOK. Determine encoding
522
decoding_fgets(char *s, int size, struct tok_state *tok)
527
if (tok->decoding_state == STATE_NORMAL) {
528
/* We already have a codec associated with
530
line = fp_readl(s, size, tok);
532
} else if (tok->decoding_state == STATE_RAW) {
533
/* We want a 'raw' read. */
534
line = Py_UniversalNewlineFgets(s, size,
538
/* We have not yet determined the encoding.
539
If an encoding is found, use the file-pointer
540
reader functions from now on. */
541
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
542
return error_ret(tok);
543
assert(tok->decoding_state != STATE_INIT);
546
if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
547
if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
548
return error_ret(tok);
552
/* The default encoding is UTF-8, so make sure we don't have any
553
non-UTF-8 sequences in it. */
554
if (line && !tok->encoding) {
557
for (c = (unsigned char *)line; *c; c += length)
558
if (!(length = valid_utf8(c))) {
565
/* Need to add 1 to the line number, since this line
566
has not been counted, yet. */
568
"Non-UTF-8 code starting with '\\x%.2x' "
569
"in file %.200s on line %i, "
570
"but no encoding declared; "
571
"see http://python.org/dev/peps/pep-0263/ for details",
572
badchar, tok->filename, tok->lineno + 1);
573
PyErr_SetString(PyExc_SyntaxError, buf);
574
return error_ret(tok);
581
decoding_feof(struct tok_state *tok)
583
if (tok->decoding_state != STATE_NORMAL) {
584
return feof(tok->fp);
586
PyObject* buf = tok->decoding_buffer;
588
buf = PyObject_CallObject(tok->decoding_readline, NULL);
593
tok->decoding_buffer = buf;
596
return PyObject_Length(buf) == 0;
600
/* Fetch a byte from TOK, using the string buffer. */
603
buf_getc(struct tok_state *tok) {
604
return Py_CHARMASK(*tok->str++);
607
/* Unfetch a byte from TOK, using the string buffer. */
610
buf_ungetc(int c, struct tok_state *tok) {
612
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
615
/* Set the readline function for TOK to ENC. For the string-based
616
tokenizer, this means to just record the encoding. */
619
buf_setreadl(struct tok_state *tok, const char* enc) {
624
/* Return a UTF-8 encoding Python string object from the
625
C byte string STR, which is encoded with ENC. */
628
translate_into_utf8(const char* str, const char* enc) {
630
PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
633
utf8 = PyUnicode_AsUTF8String(buf);
638
/* Decode a byte string STR for use as the buffer of TOK.
639
Look for encoding declarations inside STR, and record them
643
decode_str(const char *str, struct tok_state *tok)
645
PyObject* utf8 = NULL;
647
const char *newl[2] = {NULL, NULL};
651
if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
652
return error_ret(tok);
653
str = tok->str; /* string after BOM if any */
655
if (tok->enc != NULL) {
656
utf8 = translate_into_utf8(str, tok->enc);
658
return error_ret(tok);
659
str = PyBytes_AsString(utf8);
661
for (s = str;; s++) {
662
if (*s == '\0') break;
663
else if (*s == '\n') {
667
if (lineno == 2) break;
671
/* need to check line 1 and 2 separately since check_coding_spec
672
assumes a single line as input */
674
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
675
return error_ret(tok);
676
if (tok->enc == NULL && newl[1]) {
677
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
679
return error_ret(tok);
682
if (tok->enc != NULL) {
683
assert(utf8 == NULL);
684
utf8 = translate_into_utf8(str, tok->enc);
686
PyErr_Format(PyExc_SyntaxError,
687
"unknown encoding: %s", tok->enc);
688
return error_ret(tok);
690
str = PyBytes_AS_STRING(utf8);
692
assert(tok->decoding_buffer == NULL);
693
tok->decoding_buffer = utf8; /* CAUTION */
699
/* Set up tokenizer for string */
702
PyTokenizer_FromString(const char *str)
704
struct tok_state *tok = tok_new();
707
str = (char *)decode_str(str, tok);
709
PyTokenizer_Free(tok);
713
/* XXX: constify members. */
714
tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
719
PyTokenizer_FromUTF8(const char *str)
721
struct tok_state *tok = tok_new();
724
tok->decoding_state = STATE_RAW;
725
tok->read_coding_spec = 1;
728
tok->encoding = (char *)PyMem_MALLOC(6);
729
if (!tok->encoding) {
730
PyTokenizer_Free(tok);
733
strcpy(tok->encoding, "utf-8");
735
/* XXX: constify members. */
736
tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
741
/* Set up tokenizer for file */
744
PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
746
struct tok_state *tok = tok_new();
749
if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
750
PyTokenizer_Free(tok);
753
tok->cur = tok->inp = tok->buf;
754
tok->end = tok->buf + BUFSIZ;
757
tok->nextprompt = ps2;
759
/* Must copy encoding declaration since it
760
gets copied into the parse tree. */
761
tok->encoding = PyMem_MALLOC(strlen(enc)+1);
762
if (!tok->encoding) {
763
PyTokenizer_Free(tok);
766
strcpy(tok->encoding, enc);
767
tok->decoding_state = STATE_NORMAL;
773
/* Free a tok_state structure */
776
PyTokenizer_Free(struct tok_state *tok)
778
if (tok->encoding != NULL)
779
PyMem_FREE(tok->encoding);
781
Py_XDECREF(tok->decoding_readline);
782
Py_XDECREF(tok->decoding_buffer);
784
if (tok->fp != NULL && tok->buf != NULL)
785
PyMem_FREE(tok->buf);
789
/* Get next char, updating state; error code goes into tok->done */
792
tok_nextc(register struct tok_state *tok)
795
if (tok->cur != tok->inp) {
796
return Py_CHARMASK(*tok->cur++); /* Fast path */
798
if (tok->done != E_OK)
800
if (tok->fp == NULL) {
801
char *end = strchr(tok->inp, '\n');
805
end = strchr(tok->inp, '\0');
806
if (end == tok->inp) {
811
if (tok->start == NULL)
813
tok->line_start = tok->cur;
816
return Py_CHARMASK(*tok->cur++);
818
if (tok->prompt != NULL) {
819
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
821
if (tok->encoding && newtok && *newtok) {
822
/* Recode to UTF-8 */
825
PyObject *u = translate_into_utf8(newtok, tok->encoding);
828
tok->done = E_DECODE;
831
buflen = PyBytes_GET_SIZE(u);
832
buf = PyBytes_AS_STRING(u);
835
tok->done = E_DECODE;
838
newtok = PyMem_MALLOC(buflen+1);
843
if (tok->nextprompt != NULL)
844
tok->prompt = tok->nextprompt;
847
else if (*newtok == '\0') {
851
else if (tok->start != NULL) {
852
size_t start = tok->start - tok->buf;
853
size_t oldlen = tok->cur - tok->buf;
854
size_t newlen = oldlen + strlen(newtok);
855
char *buf = tok->buf;
856
buf = (char *)PyMem_REALLOC(buf, newlen+1);
859
PyMem_FREE(tok->buf);
866
tok->cur = tok->buf + oldlen;
867
tok->line_start = tok->cur;
868
strcpy(tok->buf + oldlen, newtok);
870
tok->inp = tok->buf + newlen;
871
tok->end = tok->inp + 1;
872
tok->start = tok->buf + start;
876
if (tok->buf != NULL)
877
PyMem_FREE(tok->buf);
879
tok->line_start = tok->buf;
881
tok->line_start = tok->buf;
882
tok->inp = strchr(tok->buf, '\0');
883
tok->end = tok->inp + 1;
890
if (tok->start == NULL) {
891
if (tok->buf == NULL) {
893
PyMem_MALLOC(BUFSIZ);
894
if (tok->buf == NULL) {
898
tok->end = tok->buf + BUFSIZ;
900
if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
907
tok->inp = strchr(tok->buf, '\0');
908
done = tok->inp[-1] == '\n';
912
cur = tok->cur - tok->buf;
913
if (decoding_feof(tok)) {
921
/* Read until '\n' or EOF */
923
Py_ssize_t curstart = tok->start == NULL ? -1 :
924
tok->start - tok->buf;
925
Py_ssize_t curvalid = tok->inp - tok->buf;
926
Py_ssize_t newsize = curvalid + BUFSIZ;
927
char *newbuf = tok->buf;
928
newbuf = (char *)PyMem_REALLOC(newbuf,
930
if (newbuf == NULL) {
936
tok->inp = tok->buf + curvalid;
937
tok->end = tok->buf + newsize;
938
tok->start = curstart < 0 ? NULL :
940
if (decoding_fgets(tok->inp,
941
(int)(tok->end - tok->inp),
943
/* Break out early on decoding
944
errors, as tok->buf will be NULL
946
if (tok->decoding_erred)
948
/* Last line does not end in \n,
950
strcpy(tok->inp, "\n");
952
tok->inp = strchr(tok->inp, '\0');
953
done = tok->inp[-1] == '\n';
955
if (tok->buf != NULL) {
956
tok->cur = tok->buf + cur;
957
tok->line_start = tok->cur;
958
/* replace "\r\n" with "\n" */
959
/* For Mac leave the \r, giving a syntax error */
961
if (pt >= tok->buf && *pt == '\r') {
968
if (tok->done != E_OK) {
969
if (tok->prompt != NULL)
970
PySys_WriteStderr("\n");
979
/* Back-up one character */
982
tok_backup(register struct tok_state *tok, register int c)
985
if (--tok->cur < tok->buf)
986
Py_FatalError("tok_backup: begin of buffer");
993
/* Return the token corresponding to a single character */
996
PyToken_OneChar(int c)
999
case '(': return LPAR;
1000
case ')': return RPAR;
1001
case '[': return LSQB;
1002
case ']': return RSQB;
1003
case ':': return COLON;
1004
case ',': return COMMA;
1005
case ';': return SEMI;
1006
case '+': return PLUS;
1007
case '-': return MINUS;
1008
case '*': return STAR;
1009
case '/': return SLASH;
1010
case '|': return VBAR;
1011
case '&': return AMPER;
1012
case '<': return LESS;
1013
case '>': return GREATER;
1014
case '=': return EQUAL;
1015
case '.': return DOT;
1016
case '%': return PERCENT;
1017
case '{': return LBRACE;
1018
case '}': return RBRACE;
1019
case '^': return CIRCUMFLEX;
1020
case '~': return TILDE;
1021
case '@': return AT;
1028
PyToken_TwoChars(int c1, int c2)
1033
case '=': return EQEQUAL;
1038
case '=': return NOTEQUAL;
1043
case '=': return LESSEQUAL;
1044
case '<': return LEFTSHIFT;
1049
case '=': return GREATEREQUAL;
1050
case '>': return RIGHTSHIFT;
1055
case '=': return PLUSEQUAL;
1060
case '=': return MINEQUAL;
1061
case '>': return RARROW;
1066
case '*': return DOUBLESTAR;
1067
case '=': return STAREQUAL;
1072
case '/': return DOUBLESLASH;
1073
case '=': return SLASHEQUAL;
1078
case '=': return VBAREQUAL;
1083
case '=': return PERCENTEQUAL;
1088
case '=': return AMPEREQUAL;
1093
case '=': return CIRCUMFLEXEQUAL;
1101
PyToken_ThreeChars(int c1, int c2, int c3)
1109
return LEFTSHIFTEQUAL;
1119
return RIGHTSHIFTEQUAL;
1129
return DOUBLESTAREQUAL;
1139
return DOUBLESLASHEQUAL;
1159
indenterror(struct tok_state *tok)
1161
if (tok->alterror) {
1162
tok->done = E_TABSPACE;
1163
tok->cur = tok->inp;
1166
if (tok->altwarning) {
1167
PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1168
"in indentation\n", tok->filename);
1169
tok->altwarning = 0;
1175
#define verify_identifier(s,e) 1
1177
/* Verify that the identifier follows PEP 3131. */
1179
verify_identifier(char *start, char *end)
1183
s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1188
result = PyUnicode_IsIdentifier(s);
1194
/* Get next token, after space stripping etc. */
1197
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1200
int blankline, nonascii;
1202
*p_start = *p_end = NULL;
1207
/* Get indentation level */
1209
register int col = 0;
1210
register int altcol = 0;
1216
else if (c == '\t') {
1217
col = (col/tok->tabsize + 1) * tok->tabsize;
1218
altcol = (altcol/tok->alttabsize + 1)
1221
else if (c == '\014') /* Control-L (formfeed) */
1222
col = altcol = 0; /* For Emacs users */
1227
if (c == '#' || c == '\n') {
1228
/* Lines with only whitespace and/or comments
1229
shouldn't affect the indentation and are
1230
not passed to the parser as NEWLINE tokens,
1231
except *totally* empty lines in interactive
1232
mode, which signal the end of a command group. */
1233
if (col == 0 && c == '\n' && tok->prompt != NULL)
1234
blankline = 0; /* Let it through */
1236
blankline = 1; /* Ignore completely */
1237
/* We can't jump back right here since we still
1238
may need to skip to the end of a comment */
1240
if (!blankline && tok->level == 0) {
1241
if (col == tok->indstack[tok->indent]) {
1243
if (altcol != tok->altindstack[tok->indent]) {
1244
if (indenterror(tok))
1248
else if (col > tok->indstack[tok->indent]) {
1249
/* Indent -- always one */
1250
if (tok->indent+1 >= MAXINDENT) {
1251
tok->done = E_TOODEEP;
1252
tok->cur = tok->inp;
1255
if (altcol <= tok->altindstack[tok->indent]) {
1256
if (indenterror(tok))
1260
tok->indstack[++tok->indent] = col;
1261
tok->altindstack[tok->indent] = altcol;
1263
else /* col < tok->indstack[tok->indent] */ {
1264
/* Dedent -- any number, must be consistent */
1265
while (tok->indent > 0 &&
1266
col < tok->indstack[tok->indent]) {
1270
if (col != tok->indstack[tok->indent]) {
1271
tok->done = E_DEDENT;
1272
tok->cur = tok->inp;
1275
if (altcol != tok->altindstack[tok->indent]) {
1276
if (indenterror(tok))
1283
tok->start = tok->cur;
1285
/* Return pending indents/dedents */
1286
if (tok->pendin != 0) {
1287
if (tok->pendin < 0) {
1302
} while (c == ' ' || c == '\t' || c == '\014');
1304
/* Set start of current token */
1305
tok->start = tok->cur - 1;
1309
while (c != EOF && c != '\n')
1312
/* Check for EOF and errors now */
1314
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1317
/* Identifier (most frequent token!) */
1319
if (is_potential_identifier_start(c)) {
1320
/* Process b"", r"" and br"" */
1321
if (c == 'b' || c == 'B') {
1323
if (c == '"' || c == '\'')
1326
if (c == 'r' || c == 'R') {
1328
if (c == '"' || c == '\'')
1331
while (is_potential_identifier_char(c)) {
1338
!verify_identifier(tok->start, tok->cur)) {
1339
tok->done = E_IDENTIFIER;
1342
*p_start = tok->start;
1350
if (blankline || tok->level > 0)
1352
*p_start = tok->start;
1353
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1358
/* Period or number starting with period? */
1363
} else if (c == '.') {
1366
*p_start = tok->start;
1372
tok_backup(tok, '.');
1376
*p_start = tok->start;
1384
/* Hex, octal or binary -- maybe. */
1388
#ifndef WITHOUT_COMPLEX
1389
if (c == 'j' || c == 'J')
1392
if (c == 'x' || c == 'X') {
1397
tok->done = E_TOKEN;
1403
} while (isxdigit(c));
1405
else if (c == 'o' || c == 'O') {
1408
if (c < '0' || c >= '8') {
1409
tok->done = E_TOKEN;
1415
} while ('0' <= c && c < '8');
1417
else if (c == 'b' || c == 'B') {
1420
if (c != '0' && c != '1') {
1421
tok->done = E_TOKEN;
1427
} while (c == '0' || c == '1');
1431
/* maybe old-style octal; c is first char of it */
1432
/* in any case, allow '0' as a literal */
1435
while (isdigit(c)) {
1441
else if (c == 'e' || c == 'E')
1443
#ifndef WITHOUT_COMPLEX
1444
else if (c == 'j' || c == 'J')
1448
tok->done = E_TOKEN;
1458
} while (isdigit(c));
1460
/* Accept floating point numbers. */
1466
} while (isdigit(c));
1468
if (c == 'e' || c == 'E') {
1472
if (c == '+' || c == '-')
1475
tok->done = E_TOKEN;
1481
} while (isdigit(c));
1483
#ifndef WITHOUT_COMPLEX
1484
if (c == 'j' || c == 'J')
1485
/* Imaginary part */
1492
*p_start = tok->start;
1499
if (c == '\'' || c == '"') {
1501
int quote_size = 1; /* 1 or 3 */
1502
int end_quote_size = 0;
1504
/* Find the quote size and start of string */
1511
end_quote_size = 1; /* empty string found */
1516
/* Get rest of string */
1517
while (end_quote_size != quote_size) {
1520
if (quote_size == 3)
1524
tok->cur = tok->inp;
1527
if (quote_size == 1 && c == '\n') {
1529
tok->cur = tok->inp;
1533
end_quote_size += 1;
1537
c = tok_nextc(tok); /* skip escaped char */
1541
*p_start = tok->start;
1546
/* Line continuation */
1550
tok->done = E_LINECONT;
1551
tok->cur = tok->inp;
1555
goto again; /* Read next line */
1558
/* Check for two-character token */
1560
int c2 = tok_nextc(tok);
1561
int token = PyToken_TwoChars(c, c2);
1563
int c3 = tok_nextc(tok);
1564
int token3 = PyToken_ThreeChars(c, c2, c3);
1568
tok_backup(tok, c3);
1570
*p_start = tok->start;
1574
tok_backup(tok, c2);
1577
/* Keep track of parentheses nesting level */
1591
/* Punctuation character */
1592
*p_start = tok->start;
1594
return PyToken_OneChar(c);
1598
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1600
int result = tok_get(tok, p_start, p_end);
1601
if (tok->decoding_erred) {
1602
result = ERRORTOKEN;
1603
tok->done = E_DECODE;
1608
/* Get -*- encoding -*- from a Python file.
1610
PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1611
the first or second line of the file (in which case the encoding
1612
should be assumed to be PyUnicode_GetDefaultEncoding()).
1614
The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1618
PyTokenizer_FindEncoding(int fd)
1620
struct tok_state *tok;
1622
char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1628
fp = fdopen(fd, "r");
1632
tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1637
while (tok->lineno < 2 && tok->done == E_OK) {
1638
PyTokenizer_Get(tok, &p_start, &p_end);
1641
if (tok->encoding) {
1642
encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1644
strcpy(encoding, tok->encoding);
1646
PyTokenizer_Free(tok);
1653
tok_dump(int type, char *start, char *end)
1655
printf("%s", _PyParser_TokenNames[type]);
1656
if (type == NAME || type == NUMBER || type == STRING || type == OP)
1657
printf("(%.*s)", (int)(end - start), start);