~ubuntu-branches/ubuntu/maverick/python3.1/maverick

« back to all changes in this revision

Viewing changes to Parser/tokenizer.c

  • Committer: Bazaar Package Importer
  • Author(s): Matthias Klose
  • Date: 2009-03-23 00:01:27 UTC
  • Revision ID: james.westby@ubuntu.com-20090323000127-5fstfxju4ufrhthq
Tags: upstream-3.1~a1+20090322
ImportĀ upstreamĀ versionĀ 3.1~a1+20090322

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
 
 
2
/* Tokenizer implementation */
 
3
 
 
4
#include "Python.h"
 
5
#include "pgenheaders.h"
 
6
 
 
7
#include <ctype.h>
 
8
#include <assert.h>
 
9
 
 
10
#include "tokenizer.h"
 
11
#include "errcode.h"
 
12
 
 
13
#ifndef PGEN
 
14
#include "unicodeobject.h"
 
15
#include "bytesobject.h"
 
16
#include "fileobject.h"
 
17
#include "codecs.h"
 
18
#include "abstract.h"
 
19
#endif /* PGEN */
 
20
 
 
21
#define is_potential_identifier_start(c) (\
 
22
                          (c >= 'a' && c <= 'z')\
 
23
                       || (c >= 'A' && c <= 'Z')\
 
24
                       || c == '_'\
 
25
                       || (c >= 128))
 
26
 
 
27
#define is_potential_identifier_char(c) (\
 
28
                          (c >= 'a' && c <= 'z')\
 
29
                       || (c >= 'A' && c <= 'Z')\
 
30
                       || (c >= '0' && c <= '9')\
 
31
                       || c == '_'\
 
32
                       || (c >= 128))
 
33
 
 
34
extern char *PyOS_Readline(FILE *, FILE *, char *);
 
35
/* Return malloc'ed string including trailing \n;
 
36
   empty malloc'ed string for EOF;
 
37
   NULL if interrupted */
 
38
 
 
39
/* Don't ever change this -- it would break the portability of Python code */
 
40
#define TABSIZE 8
 
41
 
 
42
/* Forward */
 
43
static struct tok_state *tok_new(void);
 
44
static int tok_nextc(struct tok_state *tok);
 
45
static void tok_backup(struct tok_state *tok, int c);
 
46
 
 
47
 
 
48
/* Token names */
 
49
 
 
50
char *_PyParser_TokenNames[] = {
 
51
        "ENDMARKER",
 
52
        "NAME",
 
53
        "NUMBER",
 
54
        "STRING",
 
55
        "NEWLINE",
 
56
        "INDENT",
 
57
        "DEDENT",
 
58
        "LPAR",
 
59
        "RPAR",
 
60
        "LSQB",
 
61
        "RSQB",
 
62
        "COLON",
 
63
        "COMMA",
 
64
        "SEMI",
 
65
        "PLUS",
 
66
        "MINUS",
 
67
        "STAR",
 
68
        "SLASH",
 
69
        "VBAR",
 
70
        "AMPER",
 
71
        "LESS",
 
72
        "GREATER",
 
73
        "EQUAL",
 
74
        "DOT",
 
75
        "PERCENT",
 
76
        "LBRACE",
 
77
        "RBRACE",
 
78
        "EQEQUAL",
 
79
        "NOTEQUAL",
 
80
        "LESSEQUAL",
 
81
        "GREATEREQUAL",
 
82
        "TILDE",
 
83
        "CIRCUMFLEX",
 
84
        "LEFTSHIFT",
 
85
        "RIGHTSHIFT",
 
86
        "DOUBLESTAR",
 
87
        "PLUSEQUAL",
 
88
        "MINEQUAL",
 
89
        "STAREQUAL",
 
90
        "SLASHEQUAL",
 
91
        "PERCENTEQUAL",
 
92
        "AMPEREQUAL",
 
93
        "VBAREQUAL",
 
94
        "CIRCUMFLEXEQUAL",
 
95
        "LEFTSHIFTEQUAL",
 
96
        "RIGHTSHIFTEQUAL",
 
97
        "DOUBLESTAREQUAL",
 
98
        "DOUBLESLASH",
 
99
        "DOUBLESLASHEQUAL",
 
100
        "AT",
 
101
        "RARROW",
 
102
        "ELLIPSIS",
 
103
        /* This table must match the #defines in token.h! */
 
104
        "OP",
 
105
        "<ERRORTOKEN>",
 
106
        "<N_TOKENS>"
 
107
};
 
108
 
 
109
 
 
110
/* Create and initialize a new tok_state structure */
 
111
 
 
112
static struct tok_state *
 
113
tok_new(void)
 
114
{
 
115
        struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 
116
                                                sizeof(struct tok_state));
 
117
        if (tok == NULL)
 
118
                return NULL;
 
119
        tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 
120
        tok->done = E_OK;
 
121
        tok->fp = NULL;
 
122
        tok->tabsize = TABSIZE;
 
123
        tok->indent = 0;
 
124
        tok->indstack[0] = 0;
 
125
        tok->atbol = 1;
 
126
        tok->pendin = 0;
 
127
        tok->prompt = tok->nextprompt = NULL;
 
128
        tok->lineno = 0;
 
129
        tok->level = 0;
 
130
        tok->filename = NULL;
 
131
        tok->altwarning = 1;
 
132
        tok->alterror = 1;
 
133
        tok->alttabsize = 1;
 
134
        tok->altindstack[0] = 0;
 
135
        tok->decoding_state = STATE_INIT;
 
136
        tok->decoding_erred = 0;
 
137
        tok->read_coding_spec = 0;
 
138
        tok->enc = NULL;
 
139
        tok->encoding = NULL;
 
140
        tok->cont_line = 0;
 
141
#ifndef PGEN
 
142
        tok->decoding_readline = NULL;
 
143
        tok->decoding_buffer = NULL;
 
144
#endif
 
145
        return tok;
 
146
}
 
147
 
 
148
#ifdef PGEN
 
149
 
 
150
static char *
 
151
decoding_fgets(char *s, int size, struct tok_state *tok)
 
152
{
 
153
        return fgets(s, size, tok->fp);
 
154
}
 
155
 
 
156
static int
 
157
decoding_feof(struct tok_state *tok)
 
158
{
 
159
        return feof(tok->fp);
 
160
}
 
161
 
 
162
static const char *
 
163
decode_str(const char *str, struct tok_state *tok)
 
164
{
 
165
        return str;
 
166
}
 
167
 
 
168
#else /* PGEN */
 
169
 
 
170
static char *
 
171
error_ret(struct tok_state *tok) /* XXX */
 
172
{
 
173
        tok->decoding_erred = 1;
 
174
        if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 
175
                PyMem_FREE(tok->buf);
 
176
        tok->buf = NULL;
 
177
        return NULL;            /* as if it were EOF */
 
178
}
 
179
 
 
180
static char *
 
181
new_string(const char *s, Py_ssize_t len)
 
182
{
 
183
        char* result = (char *)PyMem_MALLOC(len + 1);
 
184
        if (result != NULL) {
 
185
                memcpy(result, s, len);
 
186
                result[len] = '\0';
 
187
        }
 
188
        return result;
 
189
}
 
190
 
 
191
static char *
 
192
get_normal_name(char *s)        /* for utf-8 and latin-1 */
 
193
{
 
194
        char buf[13];
 
195
        int i;
 
196
        for (i = 0; i < 12; i++) {
 
197
                int c = s[i];
 
198
                if (c == '\0') break;
 
199
                else if (c == '_') buf[i] = '-';
 
200
                else buf[i] = tolower(c);
 
201
        }
 
202
        buf[i] = '\0';
 
203
        if (strcmp(buf, "utf-8") == 0 ||
 
204
            strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 
205
        else if (strcmp(buf, "latin-1") == 0 ||
 
206
                 strcmp(buf, "iso-8859-1") == 0 ||
 
207
                 strcmp(buf, "iso-latin-1") == 0 ||
 
208
                 strncmp(buf, "latin-1-", 8) == 0 ||
 
209
                 strncmp(buf, "iso-8859-1-", 11) == 0 ||
 
210
                 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 
211
        else return s;
 
212
}
 
213
 
 
214
/* Return the coding spec in S, or NULL if none is found.  */
 
215
 
 
216
static char *
 
217
get_coding_spec(const char *s, Py_ssize_t size)
 
218
{
 
219
        Py_ssize_t i;
 
220
        /* Coding spec must be in a comment, and that comment must be
 
221
         * the only statement on the source code line. */
 
222
        for (i = 0; i < size - 6; i++) {
 
223
                if (s[i] == '#')
 
224
                        break;
 
225
                if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 
226
                        return NULL;
 
227
        }
 
228
        for (; i < size - 6; i++) { /* XXX inefficient search */
 
229
                const char* t = s + i;
 
230
                if (strncmp(t, "coding", 6) == 0) {
 
231
                        const char* begin = NULL;
 
232
                        t += 6;
 
233
                        if (t[0] != ':' && t[0] != '=')
 
234
                                continue;
 
235
                        do {
 
236
                                t++;
 
237
                        } while (t[0] == '\x20' || t[0] == '\t');
 
238
 
 
239
                        begin = t;
 
240
                        while (isalnum(Py_CHARMASK(t[0])) ||
 
241
                               t[0] == '-' || t[0] == '_' || t[0] == '.')
 
242
                                t++;
 
243
 
 
244
                        if (begin < t) {
 
245
                                char* r = new_string(begin, t - begin);
 
246
                                char* q = get_normal_name(r);
 
247
                                if (r != q) {
 
248
                                        PyMem_FREE(r);
 
249
                                        r = new_string(q, strlen(q));
 
250
                                }
 
251
                                return r;
 
252
                        }
 
253
                }
 
254
        }
 
255
        return NULL;
 
256
}
 
257
 
 
258
/* Check whether the line contains a coding spec. If it does,
 
259
   invoke the set_readline function for the new encoding.
 
260
   This function receives the tok_state and the new encoding.
 
261
   Return 1 on success, 0 on failure.  */
 
262
 
 
263
static int
 
264
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 
265
                  int set_readline(struct tok_state *, const char *))
 
266
{
 
267
        char * cs;
 
268
        int r = 1;
 
269
 
 
270
        if (tok->cont_line)
 
271
                /* It's a continuation line, so it can't be a coding spec. */
 
272
                return 1;
 
273
        cs = get_coding_spec(line, size);
 
274
        if (cs != NULL) {
 
275
                tok->read_coding_spec = 1;
 
276
                if (tok->encoding == NULL) {
 
277
                        assert(tok->decoding_state == STATE_RAW);
 
278
                        if (strcmp(cs, "utf-8") == 0) {
 
279
                                tok->encoding = cs;
 
280
                        } else {
 
281
                                r = set_readline(tok, cs);
 
282
                                if (r) {
 
283
                                        tok->encoding = cs;
 
284
                                        tok->decoding_state = STATE_NORMAL;
 
285
                                }
 
286
                                else
 
287
                                        PyMem_FREE(cs);
 
288
                        }
 
289
                } else {        /* then, compare cs with BOM */
 
290
                        r = (strcmp(tok->encoding, cs) == 0);
 
291
                        PyMem_FREE(cs);
 
292
                }
 
293
        }
 
294
        if (!r) {
 
295
                cs = tok->encoding;
 
296
                if (!cs)
 
297
                        cs = "with BOM";
 
298
                PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 
299
        }
 
300
        return r;
 
301
}
 
302
 
 
303
/* See whether the file starts with a BOM. If it does,
 
304
   invoke the set_readline function with the new encoding.
 
305
   Return 1 on success, 0 on failure.  */
 
306
 
 
307
static int
 
308
check_bom(int get_char(struct tok_state *),
 
309
          void unget_char(int, struct tok_state *),
 
310
          int set_readline(struct tok_state *, const char *),
 
311
          struct tok_state *tok)
 
312
{
 
313
        int ch = get_char(tok);
 
314
        tok->decoding_state = STATE_RAW;
 
315
        if (ch == EOF) {
 
316
                return 1;
 
317
        } else if (ch == 0xEF) {
 
318
                ch = get_char(tok); 
 
319
                if (ch != 0xBB) {
 
320
                        unget_char(ch, tok);
 
321
                        unget_char(0xEF, tok);
 
322
                        /* any token beginning with '\xEF' is a bad token */
 
323
                        return 1;
 
324
                }
 
325
                ch = get_char(tok); 
 
326
                if (ch != 0xBF) {
 
327
                        unget_char(ch, tok);
 
328
                        unget_char(0xBB, tok);
 
329
                        unget_char(0xEF, tok);
 
330
                        /* any token beginning with '\xEF' is a bad token */
 
331
                        return 1;
 
332
                }
 
333
#if 0
 
334
        /* Disable support for UTF-16 BOMs until a decision
 
335
           is made whether this needs to be supported.  */
 
336
        } else if (ch == 0xFE) {
 
337
                ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 
338
                if (!set_readline(tok, "utf-16-be")) return 0;
 
339
                tok->decoding_state = STATE_NORMAL;
 
340
        } else if (ch == 0xFF) {
 
341
                ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 
342
                if (!set_readline(tok, "utf-16-le")) return 0;
 
343
                tok->decoding_state = STATE_NORMAL;
 
344
#endif
 
345
        } else {
 
346
                unget_char(ch, tok);
 
347
                return 1;
 
348
        }
 
349
        if (tok->encoding != NULL)
 
350
                PyMem_FREE(tok->encoding);
 
351
        tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 
352
        /* No need to set_readline: input is already utf-8 */
 
353
        return 1;
 
354
}
 
355
 
 
356
/* Read a line of text from TOK into S, using the stream in TOK.
 
357
   Return NULL on failure, else S.
 
358
 
 
359
   On entry, tok->decoding_buffer will be one of:
 
360
     1) NULL: need to call tok->decoding_readline to get a new line
 
361
     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 
362
           stored the result in tok->decoding_buffer
 
363
     3) PyByteArrayObject *: previous call to fp_readl did not have enough room
 
364
           (in the s buffer) to copy entire contents of the line read
 
365
           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 
366
           In this case, fp_readl is called in a loop (with an expanded buffer)
 
367
           until the buffer ends with a '\n' (or until the end of the file is
 
368
           reached): see tok_nextc and its calls to decoding_fgets.
 
369
*/
 
370
 
 
371
static char *
 
372
fp_readl(char *s, int size, struct tok_state *tok)
 
373
{
 
374
        PyObject* bufobj;
 
375
        const char *buf;
 
376
        Py_ssize_t buflen;
 
377
 
 
378
        /* Ask for one less byte so we can terminate it */
 
379
        assert(size > 0);
 
380
        size--;
 
381
 
 
382
        if (tok->decoding_buffer) {
 
383
                bufobj = tok->decoding_buffer;
 
384
                Py_INCREF(bufobj);
 
385
        }
 
386
        else
 
387
        {
 
388
                bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
 
389
                if (bufobj == NULL)
 
390
                        goto error;
 
391
        }
 
392
        if (PyUnicode_CheckExact(bufobj))
 
393
        {
 
394
                buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
 
395
                if (buf == NULL) {
 
396
                        goto error;
 
397
                }
 
398
        }
 
399
        else
 
400
        {
 
401
                buf = PyByteArray_AsString(bufobj);
 
402
                if (buf == NULL) {
 
403
                        goto error;
 
404
                }
 
405
                buflen = PyByteArray_GET_SIZE(bufobj);
 
406
        }
 
407
 
 
408
        Py_XDECREF(tok->decoding_buffer);
 
409
        if (buflen > size) {
 
410
                /* Too many chars, the rest goes into tok->decoding_buffer */
 
411
                tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
 
412
                                                                 buflen-size);
 
413
                if (tok->decoding_buffer == NULL)
 
414
                        goto error;
 
415
                buflen = size;
 
416
        }
 
417
        else
 
418
                tok->decoding_buffer = NULL;
 
419
 
 
420
        memcpy(s, buf, buflen);
 
421
        s[buflen] = '\0';
 
422
        if (buflen == 0) /* EOF */
 
423
                s = NULL;
 
424
        Py_DECREF(bufobj);
 
425
        return s;
 
426
 
 
427
error:
 
428
        Py_XDECREF(bufobj);
 
429
        return error_ret(tok);
 
430
}
 
431
 
 
432
/* Set the readline function for TOK to a StreamReader's
 
433
   readline function. The StreamReader is named ENC.
 
434
 
 
435
   This function is called from check_bom and check_coding_spec.
 
436
 
 
437
   ENC is usually identical to the future value of tok->encoding,
 
438
   except for the (currently unsupported) case of UTF-16.
 
439
 
 
440
   Return 1 on success, 0 on failure. */
 
441
 
 
442
static int
 
443
fp_setreadl(struct tok_state *tok, const char* enc)
 
444
{
 
445
        PyObject *readline = NULL, *stream = NULL, *io = NULL;
 
446
 
 
447
        io = PyImport_ImportModuleNoBlock("io");
 
448
        if (io == NULL)
 
449
                goto cleanup;
 
450
 
 
451
        if (tok->filename)
 
452
                stream = PyObject_CallMethod(io, "open", "ssis",
 
453
                                             tok->filename, "r", -1, enc);
 
454
        else
 
455
                stream = PyObject_CallMethod(io, "open", "isisOOO",
 
456
                                fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
 
457
        if (stream == NULL)
 
458
                goto cleanup;
 
459
 
 
460
        Py_XDECREF(tok->decoding_readline);
 
461
        readline = PyObject_GetAttrString(stream, "readline");
 
462
        tok->decoding_readline = readline;
 
463
 
 
464
        /* The file has been reopened; parsing will restart from
 
465
         * the beginning of the file, we have to reset the line number.
 
466
         * But this function has been called from inside tok_nextc() which
 
467
         * will increment lineno before it returns. So we set it -1 so that
 
468
         * the next call to tok_nextc() will start with tok->lineno == 0.
 
469
         */
 
470
        tok->lineno = -1;
 
471
 
 
472
  cleanup:
 
473
        Py_XDECREF(stream);
 
474
        Py_XDECREF(io);
 
475
        return readline != NULL;
 
476
}
 
477
 
 
478
/* Fetch the next byte from TOK. */
 
479
 
 
480
static int fp_getc(struct tok_state *tok) {
 
481
        return getc(tok->fp);
 
482
}
 
483
 
 
484
/* Unfetch the last byte back into TOK.  */
 
485
 
 
486
static void fp_ungetc(int c, struct tok_state *tok) {
 
487
        ungetc(c, tok->fp);
 
488
}
 
489
 
 
490
/* Check whether the characters at s start a valid
 
491
   UTF-8 sequence. Return the number of characters forming
 
492
   the sequence if yes, 0 if not.  */
 
493
static int valid_utf8(const unsigned char* s)
 
494
{
 
495
        int expected = 0;
 
496
        int length;
 
497
        if (*s < 0x80)
 
498
                /* single-byte code */
 
499
                return 1;
 
500
        if (*s < 0xc0)
 
501
                /* following byte */
 
502
                return 0;
 
503
        if (*s < 0xE0)
 
504
                expected = 1;
 
505
        else if (*s < 0xF0)
 
506
                expected = 2;
 
507
        else if (*s < 0xF8)
 
508
                expected = 3;
 
509
        else
 
510
                return 0;
 
511
        length = expected + 1;
 
512
        for (; expected; expected--)
 
513
                if (s[expected] < 0x80 || s[expected] >= 0xC0)
 
514
                        return 0;
 
515
        return length;
 
516
}
 
517
 
 
518
/* Read a line of input from TOK. Determine encoding
 
519
   if necessary.  */
 
520
 
 
521
static char *
 
522
decoding_fgets(char *s, int size, struct tok_state *tok)
 
523
{
 
524
        char *line = NULL;
 
525
        int badchar = 0;
 
526
        for (;;) {
 
527
                if (tok->decoding_state == STATE_NORMAL) {
 
528
                        /* We already have a codec associated with
 
529
                           this input. */
 
530
                        line = fp_readl(s, size, tok);
 
531
                        break;
 
532
                } else if (tok->decoding_state == STATE_RAW) {
 
533
                        /* We want a 'raw' read. */
 
534
                        line = Py_UniversalNewlineFgets(s, size,
 
535
                                                        tok->fp, NULL);
 
536
                        break;
 
537
                } else {
 
538
                        /* We have not yet determined the encoding.
 
539
                           If an encoding is found, use the file-pointer
 
540
                           reader functions from now on. */
 
541
                        if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 
542
                                return error_ret(tok);
 
543
                        assert(tok->decoding_state != STATE_INIT);
 
544
                }
 
545
        }
 
546
        if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 
547
                if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 
548
                        return error_ret(tok);
 
549
                }
 
550
        }
 
551
#ifndef PGEN
 
552
        /* The default encoding is UTF-8, so make sure we don't have any
 
553
           non-UTF-8 sequences in it. */
 
554
        if (line && !tok->encoding) {
 
555
                unsigned char *c;
 
556
                int length;
 
557
                for (c = (unsigned char *)line; *c; c += length)
 
558
                        if (!(length = valid_utf8(c))) {
 
559
                                badchar = *c;
 
560
                                break;
 
561
                        }
 
562
        }
 
563
        if (badchar) {
 
564
                char buf[500];
 
565
                /* Need to add 1 to the line number, since this line
 
566
                   has not been counted, yet.  */
 
567
                sprintf(buf,
 
568
                        "Non-UTF-8 code starting with '\\x%.2x' "
 
569
                        "in file %.200s on line %i, "
 
570
                        "but no encoding declared; "
 
571
                        "see http://python.org/dev/peps/pep-0263/ for details",
 
572
                        badchar, tok->filename, tok->lineno + 1);
 
573
                PyErr_SetString(PyExc_SyntaxError, buf);
 
574
                return error_ret(tok);
 
575
        }
 
576
#endif
 
577
        return line;
 
578
}
 
579
 
 
580
static int
 
581
decoding_feof(struct tok_state *tok)
 
582
{
 
583
        if (tok->decoding_state != STATE_NORMAL) {
 
584
                return feof(tok->fp);
 
585
        } else {
 
586
                PyObject* buf = tok->decoding_buffer;
 
587
                if (buf == NULL) {
 
588
                        buf = PyObject_CallObject(tok->decoding_readline, NULL);
 
589
                        if (buf == NULL) {
 
590
                                error_ret(tok);
 
591
                                return 1;
 
592
                        } else {
 
593
                                tok->decoding_buffer = buf;
 
594
                        }
 
595
                }
 
596
                return PyObject_Length(buf) == 0;
 
597
        }
 
598
}
 
599
 
 
600
/* Fetch a byte from TOK, using the string buffer. */
 
601
 
 
602
static int
 
603
buf_getc(struct tok_state *tok) {
 
604
        return Py_CHARMASK(*tok->str++);
 
605
}
 
606
 
 
607
/* Unfetch a byte from TOK, using the string buffer. */
 
608
 
 
609
static void
 
610
buf_ungetc(int c, struct tok_state *tok) {
 
611
        tok->str--;
 
612
        assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 
613
}
 
614
 
 
615
/* Set the readline function for TOK to ENC. For the string-based
 
616
   tokenizer, this means to just record the encoding. */
 
617
 
 
618
static int
 
619
buf_setreadl(struct tok_state *tok, const char* enc) {
 
620
        tok->enc = enc;
 
621
        return 1;
 
622
}
 
623
 
 
624
/* Return a UTF-8 encoding Python string object from the
 
625
   C byte string STR, which is encoded with ENC. */
 
626
 
 
627
static PyObject *
 
628
translate_into_utf8(const char* str, const char* enc) {
 
629
        PyObject *utf8;
 
630
        PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 
631
        if (buf == NULL)
 
632
                return NULL;
 
633
        utf8 = PyUnicode_AsUTF8String(buf);
 
634
        Py_DECREF(buf);
 
635
        return utf8;
 
636
}
 
637
 
 
638
/* Decode a byte string STR for use as the buffer of TOK.
 
639
   Look for encoding declarations inside STR, and record them
 
640
   inside TOK.  */
 
641
 
 
642
static const char *
 
643
decode_str(const char *str, struct tok_state *tok)
 
644
{
 
645
        PyObject* utf8 = NULL;
 
646
        const char *s;
 
647
        const char *newl[2] = {NULL, NULL};
 
648
        int lineno = 0;
 
649
        tok->enc = NULL;
 
650
        tok->str = str;
 
651
        if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 
652
                return error_ret(tok);
 
653
        str = tok->str;         /* string after BOM if any */
 
654
        assert(str);
 
655
        if (tok->enc != NULL) {
 
656
                utf8 = translate_into_utf8(str, tok->enc);
 
657
                if (utf8 == NULL)
 
658
                        return error_ret(tok);
 
659
                str = PyBytes_AsString(utf8);
 
660
        }
 
661
        for (s = str;; s++) {
 
662
                if (*s == '\0') break;
 
663
                else if (*s == '\n') {
 
664
                        assert(lineno < 2);
 
665
                        newl[lineno] = s;
 
666
                        lineno++;
 
667
                        if (lineno == 2) break;
 
668
                }
 
669
        }
 
670
        tok->enc = NULL;
 
671
        /* need to check line 1 and 2 separately since check_coding_spec
 
672
           assumes a single line as input */
 
673
        if (newl[0]) {
 
674
                if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 
675
                        return error_ret(tok);
 
676
                if (tok->enc == NULL && newl[1]) {
 
677
                        if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 
678
                                               tok, buf_setreadl))
 
679
                                return error_ret(tok);
 
680
                }
 
681
        }
 
682
        if (tok->enc != NULL) {
 
683
                assert(utf8 == NULL);
 
684
                utf8 = translate_into_utf8(str, tok->enc);
 
685
                if (utf8 == NULL) {
 
686
                        PyErr_Format(PyExc_SyntaxError,
 
687
                                "unknown encoding: %s", tok->enc);
 
688
                        return error_ret(tok);
 
689
                }
 
690
                str = PyBytes_AS_STRING(utf8);
 
691
        }
 
692
        assert(tok->decoding_buffer == NULL);
 
693
        tok->decoding_buffer = utf8; /* CAUTION */
 
694
        return str;
 
695
}
 
696
 
 
697
#endif /* PGEN */
 
698
 
 
699
/* Set up tokenizer for string */
 
700
 
 
701
struct tok_state *
 
702
PyTokenizer_FromString(const char *str)
 
703
{
 
704
        struct tok_state *tok = tok_new();
 
705
        if (tok == NULL)
 
706
                return NULL;
 
707
        str = (char *)decode_str(str, tok);
 
708
        if (str == NULL) {
 
709
                PyTokenizer_Free(tok);
 
710
                return NULL;
 
711
        }
 
712
 
 
713
        /* XXX: constify members. */
 
714
        tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 
715
        return tok;
 
716
}
 
717
 
 
718
struct tok_state *
 
719
PyTokenizer_FromUTF8(const char *str)
 
720
{
 
721
        struct tok_state *tok = tok_new();
 
722
        if (tok == NULL)
 
723
                return NULL;
 
724
        tok->decoding_state = STATE_RAW;
 
725
        tok->read_coding_spec = 1;
 
726
        tok->enc = NULL;
 
727
        tok->str = str;
 
728
        tok->encoding = (char *)PyMem_MALLOC(6);
 
729
        if (!tok->encoding) {
 
730
                PyTokenizer_Free(tok);
 
731
                return NULL;
 
732
        }
 
733
        strcpy(tok->encoding, "utf-8");
 
734
 
 
735
        /* XXX: constify members. */
 
736
        tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 
737
        return tok;
 
738
}
 
739
 
 
740
 
 
741
/* Set up tokenizer for file */
 
742
 
 
743
struct tok_state *
 
744
PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
 
745
{
 
746
        struct tok_state *tok = tok_new();
 
747
        if (tok == NULL)
 
748
                return NULL;
 
749
        if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 
750
                PyTokenizer_Free(tok);
 
751
                return NULL;
 
752
        }
 
753
        tok->cur = tok->inp = tok->buf;
 
754
        tok->end = tok->buf + BUFSIZ;
 
755
        tok->fp = fp;
 
756
        tok->prompt = ps1;
 
757
        tok->nextprompt = ps2;
 
758
        if (enc != NULL) {
 
759
                /* Must copy encoding declaration since it
 
760
                   gets copied into the parse tree. */
 
761
                tok->encoding = PyMem_MALLOC(strlen(enc)+1);
 
762
                if (!tok->encoding) {
 
763
                        PyTokenizer_Free(tok);
 
764
                        return NULL;
 
765
                }
 
766
                strcpy(tok->encoding, enc);
 
767
                tok->decoding_state = STATE_NORMAL;
 
768
        }
 
769
        return tok;
 
770
}
 
771
 
 
772
 
 
773
/* Free a tok_state structure */
 
774
 
 
775
void
 
776
PyTokenizer_Free(struct tok_state *tok)
 
777
{
 
778
        if (tok->encoding != NULL)
 
779
                PyMem_FREE(tok->encoding);
 
780
#ifndef PGEN
 
781
        Py_XDECREF(tok->decoding_readline);
 
782
        Py_XDECREF(tok->decoding_buffer);
 
783
#endif
 
784
        if (tok->fp != NULL && tok->buf != NULL)
 
785
                PyMem_FREE(tok->buf);
 
786
        PyMem_FREE(tok);
 
787
}
 
788
 
 
789
/* Get next char, updating state; error code goes into tok->done */
 
790
 
 
791
static int
 
792
tok_nextc(register struct tok_state *tok)
 
793
{
 
794
        for (;;) {
 
795
                if (tok->cur != tok->inp) {
 
796
                        return Py_CHARMASK(*tok->cur++); /* Fast path */
 
797
                }
 
798
                if (tok->done != E_OK)
 
799
                        return EOF;
 
800
                if (tok->fp == NULL) {
 
801
                        char *end = strchr(tok->inp, '\n');
 
802
                        if (end != NULL)
 
803
                                end++;
 
804
                        else {
 
805
                                end = strchr(tok->inp, '\0');
 
806
                                if (end == tok->inp) {
 
807
                                        tok->done = E_EOF;
 
808
                                        return EOF;
 
809
                                }
 
810
                        }
 
811
                        if (tok->start == NULL)
 
812
                                tok->buf = tok->cur;
 
813
                        tok->line_start = tok->cur;
 
814
                        tok->lineno++;
 
815
                        tok->inp = end;
 
816
                        return Py_CHARMASK(*tok->cur++);
 
817
                }
 
818
                if (tok->prompt != NULL) {
 
819
                        char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 
820
#ifndef PGEN
 
821
                        if (tok->encoding && newtok && *newtok) {
 
822
                                /* Recode to UTF-8 */
 
823
                                Py_ssize_t buflen;
 
824
                                const char* buf;
 
825
                                PyObject *u = translate_into_utf8(newtok, tok->encoding);
 
826
                                PyMem_FREE(newtok);
 
827
                                if (!u) {
 
828
                                        tok->done = E_DECODE;
 
829
                                        return EOF;
 
830
                                }
 
831
                                buflen = PyBytes_GET_SIZE(u);
 
832
                                buf = PyBytes_AS_STRING(u);
 
833
                                if (!buf) {
 
834
                                        Py_DECREF(u);
 
835
                                        tok->done = E_DECODE;
 
836
                                        return EOF;
 
837
                                }
 
838
                                newtok = PyMem_MALLOC(buflen+1);
 
839
                                strcpy(newtok, buf);
 
840
                                Py_DECREF(u);
 
841
                        }
 
842
#endif
 
843
                        if (tok->nextprompt != NULL)
 
844
                                tok->prompt = tok->nextprompt;
 
845
                        if (newtok == NULL)
 
846
                                tok->done = E_INTR;
 
847
                        else if (*newtok == '\0') {
 
848
                                PyMem_FREE(newtok);
 
849
                                tok->done = E_EOF;
 
850
                        }
 
851
                        else if (tok->start != NULL) {
 
852
                                size_t start = tok->start - tok->buf;
 
853
                                size_t oldlen = tok->cur - tok->buf;
 
854
                                size_t newlen = oldlen + strlen(newtok);
 
855
                                char *buf = tok->buf;
 
856
                                buf = (char *)PyMem_REALLOC(buf, newlen+1);
 
857
                                tok->lineno++;
 
858
                                if (buf == NULL) {
 
859
                                        PyMem_FREE(tok->buf);
 
860
                                        tok->buf = NULL;
 
861
                                        PyMem_FREE(newtok);
 
862
                                        tok->done = E_NOMEM;
 
863
                                        return EOF;
 
864
                                }
 
865
                                tok->buf = buf;
 
866
                                tok->cur = tok->buf + oldlen;
 
867
                                tok->line_start = tok->cur;
 
868
                                strcpy(tok->buf + oldlen, newtok);
 
869
                                PyMem_FREE(newtok);
 
870
                                tok->inp = tok->buf + newlen;
 
871
                                tok->end = tok->inp + 1;
 
872
                                tok->start = tok->buf + start;
 
873
                        }
 
874
                        else {
 
875
                                tok->lineno++;
 
876
                                if (tok->buf != NULL)
 
877
                                        PyMem_FREE(tok->buf);
 
878
                                tok->buf = newtok;
 
879
                                tok->line_start = tok->buf;
 
880
                                tok->cur = tok->buf;
 
881
                                tok->line_start = tok->buf;
 
882
                                tok->inp = strchr(tok->buf, '\0');
 
883
                                tok->end = tok->inp + 1;
 
884
                        }
 
885
                }
 
886
                else {
 
887
                        int done = 0;
 
888
                        Py_ssize_t cur = 0;
 
889
                        char *pt;
 
890
                        if (tok->start == NULL) {
 
891
                                if (tok->buf == NULL) {
 
892
                                        tok->buf = (char *)
 
893
                                                PyMem_MALLOC(BUFSIZ);
 
894
                                        if (tok->buf == NULL) {
 
895
                                                tok->done = E_NOMEM;
 
896
                                                return EOF;
 
897
                                        }
 
898
                                        tok->end = tok->buf + BUFSIZ;
 
899
                                }
 
900
                                if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 
901
                                          tok) == NULL) {
 
902
                                        tok->done = E_EOF;
 
903
                                        done = 1;
 
904
                                }
 
905
                                else {
 
906
                                        tok->done = E_OK;
 
907
                                        tok->inp = strchr(tok->buf, '\0');
 
908
                                        done = tok->inp[-1] == '\n';
 
909
                                }
 
910
                        }
 
911
                        else {
 
912
                                cur = tok->cur - tok->buf;
 
913
                                if (decoding_feof(tok)) {
 
914
                                        tok->done = E_EOF;
 
915
                                        done = 1;
 
916
                                }
 
917
                                else
 
918
                                        tok->done = E_OK;
 
919
                        }
 
920
                        tok->lineno++;
 
921
                        /* Read until '\n' or EOF */
 
922
                        while (!done) {
 
923
                                Py_ssize_t curstart = tok->start == NULL ? -1 :
 
924
                                                  tok->start - tok->buf;
 
925
                                Py_ssize_t curvalid = tok->inp - tok->buf;
 
926
                                Py_ssize_t newsize = curvalid + BUFSIZ;
 
927
                                char *newbuf = tok->buf;
 
928
                                newbuf = (char *)PyMem_REALLOC(newbuf,
 
929
                                                               newsize);
 
930
                                if (newbuf == NULL) {
 
931
                                        tok->done = E_NOMEM;
 
932
                                        tok->cur = tok->inp;
 
933
                                        return EOF;
 
934
                                }
 
935
                                tok->buf = newbuf;
 
936
                                tok->inp = tok->buf + curvalid;
 
937
                                tok->end = tok->buf + newsize;
 
938
                                tok->start = curstart < 0 ? NULL :
 
939
                                             tok->buf + curstart;
 
940
                                if (decoding_fgets(tok->inp,
 
941
                                               (int)(tok->end - tok->inp),
 
942
                                               tok) == NULL) {
 
943
                                        /* Break out early on decoding
 
944
                                           errors, as tok->buf will be NULL
 
945
                                         */
 
946
                                        if (tok->decoding_erred)
 
947
                                                return EOF;
 
948
                                        /* Last line does not end in \n,
 
949
                                           fake one */
 
950
                                        strcpy(tok->inp, "\n");
 
951
                                }
 
952
                                tok->inp = strchr(tok->inp, '\0');
 
953
                                done = tok->inp[-1] == '\n';
 
954
                        }
 
955
                        if (tok->buf != NULL) {
 
956
                                tok->cur = tok->buf + cur;
 
957
                                tok->line_start = tok->cur;
 
958
                                /* replace "\r\n" with "\n" */
 
959
                                /* For Mac leave the \r, giving a syntax error */
 
960
                                pt = tok->inp - 2;
 
961
                                if (pt >= tok->buf && *pt == '\r') {
 
962
                                        *pt++ = '\n';
 
963
                                        *pt = '\0';
 
964
                                        tok->inp = pt;
 
965
                                }
 
966
                        }
 
967
                }
 
968
                if (tok->done != E_OK) {
 
969
                        if (tok->prompt != NULL)
 
970
                                PySys_WriteStderr("\n");
 
971
                        tok->cur = tok->inp;
 
972
                        return EOF;
 
973
                }
 
974
        }
 
975
        /*NOTREACHED*/
 
976
}
 
977
 
 
978
 
 
979
/* Back-up one character */
 
980
 
 
981
static void
 
982
tok_backup(register struct tok_state *tok, register int c)
 
983
{
 
984
        if (c != EOF) {
 
985
                if (--tok->cur < tok->buf)
 
986
                        Py_FatalError("tok_backup: begin of buffer");
 
987
                if (*tok->cur != c)
 
988
                        *tok->cur = c;
 
989
        }
 
990
}
 
991
 
 
992
 
 
993
/* Return the token corresponding to a single character */
 
994
 
 
995
int
 
996
PyToken_OneChar(int c)
 
997
{
 
998
        switch (c) {
 
999
        case '(':       return LPAR;
 
1000
        case ')':       return RPAR;
 
1001
        case '[':       return LSQB;
 
1002
        case ']':       return RSQB;
 
1003
        case ':':       return COLON;
 
1004
        case ',':       return COMMA;
 
1005
        case ';':       return SEMI;
 
1006
        case '+':       return PLUS;
 
1007
        case '-':       return MINUS;
 
1008
        case '*':       return STAR;
 
1009
        case '/':       return SLASH;
 
1010
        case '|':       return VBAR;
 
1011
        case '&':       return AMPER;
 
1012
        case '<':       return LESS;
 
1013
        case '>':       return GREATER;
 
1014
        case '=':       return EQUAL;
 
1015
        case '.':       return DOT;
 
1016
        case '%':       return PERCENT;
 
1017
        case '{':       return LBRACE;
 
1018
        case '}':       return RBRACE;
 
1019
        case '^':       return CIRCUMFLEX;
 
1020
        case '~':       return TILDE;
 
1021
        case '@':       return AT;
 
1022
        default:        return OP;
 
1023
        }
 
1024
}
 
1025
 
 
1026
 
 
1027
int
 
1028
PyToken_TwoChars(int c1, int c2)
 
1029
{
 
1030
        switch (c1) {
 
1031
        case '=':
 
1032
                switch (c2) {
 
1033
                case '=':       return EQEQUAL;
 
1034
                }
 
1035
                break;
 
1036
        case '!':
 
1037
                switch (c2) {
 
1038
                case '=':       return NOTEQUAL;
 
1039
                }
 
1040
                break;
 
1041
        case '<':
 
1042
                switch (c2) {
 
1043
                case '=':       return LESSEQUAL;
 
1044
                case '<':       return LEFTSHIFT;
 
1045
                }
 
1046
                break;
 
1047
        case '>':
 
1048
                switch (c2) {
 
1049
                case '=':       return GREATEREQUAL;
 
1050
                case '>':       return RIGHTSHIFT;
 
1051
                }
 
1052
                break;
 
1053
        case '+':
 
1054
                switch (c2) {
 
1055
                case '=':       return PLUSEQUAL;
 
1056
                }
 
1057
                break;
 
1058
        case '-':
 
1059
                switch (c2) {
 
1060
                case '=':       return MINEQUAL;
 
1061
                case '>':       return RARROW;
 
1062
                }
 
1063
                break;
 
1064
        case '*':
 
1065
                switch (c2) {
 
1066
                case '*':       return DOUBLESTAR;
 
1067
                case '=':       return STAREQUAL;
 
1068
                }
 
1069
                break;
 
1070
        case '/':
 
1071
                switch (c2) {
 
1072
                case '/':       return DOUBLESLASH;
 
1073
                case '=':       return SLASHEQUAL;
 
1074
                }
 
1075
                break;
 
1076
        case '|':
 
1077
                switch (c2) {
 
1078
                case '=':       return VBAREQUAL;
 
1079
                }
 
1080
                break;
 
1081
        case '%':
 
1082
                switch (c2) {
 
1083
                case '=':       return PERCENTEQUAL;
 
1084
                }
 
1085
                break;
 
1086
        case '&':
 
1087
                switch (c2) {
 
1088
                case '=':       return AMPEREQUAL;
 
1089
                }
 
1090
                break;
 
1091
        case '^':
 
1092
                switch (c2) {
 
1093
                case '=':       return CIRCUMFLEXEQUAL;
 
1094
                }
 
1095
                break;
 
1096
        }
 
1097
        return OP;
 
1098
}
 
1099
 
 
1100
int
 
1101
PyToken_ThreeChars(int c1, int c2, int c3)
 
1102
{
 
1103
        switch (c1) {
 
1104
        case '<':
 
1105
                switch (c2) {
 
1106
                case '<':
 
1107
                        switch (c3) {
 
1108
                        case '=':
 
1109
                                return LEFTSHIFTEQUAL;
 
1110
                        }
 
1111
                        break;
 
1112
                }
 
1113
                break;
 
1114
        case '>':
 
1115
                switch (c2) {
 
1116
                case '>':
 
1117
                        switch (c3) {
 
1118
                        case '=':
 
1119
                                return RIGHTSHIFTEQUAL;
 
1120
                        }
 
1121
                        break;
 
1122
                }
 
1123
                break;
 
1124
        case '*':
 
1125
                switch (c2) {
 
1126
                case '*':
 
1127
                        switch (c3) {
 
1128
                        case '=':
 
1129
                                return DOUBLESTAREQUAL;
 
1130
                        }
 
1131
                        break;
 
1132
                }
 
1133
                break;
 
1134
        case '/':
 
1135
                switch (c2) {
 
1136
                case '/':
 
1137
                        switch (c3) {
 
1138
                        case '=':
 
1139
                                return DOUBLESLASHEQUAL;
 
1140
                        }
 
1141
                        break;
 
1142
                }
 
1143
                break;
 
1144
        case '.':
 
1145
                switch (c2) {
 
1146
                case '.':
 
1147
                        switch (c3) {
 
1148
                        case '.':
 
1149
                                return ELLIPSIS;
 
1150
                        }
 
1151
                        break;
 
1152
                }
 
1153
                break;
 
1154
        }
 
1155
        return OP;
 
1156
}
 
1157
 
 
1158
static int
 
1159
indenterror(struct tok_state *tok)
 
1160
{
 
1161
        if (tok->alterror) {
 
1162
                tok->done = E_TABSPACE;
 
1163
                tok->cur = tok->inp;
 
1164
                return 1;
 
1165
        }
 
1166
        if (tok->altwarning) {
 
1167
                PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
 
1168
                                  "in indentation\n", tok->filename);
 
1169
                tok->altwarning = 0;
 
1170
        }
 
1171
        return 0;
 
1172
}
 
1173
 
 
1174
#ifdef PGEN
 
1175
#define verify_identifier(s,e) 1
 
1176
#else
 
1177
/* Verify that the identifier follows PEP 3131. */
 
1178
static int
 
1179
verify_identifier(char *start, char *end)
 
1180
{
 
1181
        PyObject *s;
 
1182
        int result;
 
1183
        s = PyUnicode_DecodeUTF8(start, end-start, NULL);
 
1184
        if (s == NULL) {
 
1185
                PyErr_Clear();
 
1186
                return 0;
 
1187
        }
 
1188
        result = PyUnicode_IsIdentifier(s);
 
1189
        Py_DECREF(s);
 
1190
        return result;
 
1191
}
 
1192
#endif
 
1193
 
 
1194
/* Get next token, after space stripping etc. */
 
1195
 
 
1196
static int
 
1197
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
 
1198
{
 
1199
        register int c;
 
1200
        int blankline, nonascii;
 
1201
 
 
1202
        *p_start = *p_end = NULL;
 
1203
  nextline:
 
1204
        tok->start = NULL;
 
1205
        blankline = 0;
 
1206
 
 
1207
        /* Get indentation level */
 
1208
        if (tok->atbol) {
 
1209
                register int col = 0;
 
1210
                register int altcol = 0;
 
1211
                tok->atbol = 0;
 
1212
                for (;;) {
 
1213
                        c = tok_nextc(tok);
 
1214
                        if (c == ' ')
 
1215
                                col++, altcol++;
 
1216
                        else if (c == '\t') {
 
1217
                                col = (col/tok->tabsize + 1) * tok->tabsize;
 
1218
                                altcol = (altcol/tok->alttabsize + 1)
 
1219
                                        * tok->alttabsize;
 
1220
                        }
 
1221
                        else if (c == '\014') /* Control-L (formfeed) */
 
1222
                                col = altcol = 0; /* For Emacs users */
 
1223
                        else
 
1224
                                break;
 
1225
                }
 
1226
                tok_backup(tok, c);
 
1227
                if (c == '#' || c == '\n') {
 
1228
                        /* Lines with only whitespace and/or comments
 
1229
                           shouldn't affect the indentation and are
 
1230
                           not passed to the parser as NEWLINE tokens,
 
1231
                           except *totally* empty lines in interactive
 
1232
                           mode, which signal the end of a command group. */
 
1233
                        if (col == 0 && c == '\n' && tok->prompt != NULL)
 
1234
                                blankline = 0; /* Let it through */
 
1235
                        else
 
1236
                                blankline = 1; /* Ignore completely */
 
1237
                        /* We can't jump back right here since we still
 
1238
                           may need to skip to the end of a comment */
 
1239
                }
 
1240
                if (!blankline && tok->level == 0) {
 
1241
                        if (col == tok->indstack[tok->indent]) {
 
1242
                                /* No change */
 
1243
                                if (altcol != tok->altindstack[tok->indent]) {
 
1244
                                        if (indenterror(tok))
 
1245
                                                return ERRORTOKEN;
 
1246
                                }
 
1247
                        }
 
1248
                        else if (col > tok->indstack[tok->indent]) {
 
1249
                                /* Indent -- always one */
 
1250
                                if (tok->indent+1 >= MAXINDENT) {
 
1251
                                        tok->done = E_TOODEEP;
 
1252
                                        tok->cur = tok->inp;
 
1253
                                        return ERRORTOKEN;
 
1254
                                }
 
1255
                                if (altcol <= tok->altindstack[tok->indent]) {
 
1256
                                        if (indenterror(tok))
 
1257
                                                return ERRORTOKEN;
 
1258
                                }
 
1259
                                tok->pendin++;
 
1260
                                tok->indstack[++tok->indent] = col;
 
1261
                                tok->altindstack[tok->indent] = altcol;
 
1262
                        }
 
1263
                        else /* col < tok->indstack[tok->indent] */ {
 
1264
                                /* Dedent -- any number, must be consistent */
 
1265
                                while (tok->indent > 0 &&
 
1266
                                        col < tok->indstack[tok->indent]) {
 
1267
                                        tok->pendin--;
 
1268
                                        tok->indent--;
 
1269
                                }
 
1270
                                if (col != tok->indstack[tok->indent]) {
 
1271
                                        tok->done = E_DEDENT;
 
1272
                                        tok->cur = tok->inp;
 
1273
                                        return ERRORTOKEN;
 
1274
                                }
 
1275
                                if (altcol != tok->altindstack[tok->indent]) {
 
1276
                                        if (indenterror(tok))
 
1277
                                                return ERRORTOKEN;
 
1278
                                }
 
1279
                        }
 
1280
                }
 
1281
        }
 
1282
 
 
1283
        tok->start = tok->cur;
 
1284
 
 
1285
        /* Return pending indents/dedents */
 
1286
        if (tok->pendin != 0) {
 
1287
                if (tok->pendin < 0) {
 
1288
                        tok->pendin++;
 
1289
                        return DEDENT;
 
1290
                }
 
1291
                else {
 
1292
                        tok->pendin--;
 
1293
                        return INDENT;
 
1294
                }
 
1295
        }
 
1296
 
 
1297
 again:
 
1298
        tok->start = NULL;
 
1299
        /* Skip spaces */
 
1300
        do {
 
1301
                c = tok_nextc(tok);
 
1302
        } while (c == ' ' || c == '\t' || c == '\014');
 
1303
 
 
1304
        /* Set start of current token */
 
1305
        tok->start = tok->cur - 1;
 
1306
 
 
1307
        /* Skip comment */
 
1308
        if (c == '#')
 
1309
                while (c != EOF && c != '\n')
 
1310
                        c = tok_nextc(tok);
 
1311
 
 
1312
        /* Check for EOF and errors now */
 
1313
        if (c == EOF) {
 
1314
                return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
 
1315
        }
 
1316
 
 
1317
        /* Identifier (most frequent token!) */
 
1318
        nonascii = 0;
 
1319
        if (is_potential_identifier_start(c)) {
 
1320
                /* Process b"", r"" and br"" */
 
1321
                if (c == 'b' || c == 'B') {
 
1322
                        c = tok_nextc(tok);
 
1323
                        if (c == '"' || c == '\'')
 
1324
                                goto letter_quote;
 
1325
                }
 
1326
                if (c == 'r' || c == 'R') {
 
1327
                        c = tok_nextc(tok);
 
1328
                        if (c == '"' || c == '\'')
 
1329
                                goto letter_quote;
 
1330
            }
 
1331
                while (is_potential_identifier_char(c)) {
 
1332
                        if (c >= 128)
 
1333
                                nonascii = 1;
 
1334
                        c = tok_nextc(tok);
 
1335
                }
 
1336
                tok_backup(tok, c);
 
1337
                if (nonascii &&
 
1338
                    !verify_identifier(tok->start, tok->cur)) {
 
1339
                        tok->done = E_IDENTIFIER;
 
1340
                        return ERRORTOKEN;
 
1341
                }
 
1342
                *p_start = tok->start;
 
1343
                *p_end = tok->cur;
 
1344
                return NAME;
 
1345
        }
 
1346
 
 
1347
        /* Newline */
 
1348
        if (c == '\n') {
 
1349
                tok->atbol = 1;
 
1350
                if (blankline || tok->level > 0)
 
1351
                        goto nextline;
 
1352
                *p_start = tok->start;
 
1353
                *p_end = tok->cur - 1; /* Leave '\n' out of the string */
 
1354
                tok->cont_line = 0;
 
1355
                return NEWLINE;
 
1356
        }
 
1357
 
 
1358
        /* Period or number starting with period? */
 
1359
        if (c == '.') {
 
1360
                c = tok_nextc(tok);
 
1361
                if (isdigit(c)) {
 
1362
                        goto fraction;
 
1363
                } else if (c == '.') {
 
1364
                        c = tok_nextc(tok);
 
1365
                        if (c == '.') {
 
1366
                                *p_start = tok->start;
 
1367
                                *p_end = tok->cur;
 
1368
                                return ELLIPSIS;
 
1369
                        } else {
 
1370
                                tok_backup(tok, c);
 
1371
                        }
 
1372
                        tok_backup(tok, '.');
 
1373
                } else {
 
1374
                        tok_backup(tok, c);
 
1375
                }
 
1376
                *p_start = tok->start;
 
1377
                *p_end = tok->cur;
 
1378
                return DOT;
 
1379
        }
 
1380
 
 
1381
        /* Number */
 
1382
        if (isdigit(c)) {
 
1383
                if (c == '0') {
 
1384
                        /* Hex, octal or binary -- maybe. */
 
1385
                        c = tok_nextc(tok);
 
1386
                        if (c == '.')
 
1387
                                goto fraction;
 
1388
#ifndef WITHOUT_COMPLEX
 
1389
                        if (c == 'j' || c == 'J')
 
1390
                                goto imaginary;
 
1391
#endif
 
1392
                        if (c == 'x' || c == 'X') {
 
1393
 
 
1394
                                /* Hex */
 
1395
                                c = tok_nextc(tok);
 
1396
                                if (!isxdigit(c)) {
 
1397
                                        tok->done = E_TOKEN;
 
1398
                                        tok_backup(tok, c);
 
1399
                                        return ERRORTOKEN;
 
1400
                                }
 
1401
                                do {
 
1402
                                        c = tok_nextc(tok);
 
1403
                                } while (isxdigit(c));
 
1404
                        }
 
1405
                        else if (c == 'o' || c == 'O') {
 
1406
                                /* Octal */
 
1407
                                c = tok_nextc(tok);
 
1408
                                if (c < '0' || c >= '8') {
 
1409
                                        tok->done = E_TOKEN;
 
1410
                                        tok_backup(tok, c);
 
1411
                                        return ERRORTOKEN;
 
1412
                                }
 
1413
                                do {
 
1414
                                        c = tok_nextc(tok);
 
1415
                                } while ('0' <= c && c < '8');
 
1416
                        }
 
1417
                        else if (c == 'b' || c == 'B') {
 
1418
                                /* Binary */
 
1419
                                c = tok_nextc(tok);
 
1420
                                if (c != '0' && c != '1') {
 
1421
                                        tok->done = E_TOKEN;
 
1422
                                        tok_backup(tok, c);
 
1423
                                        return ERRORTOKEN;
 
1424
                                }
 
1425
                                do {
 
1426
                                        c = tok_nextc(tok);
 
1427
                                } while (c == '0' || c == '1');
 
1428
                        }
 
1429
                        else {
 
1430
                                int nonzero = 0;
 
1431
                                /* maybe old-style octal; c is first char of it */
 
1432
                                /* in any case, allow '0' as a literal */
 
1433
                                while (c == '0')
 
1434
                                        c = tok_nextc(tok);
 
1435
                                while (isdigit(c)) {
 
1436
                                        nonzero = 1;
 
1437
                                        c = tok_nextc(tok);
 
1438
                                }
 
1439
                                if (c == '.')
 
1440
                                        goto fraction;
 
1441
                                else if (c == 'e' || c == 'E')
 
1442
                                        goto exponent;
 
1443
#ifndef WITHOUT_COMPLEX
 
1444
                                else if (c == 'j' || c == 'J')
 
1445
                                        goto imaginary;
 
1446
#endif
 
1447
                                else if (nonzero) {
 
1448
                                        tok->done = E_TOKEN;
 
1449
                                        tok_backup(tok, c);
 
1450
                                        return ERRORTOKEN;
 
1451
                                }
 
1452
                        }
 
1453
                }
 
1454
                else {
 
1455
                        /* Decimal */
 
1456
                        do {
 
1457
                                c = tok_nextc(tok);
 
1458
                        } while (isdigit(c));
 
1459
                        {
 
1460
                                /* Accept floating point numbers. */
 
1461
                                if (c == '.') {
 
1462
                fraction:
 
1463
                                        /* Fraction */
 
1464
                                        do {
 
1465
                                                c = tok_nextc(tok);
 
1466
                                        } while (isdigit(c));
 
1467
                                }
 
1468
                                if (c == 'e' || c == 'E') {
 
1469
                exponent:
 
1470
                                        /* Exponent part */
 
1471
                                        c = tok_nextc(tok);
 
1472
                                        if (c == '+' || c == '-')
 
1473
                                                c = tok_nextc(tok);
 
1474
                                        if (!isdigit(c)) {
 
1475
                                                tok->done = E_TOKEN;
 
1476
                                                tok_backup(tok, c);
 
1477
                                                return ERRORTOKEN;
 
1478
                                        }
 
1479
                                        do {
 
1480
                                                c = tok_nextc(tok);
 
1481
                                        } while (isdigit(c));
 
1482
                                }
 
1483
#ifndef WITHOUT_COMPLEX
 
1484
                                if (c == 'j' || c == 'J')
 
1485
                                        /* Imaginary part */
 
1486
                imaginary:
 
1487
                                        c = tok_nextc(tok);
 
1488
#endif
 
1489
                        }
 
1490
                }
 
1491
                tok_backup(tok, c);
 
1492
                *p_start = tok->start;
 
1493
                *p_end = tok->cur;
 
1494
                return NUMBER;
 
1495
        }
 
1496
 
 
1497
  letter_quote:
 
1498
        /* String */
 
1499
        if (c == '\'' || c == '"') {
 
1500
                int quote = c;
 
1501
                int quote_size = 1;             /* 1 or 3 */
 
1502
                int end_quote_size = 0;
 
1503
 
 
1504
                /* Find the quote size and start of string */
 
1505
                c = tok_nextc(tok);
 
1506
                if (c == quote) {
 
1507
                        c = tok_nextc(tok);
 
1508
                        if (c == quote)
 
1509
                                quote_size = 3;
 
1510
                        else
 
1511
                                end_quote_size = 1;     /* empty string found */
 
1512
                }
 
1513
                if (c != quote)
 
1514
                    tok_backup(tok, c);
 
1515
 
 
1516
                /* Get rest of string */
 
1517
                while (end_quote_size != quote_size) {
 
1518
                        c = tok_nextc(tok);
 
1519
                        if (c == EOF) {
 
1520
                                if (quote_size == 3)
 
1521
                                        tok->done = E_EOFS;
 
1522
                                else
 
1523
                                        tok->done = E_EOLS;
 
1524
                                tok->cur = tok->inp;
 
1525
                                return ERRORTOKEN;
 
1526
                        }
 
1527
                        if (quote_size == 1 && c == '\n') {
 
1528
                            tok->done = E_EOLS;
 
1529
                            tok->cur = tok->inp;
 
1530
                            return ERRORTOKEN;
 
1531
                        }
 
1532
                        if (c == quote)
 
1533
                            end_quote_size += 1;
 
1534
                        else {
 
1535
                            end_quote_size = 0;
 
1536
                            if (c == '\\')
 
1537
                                c = tok_nextc(tok);  /* skip escaped char */
 
1538
                        }
 
1539
                }
 
1540
 
 
1541
                *p_start = tok->start;
 
1542
                *p_end = tok->cur;
 
1543
                return STRING;
 
1544
        }
 
1545
 
 
1546
        /* Line continuation */
 
1547
        if (c == '\\') {
 
1548
                c = tok_nextc(tok);
 
1549
                if (c != '\n') {
 
1550
                        tok->done = E_LINECONT;
 
1551
                        tok->cur = tok->inp;
 
1552
                        return ERRORTOKEN;
 
1553
                }
 
1554
                tok->cont_line = 1;
 
1555
                goto again; /* Read next line */
 
1556
        }
 
1557
 
 
1558
        /* Check for two-character token */
 
1559
        {
 
1560
                int c2 = tok_nextc(tok);
 
1561
                int token = PyToken_TwoChars(c, c2);
 
1562
                if (token != OP) {
 
1563
                        int c3 = tok_nextc(tok);
 
1564
                        int token3 = PyToken_ThreeChars(c, c2, c3);
 
1565
                        if (token3 != OP) {
 
1566
                                token = token3;
 
1567
                        } else {
 
1568
                                tok_backup(tok, c3);
 
1569
                        }
 
1570
                        *p_start = tok->start;
 
1571
                        *p_end = tok->cur;
 
1572
                        return token;
 
1573
                }
 
1574
                tok_backup(tok, c2);
 
1575
        }
 
1576
 
 
1577
        /* Keep track of parentheses nesting level */
 
1578
        switch (c) {
 
1579
        case '(':
 
1580
        case '[':
 
1581
        case '{':
 
1582
                tok->level++;
 
1583
                break;
 
1584
        case ')':
 
1585
        case ']':
 
1586
        case '}':
 
1587
                tok->level--;
 
1588
                break;
 
1589
        }
 
1590
 
 
1591
        /* Punctuation character */
 
1592
        *p_start = tok->start;
 
1593
        *p_end = tok->cur;
 
1594
        return PyToken_OneChar(c);
 
1595
}
 
1596
 
 
1597
int
 
1598
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 
1599
{
 
1600
        int result = tok_get(tok, p_start, p_end);
 
1601
        if (tok->decoding_erred) {
 
1602
                result = ERRORTOKEN;
 
1603
                tok->done = E_DECODE;
 
1604
        }
 
1605
        return result;
 
1606
}
 
1607
 
 
1608
/* Get -*- encoding -*- from a Python file.
 
1609
 
 
1610
   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
 
1611
   the first or second line of the file (in which case the encoding
 
1612
   should be assumed to be PyUnicode_GetDefaultEncoding()).
 
1613
 
 
1614
   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
 
1615
   by the caller.
 
1616
*/
 
1617
char *
 
1618
PyTokenizer_FindEncoding(int fd)
 
1619
{
 
1620
        struct tok_state *tok;
 
1621
        FILE *fp;
 
1622
        char *p_start =NULL , *p_end =NULL , *encoding = NULL;
 
1623
 
 
1624
        fd = dup(fd);
 
1625
        if (fd < 0) {
 
1626
                return NULL;
 
1627
        }
 
1628
        fp = fdopen(fd, "r");
 
1629
        if (fp == NULL) {
 
1630
                return NULL;
 
1631
        }
 
1632
        tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
 
1633
        if (tok == NULL) {
 
1634
                fclose(fp);
 
1635
                return NULL;
 
1636
        }
 
1637
        while (tok->lineno < 2 && tok->done == E_OK) {
 
1638
                PyTokenizer_Get(tok, &p_start, &p_end);
 
1639
        }
 
1640
        fclose(fp);
 
1641
        if (tok->encoding) {
 
1642
            encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
 
1643
            if (encoding)
 
1644
                strcpy(encoding, tok->encoding);
 
1645
        }
 
1646
        PyTokenizer_Free(tok);
 
1647
        return encoding;
 
1648
}
 
1649
 
 
1650
#ifdef Py_DEBUG
 
1651
 
 
1652
void
 
1653
tok_dump(int type, char *start, char *end)
 
1654
{
 
1655
        printf("%s", _PyParser_TokenNames[type]);
 
1656
        if (type == NAME || type == NUMBER || type == STRING || type == OP)
 
1657
                printf("(%.*s)", (int)(end - start), start);
 
1658
}
 
1659
 
 
1660
#endif