19
19
#include "mb/pg_wchar.h"
22
/* No reason to constrain amount of data slurped */
23
#define YY_READ_BUF_SIZE 16777216
25
22
/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
27
24
#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
27
* When we parse a token that requires multiple lexer rules to process,
28
* remember the token's starting position this way.
30
#define SAVE_TOKEN_START() \
31
( start_lineno = plpgsql_scanner_lineno(), start_charpos = yytext )
29
33
/* Handles to the buffer that the lexer uses internally */
30
34
static YY_BUFFER_STATE scanbufhandle;
31
35
static char *scanbuf;
33
37
static const char *scanstr; /* original input string */
35
static int scanner_functype;
36
static bool scanner_typereported;
37
39
static int pushback_token;
38
40
static bool have_pushback_token;
39
41
static const char *cur_line_start;
40
42
static int cur_line_num;
43
static int xcdepth = 0; /* depth of nesting in slash-star comments */
41
44
static char *dolqstart; /* current $foo$ quote start string */
42
static int dolqlen; /* signal to plpgsql_get_string_value */
46
extern PGDLLIMPORT bool standard_conforming_strings;
44
48
bool plpgsql_SpaceScanned = false;
55
59
%option case-insensitive
62
* Exclusive states are a subset of the core lexer's:
63
* <xc> extended C-style comments
64
* <xq> standard quoted strings
65
* <xe> extended quoted strings (support backslash escape sequences)
66
* <xdolq> $foo$ quoted strings
75
* Definitions --- these generally must match the core lexer, but in some
76
* cases we can simplify, since we only care about identifying the token
77
* boundaries and not about deriving the represented value. Also, we
78
* aren't trying to lex multicharacter operators so their interactions
79
* with comments go away.
87
comment ("--"{non_newline}*)
89
whitespace ({space}+|{comment})
90
special_whitespace ({space}+|{comment}{newline})
91
horiz_whitespace ({horiz_space}|{comment})
92
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
95
quotestop {quote}{whitespace}*
96
quotecontinue {quote}{whitespace_with_newline}{quote}
97
quotefail {quote}{whitespace}*"-"
104
xqdouble {quote}{quote}
107
dolq_start [A-Za-z\200-\377_]
108
dolq_cont [A-Za-z\200-\377_0-9]
109
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
110
dolqfailed \${dolq_start}{dolq_cont}*
63
118
ident_start [A-Za-z\200-\377_]
64
119
ident_cont [A-Za-z\200-\377_0-9\$]
121
/* This is a simpler treatment of quoted identifiers than the core uses */
66
122
quoted_ident (\"[^\"]*\")+
68
124
identifier ({ident_start}{ident_cont}*|{quoted_ident})
74
/* $foo$ style quotes ("dollar quoting")
75
* copied straight from the backend SQL parser
77
dolq_start [A-Za-z\200-\377_]
78
dolq_cont [A-Za-z\200-\377_0-9]
79
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
84
130
* Local variables in scanner to remember where
226
261
{digit}+ { return T_NUMBER; }
229
plpgsql_error_lineno = plpgsql_scanner_lineno();
231
(errcode(ERRCODE_DATATYPE_MISMATCH),
232
errmsg("unterminated quoted identifier")));
236
* Ignore whitespaces but remember this happened
239
{space}+ { plpgsql_SpaceScanned = true; }
247
\/\* { start_lineno = plpgsql_scanner_lineno();
250
<IN_COMMENT>\*\/ { BEGIN(INITIAL); plpgsql_SpaceScanned = true; }
253
<IN_COMMENT><<EOF>> {
254
plpgsql_error_lineno = start_lineno;
256
(errcode(ERRCODE_DATATYPE_MISMATCH),
257
errmsg("unterminated /* comment")));
261
* Collect anything inside of ''s and return one STRING token
263
* Hacking yytext/yyleng here lets us avoid using yymore(), which is
264
* a win for performance. It's safe because we know the underlying
265
* input buffer is not changing.
269
start_lineno = plpgsql_scanner_lineno();
270
start_charpos = yytext;
274
/* for now, treat the same as a regular literal */
275
start_lineno = plpgsql_scanner_lineno();
276
start_charpos = yytext;
280
<IN_STRING>\\ { /* can only happen with \ at EOF */ }
283
/* tell plpgsql_get_string_value it's not a dollar quote */
285
/* adjust yytext/yyleng to describe whole string token */
286
yyleng += (yytext - start_charpos);
287
yytext = start_charpos;
291
<IN_STRING>[^'\\]+ { }
293
plpgsql_error_lineno = start_lineno;
295
(errcode(ERRCODE_DATATYPE_MISMATCH),
296
errmsg("unterminated quoted string")));
300
start_lineno = plpgsql_scanner_lineno();
301
start_charpos = yytext;
302
dolqstart = pstrdup(yytext);
303
BEGIN(IN_DOLLARQUOTE);
305
<IN_DOLLARQUOTE>{dolqdelim} {
306
if (strcmp(yytext, dolqstart) == 0)
309
/* tell plpgsql_get_string_value it is a dollar quote */
263
\". { yyerror("unterminated quoted identifier"); }
266
* Ignore whitespace (including comments) but remember this happened
269
{whitespace} { plpgsql_SpaceScanned = true; }
272
* Comment and literal handling is mostly copied from the core lexer
276
/* Set location in case of syntax error in comment */
280
plpgsql_SpaceScanned = true;
306
<xc><<EOF>> { yyerror("unterminated /* comment"); }
310
if (standard_conforming_strings)
311
323
/* adjust yytext/yyleng to describe whole string token */
312
324
yyleng += (yytext - start_charpos);
313
325
yytext = start_charpos;
320
* When we fail to match $...$ to dolqstart, transfer
321
* the $... part to the output, but put back the final
322
* $ for rescanning. Consider $delim$...$junk$delim$
327
<IN_DOLLARQUOTE>{dolqinside} { }
328
<IN_DOLLARQUOTE>. { /* needed for $ inside the quoted text */ }
329
<IN_DOLLARQUOTE><<EOF>> {
330
plpgsql_error_lineno = start_lineno;
332
(errcode(ERRCODE_DATATYPE_MISMATCH),
333
errmsg("unterminated dollar-quoted string")));
336
<xq,xe>{quotecontinue} {
340
/* This is only needed for \ just before EOF */
342
<xq,xe><<EOF>> { yyerror("unterminated quoted string"); }
346
dolqstart = pstrdup(yytext);
350
/* throw back all but the initial "$" */
352
/* and treat it as {other} */
356
if (strcmp(yytext, dolqstart) == 0)
360
/* adjust yytext/yyleng to describe whole string */
361
yyleng += (yytext - start_charpos);
362
yytext = start_charpos;
368
* When we fail to match $...$ to dolqstart, transfer
369
* the $... part to the output, but put back the final
370
* $ for rescanning. Consider $delim$...$junk$delim$
375
<xdolq>{dolqinside} {
377
<xdolq>{dolqfailed} {
380
/* This is only needed for $ inside the quoted text */
382
<xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
337
385
* Any unmatched character is returned as is
340
. { return yytext[0]; }
493
540
yy_delete_buffer(scanbufhandle);
498
* Called after a T_STRING token is read to get the string literal's value
499
* as a palloc'd string. (We make this a separate call because in many
500
* scenarios there's no need to get the decoded value.)
502
* Note: we expect the literal to be the most recently lexed token. This
503
* would not work well if we supported multiple-token pushback or if
504
* plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
507
plpgsql_get_string_value(void)
515
/* Token is a $foo$...$foo$ string */
516
len = yyleng - 2 * dolqlen;
518
result = (char *) palloc(len + 1);
519
memcpy(result, yytext + dolqlen, len);
522
else if (*yytext == 'E' || *yytext == 'e')
524
/* Token is an E'...' string */
525
result = (char *) palloc(yyleng + 1); /* more than enough room */
527
for (cp = yytext + 2; *cp; cp++)
532
result[len++] = *cp++;
533
/* else it must be string end quote */
535
else if (*cp == '\\')
537
if (cp[1] != '\0') /* just a paranoid check */
538
result[len++] = *(++cp);
547
/* Token is a '...' string */
548
result = (char *) palloc(yyleng + 1); /* more than enough room */
550
for (cp = yytext + 1; *cp; cp++)
555
result[len++] = *cp++;
556
/* else it must be string end quote */
558
else if (*cp == '\\')
560
if (cp[1] != '\0') /* just a paranoid check */
561
result[len++] = *(++cp);