1
/* xgettext Perl backend.
2
Copyright (C) 2002-2003 Free Software Foundation, Inc.
4
This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
11
This program is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with this program; if not, write to the Free Software Foundation,
18
Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34
#include "error-progname.h"
37
#include "po-charset.h"
38
#include "ucs4-utf8.h"
43
#define _(s) gettext(s)
45
/* The Perl syntax is defined in perlsyn.pod. Try the command
46
"man perlsyn" or "perldoc perlsyn".
47
Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
48
Try the command "man perlsub" or "perldoc perlsub". */
53
/* ====================== Keyword set customization. ====================== */
55
/* If true extract all strings. */
56
static bool extract_all = false;
58
static hash_table keywords;
59
static bool default_keywords = true;
70
x_perl_keyword (const char *name)
73
default_keywords = false;
81
if (keywords.table == NULL)
82
init_hash (&keywords, 100);
84
split_keywordspec (name, &end, &argnum1, &argnum2);
86
/* The characters between name and end should form a valid C identifier.
87
A colon means an invalid parse in split_keywordspec(). */
88
colon = strchr (name, ':');
89
if (colon == NULL || colon >= end)
93
insert_entry (&keywords, name, end - name,
94
(void *) (long) (argnum1 + (argnum2 << 10)));
99
/* Finish initializing the keywords hash table.
100
Called after argument processing, before each file is processed. */
104
if (default_keywords)
106
x_perl_keyword ("gettext");
107
x_perl_keyword ("%gettext");
108
x_perl_keyword ("$gettext");
109
x_perl_keyword ("dgettext:2");
110
x_perl_keyword ("dcgettext:2");
111
x_perl_keyword ("ngettext:1,2");
112
x_perl_keyword ("dngettext:2,3");
113
x_perl_keyword ("dcngettext:2,3");
114
x_perl_keyword ("gettext_noop");
116
x_perl_keyword ("__");
117
x_perl_keyword ("$__");
118
x_perl_keyword ("%__");
119
x_perl_keyword ("__x");
120
x_perl_keyword ("__n:1,2");
121
x_perl_keyword ("__nx:1,2");
122
x_perl_keyword ("__xn:1,2");
123
x_perl_keyword ("N__");
125
default_keywords = false;
130
init_flag_table_perl ()
132
xgettext_record_flag ("gettext:1:pass-perl-format");
133
xgettext_record_flag ("gettext:1:pass-perl-brace-format");
134
xgettext_record_flag ("%gettext:1:pass-perl-format");
135
xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
136
xgettext_record_flag ("$gettext:1:pass-perl-format");
137
xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
138
xgettext_record_flag ("dgettext:2:pass-perl-format");
139
xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
140
xgettext_record_flag ("dcgettext:2:pass-perl-format");
141
xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
142
xgettext_record_flag ("ngettext:1:pass-perl-format");
143
xgettext_record_flag ("ngettext:2:pass-perl-format");
144
xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
145
xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
146
xgettext_record_flag ("dngettext:2:pass-perl-format");
147
xgettext_record_flag ("dngettext:3:pass-perl-format");
148
xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
149
xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
150
xgettext_record_flag ("dcngettext:2:pass-perl-format");
151
xgettext_record_flag ("dcngettext:3:pass-perl-format");
152
xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
153
xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
154
xgettext_record_flag ("gettext_noop:1:pass-perl-format");
155
xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
156
xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
157
xgettext_record_flag ("sprintf:1:perl-format");
159
xgettext_record_flag ("__:1:pass-perl-format");
160
xgettext_record_flag ("__:1:pass-perl-brace-format");
161
xgettext_record_flag ("%__:1:pass-perl-format");
162
xgettext_record_flag ("%__:1:pass-perl-brace-format");
163
xgettext_record_flag ("$__:1:pass-perl-format");
164
xgettext_record_flag ("$__:1:pass-perl-brace-format");
165
xgettext_record_flag ("__x:1:perl-brace-format");
166
xgettext_record_flag ("__n:1:pass-perl-format");
167
xgettext_record_flag ("__n:2:pass-perl-format");
168
xgettext_record_flag ("__n:1:pass-perl-brace-format");
169
xgettext_record_flag ("__n:2:pass-perl-brace-format");
170
xgettext_record_flag ("__nx:1:perl-brace-format");
171
xgettext_record_flag ("__nx:2:perl-brace-format");
172
xgettext_record_flag ("__xn:1:perl-brace-format");
173
xgettext_record_flag ("__xn:2:perl-brace-format");
174
xgettext_record_flag ("N__:1:pass-perl-format");
175
xgettext_record_flag ("N__:1:pass-perl-brace-format");
180
/* ======================== Reading of characters. ======================== */
182
/* Real filename, used in error messages about the input file. */
183
static const char *real_file_name;
185
/* Logical filename and line number, used to label the extracted messages. */
186
static char *logical_file_name;
187
static int line_number;
189
/* The input file stream. */
192
/* The current line buffer. */
193
static char *linebuf;
195
/* The size of the current line. */
198
/* The position in the current line. */
201
/* The size of the input buffer. */
202
static size_t linebuf_size;
204
/* Number of lines eaten for here documents. */
205
static int here_eaten;
207
/* Paranoia: EOF marker for __END__ or __DATA__. */
208
static bool end_of_file;
211
/* 1. line_number handling. */
213
/* Returns the next character from the input stream or EOF. */
217
line_number += here_eaten;
223
if (linepos >= linesize)
225
linesize = getline (&linebuf, &linebuf_size, fp);
230
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
239
/* Undosify. This is important for catching the end of <<EOF and
240
<<'EOF'. We could rely on stdio doing this for us but you
241
it is not uncommon to to come across Perl scripts with CRLF
242
newline conventions on systems that do not follow this
244
if (linesize >= 2 && linebuf[linesize - 1] == '\n'
245
&& linebuf[linesize - 2] == '\r')
247
linebuf[linesize - 2] = '\n';
248
linebuf[linesize - 1] = '\0';
253
return linebuf[linepos++];
256
/* Supports only one pushback character. */
258
phase1_ungetc (int c)
263
/* Attempt to ungetc across line boundary. Shouldn't happen.
264
No two phase1_ungetc calls are permitted in a row. */
271
/* Read a here document and return its contents.
272
The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
276
get_here_document (const char *delimiter)
278
/* Accumulator for the entire here document, including a NUL byte
281
static size_t bufmax = 0;
283
/* Current line being appended. */
284
static char *my_linebuf = NULL;
285
static size_t my_linebuf_size = 0;
287
/* Allocate the initial buffer. Later on, bufmax > 0. */
290
buffer = xrealloc (NULL, 1);
297
int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
305
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
310
error_with_progname = false;
311
error (EXIT_SUCCESS, 0, _("\
312
%s:%d: can't find string terminator \"%s\" anywhere before EOF"),
313
real_file_name, line_number, delimiter);
314
error_with_progname = true;
322
/* Convert to UTF-8. */
324
from_current_source_encoding (my_linebuf, logical_file_name,
325
line_number + here_eaten);
326
if (my_line_utf8 != my_linebuf)
328
if (strlen (my_line_utf8) >= my_linebuf_size)
330
my_linebuf_size = strlen (my_line_utf8) + 1;
331
my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
333
strcpy (my_linebuf, my_line_utf8);
337
/* Undosify. This is important for catching the end of <<EOF and
338
<<'EOF'. We could rely on stdio doing this for us but you
339
it is not uncommon to to come across Perl scripts with CRLF
340
newline conventions on systems that do not follow this
342
if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
343
&& my_linebuf[read_bytes - 2] == '\r')
345
my_linebuf[read_bytes - 2] = '\n';
346
my_linebuf[read_bytes - 1] = '\0';
350
/* Temporarily remove the trailing newline from my_linebuf. */
352
if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
355
my_linebuf[read_bytes - 1] = '\0';
358
/* See whether this line terminates the here document. */
359
if (strcmp (my_linebuf, delimiter) == 0)
362
/* Add back the trailing newline to my_linebuf. */
364
my_linebuf[read_bytes - 1] = '\n';
366
/* Ensure room for read_bytes + 1 bytes. */
367
if (bufpos + read_bytes >= bufmax)
370
bufmax = 2 * bufmax + 10;
371
while (bufpos + read_bytes >= bufmax);
372
buffer = xrealloc (buffer, bufmax);
374
/* Append this line to the accumulator. */
375
strcpy (buffer + bufpos, my_linebuf);
376
bufpos += read_bytes;
379
/* Done accumulating the here document. */
380
return xstrdup (buffer);
383
/* Skips pod sections. */
387
line_number += here_eaten;
393
linesize = getline (&linebuf, &linebuf_size, fp);
398
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
405
if (strncmp ("=cut", linebuf, 4) == 0)
407
/* Force reading of a new line on next call to phase1_getc(). */
415
/* These are for tracking whether comments count as immediately before
417
static int last_comment_line;
418
static int last_non_comment_line;
421
/* 2. Replace each comment that is not inside a string literal or regular
422
expression with a newline character. We need to remember the comment
423
for later, because it may be attached to a keyword string. */
429
static size_t bufmax;
439
lineno = line_number;
440
/* Skip leading whitespace. */
446
if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
452
/* Accumulate the comment. */
456
if (c == '\n' || c == EOF)
458
if (buflen >= bufmax)
460
bufmax = 2 * bufmax + 10;
461
buffer = xrealloc (buffer, bufmax);
463
buffer[buflen++] = c;
465
if (buflen >= bufmax)
467
bufmax = 2 * bufmax + 10;
468
buffer = xrealloc (buffer, bufmax);
470
buffer[buflen] = '\0';
471
/* Convert it to UTF-8. */
473
from_current_source_encoding (buffer, logical_file_name, lineno);
474
/* Save it until we encounter the corresponding string. */
475
xgettext_current_source_encoding = po_charset_utf8;
476
xgettext_comment_add (utf8_string);
477
xgettext_current_source_encoding = xgettext_global_source_encoding;
478
last_comment_line = lineno;
483
/* Supports only one pushback character. */
485
phase2_ungetc (int c)
491
/* Whitespace recognition. */
493
#define case_whitespace \
494
case ' ': case '\t': case '\r': case '\n': case '\f'
497
is_whitespace (int c)
499
return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
503
/* ========================== Reading of tokens. ========================== */
509
token_type_lparen, /* ( */
510
token_type_rparen, /* ) */
511
token_type_comma, /* , */
512
token_type_fat_comma, /* => */
513
token_type_dereference, /* , */
514
token_type_semicolon, /* ; */
515
token_type_lbrace, /* { */
516
token_type_rbrace, /* } */
517
token_type_lbracket, /* [ */
518
token_type_rbracket, /* ] */
519
token_type_string, /* quote-like */
520
token_type_named_op, /* if, unless, while, ... */
521
token_type_variable, /* $... */
522
token_type_symbol, /* symbol, number */
523
token_type_regex_op, /* s, tr, y, m. */
524
token_type_dot, /* . */
525
token_type_other, /* regexp, misc. operator */
526
/* The following are not really token types, but variants used by
528
token_type_keyword_symbol /* keyword symbol */
530
typedef enum token_type_ty token_type_ty;
532
/* Subtypes for strings, important for interpolation. */
535
string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'",
536
"tr/.../.../", "y/.../.../". */
537
string_type_q, /* "'..'", "q/.../". */
538
string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../",
540
string_type_qr /* Not supported. */
543
/* Subtypes for symbols, important for dollar interpretation. */
546
symbol_type_none, /* Nothing special. */
547
symbol_type_sub, /* 'sub'. */
548
symbol_type_function /* Function name after 'sub'. */
551
typedef struct token_ty token_ty;
555
int sub_type; /* for token_type_string, token_type_symbol */
556
char *string; /* for: in encoding:
557
token_type_named_op ASCII
558
token_type_string UTF-8
559
token_type_symbol ASCII
560
token_type_variable global_source_encoding
567
token2string (const token_ty *token)
572
return "token_type_eof";
573
case token_type_lparen:
574
return "token_type_lparen";
575
case token_type_rparen:
576
return "token_type_rparen";
577
case token_type_comma:
578
return "token_type_comma";
579
case token_type_fat_comma:
580
return "token_type_fat_comma";
581
case token_type_dereference:
582
return "token_type_dereference";
583
case token_type_semicolon:
584
return "token_type_semicolon";
585
case token_type_lbrace:
586
return "token_type_lbrace";
587
case token_type_rbrace:
588
return "token_type_rbrace";
589
case token_type_lbracket:
590
return "token_type_lbracket";
591
case token_type_rbracket:
592
return "token_type_rbracket";
593
case token_type_string:
594
return "token_type_string";
595
case token_type_named_op:
596
return "token_type_named_op";
597
case token_type_variable:
598
return "token_type_variable";
599
case token_type_symbol:
600
return "token_type_symbol";
601
case token_type_regex_op:
602
return "token_type_regex_op";
604
return "token_type_dot";
605
case token_type_other:
606
return "token_type_other";
613
/* Free the memory pointed to by a 'struct token_ty'. */
615
free_token (token_ty *tp)
619
case token_type_named_op:
620
case token_type_string:
621
case token_type_symbol:
622
case token_type_variable:
631
/* Pass 1 of extracting quotes: Find the end of the string, regardless
632
of the semantics of the construct. Return the complete string,
633
including the starting and the trailing delimiter, with backslashes
634
removed where appropriate. */
636
extract_quotelike_pass1 (int delim)
638
/* This function is called recursively. No way to allocate stuff
639
statically. Also alloca() is inappropriate due to limited stack
640
size on some platforms. So we use malloc(). */
642
char *buffer = (char *) xmalloc (bufmax);
647
buffer[bufpos++] = delim;
649
/* Find the closing delimiter. */
664
default: /* "..." or '...' or |...| etc. */
666
counter_delim = delim;
672
int c = phase1_getc ();
674
/* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */
675
if (bufpos + 2 > bufmax)
677
bufmax = 2 * bufmax + 10;
678
buffer = xrealloc (buffer, bufmax);
681
if (c == counter_delim || c == EOF)
683
buffer[bufpos++] = counter_delim; /* will be stripped off later */
684
buffer[bufpos++] = '\0';
686
fprintf (stderr, "PASS1: %s\n", buffer);
691
if (nested && c == delim)
693
char *inner = extract_quotelike_pass1 (delim);
694
size_t len = strlen (inner);
696
/* Ensure room for len + 1 bytes. */
697
if (bufpos + len >= bufmax)
700
bufmax = 2 * bufmax + 10;
701
while (bufpos + len >= bufmax);
702
buffer = xrealloc (buffer, bufmax);
704
strcpy (buffer + bufpos, inner);
713
buffer[bufpos++] = '\\';
714
buffer[bufpos++] = '\\';
716
else if (c == delim || c == counter_delim)
718
/* This is pass2 in Perl. */
719
buffer[bufpos++] = c;
723
buffer[bufpos++] = '\\';
729
buffer[bufpos++] = c;
734
/* Like extract_quotelike_pass1, but return the complete string in UTF-8
737
extract_quotelike_pass1_utf8 (int delim)
739
char *string = extract_quotelike_pass1 (delim);
741
from_current_source_encoding (string, logical_file_name, line_number);
742
if (utf8_string != string)
748
/* ========= Reading of tokens and commands. Extracting strings. ========= */
751
/* There is an ambiguity about '/': It can start a division operator ('/' or
752
'/=') or it can start a regular expression. The distinction is important
753
because inside regular expressions, '#' loses its special meaning.
754
The distinction is possible depending on the parsing state: After a
755
variable or simple expression, it's a division operator; at the beginning
756
of an expression, it's a regexp. */
757
static bool prefer_division_over_regexp;
759
/* Context lookup table. */
760
static flag_context_list_table_ty *flag_context_list_table;
763
/* Forward declaration of local functions. */
764
static void interpolate_keywords (message_list_ty *mlp, const char *string,
766
static token_ty *x_perl_lex (message_list_ty *mlp);
767
static void x_perl_unlex (token_ty *tp);
768
static bool extract_balanced (message_list_ty *mlp, int state,
770
flag_context_ty outer_context,
771
flag_context_list_iterator_ty context_iter,
772
int arg_sg, int arg_pl);
775
/* Extract an unsigned hexadecimal number from STRING, considering at
776
most LEN bytes and place the result in *RESULT. Returns a pointer
777
to the first character past the hexadecimal number. */
779
extract_hex (const char *string, size_t len, unsigned int *result)
785
for (i = 0; i < len; i++)
790
if (c >= 'A' && c <= 'F')
791
number = c - 'A' + 10;
792
else if (c >= 'a' && c <= 'f')
793
number = c - 'a' + 10;
794
else if (c >= '0' && c <= '9')
806
/* Extract an unsigned octal number from STRING, considering at
807
most LEN bytes and place the result in *RESULT. Returns a pointer
808
to the first character past the octal number. */
810
extract_oct (const char *string, size_t len, unsigned int *result)
816
for (i = 0; i < len; i++)
821
if (c >= '0' && c <= '7')
833
/* Extract the various quotelike constructs except for <<EOF. See the
834
section "Gory details of parsing quoted constructs" in perlop.pod.
835
Return the resulting token in *tp; tp->type == token_type_string. */
837
extract_quotelike (token_ty *tp, int delim)
839
char *string = extract_quotelike_pass1_utf8 (delim);
840
size_t len = strlen (string);
842
tp->type = token_type_string;
843
/* Take the string without the delimiters at the start and at the end. */
846
string[len - 1] = '\0';
847
tp->string = xstrdup (string + 1);
851
/* Extract the quotelike constructs with double delimiters, like
852
s/[SEARCH]/[REPLACE]/. This function does not eat up trailing
853
modifiers (left to the caller).
854
Return the resulting token in *tp; tp->type == token_type_regex_op. */
856
extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
861
tp->type = token_type_regex_op;
863
string = extract_quotelike_pass1_utf8 (delim);
865
interpolate_keywords (mlp, string, line_number);
868
if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
870
/* The delimiter for the second string can be different, e.g.
871
s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */
872
delim = phase1_getc ();
873
while (is_whitespace (delim))
875
/* The hash-sign is not a valid delimiter after whitespace, ergo
876
use phase2_getc() and not phase1_getc() now. */
877
delim = phase2_getc ();
880
string = extract_quotelike_pass1_utf8 (delim);
882
interpolate_keywords (mlp, string, line_number);
886
/* Perform pass 3 of quotelike extraction (interpolation).
887
*tp is a token of type token_type_string.
888
This function replaces tp->string. */
889
/* FIXME: Currently may writes null-bytes into the string. */
891
extract_quotelike_pass3 (token_ty *tp, int error_level)
894
static int bufmax = 0;
902
switch (tp->sub_type)
904
case string_type_verbatim:
905
fprintf (stderr, "Interpolating string_type_verbatim:\n");
908
fprintf (stderr, "Interpolating string_type_q:\n");
911
fprintf (stderr, "Interpolating string_type_qq:\n");
914
fprintf (stderr, "Interpolating string_type_qr:\n");
917
fprintf (stderr, "%s\n", tp->string);
918
if (tp->sub_type == string_type_verbatim)
919
fprintf (stderr, "---> %s\n", tp->string);
922
if (tp->sub_type == string_type_verbatim)
925
/* Loop over tp->string, accumulating the expansion in buffer. */
934
/* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
935
if \Q modifier is present. */
936
if (bufpos + 7 > bufmax)
938
bufmax = 2 * bufmax + 10;
939
buffer = xrealloc (buffer, bufmax);
942
if (tp->sub_type == string_type_q)
950
buffer[bufpos++] = '\\';
955
buffer[bufpos++] = *crs++;
961
/* We only get here for double-quoted strings or regular expressions.
962
Unescape escape sequences. */
969
buffer[bufpos++] = '\t';
973
buffer[bufpos++] = '\n';
977
buffer[bufpos++] = '\r';
981
buffer[bufpos++] = '\f';
985
buffer[bufpos++] = '\b';
989
buffer[bufpos++] = '\a';
993
buffer[bufpos++] = 0x1b;
995
case '0': case '1': case '2': case '3':
996
case '4': case '5': case '6': case '7':
998
unsigned int oct_number;
1001
crs = extract_oct (crs + 1, 3, &oct_number);
1003
/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1004
true, the character should be converted to its uppercase
1005
resp. lowercase equivalent. I don't know if the necessary
1006
facilities are already included in gettext. For US-Ascii
1007
the conversion can be already be done, however. */
1008
if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1010
oct_number = oct_number - 'a' + 'A';
1012
else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1014
oct_number = oct_number - 'A' + 'a';
1018
/* Yes, octal escape sequences in the range 0x100..0x1ff are
1020
length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1028
unsigned int hex_number = 0;
1034
const char *end = strchr (crs, '}');
1037
error_with_progname = false;
1038
error (error_level, 0, _("\
1039
%s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1040
error_with_progname = true;
1047
(void) extract_hex (crs, end - crs, &hex_number);
1053
crs = extract_hex (crs, 2, &hex_number);
1056
/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1057
true, the character should be converted to its uppercase
1058
resp. lowercase equivalent. I don't know if the necessary
1059
facilities are already included in gettext. For US-Ascii
1060
the conversion can be already be done, however. */
1061
if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1063
hex_number = hex_number - 'a' + 'A';
1065
else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1067
hex_number = hex_number - 'A' + 'a';
1070
length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1078
/* Perl's notion of control characters. */
1082
int the_char = (unsigned char) *crs;
1083
if (the_char >= 'a' || the_char <= 'z')
1084
the_char = the_char - 'a' + 'A';
1085
buffer[bufpos++] = the_char ^ 0x40;
1092
const char *end = strchr (crs + 1, '}');
1096
unsigned int unicode;
1098
name = (char *) xmalloc (end - (crs + 1) + 1);
1099
memcpy (name, crs + 1, end - (crs + 1));
1100
name[end - (crs + 1)] = '\0';
1102
unicode = unicode_name_character (name);
1103
if (unicode != UNINAME_INVALID)
1105
/* FIXME: Convert to upper/lowercase if the
1106
corresponding flag is set to true. */
1108
u8_uctomb ((unsigned char *) (buffer + bufpos),
1123
/* No escape sequence, go on. */
1151
if (*crs >= 'A' && *crs <= 'Z')
1153
buffer[bufpos++] = *crs - 'A' + 'a';
1155
else if ((unsigned char) *crs >= 0x80)
1157
error_with_progname = false;
1158
error (error_level, 0, _("\
1159
%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1160
real_file_name, line_number, *crs);
1161
error_with_progname = true;
1165
buffer[bufpos++] = *crs;
1171
if (*crs >= 'a' && *crs <= 'z')
1173
buffer[bufpos++] = *crs - 'a' + 'A';
1175
else if ((unsigned char) *crs >= 0x80)
1177
error_with_progname = false;
1178
error (error_level, 0, _("\
1179
%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1180
real_file_name, line_number, *crs);
1181
error_with_progname = true;
1185
buffer[bufpos++] = *crs;
1190
buffer[bufpos++] = *crs;
1199
backslashed = false;
1202
&& !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1203
|| (*crs >= '0' && *crs <= '9') || *crs == '_'))
1205
buffer[bufpos++] = '\\';
1209
if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1211
error_with_progname = false;
1212
error (error_level, 0, _("\
1213
%s:%d: invalid variable interpolation at \"%c\""),
1214
real_file_name, line_number, *crs);
1215
error_with_progname = true;
1220
if (*crs >= 'A' && *crs <= 'Z')
1221
buffer[bufpos++] = *crs - 'A' + 'a';
1222
else if ((unsigned char) *crs >= 0x80)
1224
error_with_progname = false;
1225
error (error_level, 0, _("\
1226
%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1227
real_file_name, line_number, *crs);
1228
error_with_progname = true;
1229
buffer[bufpos++] = *crs;
1232
buffer[bufpos++] = *crs;
1237
if (*crs >= 'a' && *crs <= 'z')
1238
buffer[bufpos++] = *crs - 'a' + 'A';
1239
else if ((unsigned char) *crs >= 0x80)
1241
error_with_progname = false;
1242
error (error_level, 0, _("\
1243
%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1244
real_file_name, line_number, *crs);
1245
error_with_progname = true;
1246
buffer[bufpos++] = *crs;
1249
buffer[bufpos++] = *crs;
1254
buffer[bufpos++] = *crs++;
1258
/* Ensure room for 1 more byte. */
1259
if (bufpos >= bufmax)
1261
bufmax = 2 * bufmax + 10;
1262
buffer = xrealloc (buffer, bufmax);
1265
buffer[bufpos++] = '\0';
1268
fprintf (stderr, "---> %s\n", buffer);
1271
/* Replace tp->string. */
1273
tp->string = xstrdup (buffer);
1276
/* Parse a variable. This is done in several steps:
1277
1) Consume all leading occurencies of '$', '@', '%', and '*'.
1278
2) Determine the name of the variable from the following input.
1279
3) Parse possible following hash keys or array indexes.
1282
extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1284
static char *buffer;
1285
static int bufmax = 0;
1288
size_t varbody_length = 0;
1289
bool maybe_hash_deref = false;
1290
bool maybe_hash_value = false;
1292
tp->type = token_type_variable;
1295
fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1296
real_file_name, line_number, first);
1300
* 1) Consume dollars and so on (not euros ...). Unconditionally
1301
* accepting the hash sign (#) will maybe lead to inaccurate
1304
while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1306
if (bufpos >= bufmax)
1308
bufmax = 2 * bufmax + 10;
1309
buffer = xrealloc (buffer, bufmax);
1311
buffer[bufpos++] = c;
1317
tp->type = token_type_eof;
1321
/* Hash references are treated in a special way, when looking for
1323
if (buffer[0] == '$')
1326
maybe_hash_value = true;
1327
else if (bufpos == 2 && buffer[1] == '$')
1330
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1331
|| (c >= '0' && c <= '9')
1332
|| c == '_' || c == ':' || c == '\'' || c >= 0x80))
1334
/* Special variable $$ for pid. */
1335
if (bufpos >= bufmax)
1337
bufmax = 2 * bufmax + 10;
1338
buffer = xrealloc (buffer, bufmax);
1340
buffer[bufpos++] = '\0';
1341
tp->string = xstrdup (buffer);
1343
fprintf (stderr, "%s:%d: is PID ($$)\n",
1344
real_file_name, line_number);
1351
maybe_hash_deref = true;
1357
* 2) Get the name of the variable. The first character is practically
1358
* arbitrary. Punctuation and numbers automagically put a variable
1359
* in the global namespace but that subtle difference is not interesting
1362
if (bufpos >= bufmax)
1364
bufmax = 2 * bufmax + 10;
1365
buffer = xrealloc (buffer, bufmax);
1369
/* Yuck, we cannot accept ${gettext} as a keyword... Except for
1370
* debugging purposes it is also harmless, that we suppress the
1371
* real name of the variable.
1374
fprintf (stderr, "%s:%d: braced {variable_name}\n",
1375
real_file_name, line_number);
1378
if (extract_balanced (mlp, 0, token_type_rbrace,
1379
null_context, null_context_list_iterator, -1, -1))
1381
buffer[bufpos++] = c;
1385
while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1386
|| (c >= '0' && c <= '9')
1387
|| c == '_' || c == ':' || c == '\'' || c >= 0x80)
1390
if (bufpos >= bufmax)
1392
bufmax = 2 * bufmax + 10;
1393
buffer = xrealloc (buffer, bufmax);
1395
buffer[bufpos++] = c;
1401
/* Probably some strange Perl variable like $`. */
1402
if (varbody_length == 0)
1405
if (c == EOF || is_whitespace (c))
1406
phase1_ungetc (c); /* Loser. */
1409
if (bufpos >= bufmax)
1411
bufmax = 2 * bufmax + 10;
1412
buffer = xrealloc (buffer, bufmax);
1414
buffer[bufpos++] = c;
1418
if (bufpos >= bufmax)
1420
bufmax = 2 * bufmax + 10;
1421
buffer = xrealloc (buffer, bufmax);
1423
buffer[bufpos++] = '\0';
1425
tp->string = xstrdup (buffer);
1428
fprintf (stderr, "%s:%d: complete variable name: %s\n",
1429
real_file_name, line_number, tp->string);
1432
prefer_division_over_regexp = true;
1435
* 3) If the following looks strange to you, this is valid Perl syntax:
1437
* $var = $$hashref # We can place a
1438
* # comment here and then ...
1439
* {key_into_hashref};
1441
* POD sections are not allowed but we leave complaints about
1442
* that to the compiler/interpreter.
1444
/* We only extract strings from the first hash key (if present). */
1446
if (maybe_hash_deref || maybe_hash_value)
1448
bool is_dereference = false;
1453
while (is_whitespace (c));
1457
int c2 = phase1_getc ();
1461
is_dereference = true;
1465
while (is_whitespace (c));
1467
else if (c2 != '\n')
1469
/* Discarding the newline is harmless here. The only
1470
special character recognized after a minus is greater-than
1471
for dereference. However, the sequence "-\n>" that we
1472
treat incorrectly here, is a syntax error. */
1477
if (maybe_hash_value && is_dereference)
1480
fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1481
real_file_name, line_number);
1484
else if (maybe_hash_value)
1486
/* Fake it into a hash. */
1487
tp->string[0] = '%';
1490
/* Do NOT change that into else if (see above). */
1491
if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1493
void *keyword_value;
1496
fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1497
real_file_name, line_number);
1500
if (find_entry (&keywords, tp->string, strlen (tp->string),
1501
&keyword_value) == 0)
1503
/* Extract a possible string from the key. Before proceeding
1504
we check whether the open curly is followed by a symbol and
1505
then by a right curly. */
1506
flag_context_list_iterator_ty context_iter =
1507
flag_context_list_iterator (
1508
flag_context_list_table_lookup (
1509
flag_context_list_table,
1510
tp->string, strlen (tp->string)));
1511
token_ty *t1 = x_perl_lex (mlp);
1514
fprintf (stderr, "%s:%d: extracting string key\n",
1515
real_file_name, line_number);
1518
if (t1->type == token_type_symbol
1519
|| t1->type == token_type_named_op)
1521
token_ty *t2 = x_perl_lex (mlp);
1522
if (t2->type == token_type_rbrace)
1524
flag_context_ty context;
1528
inherited_context (null_context,
1529
flag_context_list_iterator_advance (
1532
pos.line_number = line_number;
1533
pos.file_name = logical_file_name;
1535
xgettext_current_source_encoding = po_charset_utf8;
1536
remember_a_message (mlp, xstrdup (t1->string), context, &pos);
1537
xgettext_current_source_encoding = xgettext_global_source_encoding;
1549
if (extract_balanced (mlp, 1, token_type_rbrace,
1550
null_context, context_iter, 1, -1))
1565
/* Now consume "->", "[...]", and "{...}". */
1568
int c = phase2_getc ();
1575
fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1576
real_file_name, line_number);
1578
extract_balanced (mlp, 0, token_type_rbrace,
1579
null_context, null_context_list_iterator, -1, -1);
1584
fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1585
real_file_name, line_number);
1587
extract_balanced (mlp, 0, token_type_rbracket,
1588
null_context, null_context_list_iterator, -1, -1);
1592
c2 = phase1_getc ();
1596
fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1597
real_file_name, line_number);
1601
else if (c2 != '\n')
1603
/* Discarding the newline is harmless here. The only
1604
special character recognized after a minus is greater-than
1605
for dereference. However, the sequence "-\n>" that we
1606
treat incorrectly here, is a syntax error. */
1613
fprintf (stderr, "%s:%d: variable finished\n",
1614
real_file_name, line_number);
1622
/* Actually a simplified version of extract_variable(). It searches for
1623
variables inside a double-quoted string that may interpolate to
1624
some keyword hash (reference). The string is UTF-8 encoded. */
1626
interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1628
static char *buffer;
1629
static int bufmax = 0;
1631
flag_context_ty context;
1633
bool maybe_hash_deref = false;
1655
* one_dollar: dollar sign seen in state INITIAL
1656
* two_dollars: another dollar-sign has been seen in state ONE_DOLLAR
1657
* identifier: a valid identifier character has been seen in state
1658
* ONE_DOLLAR or TWO_DOLLARS
1659
* minus: a minus-sign has been seen in state IDENTIFIER
1660
* wait_lbrace: a greater-than has been seen in state MINUS
1661
* wait_quote: a left brace has been seen in state IDENTIFIER or in
1663
* dquote: a double-quote has been seen in state WAIT_QUOTE
1664
* squote: a single-quote has been seen in state WAIT_QUOTE
1665
* barekey: an bareword character has been seen in state WAIT_QUOTE
1666
* wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE
1668
* In the states initial...identifier the context is null_context; in the
1669
* states minus...wait_rbrace the context is the one suitable for the first
1670
* argument of the last seen identifier.
1673
context = null_context;
1675
token.type = token_type_string;
1676
token.sub_type = string_type_qq;
1677
token.line_number = line_number;
1678
pos.file_name = logical_file_name;
1679
pos.line_number = lineno;
1681
while ((c = (unsigned char) *string++) != '\0')
1683
void *keyword_value;
1685
if (state == initial)
1691
if (bufpos + 1 >= bufmax)
1693
bufmax = 2 * bufmax + 10;
1694
buffer = xrealloc (buffer, bufmax);
1703
c = (unsigned char) *string++;
1708
buffer[bufpos++] = '$';
1709
maybe_hash_deref = false;
1721
* This is enough to make us believe later that we dereference
1724
maybe_hash_deref = true;
1725
state = two_dollars;
1728
if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1729
|| (c >= 'A' && c <= 'Z')
1730
|| (c >= 'a' && c <= 'z')
1731
|| (c >= '0' && c <= '9'))
1733
buffer[bufpos++] = c;
1742
if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1743
|| (c >= 'A' && c <= 'Z')
1744
|| (c >= 'a' && c <= 'z')
1745
|| (c >= '0' && c <= '9'))
1747
buffer[bufpos++] = c;
1757
if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1759
flag_context_list_iterator_ty context_iter =
1760
flag_context_list_iterator (
1761
flag_context_list_table_lookup (
1762
flag_context_list_table,
1765
inherited_context (null_context,
1766
flag_context_list_iterator_advance (
1774
if (!maybe_hash_deref)
1776
if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1778
flag_context_list_iterator_ty context_iter =
1779
flag_context_list_iterator (
1780
flag_context_list_table_lookup (
1781
flag_context_list_table,
1784
inherited_context (null_context,
1785
flag_context_list_iterator_advance (
1793
if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1794
|| (c >= 'A' && c <= 'Z')
1795
|| (c >= 'a' && c <= 'z')
1796
|| (c >= '0' && c <= '9'))
1798
buffer[bufpos++] = c;
1809
state = wait_lbrace;
1812
context = null_context;
1824
context = null_context;
1835
pos.line_number = lineno;
1840
pos.line_number = lineno;
1845
if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1846
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1848
pos.line_number = lineno;
1850
buffer[bufpos++] = c;
1855
context = null_context;
1865
/* The resulting string has to be interpolated twice. */
1866
buffer[bufpos] = '\0';
1867
token.string = xstrdup (buffer);
1868
extract_quotelike_pass3 (&token, EXIT_FAILURE);
1869
/* The string can only shrink with interpolation (because
1871
if (!(strlen (token.string) <= bufpos))
1873
strcpy (buffer, token.string);
1874
free (token.string);
1875
state = wait_rbrace;
1878
if (string[0] == '\"')
1880
buffer[bufpos++] = string++[0];
1884
buffer[bufpos++] = '\\';
1885
buffer[bufpos++] = string++[0];
1889
context = null_context;
1894
buffer[bufpos++] = c;
1902
state = wait_rbrace;
1905
if (string[0] == '\'')
1907
buffer[bufpos++] = string++[0];
1911
buffer[bufpos++] = '\\';
1912
buffer[bufpos++] = string++[0];
1916
context = null_context;
1921
buffer[bufpos++] = c;
1926
if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1927
|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1929
buffer[bufpos++] = c;
1932
else if (is_whitespace (c))
1934
state = wait_rbrace;
1939
context = null_context;
1943
/* Must be right brace. */
1951
buffer[bufpos] = '\0';
1952
token.string = xstrdup (buffer);
1953
extract_quotelike_pass3 (&token, EXIT_FAILURE);
1954
xgettext_current_source_encoding = po_charset_utf8;
1955
remember_a_message (mlp, token.string, context, &pos);
1956
xgettext_current_source_encoding = xgettext_global_source_encoding;
1959
context = null_context;
1968
/* The last token seen in the token stream. This is important for the
1969
interpretation of '?' and '/'. */
1970
static token_type_ty last_token;
1972
/* Combine characters into tokens. Discard whitespace. */
1975
x_perl_prelex (message_list_ty *mlp, token_ty *tp)
1977
static char *buffer;
1985
tp->line_number = line_number;
1990
tp->type = token_type_eof;
1994
if (last_non_comment_line > last_comment_line)
1995
xgettext_comment_reset ();
1999
/* Ignore whitespace. */
2008
extract_variable (mlp, tp, c);
2009
prefer_division_over_regexp = true;
2015
last_non_comment_line = tp->line_number;
2021
int c2 = phase1_getc ();
2025
tp->type = token_type_other;
2026
prefer_division_over_regexp = false;
2029
else if (c2 >= '0' && c2 <= '9')
2031
prefer_division_over_regexp = false;
2035
tp->type = token_type_dot;
2036
prefer_division_over_regexp = true;
2041
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2042
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2043
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2044
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2047
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2048
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2049
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2050
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2052
case '0': case '1': case '2': case '3': case '4':
2053
case '5': case '6': case '7': case '8': case '9':
2054
/* Symbol, or part of a number. */
2055
prefer_division_over_regexp = true;
2059
if (bufpos >= bufmax)
2061
bufmax = 2 * bufmax + 10;
2062
buffer = xrealloc (buffer, bufmax);
2064
buffer[bufpos++] = c;
2068
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2069
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2070
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2071
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2074
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2075
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2076
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2077
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2079
case '0': case '1': case '2': case '3': case '4':
2080
case '5': case '6': case '7': case '8': case '9':
2089
if (bufpos >= bufmax)
2091
bufmax = 2 * bufmax + 10;
2092
buffer = xrealloc (buffer, bufmax);
2094
buffer[bufpos] = '\0';
2096
if (strcmp (buffer, "__END__") == 0
2097
|| strcmp (buffer, "__DATA__") == 0)
2100
tp->type = token_type_eof;
2103
else if (strcmp (buffer, "and") == 0
2104
|| strcmp (buffer, "cmp") == 0
2105
|| strcmp (buffer, "eq") == 0
2106
|| strcmp (buffer, "if") == 0
2107
|| strcmp (buffer, "ge") == 0
2108
|| strcmp (buffer, "gt") == 0
2109
|| strcmp (buffer, "le") == 0
2110
|| strcmp (buffer, "lt") == 0
2111
|| strcmp (buffer, "ne") == 0
2112
|| strcmp (buffer, "not") == 0
2113
|| strcmp (buffer, "or") == 0
2114
|| strcmp (buffer, "unless") == 0
2115
|| strcmp (buffer, "while") == 0
2116
|| strcmp (buffer, "xor") == 0)
2118
tp->type = token_type_named_op;
2119
tp->string = xstrdup (buffer);
2120
prefer_division_over_regexp = false;
2123
else if (strcmp (buffer, "s") == 0
2124
|| strcmp (buffer, "y") == 0
2125
|| strcmp (buffer, "tr") == 0)
2127
int delim = phase1_getc ();
2129
while (is_whitespace (delim))
2130
delim = phase2_getc ();
2134
tp->type = token_type_eof;
2137
if ((delim >= '0' && delim <= '9')
2138
|| (delim >= 'A' && delim <= 'Z')
2139
|| (delim >= 'a' && delim <= 'z'))
2141
/* False positive. */
2142
phase2_ungetc (delim);
2143
tp->type = token_type_symbol;
2144
tp->sub_type = symbol_type_none;
2145
tp->string = xstrdup (buffer);
2146
prefer_division_over_regexp = true;
2149
extract_triple_quotelike (mlp, tp, delim,
2150
buffer[0] == 's' && delim != '\'');
2152
/* Eat the following modifiers. */
2155
while (c >= 'a' && c <= 'z');
2159
else if (strcmp (buffer, "m") == 0)
2161
int delim = phase1_getc ();
2163
while (is_whitespace (delim))
2164
delim = phase2_getc ();
2168
tp->type = token_type_eof;
2171
if ((delim >= '0' && delim <= '9')
2172
|| (delim >= 'A' && delim <= 'Z')
2173
|| (delim >= 'a' && delim <= 'z'))
2175
/* False positive. */
2176
phase2_ungetc (delim);
2177
tp->type = token_type_symbol;
2178
tp->sub_type = symbol_type_none;
2179
tp->string = xstrdup (buffer);
2180
prefer_division_over_regexp = true;
2183
extract_quotelike (tp, delim);
2185
interpolate_keywords (mlp, tp->string, line_number);
2187
tp->type = token_type_regex_op;
2188
prefer_division_over_regexp = true;
2190
/* Eat the following modifiers. */
2193
while (c >= 'a' && c <= 'z');
2197
else if (strcmp (buffer, "qq") == 0
2198
|| strcmp (buffer, "q") == 0
2199
|| strcmp (buffer, "qx") == 0
2200
|| strcmp (buffer, "qw") == 0
2201
|| strcmp (buffer, "qr") == 0)
2203
/* The qw (...) construct is not really a string but we
2204
can treat in the same manner and then pretend it is
2205
a symbol. Rationale: Saying "qw (foo bar)" is the
2206
same as "my @list = ('foo', 'bar'); @list;". */
2208
int delim = phase1_getc ();
2210
while (is_whitespace (delim))
2211
delim = phase2_getc ();
2215
tp->type = token_type_eof;
2218
prefer_division_over_regexp = true;
2220
if ((delim >= '0' && delim <= '9')
2221
|| (delim >= 'A' && delim <= 'Z')
2222
|| (delim >= 'a' && delim <= 'z'))
2224
/* False positive. */
2225
phase2_ungetc (delim);
2226
tp->type = token_type_symbol;
2227
tp->sub_type = symbol_type_none;
2228
tp->string = xstrdup (buffer);
2229
prefer_division_over_regexp = true;
2233
extract_quotelike (tp, delim);
2239
tp->type = token_type_string;
2240
tp->sub_type = string_type_qq;
2241
interpolate_keywords (mlp, tp->string, line_number);
2244
tp->type = token_type_regex_op;
2247
tp->type = token_type_symbol;
2248
tp->sub_type = symbol_type_none;
2251
tp->type = token_type_string;
2252
tp->sub_type = string_type_q;
2259
else if (strcmp (buffer, "grep") == 0
2260
|| strcmp (buffer, "split") == 0)
2262
prefer_division_over_regexp = false;
2264
tp->type = token_type_symbol;
2265
tp->sub_type = (strcmp (buffer, "sub") == 0
2267
: symbol_type_none);
2268
tp->string = xstrdup (buffer);
2272
prefer_division_over_regexp = true;
2273
extract_quotelike (tp, c);
2274
tp->sub_type = string_type_qq;
2275
interpolate_keywords (mlp, tp->string, line_number);
2279
prefer_division_over_regexp = true;
2280
extract_quotelike (tp, c);
2281
tp->sub_type = string_type_qq;
2282
interpolate_keywords (mlp, tp->string, line_number);
2286
prefer_division_over_regexp = true;
2287
extract_quotelike (tp, c);
2288
tp->sub_type = string_type_q;
2294
/* Ignore empty list. */
2298
tp->type = token_type_lparen;
2299
prefer_division_over_regexp = false;
2303
tp->type = token_type_rparen;
2304
prefer_division_over_regexp = true;
2308
tp->type = token_type_lbrace;
2309
prefer_division_over_regexp = false;
2313
tp->type = token_type_rbrace;
2314
prefer_division_over_regexp = false;
2318
tp->type = token_type_lbracket;
2319
prefer_division_over_regexp = false;
2323
tp->type = token_type_rbracket;
2324
prefer_division_over_regexp = false;
2328
tp->type = token_type_semicolon;
2329
prefer_division_over_regexp = false;
2333
tp->type = token_type_comma;
2334
prefer_division_over_regexp = false;
2338
/* Check for fat comma. */
2342
tp->type = token_type_fat_comma;
2345
else if (linepos == 2
2346
&& (last_token == token_type_semicolon
2347
|| last_token == token_type_rbrace)
2348
&& ((c >= 'A' && c <='Z')
2349
|| (c >= 'a' && c <= 'z')))
2352
fprintf (stderr, "%s:%d: start pod section\n",
2353
real_file_name, line_number);
2357
fprintf (stderr, "%s:%d: end pod section\n",
2358
real_file_name, line_number);
2363
tp->type = token_type_other;
2364
prefer_division_over_regexp = false;
2368
/* Check for <<EOF and friends. */
2369
prefer_division_over_regexp = false;
2377
extract_quotelike (tp, c);
2378
string = get_here_document (tp->string);
2380
tp->string = string;
2381
tp->type = token_type_string;
2382
tp->sub_type = string_type_verbatim;
2383
tp->line_number = line_number + 1;
2389
extract_quotelike (tp, c);
2390
string = get_here_document (tp->string);
2392
tp->string = string;
2393
tp->type = token_type_string;
2394
tp->sub_type = string_type_qq;
2395
tp->line_number = line_number + 1;
2396
interpolate_keywords (mlp, tp->string, line_number + 1);
2399
else if ((c >= 'A' && c <= 'Z')
2400
|| (c >= 'a' && c <= 'z')
2404
while ((c >= 'A' && c <= 'Z')
2405
|| (c >= 'a' && c <= 'z')
2406
|| (c >= '0' && c <= '9')
2407
|| c == '_' || c >= 0x80)
2409
if (bufpos >= bufmax)
2411
bufmax = 2 * bufmax + 10;
2412
buffer = xrealloc (buffer, bufmax);
2414
buffer[bufpos++] = c;
2419
tp->type = token_type_eof;
2426
if (bufpos >= bufmax)
2428
bufmax = 2 * bufmax + 10;
2429
buffer = xrealloc (buffer, bufmax);
2431
buffer[bufpos++] = '\0';
2432
string = get_here_document (buffer);
2433
tp->string = string;
2434
tp->type = token_type_string;
2435
tp->sub_type = string_type_qq;
2436
tp->line_number = line_number + 1;
2437
interpolate_keywords (mlp, tp->string, line_number + 1);
2443
tp->type = token_type_other;
2450
tp->type = token_type_other;
2452
return; /* End of case '>'. */
2455
/* Check for dereferencing operator. */
2459
tp->type = token_type_dereference;
2463
tp->type = token_type_other;
2464
prefer_division_over_regexp = false;
2469
if (!prefer_division_over_regexp)
2471
extract_quotelike (tp, c);
2472
interpolate_keywords (mlp, tp->string, line_number);
2474
tp->type = token_type_other;
2475
prefer_division_over_regexp = true;
2476
/* Eat the following modifiers. */
2479
while (c >= 'a' && c <= 'z');
2486
/* We could carefully recognize each of the 2 and 3 character
2487
operators, but it is not necessary, as we only need to recognize
2488
gettext invocations. Don't bother. */
2489
tp->type = token_type_other;
2490
prefer_division_over_regexp = false;
2497
/* A token stack used as a lookahead buffer. */
2499
typedef struct token_stack_ty token_stack_ty;
2500
struct token_stack_ty
2507
static struct token_stack_ty token_stack;
2510
/* Dumps all resources allocated by stack STACK. */
2512
token_stack_dump (token_stack_ty *stack)
2516
fprintf (stderr, "BEGIN STACK DUMP\n");
2517
for (i = 0; i < stack->nitems; i++)
2519
token_ty *token = stack->items[i];
2520
fprintf (stderr, " [%s]\n", token2string (token));
2521
switch (token->type)
2523
case token_type_named_op:
2524
case token_type_string:
2525
case token_type_symbol:
2526
case token_type_variable:
2527
fprintf (stderr, " string: %s\n", token->string);
2531
fprintf (stderr, "END STACK DUMP\n");
2536
/* Pushes the token TOKEN onto the stack STACK. */
2538
token_stack_push (token_stack_ty *stack, token_ty *token)
2540
if (stack->nitems >= stack->nitems_max)
2544
stack->nitems_max = 2 * stack->nitems_max + 4;
2545
nbytes = stack->nitems_max * sizeof (token_ty *);
2546
stack->items = xrealloc (stack->items, nbytes);
2548
stack->items[stack->nitems++] = token;
2551
/* Pops the most recently pushed token from the stack STACK and returns it.
2552
Returns NULL if the stack is empty. */
2553
static inline token_ty *
2554
token_stack_pop (token_stack_ty *stack)
2556
if (stack->nitems > 0)
2557
return stack->items[--(stack->nitems)];
2562
/* Return the top of the stack without removing it from the stack, or
2563
NULL if the stack is empty. */
2564
static inline token_ty *
2565
token_stack_peek (const token_stack_ty *stack)
2567
if (stack->nitems > 0)
2568
return stack->items[stack->nitems - 1];
2573
/* Frees all resources allocated by stack STACK. */
2575
token_stack_free (token_stack_ty *stack)
2579
for (i = 0; i < stack->nitems; i++)
2580
free_token (stack->items[i]);
2581
free (stack->items);
2586
x_perl_lex (message_list_ty *mlp)
2589
int dummy = token_stack_dump (&token_stack);
2591
token_ty *tp = token_stack_pop (&token_stack);
2595
tp = (token_ty *) xmalloc (sizeof (token_ty));
2596
x_perl_prelex (mlp, tp);
2598
fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2599
real_file_name, line_number, token2string (tp));
2605
fprintf (stderr, "%s:%d: %s recycled from stack\n",
2606
real_file_name, line_number, token2string (tp));
2610
/* A symbol followed by a fat comma is really a single-quoted string.
2611
Function definitions or forward declarations also need a special
2612
handling because the dollars and at signs inside the parentheses
2613
must not be interpreted as the beginning of a variable ')'. */
2614
if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2616
token_ty *next = token_stack_peek (&token_stack);
2621
fprintf (stderr, "%s:%d: pre-fetching next token\n",
2622
real_file_name, line_number);
2624
next = x_perl_lex (mlp);
2625
x_perl_unlex (next);
2627
fprintf (stderr, "%s:%d: unshifted next token\n",
2628
real_file_name, line_number);
2633
fprintf (stderr, "%s:%d: next token is %s\n",
2634
real_file_name, line_number, token2string (next));
2637
if (next->type == token_type_fat_comma)
2639
tp->type = token_type_string;
2640
tp->sub_type = string_type_q;
2643
"%s:%d: token %s mutated to token_type_string\n",
2644
real_file_name, line_number, token2string (tp));
2647
else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2648
&& next->type == token_type_symbol)
2650
/* Start of a function declaration or definition. Mark this
2651
symbol as a function name, so that we can later eat up
2652
possible prototype information. */
2654
fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2655
real_file_name, line_number, next->string);
2657
next->sub_type = symbol_type_function;
2659
else if (tp->type == token_type_symbol
2660
&& (tp->sub_type == symbol_type_sub
2661
|| tp->sub_type == symbol_type_function)
2662
&& next->type == token_type_lparen)
2664
/* For simplicity we simply consume everything up to the
2665
closing parenthesis. Actually only a limited set of
2666
characters is allowed inside parentheses but we leave
2667
complaints to the interpreter and are prepared for
2668
future extensions to the Perl syntax. */
2672
fprintf (stderr, "%s:%d: consuming prototype information\n",
2673
real_file_name, line_number);
2680
fprintf (stderr, " consuming character '%c'\n", c);
2683
while (c != EOF && c != ')');
2692
x_perl_unlex (token_ty *tp)
2694
token_stack_push (&token_stack, tp);
2698
/* ========================= Extracting strings. ========================== */
2700
/* Assuming TP is a string token, this function accumulates all subsequent
2701
. string2 . string3 ... to the string. (String concatenation.) */
2704
collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2709
extract_quotelike_pass3 (tp, error_level);
2710
string = xstrdup (tp->string);
2711
len = strlen (tp->string) + 1;
2719
while (is_whitespace (c));
2729
while (is_whitespace (c));
2733
if (c == '"' || c == '\'' || c == '`'
2734
|| (!prefer_division_over_regexp && (c == '/' || c == '?'))
2737
token_ty *qstring = x_perl_lex (mlp);
2738
if (qstring->type != token_type_string)
2740
/* assert (qstring->type == token_type_symbol) */
2741
x_perl_unlex (qstring);
2745
extract_quotelike_pass3 (qstring, error_level);
2746
len += strlen (qstring->string);
2747
string = xrealloc (string, len);
2748
strcat (string, qstring->string);
2749
free_token (qstring);
2754
/* The file is broken into tokens. Scan the token stream, looking for
2755
a keyword, followed by a left paren, followed by a string. When we
2756
see this sequence, we have something to remember. We assume we are
2757
looking at a valid C or C++ program, and leave the complaints about
2758
the grammar to the compiler.
2760
Normal handling: Look for
2761
keyword ( ... msgid ... )
2762
Plural handling: Look for
2763
keyword ( ... msgid ... msgid_plural ... )
2765
We use recursion because the arguments before msgid or between msgid
2766
and msgid_plural can contain subexpressions of the same form. */
2768
/* Extract messages until the next balanced closing parenthesis.
2769
Extracted messages are added to MLP.
2771
When specific arguments shall be extracted, ARG_SG and ARG_PL are
2772
set to the corresponding argument number or -1 if not applicable.
2774
Returns true for EOF, false otherwise.
2779
1 - keyword has been seen
2780
2 - extractable string has been seen
2781
3 - a dot operator after an extractable string has been seen
2783
States 2 and 3 are "fragile", the parser will remain in state 2
2784
as long as only opening parentheses are seen, a transition to
2785
state 3 is done on appearance of a dot operator, all other tokens
2786
will cause the parser to fall back to state 1 or 0, eventually
2787
with an error message about invalid intermixing of constant and
2788
non-constant strings.
2790
Likewise, state 3 is fragile. The parser will remain in state 3
2791
as long as only closing parentheses are seen, a transition to state
2792
2 is done on appearance of another (literal!) string, all other
2793
tokens will cause a warning. */
2796
extract_balanced (message_list_ty *mlp, int state, token_type_ty delim,
2797
flag_context_ty outer_context,
2798
flag_context_list_iterator_ty context_iter,
2799
int arg_sg, int arg_pl)
2801
/* Remember the message containing the msgid, for msgid_plural. */
2802
message_ty *plural_mp = NULL;
2804
/* The current argument for a possibly extracted keyword. Counting
2808
/* Number of left parentheses seen. */
2811
/* Whether to implicitly assume the next tokens are arguments even without
2813
bool next_is_argument = false;
2815
/* Context iterator that will be used if the next token is a '('. */
2816
flag_context_list_iterator_ty next_context_iter =
2817
passthrough_context_list_iterator;
2818
/* Current context. */
2819
flag_context_ty inner_context =
2820
inherited_context (outer_context,
2821
flag_context_list_iterator_advance (&context_iter));
2824
static int nesting_level = 0;
2829
last_token = token_type_semicolon; /* Safe assumption. */
2830
prefer_division_over_regexp = false;
2834
int my_last_token = last_token;
2835
/* The current token. */
2838
tp = x_perl_lex (mlp);
2840
last_token = tp->type;
2842
if (delim == tp->type)
2845
fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2846
logical_file_name, tp->line_number, --nesting_level);
2852
if (next_is_argument && tp->type != token_type_lparen)
2854
/* An argument list starts, even though there is no '('. */
2855
context_iter = next_context_iter;
2856
outer_context = inner_context;
2858
inherited_context (outer_context,
2859
flag_context_list_iterator_advance (
2865
case token_type_symbol:
2867
fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
2868
logical_file_name, tp->line_number, nesting_level,
2873
void *keyword_value;
2875
if (find_entry (&keywords, tp->string, strlen (tp->string),
2876
&keyword_value) == 0)
2878
last_token = token_type_keyword_symbol;
2880
arg_sg = (int) (long) keyword_value & ((1 << 10) - 1);
2881
arg_pl = (int) (long) keyword_value >> 10;
2887
next_is_argument = true;
2889
flag_context_list_iterator (
2890
flag_context_list_table_lookup (
2891
flag_context_list_table,
2892
tp->string, strlen (tp->string)));
2895
case token_type_variable:
2897
fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
2898
logical_file_name, tp->line_number, nesting_level, tp->string);
2900
prefer_division_over_regexp = true;
2901
next_is_argument = false;
2902
next_context_iter = null_context_list_iterator;
2905
case token_type_lparen:
2907
fprintf (stderr, "%s:%d: type left parentheses (%d)\n",
2908
logical_file_name, tp->line_number, nesting_level);
2912
if (extract_balanced (mlp, state, token_type_rparen,
2913
inner_context, next_context_iter,
2914
arg_sg - arg_count + 1, arg_pl - arg_count + 1))
2919
if (my_last_token == token_type_keyword_symbol)
2920
arg_sg = arg_pl = -1;
2921
next_is_argument = false;
2922
next_context_iter = null_context_list_iterator;
2925
case token_type_rparen:
2927
fprintf (stderr, "%s:%d: type right parentheses(%d)\n",
2928
logical_file_name, tp->line_number, nesting_level);
2931
next_is_argument = false;
2932
next_context_iter = null_context_list_iterator;
2935
case token_type_comma:
2936
case token_type_fat_comma:
2938
fprintf (stderr, "%s:%d: type comma (%d)\n",
2939
logical_file_name, tp->line_number, nesting_level);
2942
if (arg_count > arg_sg && arg_count > arg_pl)
2944
/* We have missed the argument. */
2945
arg_sg = arg_pl = -1;
2949
fprintf (stderr, "%s:%d: arg_count: %d, arg_sg: %d, arg_pl: %d\n",
2950
real_file_name, tp->line_number,
2951
arg_count, arg_sg, arg_pl);
2954
inherited_context (outer_context,
2955
flag_context_list_iterator_advance (
2957
next_is_argument = false;
2958
next_context_iter = passthrough_context_list_iterator;
2961
case token_type_string:
2963
fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
2964
logical_file_name, tp->line_number, nesting_level,
2973
pos.file_name = logical_file_name;
2974
pos.line_number = tp->line_number;
2975
string = collect_message (mlp, tp, EXIT_SUCCESS);
2976
xgettext_current_source_encoding = po_charset_utf8;
2977
remember_a_message (mlp, string, inner_context, &pos);
2978
xgettext_current_source_encoding = xgettext_global_source_encoding;
2985
pos.file_name = logical_file_name;
2986
pos.line_number = tp->line_number;
2988
if (arg_count == arg_sg)
2990
string = collect_message (mlp, tp, EXIT_FAILURE);
2991
xgettext_current_source_encoding = po_charset_utf8;
2992
plural_mp = remember_a_message (mlp, string, inner_context, &pos);
2993
xgettext_current_source_encoding = xgettext_global_source_encoding;
2996
else if (arg_count == arg_pl)
2998
if (plural_mp == NULL)
2999
error (EXIT_FAILURE, 0, _("\
3000
%s:%d: fatal: plural message seen before singular message\n"),
3001
real_file_name, tp->line_number);
3003
string = collect_message (mlp, tp, EXIT_FAILURE);
3004
xgettext_current_source_encoding = po_charset_utf8;
3005
remember_a_message_plural (plural_mp, string, inner_context, &pos);
3006
xgettext_current_source_encoding = xgettext_global_source_encoding;
3011
if (arg_sg == -1 && arg_pl == -1)
3017
next_is_argument = false;
3018
next_context_iter = null_context_list_iterator;
3021
case token_type_eof:
3023
fprintf (stderr, "%s:%d: type EOF (%d)\n",
3024
logical_file_name, tp->line_number, nesting_level);
3029
case token_type_lbrace:
3031
fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3032
logical_file_name, tp->line_number, nesting_level);
3034
if (extract_balanced (mlp, 0, token_type_rbrace,
3035
null_context, null_context_list_iterator,
3041
next_is_argument = false;
3042
next_context_iter = null_context_list_iterator;
3045
case token_type_rbrace:
3047
fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3048
logical_file_name, tp->line_number, nesting_level);
3050
next_is_argument = false;
3051
next_context_iter = null_context_list_iterator;
3055
case token_type_lbracket:
3057
fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3058
logical_file_name, tp->line_number, nesting_level);
3060
if (extract_balanced (mlp, 0, token_type_rbracket,
3061
null_context, null_context_list_iterator,
3067
next_is_argument = false;
3068
next_context_iter = null_context_list_iterator;
3071
case token_type_rbracket:
3073
fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3074
logical_file_name, tp->line_number, nesting_level);
3076
next_is_argument = false;
3077
next_context_iter = null_context_list_iterator;
3081
case token_type_semicolon:
3083
fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3084
logical_file_name, tp->line_number, nesting_level);
3088
/* The ultimate sign. */
3089
arg_sg = arg_pl = -1;
3091
/* FIXME: Instead of resetting outer_context here, it may be better
3092
to recurse in the next_is_argument handling above, waiting for
3093
the next semicolon or other statement terminator. */
3094
outer_context = null_context;
3095
context_iter = null_context_list_iterator;
3096
next_is_argument = false;
3097
next_context_iter = passthrough_context_list_iterator;
3099
inherited_context (outer_context,
3100
flag_context_list_iterator_advance (
3104
case token_type_dereference:
3106
fprintf (stderr, "%s:%d: type dereference (%d)\n",
3107
logical_file_name, tp->line_number, nesting_level);
3109
next_is_argument = false;
3110
next_context_iter = null_context_list_iterator;
3113
case token_type_dot:
3115
fprintf (stderr, "%s:%d: type dot (%d)\n",
3116
logical_file_name, tp->line_number, nesting_level);
3118
next_is_argument = false;
3119
next_context_iter = null_context_list_iterator;
3123
case token_type_named_op:
3125
fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3126
logical_file_name, tp->line_number, nesting_level,
3129
next_is_argument = false;
3130
next_context_iter = null_context_list_iterator;
3134
case token_type_regex_op:
3136
fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3137
logical_file_name, tp->line_number, nesting_level);
3139
next_is_argument = false;
3140
next_context_iter = null_context_list_iterator;
3143
case token_type_other:
3145
fprintf (stderr, "%s:%d: type other (%d)\n",
3146
logical_file_name, tp->line_number, nesting_level);
3148
next_is_argument = false;
3149
next_context_iter = null_context_list_iterator;
3154
fprintf (stderr, "%s:%d: unknown token type %d\n",
3155
real_file_name, tp->line_number, tp->type);
3164
extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3165
flag_context_list_table_ty *flag_table,
3166
msgdomain_list_ty *mdlp)
3168
message_list_ty *mlp = mdlp->item[0]->messages;
3171
real_file_name = real_filename;
3172
logical_file_name = xstrdup (logical_filename);
3175
last_comment_line = -1;
3176
last_non_comment_line = -1;
3178
flag_context_list_table = flag_table;
3182
token_stack.items = NULL;
3183
token_stack.nitems = 0;
3184
token_stack.nitems_max = 0;
3188
end_of_file = false;
3190
/* Eat tokens until eof is seen. When extract_balanced returns
3191
due to an unbalanced closing brace, just restart it. */
3192
while (!extract_balanced (mlp, 0, token_type_rbrace,
3193
null_context, null_context_list_iterator,
3198
real_file_name = NULL;
3199
free (logical_file_name);
3200
logical_file_name = NULL;
3202
last_token = token_type_semicolon;
3203
token_stack_free (&token_stack);