1
/* xgettext C/C++/ObjectiveC backend.
2
Copyright (C) 1995-1998, 2000-2003 Free Software Foundation, Inc.
4
This file was written by Peter Miller <millerp@canb.auug.org.au>
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
11
This program is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with this program; if not, write to the Free Software Foundation,
18
Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34
#include "error-progname.h"
40
#define _(s) gettext(s)
42
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
45
/* The ANSI C standard defines several phases of translation:
47
1. Terminate line by \n, regardless of the external representation
48
of a text line. Stdio does this for us.
50
2. Convert trigraphs to their single character equivalents.
52
3. Concatenate each line ending in backslash (\) with the following
55
4. Replace each comment with a space character.
57
5. Parse each resulting logical line as preprocessing tokens a
60
6. Recognize and carry out directives (it also expands macros on
61
non-directive lines, which we do not do here).
63
7. Replaces escape sequences within character strings with their
64
single character equivalents (we do this in step 5, because we
65
don't have to worry about the #include argument).
67
8. Concatenates adjacent string literals to form single string
68
literals (because we don't expand macros, there are a few things
71
9. Converts the remaining preprocessing tokens to C tokens and
72
discards any white space from the translation unit.
74
This lexer implements the above, and presents the scanner (in
75
xgettext.c) with a stream of C tokens. The comments are
76
accumulated in a buffer, and given to xgettext when asked for. */
79
/* ========================= Lexer customization. ========================= */
81
static bool trigraphs = false;
90
/* ====================== Keyword set customization. ====================== */
92
/* If true extract all strings. */
93
static bool extract_all = false;
95
static hash_table c_keywords;
96
static hash_table objc_keywords;
97
static bool default_keywords = true;
108
add_keyword (const char *name, hash_table *keywords)
111
default_keywords = false;
119
if (keywords->table == NULL)
120
init_hash (keywords, 100);
122
split_keywordspec (name, &end, &argnum1, &argnum2);
124
/* The characters between name and end should form a valid C identifier.
125
A colon means an invalid parse in split_keywordspec(). */
126
colon = strchr (name, ':');
127
if (colon == NULL || colon >= end)
131
insert_entry (keywords, name, end - name,
132
(void *) (long) (argnum1 + (argnum2 << 10)));
138
x_c_keyword (const char *name)
140
add_keyword (name, &c_keywords);
144
x_objc_keyword (const char *name)
146
add_keyword (name, &objc_keywords);
149
/* Finish initializing the keywords hash tables.
150
Called after argument processing, before each file is processed. */
154
if (default_keywords)
156
x_c_keyword ("gettext");
157
x_c_keyword ("dgettext:2");
158
x_c_keyword ("dcgettext:2");
159
x_c_keyword ("ngettext:1,2");
160
x_c_keyword ("dngettext:2,3");
161
x_c_keyword ("dcngettext:2,3");
162
x_c_keyword ("gettext_noop");
164
x_objc_keyword ("gettext");
165
x_objc_keyword ("dgettext:2");
166
x_objc_keyword ("dcgettext:2");
167
x_objc_keyword ("ngettext:1,2");
168
x_objc_keyword ("dngettext:2,3");
169
x_objc_keyword ("dcngettext:2,3");
170
x_objc_keyword ("gettext_noop");
171
x_objc_keyword ("NSLocalizedString"); /* similar to gettext */
172
x_objc_keyword ("_"); /* similar to gettext */
173
x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
174
x_objc_keyword ("__"); /* similar to gettext_noop */
176
default_keywords = false;
183
xgettext_record_flag ("gettext:1:pass-c-format");
184
xgettext_record_flag ("dgettext:2:pass-c-format");
185
xgettext_record_flag ("dcgettext:2:pass-c-format");
186
xgettext_record_flag ("ngettext:1:pass-c-format");
187
xgettext_record_flag ("ngettext:2:pass-c-format");
188
xgettext_record_flag ("dngettext:2:pass-c-format");
189
xgettext_record_flag ("dngettext:3:pass-c-format");
190
xgettext_record_flag ("dcngettext:2:pass-c-format");
191
xgettext_record_flag ("dcngettext:3:pass-c-format");
192
xgettext_record_flag ("gettext_noop:1:pass-c-format");
194
xgettext_record_flag ("fprintf:2:c-format");
195
xgettext_record_flag ("vfprintf:2:c-format");
196
xgettext_record_flag ("printf:1:c-format");
197
xgettext_record_flag ("vprintf:1:c-format");
198
xgettext_record_flag ("sprintf:2:c-format");
199
xgettext_record_flag ("vsprintf:2:c-format");
200
xgettext_record_flag ("snprintf:3:c-format");
201
xgettext_record_flag ("vsnprintf:3:c-format");
202
#if 0 /* These functions are not standard. */
204
xgettext_record_flag ("asprintf:2:c-format");
205
xgettext_record_flag ("vasprintf:2:c-format");
206
xgettext_record_flag ("dprintf:2:c-format");
207
xgettext_record_flag ("vdprintf:2:c-format");
208
xgettext_record_flag ("obstack_printf:2:c-format");
209
xgettext_record_flag ("obstack_vprintf:2:c-format");
211
xgettext_record_flag ("error:3:c-format");
212
xgettext_record_flag ("error_at_line:5:c-format");
214
xgettext_record_flag ("argp_error:2:c-format");
215
xgettext_record_flag ("argp_failure:2:c-format");
220
init_flag_table_objc ()
222
/* Since the settings done in init_flag_table_c() also have an effect for
223
the ObjectiveC parser, we don't have to repeat them here. */
224
xgettext_record_flag ("gettext:1:pass-objc-format");
225
xgettext_record_flag ("dgettext:2:pass-objc-format");
226
xgettext_record_flag ("dcgettext:2:pass-objc-format");
227
xgettext_record_flag ("ngettext:1:pass-objc-format");
228
xgettext_record_flag ("ngettext:2:pass-objc-format");
229
xgettext_record_flag ("dngettext:2:pass-objc-format");
230
xgettext_record_flag ("dngettext:3:pass-objc-format");
231
xgettext_record_flag ("dcngettext:2:pass-objc-format");
232
xgettext_record_flag ("dcngettext:3:pass-objc-format");
233
xgettext_record_flag ("gettext_noop:1:pass-objc-format");
234
xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
235
xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
236
xgettext_record_flag ("_:1:pass-c-format");
237
xgettext_record_flag ("_:1:pass-objc-format");
238
xgettext_record_flag ("stringWithFormat::1:objc-format");
239
xgettext_record_flag ("initWithFormat::1:objc-format");
240
xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
241
xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
242
xgettext_record_flag ("appendFormat::1:objc-format");
246
init_flag_table_gcc_internal ()
248
xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
249
xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
250
xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
251
xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
252
xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
253
xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
254
xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
255
xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
256
xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
257
xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
258
#if 0 /* This should better be done inside GCC. */
259
/* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
261
xgettext_record_flag ("status_warning:2:gcc-internal-format");
263
xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
265
//xgettext_record_flag ("error:1:c-format"); // 3 different versions
266
xgettext_record_flag ("notice:1:c-format");
267
//xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
268
xgettext_record_flag ("fatal_perror:1:c-format");
270
xgettext_record_flag ("cpp_error:3:c-format");
271
xgettext_record_flag ("cpp_error_with_line:5:c-format");
273
xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
274
xgettext_record_flag ("output_printf:2:gcc-internal-format");
275
xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
276
xgettext_record_flag ("verbatim:1:gcc-internal-format");
277
xgettext_record_flag ("inform:1:pass-gcc-internal-format");
279
//xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
280
//xgettext_record_flag ("error:1:c-format"); // 3 different versions
282
xgettext_record_flag ("attr_printf:2:pass-c-format");
284
xgettext_record_flag ("error_at_line:2:pass-c-format");
285
xgettext_record_flag ("xvasprintf:2:pass-c-format");
286
xgettext_record_flag ("xasprintf:1:pass-c-format");
287
xgettext_record_flag ("oprintf:2:pass-c-format");
289
xgettext_record_flag ("message_with_line:2:pass-c-format");
291
xgettext_record_flag ("output_operand_lossage:1:c-format");
293
xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
295
xgettext_record_flag ("fnotice:2:c-format");
296
xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
297
xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
298
xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
299
xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
300
xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
301
xgettext_record_flag ("pedwarn:1:gcc-internal-format");
302
xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
303
xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
304
xgettext_record_flag ("sorry:1:gcc-internal-format");
305
xgettext_record_flag ("error:1:pass-gcc-internal-format");
306
xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
307
xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
308
xgettext_record_flag ("warning:1:pass-gcc-internal-format");
309
xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
310
xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
312
xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
314
xgettext_record_flag ("ffests_printf:2:pass-c-format");
315
/* java/java-tree.h */
316
xgettext_record_flag ("parse_error_context:2:pass-c-format");
321
/* ======================== Reading of characters. ======================== */
323
/* Real filename, used in error messages about the input file. */
324
static const char *real_file_name;
326
/* Logical filename and line number, used to label the extracted messages. */
327
static char *logical_file_name;
328
static int line_number;
330
/* The input file stream. */
334
/* 0. Terminate line by \n, regardless whether the external representation of
335
a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
336
It is debatable whether supporting CR/LF line terminators in C sources
337
on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
338
unconditionally, it must be OK.
339
The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
340
automatically, but here we also need this conversion on Unix. As a side
341
effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
354
error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
363
if (c1 != EOF && c1 != '\n')
366
/* Seen line terminator CR or CR/LF. */
374
/* Supports only one pushback character, and not '\n'. */
376
phase0_ungetc (int c)
383
/* 1. line_number handling. Combine backslash-newline to nothing. */
385
static unsigned char phase1_pushback[2];
386
static int phase1_pushback_length;
394
if (phase1_pushback_length)
396
c = phase1_pushback[--phase1_pushback_length];
411
c = phase0_getc (fp);
427
/* Supports 2 characters of pushback. */
429
phase1_ungetc (int c)
441
if (phase1_pushback_length == SIZEOF (phase1_pushback))
443
phase1_pushback[phase1_pushback_length++] = c;
449
/* 2. Convert trigraphs to their single character equivalents. Most
450
sane human beings vomit copiously at the mention of trigraphs, which
451
is why they are an option. */
453
static unsigned char phase2_pushback[1];
454
static int phase2_pushback_length;
462
if (phase2_pushback_length)
463
return phase2_pushback[--phase2_pushback_length];
465
return phase1_getc ();
504
/* Supports only one pushback character. */
506
phase2_ungetc (int c)
510
if (phase2_pushback_length == SIZEOF (phase2_pushback))
512
phase2_pushback[phase2_pushback_length++] = c;
517
/* 3. Concatenate each line ending in backslash (\) with the following
518
line. Basically, all you need to do is elide "\\\n" sequences from
521
static unsigned char phase3_pushback[2];
522
static int phase3_pushback_length;
528
if (phase3_pushback_length)
529
return phase3_pushback[--phase3_pushback_length];
532
int c = phase2_getc ();
545
/* Supports 2 characters of pushback. */
547
phase3_ungetc (int c)
551
if (phase3_pushback_length == SIZEOF (phase3_pushback))
553
phase3_pushback[phase3_pushback_length++] = c;
558
/* Accumulating comments. */
561
static size_t bufmax;
562
static size_t buflen;
573
if (buflen >= bufmax)
575
bufmax = 2 * bufmax + 10;
576
buffer = xrealloc (buffer, bufmax);
578
buffer[buflen++] = c;
582
comment_line_end (size_t chars_to_remove)
584
buflen -= chars_to_remove;
586
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
588
if (chars_to_remove == 0 && buflen >= bufmax)
590
bufmax = 2 * bufmax + 10;
591
buffer = xrealloc (buffer, bufmax);
593
buffer[buflen] = '\0';
594
savable_comment_add (buffer);
598
/* These are for tracking whether comments count as immediately before
600
static int last_comment_line;
601
static int last_non_comment_line;
602
static int newline_count;
605
/* 4. Replace each comment that is not inside a character constant or
606
string literal with a space character. We need to remember the
607
comment for later, because it may be attached to a keyword string.
608
We also optionally understand C++ comments. */
629
last_was_star = false;
635
/* We skip all leading white space, but not EOLs. */
636
if (!(buflen == 0 && (c == ' ' || c == '\t')))
641
comment_line_end (1);
643
last_was_star = false;
647
last_was_star = true;
653
comment_line_end (2);
659
last_was_star = false;
664
last_comment_line = newline_count;
668
/* C++ or ISO C 99 comment. */
673
if (c == '\n' || c == EOF)
675
/* We skip all leading white space, but not EOLs. */
676
if (!(buflen == 0 && (c == ' ' || c == '\t')))
679
comment_line_end (0);
680
last_comment_line = newline_count;
686
/* Supports only one pushback character. */
688
phase4_ungetc (int c)
694
/* ========================== Reading of tokens. ========================== */
697
/* True if ObjectiveC extensions are recognized. */
698
static bool objc_extensions;
702
token_type_character_constant, /* 'x' */
705
token_type_hash, /* # */
706
token_type_lparen, /* ( */
707
token_type_rparen, /* ) */
708
token_type_comma, /* , */
709
token_type_colon, /* : */
710
token_type_name, /* abc */
711
token_type_number, /* 2.7 */
712
token_type_string_literal, /* "abc" */
713
token_type_symbol, /* < > = etc. */
714
token_type_objc_special, /* @ */
715
token_type_white_space
717
typedef enum token_type_ty token_type_ty;
719
typedef struct token_ty token_ty;
723
char *string; /* for token_type_name, token_type_string_literal */
724
refcounted_string_list_ty *comment; /* for token_type_string_literal,
725
token_type_objc_special */
731
/* 7. Replace escape sequences within character strings with their
732
single character equivalents. This is called from phase 5, because
733
we don't have to worry about the #include argument. There are
734
pathological cases which could bite us (like the DOS directory
735
separator), but just pretend it can't happen. */
737
#define P7_QUOTES (1000 + '"')
738
#define P7_QUOTE (1000 + '\'')
739
#define P7_NEWLINE (1000 + '\n')
746
/* Use phase 3, because phase 4 elides comments. */
749
/* Return a magic newline indicator, so that we can distinguish
750
between the user requesting a newline in the string (e.g. using
751
"\n" or "\012") from the user failing to terminate the string or
752
character constant. The ANSI C standard says: 3.1.3.4 Character
753
Constants contain ``any character except single quote, backslash or
754
newline; or an escape sequence'' and 3.1.4 String Literals contain
755
``any character except double quote, backslash or newline; or an
758
Most compilers give a fatal error in this case, however gcc is
759
stupidly silent, even though this is a very common typo. OK, so
760
gcc --pedantic will tell me, but that gripes about too much other
761
stuff. Could I have a ``gcc -Wnewline-in-string'' option, or
762
better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is
763
also inconsistent between string literals and character constants:
764
you may not embed newlines in character constants; try it, you get
765
a useful diagnostic. --PMiller */
779
/* Unknown escape sequences really should be an error, but just
780
ignore them, and let the real compiler complain. */
795
/* The \e escape is preculiar to gcc, and assumes an ASCII
796
character set (or superset). We don't provide support for it
819
case '0': case '1': case '2': case '3': case '4':
820
case '5': case '6': case '7': case '8': case '9':
821
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
822
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
834
case '0': case '1': case '2': case '3': case '4':
835
case '5': case '6': case '7': case '8': case '9':
836
n = n * 16 + c - '0';
839
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
840
n = n * 16 + 10 + c - 'A';
843
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
844
n = n * 16 + 10 + c - 'a';
851
case '0': case '1': case '2': case '3':
852
case '4': case '5': case '6': case '7':
854
for (j = 0; j < 3; ++j)
863
case '0': case '1': case '2': case '3':
864
case '4': case '5': case '6': case '7':
876
phase7_ungetc (int c)
882
/* Free the memory pointed to by a 'struct token_ty'. */
884
free_token (token_ty *tp)
886
if (tp->type == token_type_name || tp->type == token_type_string_literal)
888
if (tp->type == token_type_string_literal
889
|| tp->type == token_type_objc_special)
890
drop_reference (tp->comment);
894
/* 5. Parse each resulting logical line as preprocessing tokens and
895
white space. Preprocessing tokens and C tokens don't always match. */
897
static token_ty phase5_pushback[1];
898
static int phase5_pushback_length;
902
phase5_get (token_ty *tp)
909
if (phase5_pushback_length)
911
*tp = phase5_pushback[--phase5_pushback_length];
916
tp->line_number = line_number;
921
tp->type = token_type_eof;
925
tp->type = token_type_eoln;
947
tp->type = token_type_white_space;
950
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
951
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
952
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
953
case 'V': case 'W': case 'X': case 'Y': case 'Z':
955
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
956
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
957
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
958
case 'v': case 'w': case 'x': case 'y': case 'z':
962
if (bufpos >= bufmax)
964
bufmax = 2 * bufmax + 10;
965
buffer = xrealloc (buffer, bufmax);
967
buffer[bufpos++] = c;
971
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
972
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
973
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
974
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
977
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
978
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
979
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
980
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
982
case '0': case '1': case '2': case '3': case '4':
983
case '5': case '6': case '7': case '8': case '9':
992
if (bufpos >= bufmax)
994
bufmax = 2 * bufmax + 10;
995
buffer = xrealloc (buffer, bufmax);
998
tp->string = xstrdup (buffer);
999
tp->type = token_type_name;
1008
tp->type = token_type_symbol;
1011
case '0': case '1': case '2': case '3': case '4':
1012
case '5': case '6': case '7': case '8': case '9':
1018
case '0': case '1': case '2': case '3': case '4':
1019
case '5': case '6': case '7': case '8': case '9':
1020
/* The preprocessing number token is more "generous" than the C
1021
number tokens. This is mostly due to token pasting (another
1022
thing we can ignore here). */
1026
if (bufpos >= bufmax)
1028
bufmax = 2 * bufmax + 10;
1029
buffer = xrealloc (buffer, bufmax);
1031
buffer[bufpos++] = c;
1037
if (bufpos >= bufmax)
1039
bufmax = 2 * bufmax + 10;
1040
buffer = xrealloc (buffer, bufmax);
1042
buffer[bufpos++] = c;
1044
if (c != '+' || c != '-')
1051
case 'A': case 'B': case 'C': case 'D': case 'F':
1052
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1053
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1054
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1056
case 'a': case 'b': case 'c': case 'd': case 'f':
1057
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1058
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1059
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1061
case '0': case '1': case '2': case '3': case '4':
1062
case '5': case '6': case '7': case '8': case '9':
1072
if (bufpos >= bufmax)
1074
bufmax = 2 * bufmax + 10;
1075
buffer = xrealloc (buffer, bufmax);
1078
tp->type = token_type_number;
1079
tp->number = atol (buffer);
1083
/* We could worry about the 'L' before wide character constants,
1084
but ignoring it has no effect unless one of the keywords is
1085
"L". Just pretend it won't happen. Also, we don't need to
1086
remember the character constant. */
1090
if (c == P7_NEWLINE)
1092
error_with_progname = false;
1093
error (0, 0, _("%s:%d: warning: unterminated character constant"),
1094
logical_file_name, line_number - 1);
1095
error_with_progname = true;
1096
phase7_ungetc ('\n');
1099
if (c == EOF || c == P7_QUOTE)
1102
tp->type = token_type_character_constant;
1106
/* We could worry about the 'L' before wide string constants,
1107
but since gettext's argument is not a wide character string,
1108
let the compiler complain about the argument not matching the
1109
prototype. Just pretend it won't happen. */
1114
if (c == P7_NEWLINE)
1116
error_with_progname = false;
1117
error (0, 0, _("%s:%d: warning: unterminated string literal"),
1118
logical_file_name, line_number - 1);
1119
error_with_progname = true;
1120
phase7_ungetc ('\n');
1123
if (c == EOF || c == P7_QUOTES)
1127
if (bufpos >= bufmax)
1129
bufmax = 2 * bufmax + 10;
1130
buffer = xrealloc (buffer, bufmax);
1132
buffer[bufpos++] = c;
1134
if (bufpos >= bufmax)
1136
bufmax = 2 * bufmax + 10;
1137
buffer = xrealloc (buffer, bufmax);
1140
tp->type = token_type_string_literal;
1141
tp->string = xstrdup (buffer);
1142
tp->comment = add_reference (savable_comment);
1146
tp->type = token_type_lparen;
1150
tp->type = token_type_rparen;
1154
tp->type = token_type_comma;
1158
tp->type = token_type_hash;
1162
tp->type = token_type_colon;
1166
if (objc_extensions)
1168
tp->type = token_type_objc_special;
1169
tp->comment = add_reference (savable_comment);
1175
/* We could carefully recognize each of the 2 and 3 character
1176
operators, but it is not necessary, as we only need to recognize
1177
gettext invocations. Don't bother. */
1178
tp->type = token_type_symbol;
1184
/* Supports only one pushback token. */
1186
phase5_unget (token_ty *tp)
1188
if (tp->type != token_type_eof)
1190
if (phase5_pushback_length == SIZEOF (phase5_pushback))
1192
phase5_pushback[phase5_pushback_length++] = *tp;
1197
/* X. Recognize a leading # symbol. Leave leading hash as a hash, but
1198
turn hash in the middle of a line into a plain symbol token. This
1199
makes the phase 6 easier. */
1202
phaseX_get (token_ty *tp)
1204
static bool middle; /* false at the beginning of a line, true otherwise. */
1208
if (tp->type == token_type_eoln || tp->type == token_type_eof)
1214
/* Turn hash in the middle of a line into a plain symbol token. */
1215
if (tp->type == token_type_hash)
1216
tp->type = token_type_symbol;
1220
/* When we see leading whitespace followed by a hash sign,
1221
discard the leading white space token. The hash is all
1222
phase 6 is interested in. */
1223
if (tp->type == token_type_white_space)
1228
if (next.type == token_type_hash)
1231
phase5_unget (&next);
1239
/* 6. Recognize and carry out directives (it also expands macros on
1240
non-directive lines, which we do not do here). The only directive
1241
we care about are the #line and #define directive. We throw all the
1244
static token_ty phase6_pushback[2];
1245
static int phase6_pushback_length;
1249
phase6_get (token_ty *tp)
1251
static token_ty *buf;
1256
if (phase6_pushback_length)
1258
*tp = phase6_pushback[--phase6_pushback_length];
1263
/* Get the next token. If it is not a '#' at the beginning of a
1264
line (ignoring whitespace), return immediately. */
1266
if (tp->type != token_type_hash)
1269
/* Accumulate the rest of the directive in a buffer, until the
1270
"define" keyword is seen or until end of line. */
1275
if (tp->type == token_type_eoln || tp->type == token_type_eof)
1278
/* Before the "define" keyword and inside other directives
1279
white space is irrelevant. So just throw it away. */
1280
if (tp->type != token_type_white_space)
1282
/* If it is a #define directive, return immediately,
1283
thus treating the body of the #define directive like
1286
&& tp->type == token_type_name
1287
&& strcmp (tp->string, "define") == 0)
1291
if (bufpos >= bufmax)
1293
bufmax = 2 * bufmax + 10;
1294
buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1296
buf[bufpos++] = *tp;
1300
/* If it is a #line directive, with no macros to expand, act on
1301
it. Ignore all other directives. */
1302
if (bufpos >= 3 && buf[0].type == token_type_name
1303
&& strcmp (buf[0].string, "line") == 0
1304
&& buf[1].type == token_type_number
1305
&& buf[2].type == token_type_string_literal)
1307
logical_file_name = xstrdup (buf[2].string);
1308
line_number = buf[1].number;
1310
if (bufpos >= 2 && buf[0].type == token_type_number
1311
&& buf[1].type == token_type_string_literal)
1313
logical_file_name = xstrdup (buf[1].string);
1314
line_number = buf[0].number;
1317
/* Release the storage held by the directive. */
1318
for (j = 0; j < bufpos; ++j)
1319
free_token (&buf[j]);
1321
/* We must reset the selected comments. */
1322
savable_comment_reset ();
1327
/* Supports 2 tokens of pushback. */
1329
phase6_unget (token_ty *tp)
1331
if (tp->type != token_type_eof)
1333
if (phase6_pushback_length == SIZEOF (phase6_pushback))
1335
phase6_pushback[phase6_pushback_length++] = *tp;
1340
/* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1341
literal placeholders. */
1343
/* Test for an ISO C 99 section 7.8.1 format string directive. */
1345
is_inttypes_macro (const char *name)
1348
P R I { d | i | o | u | x | X }
1349
{ { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */
1350
if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1353
if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1354
|| name[0] == 'x' || name[0] == 'X')
1357
if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1360
if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1363
if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1364
&& name[3] == 'S' && name[4] == 'T')
1366
else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1369
if (name[0] == '8' && name[1] == '\0')
1371
if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1373
if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1375
if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1383
phase8a_get (token_ty *tp)
1386
if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1388
/* Turn PRIdXXX into "<PRIdXXX>". */
1389
size_t len = strlen (tp->string);
1390
char *new_string = (char *) xmalloc (len + 3);
1391
new_string[0] = '<';
1392
memcpy (new_string + 1, tp->string, len);
1393
new_string[len + 1] = '>';
1394
new_string[len + 2] = '\0';
1396
tp->string = new_string;
1397
tp->comment = add_reference (savable_comment);
1398
tp->type = token_type_string_literal;
1402
/* Supports 2 tokens of pushback. */
1404
phase8a_unget (token_ty *tp)
1410
/* 8b. Drop whitespace. */
1412
phase8b_get (token_ty *tp)
1418
if (tp->type == token_type_white_space)
1420
if (tp->type == token_type_eoln)
1422
/* We have to track the last occurrence of a string. One
1423
mode of xgettext allows to group an extracted message
1424
with a comment for documentation. The rule which states
1425
which comment is assumed to be grouped with the message
1426
says it should immediately precede it. Our
1427
interpretation: between the last line of the comment and
1428
the line in which the keyword is found must be no line
1429
with non-white space tokens. */
1431
if (last_non_comment_line > last_comment_line)
1432
savable_comment_reset ();
1439
/* Supports 2 tokens of pushback. */
1441
phase8b_unget (token_ty *tp)
1447
/* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to
1448
do this before performing concatenation of adjacent string literals. */
1450
phase8c_get (token_ty *tp)
1455
if (tp->type != token_type_objc_special)
1458
if (tmp.type != token_type_string_literal)
1460
phase8b_unget (&tmp);
1463
/* Drop the '@' token and return immediately the following string. */
1464
drop_reference (tmp.comment);
1465
tmp.comment = tp->comment;
1469
/* Supports only one pushback token. */
1471
phase8c_unget (token_ty *tp)
1477
/* 8. Concatenate adjacent string literals to form single string
1478
literals (because we don't expand macros, there are a few things we
1482
phase8_get (token_ty *tp)
1485
if (tp->type != token_type_string_literal)
1493
if (tmp.type != token_type_string_literal)
1495
phase8c_unget (&tmp);
1498
len = strlen (tp->string);
1499
tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1500
strcpy (tp->string + len, tmp.string);
1506
/* ===================== Reading of high-level tokens. ==================== */
1509
enum xgettext_token_type_ty
1511
xgettext_token_type_eof,
1512
xgettext_token_type_keyword,
1513
xgettext_token_type_symbol,
1514
xgettext_token_type_lparen,
1515
xgettext_token_type_rparen,
1516
xgettext_token_type_comma,
1517
xgettext_token_type_colon,
1518
xgettext_token_type_string_literal,
1519
xgettext_token_type_other
1521
typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1523
typedef struct xgettext_token_ty xgettext_token_ty;
1524
struct xgettext_token_ty
1526
xgettext_token_type_ty type;
1528
/* These fields are used only for xgettext_token_type_keyword. */
1532
/* This field is used only for xgettext_token_type_string_literal,
1533
xgettext_token_type_keyword, xgettext_token_type_symbol. */
1536
/* This field is used only for xgettext_token_type_string_literal. */
1537
refcounted_string_list_ty *comment;
1539
/* These fields are only for
1540
xgettext_token_type_keyword,
1541
xgettext_token_type_string_literal. */
1546
/* 9. Convert the remaining preprocessing tokens to C tokens and
1547
discards any white space from the translation unit. */
1550
x_c_lex (xgettext_token_ty *tp)
1555
void *keyword_value;
1557
phase8_get (&token);
1560
case token_type_eof:
1561
tp->type = xgettext_token_type_eof;
1564
case token_type_name:
1565
last_non_comment_line = newline_count;
1567
if (find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1568
token.string, strlen (token.string), &keyword_value)
1571
tp->type = xgettext_token_type_keyword;
1572
tp->argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1573
tp->argnum2 = (int) (long) keyword_value >> 10;
1574
tp->pos.file_name = logical_file_name;
1575
tp->pos.line_number = token.line_number;
1578
tp->type = xgettext_token_type_symbol;
1579
tp->string = token.string;
1582
case token_type_lparen:
1583
last_non_comment_line = newline_count;
1585
tp->type = xgettext_token_type_lparen;
1588
case token_type_rparen:
1589
last_non_comment_line = newline_count;
1591
tp->type = xgettext_token_type_rparen;
1594
case token_type_comma:
1595
last_non_comment_line = newline_count;
1597
tp->type = xgettext_token_type_comma;
1600
case token_type_colon:
1601
last_non_comment_line = newline_count;
1603
tp->type = xgettext_token_type_colon;
1606
case token_type_string_literal:
1607
last_non_comment_line = newline_count;
1609
tp->type = xgettext_token_type_string_literal;
1610
tp->string = token.string;
1611
tp->comment = token.comment;
1612
tp->pos.file_name = logical_file_name;
1613
tp->pos.line_number = token.line_number;
1616
case token_type_objc_special:
1617
drop_reference (token.comment);
1621
last_non_comment_line = newline_count;
1623
tp->type = xgettext_token_type_other;
1630
/* ========================= Extracting strings. ========================== */
1633
/* Context lookup table. */
1634
static flag_context_list_table_ty *flag_context_list_table;
1637
/* The file is broken into tokens. Scan the token stream, looking for
1638
a keyword, followed by a left paren, followed by a string. When we
1639
see this sequence, we have something to remember. We assume we are
1640
looking at a valid C or C++ program, and leave the complaints about
1641
the grammar to the compiler.
1643
Normal handling: Look for
1644
keyword ( ... msgid ... )
1645
Plural handling: Look for
1646
keyword ( ... msgid ... msgid_plural ... )
1648
We use recursion because the arguments before msgid or between msgid
1649
and msgid_plural can contain subexpressions of the same form. */
1652
/* Extract messages until the next balanced closing parenthesis.
1653
Extracted messages are added to MLP.
1654
When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1655
if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1656
otherwise PLURAL_COMMAS = 0.
1657
When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1658
Return true upon eof, false upon closing parenthesis. */
1660
extract_parenthesized (message_list_ty *mlp,
1661
flag_context_ty outer_context,
1662
flag_context_list_iterator_ty context_iter,
1663
int commas_to_skip, int plural_commas)
1665
/* Remember the message containing the msgid, for msgid_plural. */
1666
message_ty *plural_mp = NULL;
1668
/* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1670
/* Parameters of the keyword just seen. Defined only in state 1. */
1671
int next_commas_to_skip = -1;
1672
int next_plural_commas = 0;
1673
/* Context iterator that will be used if the next token is a '('. */
1674
flag_context_list_iterator_ty next_context_iter =
1675
passthrough_context_list_iterator;
1676
/* Context iterator that will be used if the next token is a ':'.
1677
(Objective C selector syntax.) */
1678
flag_context_list_iterator_ty selectorcall_context_iter =
1679
passthrough_context_list_iterator;
1680
/* Current context. */
1681
flag_context_ty inner_context =
1682
inherited_context (outer_context,
1683
flag_context_list_iterator_advance (&context_iter));
1685
/* Start state is 0. */
1690
xgettext_token_ty token;
1695
case xgettext_token_type_keyword:
1696
next_commas_to_skip = token.argnum1 - 1;
1697
next_plural_commas = (token.argnum2 > token.argnum1
1698
? token.argnum2 - token.argnum1 : 0);
1700
goto keyword_or_symbol;
1702
case xgettext_token_type_symbol:
1706
flag_context_list_iterator (
1707
flag_context_list_table_lookup (
1708
flag_context_list_table,
1709
token.string, strlen (token.string)));
1710
if (objc_extensions)
1712
size_t token_string_len = strlen (token.string);
1713
token.string = xrealloc (token.string, token_string_len + 2);
1714
token.string[token_string_len] = ':';
1715
token.string[token_string_len + 1] = '\0';
1716
selectorcall_context_iter =
1717
flag_context_list_iterator (
1718
flag_context_list_table_lookup (
1719
flag_context_list_table,
1720
token.string, token_string_len + 1));
1722
free (token.string);
1725
case xgettext_token_type_lparen:
1726
if (extract_parenthesized (mlp, inner_context, next_context_iter,
1727
state ? next_commas_to_skip : -1,
1728
state ? next_plural_commas : 0))
1730
next_context_iter = null_context_list_iterator;
1731
selectorcall_context_iter = null_context_list_iterator;
1735
case xgettext_token_type_rparen:
1738
case xgettext_token_type_comma:
1739
if (commas_to_skip >= 0)
1741
if (commas_to_skip > 0)
1744
if (plural_mp != NULL && plural_commas > 0)
1746
commas_to_skip = plural_commas - 1;
1750
commas_to_skip = -1;
1753
inherited_context (outer_context,
1754
flag_context_list_iterator_advance (
1756
next_context_iter = passthrough_context_list_iterator;
1757
selectorcall_context_iter = passthrough_context_list_iterator;
1761
case xgettext_token_type_colon:
1762
if (objc_extensions)
1764
context_iter = selectorcall_context_iter;
1766
inherited_context (inner_context,
1767
flag_context_list_iterator_advance (
1769
next_context_iter = passthrough_context_list_iterator;
1770
selectorcall_context_iter = passthrough_context_list_iterator;
1774
next_context_iter = null_context_list_iterator;
1775
selectorcall_context_iter = null_context_list_iterator;
1780
case xgettext_token_type_string_literal:
1783
savable_comment_to_xgettext_comment (token.comment);
1784
remember_a_message (mlp, token.string, inner_context, &token.pos);
1785
savable_comment_reset ();
1789
if (commas_to_skip == 0)
1791
if (plural_mp == NULL)
1793
/* Seen an msgid. */
1796
savable_comment_to_xgettext_comment (token.comment);
1797
mp = remember_a_message (mlp, token.string,
1798
inner_context, &token.pos);
1799
savable_comment_reset ();
1800
if (plural_commas > 0)
1805
/* Seen an msgid_plural. */
1806
remember_a_message_plural (plural_mp, token.string,
1807
inner_context, &token.pos);
1812
free (token.string);
1814
drop_reference (token.comment);
1815
next_context_iter = null_context_list_iterator;
1816
selectorcall_context_iter = null_context_list_iterator;
1820
case xgettext_token_type_other:
1821
next_context_iter = null_context_list_iterator;
1822
selectorcall_context_iter = null_context_list_iterator;
1826
case xgettext_token_type_eof:
1837
extract_whole_file (FILE *f,
1838
const char *real_filename, const char *logical_filename,
1839
flag_context_list_table_ty *flag_table,
1840
msgdomain_list_ty *mdlp)
1842
message_list_ty *mlp = mdlp->item[0]->messages;
1845
real_file_name = real_filename;
1846
logical_file_name = xstrdup (logical_filename);
1850
last_comment_line = -1;
1851
last_non_comment_line = -1;
1853
flag_context_list_table = flag_table;
1857
/* Eat tokens until eof is seen. When extract_parenthesized returns
1858
due to an unbalanced closing parenthesis, just restart it. */
1859
while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1863
/* Close scanner. */
1865
real_file_name = NULL;
1866
logical_file_name = NULL;
1873
const char *real_filename, const char *logical_filename,
1874
flag_context_list_table_ty *flag_table,
1875
msgdomain_list_ty *mdlp)
1877
objc_extensions = false;
1878
extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1882
extract_objc (FILE *f,
1883
const char *real_filename, const char *logical_filename,
1884
flag_context_list_table_ty *flag_table,
1885
msgdomain_list_ty *mdlp)
1887
objc_extensions = true;
1888
extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);