3
"$Header: d:/cvsroot/tads/tads3/tctok.cpp,v 1.5 1999/07/11 00:46:58 MJRoberts Exp $";
7
* Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved.
9
* Please see the accompanying license file, LICENSE.TXT, for information
10
* on using and copying this software.
14
tctok.cpp - TADS3 compiler tokenizer
18
The tokenizer features an integrated C-style preprocessor. The
19
preprocessor is integrated into the tokenizer for efficiency; since
20
the preprocessor uses the same lexical structure as the the TADS
21
language, we need only tokenize the input stream once, and the result
22
can be used both for preprocessing and for parsing.
24
04/12/99 MJRoberts - Creation
48
/* ------------------------------------------------------------------------ */
50
* Initialize the tokenizer
52
CTcTokenizer::CTcTokenizer(CResLoader *res_loader,
53
const char *default_charset)
63
tc_toktyp_t kw_tok_id;
65
static const kwdef kwlist[] =
67
{ "self", TOKT_SELF },
68
{ "targetprop", TOKT_TARGETPROP },
69
{ "targetobj", TOKT_TARGETOBJ },
70
{ "definingobj", TOKT_DEFININGOBJ },
71
{ "inherited", TOKT_INHERITED },
72
{ "delegated", TOKT_DELEGATED },
73
{ "argcount", TOKT_ARGCOUNT },
75
{ "else", TOKT_ELSE },
77
{ "while", TOKT_WHILE },
79
{ "switch", TOKT_SWITCH },
80
{ "case", TOKT_CASE },
81
{ "default", TOKT_DEFAULT },
82
{ "goto", TOKT_GOTO },
83
{ "break", TOKT_BREAK },
84
{ "continue", TOKT_CONTINUE },
85
// { "and", TOKT_AND },
87
// { "not", TOKT_NOT },
88
{ "function", TOKT_FUNCTION },
89
{ "return", TOKT_RETURN },
90
{ "local", TOKT_LOCAL },
91
{ "object", TOKT_OBJECT },
93
{ "true", TOKT_TRUE },
94
{ "pass", TOKT_PASS },
95
{ "external", TOKT_EXTERNAL },
96
{ "extern", TOKT_EXTERN },
97
{ "formatstring", TOKT_FORMATSTRING },
98
{ "class", TOKT_CLASS },
99
{ "replace", TOKT_REPLACE },
100
{ "modify", TOKT_MODIFY },
102
// { "delete", TOKT_DELETE },
103
{ "throw", TOKT_THROW },
105
{ "catch", TOKT_CATCH },
106
{ "finally", TOKT_FINALLY },
107
{ "intrinsic", TOKT_INTRINSIC },
108
{ "dictionary", TOKT_DICTIONARY },
109
{ "grammar", TOKT_GRAMMAR },
110
{ "enum", TOKT_ENUM },
111
{ "template", TOKT_TEMPLATE },
112
{ "static", TOKT_STATIC },
113
{ "foreach", TOKT_FOREACH },
114
{ "export", TOKT_EXPORT },
115
{ "propertyset", TOKT_PROPERTYSET },
116
{ "transient", TOKT_TRANSIENT },
117
{ "replaced", TOKT_REPLACED },
118
{ "property", TOKT_PROPERTY },
120
// { "void", TOKT_VOID },
121
// { "int", TOKT_INT },
122
// { "string", TOKT_STRING },
123
// { "list", TOKT_LIST },
124
// { "boolean", TOKT_BOOLEAN },
125
// { "any", TOKT_ANY },
127
/* end-of-table marker */
132
/* remember my resource loader */
133
res_loader_ = res_loader;
135
/* there's no stream yet */
138
/* no external source yet */
141
/* start numbering the file descriptors at zero */
142
next_filedesc_id_ = 0;
144
/* there are no file descriptors yet */
148
desc_list_cnt_ = desc_list_alo_ = 0;
150
/* empty out the input line buffer */
153
/* start out with a minimal line buffer size */
154
linebuf_.ensure_space(4096);
155
expbuf_.ensure_space(4096);
157
/* set up at the beginning of the input line buffer */
158
start_new_line(&linebuf_, 0);
160
/* remember the default character set */
161
default_charset_ = lib_copy_str(default_charset);
163
/* we don't have a default character mapper yet */
166
/* create an input mapper for the default character set, if specified */
167
if (default_charset != 0)
168
default_mapper_ = CCharmapToUni::load(res_loader, default_charset);
171
* if the default character set wasn't specified, or we failed to
172
* load a mapper for the specified character set, use a plain ASCII
175
if (default_mapper_ == 0)
176
default_mapper_ = new CCharmapToUniASCII();
178
/* presume we're not in preprocessor-only mode */
179
pp_only_mode_ = FALSE;
181
/* presume we're not in list-includes mode */
182
list_includes_mode_ = FALSE;
184
/* presume we're not in test report mode */
185
test_report_mode_ = FALSE;
187
/* allow preprocessing directives */
190
/* there are no previously-included files yet */
193
/* presume we'll convert newlines in strings to whitespace */
194
string_newline_spacing_ = TRUE;
196
/* start out with ALL_ONCE mode off */
199
/* by default, ignore redundant includes without warning */
200
warn_on_ignore_incl_ = FALSE;
202
/* there are no include path entries yet */
203
incpath_head_ = incpath_tail_ = 0;
205
/* not in a quoted string yet */
208
/* not in an embedded expression yet */
209
comment_in_embedding_ = FALSE;
210
macro_in_embedding_ = FALSE;
211
main_in_embedding_ = FALSE;
213
/* not in a #if block yet */
217
/* not processing a preprocessor constant expression */
220
/* we don't have a current or appended line yet */
224
appended_linenum_ = 0;
226
/* allocate the first token-list block */
227
init_src_block_list();
229
/* create the #define and #undef symbol tables */
230
defines_ = new CVmHashTable(512, new CVmHashFuncCS(), TRUE);
231
undefs_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
233
/* create the special __LINE__ and __FILE__ macros */
234
defines_->add(new CTcHashEntryPpLINE(this));
235
defines_->add(new CTcHashEntryPpFILE(this));
237
/* get the current time and date */
239
tblk = localtime(&timer);
240
tstr = asctime(tblk);
243
* add the __DATE__ macro - the format is "Mmm dd yyyy", where "Mmm"
244
* is the three-letter month name generated by asctime(), "dd" is
245
* the day of the month, with a leading space for numbers less than
246
* ten, and "yyyy" is the year.
248
sprintf(timebuf, "'%.3s %2d %4d'",
249
tstr + 4, tblk->tm_mday, tblk->tm_year + 1900);
250
add_define("__DATE__", timebuf);
252
/* add the __TIME__ macro - 24-hour "hh:mm:ss" format */
253
sprintf(timebuf, "'%.8s'", tstr + 11);
254
add_define("__TIME__", timebuf);
257
* Allocate a pool of macro resources. The number we start with is
258
* arbitrary, since we'll add more as needed, but we want to try to
259
* allocate enough up front that we avoid time-consuming memory
260
* allocations later. On the other hand, we don't want to
261
* pre-allocate a huge number of objects that we'll never use.
263
for (macro_res_avail_ = 0, macro_res_head_ = 0, i = 0 ; i < 7 ; ++i)
267
/* allocate a new object */
268
rsc = new CTcMacroRsc();
270
/* add it onto the master list */
271
rsc->next_ = macro_res_head_;
272
macro_res_head_ = rsc;
274
/* add it onto the available list */
275
rsc->next_avail_ = macro_res_avail_;
276
macro_res_avail_ = rsc;
279
/* create the keyword hash table */
280
kw_ = new CVmHashTable(64, new CVmHashFuncCS(), TRUE);
282
/* populate the keyword table */
283
for (kwp = kwlist ; kwp->kw_text != 0 ; ++kwp)
284
kw_->add(new CTcHashEntryKw(kwp->kw_text, kwp->kw_tok_id));
286
/* no ungot token yet */
287
nxttok_valid_ = FALSE;
289
/* no string capture file */
295
* Initialize the source save block list
297
void CTcTokenizer::init_src_block_list()
299
/* allocate the first source block */
300
src_cur_ = src_head_ = new CTcTokSrcBlock();
302
/* set up to write into the first block */
303
src_ptr_ = src_head_->get_buf();
304
src_rem_ = TCTOK_SRC_BLOCK_SIZE;
308
/* ------------------------------------------------------------------------ */
310
* Delete the tokenizer
312
CTcTokenizer::~CTcTokenizer()
314
/* delete all streams */
317
/* delete all file descriptors */
318
while (desc_head_ != 0)
322
/* remember the next descriptor */
323
nxt = desc_head_->get_next();
325
/* delete this one */
328
/* move on to the next one */
332
/* delete the file descriptor index array */
336
/* delete our default character set string copy */
337
lib_free_str(default_charset_);
339
/* release our reference on our default character mapper */
340
default_mapper_->release_ref();
342
/* forget about all of our previous include files */
343
while (prev_includes_ != 0)
345
tctok_incfile_t *nxt;
347
/* remember the next file */
348
nxt = prev_includes_->nxt;
350
/* delete this one */
351
t3free(prev_includes_);
353
/* move on to the next one */
354
prev_includes_ = nxt;
357
/* delete the include path list */
358
while (incpath_head_ != 0)
360
tctok_incpath_t *nxt;
362
/* remember the next entry in the path */
363
nxt = incpath_head_->nxt;
365
/* delete this entry */
366
t3free(incpath_head_);
368
/* move on to the next one */
372
/* delete the macro resources */
373
while (macro_res_head_ != 0)
377
/* remember the next one */
378
nxt = macro_res_head_->next_;
380
/* delete this one */
381
delete macro_res_head_;
383
/* move on to the next one */
384
macro_res_head_ = nxt;
387
/* delete the token list */
390
/* delete the #define and #undef symbol tables */
394
/* delete the keyword hash table */
397
/* if we created a mapping for the string capture file, release it */
398
if (string_fp_map_ != 0)
399
string_fp_map_->release_ref();
402
/* ------------------------------------------------------------------------ */
404
* Clear the line buffer
406
void CTcTokenizer::clear_linebuf()
408
/* clear the buffer */
409
linebuf_.clear_text();
411
/* reset our read point to the start of the line buffer */
412
p_.set(linebuf_.get_buf());
415
/* ------------------------------------------------------------------------ */
417
* Get a textual representation of an operator token
419
const char *CTcTokenizer::get_op_text(tc_toktyp_t op)
426
static const tokname_t toknames[] =
428
{ TOKT_EOF, "<end of file>" },
429
{ TOKT_SYM, "<symbol>" },
430
{ TOKT_INT, "<integer>" },
431
{ TOKT_SSTR, "<single-quoted string>" },
432
{ TOKT_DSTR, "<double-quoted string>" },
433
{ TOKT_DSTR_START, "<double-quoted string>" },
434
{ TOKT_DSTR_MID, "<double-quoted string>" },
435
{ TOKT_DSTR_END, "<double-quoted string>" },
440
{ TOKT_LBRACE, "{" },
441
{ TOKT_RBRACE, "}", },
442
{ TOKT_LBRACK, "[", },
443
{ TOKT_RBRACK, "]", },
445
{ TOKT_EQEQ, "==", },
457
{ TOKT_ARROW, "->" },
461
{ TOKT_ANDAND, "&&" },
469
{ TOKT_PLUSEQ, "+=" },
470
{ TOKT_MINEQ, "-=" },
471
{ TOKT_TIMESEQ, "*=" },
472
{ TOKT_DIVEQ, "/=" },
473
{ TOKT_MODEQ, "%=" },
474
{ TOKT_ANDEQ, "&=" },
476
{ TOKT_XOREQ, "^=" },
477
{ TOKT_SHLEQ, "<<=" },
478
{ TOKT_SHREQ, ">>=" },
479
{ TOKT_NOT, "! (not)" },
482
{ TOKT_POUNDPOUND, "##" },
483
{ TOKT_POUNDAT, "#@" },
484
{ TOKT_ELLIPSIS, "..." },
485
{ TOKT_QUESTION, "?" },
486
{ TOKT_COLONCOLON, "::" },
487
{ TOKT_FLOAT, "<float>" },
489
{ TOKT_SELF, "self" },
490
{ TOKT_TARGETPROP, "targetprop" },
491
{ TOKT_TARGETOBJ, "targetobj" },
492
{ TOKT_DEFININGOBJ, "definingobj" },
493
{ TOKT_INHERITED, "inherited" },
494
{ TOKT_DELEGATED, "delegated" },
496
{ TOKT_ELSE, "else" },
498
{ TOKT_WHILE, "while" },
500
{ TOKT_SWITCH, "switch" },
501
{ TOKT_CASE, "case" },
502
{ TOKT_DEFAULT, "default" },
503
{ TOKT_GOTO, "goto" },
504
{ TOKT_BREAK, "break" },
505
{ TOKT_CONTINUE, "continue" },
506
{ TOKT_FUNCTION, "function" },
507
{ TOKT_RETURN, "return" },
508
{ TOKT_LOCAL, "local" },
509
{ TOKT_OBJECT, "object" },
511
{ TOKT_TRUE, "true" },
512
{ TOKT_PASS, "pass" },
513
{ TOKT_EXTERNAL, "external" },
514
{ TOKT_EXTERN, "extern" },
515
{ TOKT_FORMATSTRING, "formatstring" },
516
{ TOKT_CLASS, "class" },
517
{ TOKT_REPLACE, "replace" },
518
{ TOKT_MODIFY, "modify" },
520
// { TOKT_DELETE, "delete" },
521
{ TOKT_THROW, "throw" },
523
{ TOKT_CATCH, "catch" },
524
{ TOKT_FINALLY, "finally" },
525
{ TOKT_INTRINSIC, "intrinsic" },
526
{ TOKT_DICTIONARY, "dictionary" },
527
{ TOKT_GRAMMAR, "grammar" },
528
{ TOKT_ENUM, "enum" },
529
{ TOKT_TEMPLATE, "template" },
530
{ TOKT_STATIC, "static" },
531
{ TOKT_FOREACH, "foreach" },
532
{ TOKT_EXPORT, "export" },
533
{ TOKT_PROPERTYSET, "propertyset" },
534
{ TOKT_TRANSIENT, "transient" },
535
{ TOKT_REPLACED, "replaced" },
536
{ TOKT_PROPERTY, "property" },
538
// { TOKT_VOID, "void" },
539
// { TOKT_INTKW, "int" },
540
// { TOKT_STRING, "string" },
541
// { TOKT_LIST, "list" },
542
// { TOKT_BOOLEAN, "boolean" },
543
// { TOKT_ANY, "any"},
549
/* search for the token */
550
for (p = toknames ; p->nm != 0 ; ++p)
552
/* if this is our token, return the associated name string */
557
/* we didn't find it */
561
/* ------------------------------------------------------------------------ */
563
* Reset the tokenizer. Delete the current source object and all of the
564
* saved source text. This can be used after compilation of a unit
565
* (such as a debugger expression) is completed and the intermediate
566
* parser state is no longer needed.
568
void CTcTokenizer::reset()
570
/* delete the source object */
573
/* delete saved token text */
576
/* delete the list */
579
/* re-initialize the source block list */
580
init_src_block_list();
584
/* ------------------------------------------------------------------------ */
586
* Delete the source file, if any, including any parent include files.
588
void CTcTokenizer::delete_source()
590
/* delete the current stream and all enclosing parents */
595
/* remember the next stream in the list */
596
nxt = str_->get_parent();
598
/* delete this stream */
601
/* move up to the next one */
605
/* there are no more streams */
610
/* ------------------------------------------------------------------------ */
612
* Set up to read a source file. Returns zero on success, or a non-zero
613
* error code on failure.
615
int CTcTokenizer::set_source(const char *src_filename, const char *orig_name)
617
CTcTokFileDesc *desc;
620
int default_charset_error;
622
/* empty out the input line buffer */
625
/* set up at the beginning of the input line buffer */
626
start_new_line(&linebuf_, 0);
628
/* create a reader for the source file */
629
src = CTcSrcFile::open_source(src_filename, res_loader_,
630
default_charset_, &charset_error,
631
&default_charset_error);
634
/* if we had a problem loading the default character set, log it */
635
if (default_charset_error)
636
log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
639
return TCERR_CANT_OPEN_SRC;
642
/* find or create a file descriptor for this filename */
643
desc = get_file_desc(src_filename, strlen(src_filename), FALSE,
644
orig_name, strlen(orig_name));
647
* Create a stream to read the source file. The new stream has no
648
* parent, because this is the top-level source file, and was not
649
* included from any other file.
651
str_ = new CTcTokStream(desc, src, 0, charset_error, if_sp_);
658
* Set up to read source code from a memory buffer
660
void CTcTokenizer::set_source_buf(const char *buf)
664
/* empty out the input line buffer */
667
/* reset the scanning state to the start of a brand new stream */
670
unsplicebuf_.clear_text();
672
comment_in_embedding_ = FALSE;
673
macro_in_embedding_ = FALSE;
674
main_in_embedding_ = FALSE;
677
nxttok_valid_ = FALSE;
679
/* set up at the beginning of the input line buffer */
680
start_new_line(&linebuf_, 0);
682
/* create a reader for the memory buffer */
683
src = new CTcSrcMemory(buf, default_mapper_);
686
* Create a stream to read the source file. The new stream has no
687
* parent, because this is the top-level source file, and was not
688
* included from any other file.
690
str_ = new CTcTokStream(0, src, 0, 0, if_sp_);
693
/* ------------------------------------------------------------------------ */
695
* Stuff text into the source stream.
697
void CTcTokenizer::stuff_text(const char *txt, size_t len, int expand)
702
/* if desired, expand macros */
705
/* expand macros in the text, storing the result in 'expbuf' */
706
expand_macros(&expbuf, txt, len);
708
/* use the expanded version as the stuffed text now */
709
txt = expbuf.get_text();
710
len = expbuf.get_text_len();
713
/* get the current p_ offset */
714
p_ofs = p_.getptr() - curbuf_->get_text();
716
/* insert the text into the buffer */
717
curbuf_->insert(p_ofs, txt, len);
719
/* reset p_ in case the curbuf_ buffer was reallocated for expansion */
720
start_new_line(curbuf_, p_ofs);
723
/* ------------------------------------------------------------------------ */
725
* Find or create a file descriptor for a given filename
727
CTcTokFileDesc *CTcTokenizer::get_file_desc(const char *fname,
730
const char *orig_fname,
731
size_t orig_fname_len)
733
CTcTokFileDesc *orig_desc;
734
CTcTokFileDesc *desc;
736
/* presume we won't find an original descriptor in the list */
740
* Search the list of existing descriptors to find one that matches.
741
* Do this regardless of whether we're allowed to re-use an existing
742
* one or not - even if we're creating a new one unconditionaly, we
743
* need to know if there's an earlier copy that already exists so we
744
* can associate the new one with the original.
746
for (desc = desc_head_ ; desc != 0 ; desc = desc->get_next())
748
/* check for a name match */
749
if (strlen(desc->get_fname()) == fname_len
750
&& memcmp(desc->get_fname(), fname, fname_len) == 0)
753
* if we're allowed to return an existing descriptor, return
754
* this one, since it's for the same filename
760
* we have to create a new descriptor even though we have an
761
* existing one - remember the original so we can point the
762
* new one back to the original
767
* no need to look any further - we've found the first
768
* instance of this filename in our list
774
/* we didn't find a match - create a new descriptor */
775
desc = new CTcTokFileDesc(fname, fname_len, next_filedesc_id_++,
776
orig_desc, orig_fname, orig_fname_len);
778
/* link it in at the end of the master list */
783
desc_tail_->set_next(desc);
786
/* expand our array index if necessary */
787
if (desc_list_cnt_ >= desc_list_alo_)
791
/* allocate or expand the array */
792
desc_list_alo_ += 10;
793
siz = desc_list_alo_ * sizeof(desc_list_[0]);
795
desc_list_ = (CTcTokFileDesc **)t3malloc(siz);
797
desc_list_ = (CTcTokFileDesc **)t3realloc(desc_list_, siz);
800
/* add the new array entry */
801
desc_list_[desc_list_cnt_++] = desc;
808
/* ------------------------------------------------------------------------ */
810
* Add an include path entry. Each new entry goes at the end of the
811
* list, after all previous entries.
813
void CTcTokenizer::add_inc_path(const char *path)
815
tctok_incpath_t *entry;
817
/* create a new path list entry */
818
entry = (tctok_incpath_t *)t3malloc(sizeof(tctok_incpath_t)
821
/* store the path in the entry */
822
strcpy(entry->path, path);
824
/* link this entry at the end of our list */
825
if (incpath_tail_ != 0)
826
incpath_tail_->nxt = entry;
828
incpath_head_ = entry;
829
incpath_tail_ = entry;
834
/* ------------------------------------------------------------------------ */
836
* Set the string capture file.
838
void CTcTokenizer::set_string_capture(osfildef *fp)
840
/* remember the capture file */
844
* if we don't already have a character mapping to translate from
845
* our internal unicode characters back into the source file
846
* character set, create one now
848
if (string_fp_map_ == 0)
850
/* try creating a mapping for the default character set */
851
if (default_charset_ != 0)
853
CCharmapToLocal::load(res_loader_, default_charset_);
855
/* if we couldn't create the mapping, use a default ASCII mapping */
856
if (string_fp_map_ == 0)
857
string_fp_map_ = CCharmapToLocal::load(res_loader_, "us-ascii");
862
/* ------------------------------------------------------------------------ */
864
* Get the next token in the input stream, reading additional lines from
865
* the source file as needed.
867
tc_toktyp_t CTcTokenizer::next()
869
/* the current token is about to become the previous token */
872
/* if there's an un-got token, return it */
875
/* get the previously-saved token */
878
/* we've now consumed nxttok_ */
879
nxttok_valid_ = FALSE;
881
/* return the new token's type */
882
return curtok_.gettyp();
885
/* if there's an external source, get its next token */
888
const CTcToken *ext_tok;
890
/* get the next token from the external source */
891
ext_tok = ext_src_->get_next_token();
893
/* check to see if we got a token */
897
* restore the current token in effect before this source was
900
curtok_ = *ext_src_->get_enclosing_curtok();
903
* this source has no more tokens - restore the enclosing
904
* source, and keep going so we try getting a token from it
906
ext_src_ = ext_src_->get_enclosing_source();
908
/* return the token type */
909
return curtok_.gettyp();
913
/* we got a token - copy it to our internal token buffer */
916
/* return its type */
917
return curtok_.gettyp();
921
/* keep going until we get a valid token */
927
* read the next token from the current line, applying
928
* appropriate string translations and storing strings and
929
* symbols in the source block list
931
typ = next_on_line_xlat_keep();
933
/* if it's the "null" token, skip it and read another token */
934
if (typ == TOKT_NULLTOK)
937
/* if we found a valid token, we're done - return the token */
942
* if we're at the end of a preprocess line, don't read another
943
* line - just return end of file
945
if (p_.getch() == TOK_END_PP_LINE)
949
* we've reached the end of the line - read another line,
950
* applying preprocessing directives and expanding macros as
955
/* no more lines are available - return end of file */
961
/* ------------------------------------------------------------------------ */
963
* clear external token sources, returning to the true input stream
965
void CTcTokenizer::clear_external_sources()
968
* restore the current token as it was before the outermost external
969
* source was first established
973
CTcTokenSource *outer;
975
/* find the outermost source */
976
for (outer = ext_src_ ; outer->get_enclosing_source() != 0 ;
977
outer = ext_src_->get_enclosing_source()) ;
979
/* restore its original next token */
980
curtok_ = *ext_src_->get_enclosing_curtok();
983
/* there's no external source now */
987
/* ------------------------------------------------------------------------ */
989
* Make a safely storable copy of the current token.
991
const CTcToken *CTcTokenizer::copycur()
993
/* if the current token is a symbol, it already has a safe copy */
994
if (curtok_.gettyp() == TOKT_SYM)
997
/* save the current token's text in permanent tokenizer memory */
998
curtok_.set_text(store_source(curtok_.get_text(), curtok_.get_text_len()),
999
curtok_.get_text_len());
1001
/* return the current token, now that we've made it safe */
1006
* Make a safely storable copy of a given token.
1008
void CTcTokenizer::copytok(CTcToken *dst, const CTcToken *src)
1010
/* start with an exact copy of the token */
1013
/* if the token is a symbol, it already has a safe copy */
1014
if (src->gettyp() == TOKT_SYM)
1017
/* save the token's text in permanent tokenizer memory */
1018
dst->set_text(store_source(dst->get_text(), dst->get_text_len()),
1019
dst->get_text_len());
1023
/* ------------------------------------------------------------------------ */
1025
* Check to see if the current token matches the given text
1027
int CTcTokenizer::cur_tok_matches(const char *txt, size_t len)
1029
/* if the length matches, and the text matches exactly, it matches */
1030
return (getcur()->get_text_len() == len
1031
&& memcmp(getcur()->get_text(), txt, len) == 0);
1034
/* ------------------------------------------------------------------------ */
1036
* Un-get the current token
1038
void CTcTokenizer::unget()
1041
* remember the current token as the next one to fetch, and flag
1042
* that this is valid
1045
nxttok_valid_ = TRUE;
1047
/* go back to the previous token */
1051
/* ------------------------------------------------------------------------ */
1053
* Assume that we should have just found a '>>' terminating an embedded
1054
* expression in a double-quoted string. If possible, back out the
1055
* previous token and re-scan it as though it had started with '>>'.
1057
* This is to be called by a higher-level parser when it determines
1058
* that, syntactically, we should have found the '>>' leaving an
1059
* embedded expression.
1061
void CTcTokenizer::assume_missing_dstr_cont()
1063
/* act as though we had just seen '>>' */
1064
xlat_string_to_src(&main_in_embedding_, TRUE);
1068
/* ------------------------------------------------------------------------ */
1070
* Skip whitespace and macro expansion markers
1072
void CTcTokenizer::skip_ws_and_markers(utf8_ptr *p)
1074
/* keep going until we find something interesting */
1079
/* get the current character */
1083
* if it's a macro expansion end marker, skip it as though it
1084
* were whitespace; otherwise, if it's whitespace, skip it;
1085
* otherwise, we're done skipping leading whitespace
1087
if (cur == TOK_MACRO_EXP_END)
1089
/* skip the embedded pointer value that follows */
1090
p->set(p->getptr() + 1 + sizeof(CTcHashEntryPp *));
1092
else if (is_space(cur))
1094
/* skip the space */
1099
/* it's not whitespace or equivalent - we're done */
1105
/* ------------------------------------------------------------------------ */
1107
* Get the next token from the input stream, operating on the current
1110
tc_toktyp_t CTcTokenizer::next_on_line(utf8_ptr *p, CTcToken *tok,
1111
int *in_embedding, int expanding)
1118
/* skip whitespace */
1119
skip_ws_and_markers(p);
1121
/* remember where the token starts */
1124
/* if there's nothing left in the current line, return EOF */
1125
if (p->getch() == '\0')
1127
/* indicate end of file */
1132
/* get the initial character, and skip it */
1136
/* presume the token will not be marked as fully macro-expanded */
1137
tok->set_fully_expanded(FALSE);
1139
/* presume it's not a number with a minus sign */
1142
/* see what we have */
1145
case TOK_MACRO_FORMAL_FLAG:
1147
* this is a two-byte formal parameter sequence in a macro
1148
* expansion - skip the second byte of the two-byte sequence,
1149
* and return the special token type for this sequence
1151
typ = TOKT_MACRO_FORMAL;
1154
* skip the second byte - note that we want to skip exactly one
1155
* byte, regardless of what the byte looks like as a utf-8
1156
* partial character, since it's not a utf-8 character at all
1158
p->set(p->getptr() + 1);
1161
case TOK_MACRO_FOREACH_FLAG:
1163
* this is the special macro '#foreach' flag - return it as a
1164
* special pseudo-token
1166
typ = TOKT_MACRO_FOREACH;
1169
case TOK_MACRO_IFEMPTY_FLAG:
1170
/* #ifempty macro flag */
1171
typ = TOKT_MACRO_IFEMPTY;
1174
case TOK_MACRO_IFNEMPTY_FLAG:
1175
/* #ifnempty macro flag */
1176
typ = TOKT_MACRO_IFNEMPTY;
1179
case TOK_MACRO_ARGCOUNT_FLAG:
1180
/* it's the special macro '#argcount' flag */
1181
typ = TOKT_MACRO_ARGCOUNT;
1184
case TOK_FULLY_EXPANDED_FLAG:
1185
/* set the token flag indicating that it has been fully expanded */
1186
tok->set_fully_expanded(TRUE);
1188
/* the token symbol starts at the byte after the flag byte */
1189
start = p->getptr();
1191
/* read the first character of the symbol */
1195
/* tokenize the symbol that follows */
1196
goto tokenize_symbol;
1198
case TOK_END_PP_LINE:
1200
* Preprocess line-ending marker - when we reach the end of a
1201
* preprocessor line, we can't read another source line, because
1202
* a preprocessor directive consists of only a single logical
1203
* source line. Once we see this, return end-of-file until the
1204
* caller explicitly reads a new source line.
1206
* Keep the read pointer stuck on this flag byte, so that we
1207
* return end-of-file on a subsequent attempt to get the next
1228
* Start out with the leading digit in the accumulator. Note
1229
* that the character set internally is always UTF-8.
1231
acc = value_of_digit(cur);
1234
* If it's a leading zero, treat as octal or hex. '0x' means
1235
* hex; otherwise, '0' means octal.
1239
/* check for hex - if it's not hex, it's octal */
1240
if (p->getch() == 'x' || p->getch() == 'X')
1246
* scan the hex number - keep going until we find
1247
* something that's not a hex digit
1251
/* get this character */
1254
/* if it's not a hex digit, stop scanning */
1255
if (!is_xdigit(cur))
1259
* Shift the accumulator and add this digit's value.
1260
* Note that we can save a test - if the character is
1261
* >= lower-case 'a', we know it's not an upper-case
1262
* letter because the lower-case letters all have
1263
* values above the upper-case letters in UTF-8
1264
* encoding (which we always use as the internal
1265
* character set). Since we already know it's a
1266
* valid hex digit (we wouldn't be here if it
1267
* weren't), we can just check to see if it's at
1268
* least lower-case 'a', and we automatically know
1269
* then whether it's in the 'a'-'f' range or the
1273
acc += value_of_xdigit(cur);
1281
/* scan octal digits */
1282
for ( ; is_odigit(p->getch()) ; p->inc())
1283
acc = 8*acc + value_of_odigit(p->getch());
1286
* If we stopped on a digit outside of the octal range,
1287
* consume any remaining digits, and flag it as an
1288
* error. Leaving subsequent decimal digits as a
1289
* separate token tends to be confusing, since in most
1290
* cases the inclusion of decimal digits means that the
1291
* user didn't really intend this to be an octal number
1292
* after all. For instance, the leading zero might be
1293
* there for formatting reasons, and the user simply
1294
* forgot to take into account that it triggers octal
1297
if (is_digit(p->getch()))
1299
/* skip subsequent digits */
1300
for (p->inc() ; is_digit(p->getch()) ; p->inc()) ;
1302
/* flag the error */
1304
log_error(TCERR_DECIMAL_IN_OCTAL,
1305
p->getptr() - start.getptr(),
1312
/* scan decimal digits */
1313
for ( ; is_digit(p->getch()) ; p->inc())
1314
acc = 10*acc + value_of_digit(p->getch());
1317
/* negate the value if we had a minus sign */
1322
* if we stopped at a decimal point or an exponent, it's a
1323
* floating point number
1325
if (p->getch() == '.' || p->getch() == 'e' || p->getch() == 'E')
1328
/* it's an integer value */
1331
/* set the integer value */
1332
tok->set_int_val(acc);
1340
/* start over and parse the float */
1341
for (*p = start, found_decpt = FALSE ; ; p->inc())
1343
/* get this character and move on */
1346
/* see what we have */
1349
/* we have another digit; just keep going */
1351
else if (!found_decpt && cur == '.')
1353
/* it's the decimal point - note it and keep going */
1356
else if (cur == 'e' || cur == 'E')
1360
/* it might not be an exponent - look ahead to find out */
1364
/* if we have a sign, skip it */
1365
if ((cur = p2.getch()) == '-' || cur == '+')
1368
/* we need at least one digit to make an exponent */
1369
if (!is_digit(p2.getch()))
1373
while (is_digit(p2.getch()))
1376
/* advance to the end of the exponent */
1379
/* the end of the exponent is the end of the number */
1384
/* everything else ends the number */
1397
return tokenize_string(p, tok, in_embedding);
1412
/* check for '...' and floating-point numbers */
1413
if (p->getch() == '.' && p->getch_at(1) == '.')
1417
typ = TOKT_ELLIPSIS;
1419
else if (is_digit(p->getch()))
1442
/* check for '==' */
1443
if (p->getch() == '=')
1453
/* check for '::' */
1454
if (p->getch() == ':')
1457
typ = TOKT_COLONCOLON;
1464
typ = TOKT_QUESTION;
1468
/* check for '++' and '+=' */
1469
if (p->getch() == '+')
1474
else if (p->getch() == '=')
1484
/* check for '--', '->' and '-=' */
1485
if (p->getch() == '-')
1490
else if (p->getch() == '=')
1495
else if (p->getch() == '>')
1505
/* check for '*=' */
1506
if (p->getch() == '=')
1516
/* check for '/=' */
1517
if (p->getch() == '=')
1527
/* check for '%=' */
1528
if (p->getch() == '=')
1538
/* check for '>>=', '>>' and '>=' */
1539
if (p->getch() == '=')
1544
else if (p->getch() == '>')
1546
/* check for the end of an embedded expression */
1547
if (in_embedding != 0 && *in_embedding)
1550
return tokenize_string(p, tok, in_embedding);
1553
/* check for '>>=' */
1555
if (p->getch() == '=')
1568
/* check for '<<=', '<<', '<>', and '<=' */
1569
if (p->getch() == '=')
1574
else if (p->getch() == '<')
1576
/* check for '<<=' */
1578
if (p->getch() == '=')
1587
else if (p->getch() == '>')
1589
/* '<>' is obsolete */
1591
log_error(TCERR_LTGT_OBSOLETE);
1593
/* ... but for now proceed as though it's != */
1607
/* check for '&&' and '&=' */
1608
if (p->getch() == '&')
1613
else if (p->getch() == '=')
1623
/* check for '||' and '|=' */
1624
if (p->getch() == '|')
1629
else if (p->getch() == '=')
1639
/* check for '^=' */
1640
if (p->getch() == '=')
1650
/* check for '!=' */
1651
if (p->getch() == '=')
1669
/* check for '##' and '#@' */
1670
if (p->getch() == '#')
1673
typ = TOKT_POUNDPOUND;
1675
else if (p->getch() == '@')
1685
/* check to see if it's a symbol */
1686
if (is_syminit(cur))
1688
size_t len, full_len;
1691
* scan the identifier (note that we've already skipped the
1692
* first character, so we start out at length = 1)
1695
for (len = full_len = 1 ; is_sym(p->getch()) ; p->inc())
1697
/* count the full length */
1701
* count this character if we're not over the maximum
1704
if (len < TOK_SYM_MAX_LEN)
1708
/* if we truncated the symbol, issue a warning */
1709
if (full_len != len && !expanding)
1710
log_warning(TCERR_SYMBOL_TRUNCATED,
1711
(int)full_len, start.getptr(),
1712
(int)len, start.getptr());
1730
tok->set_text(start.getptr(), p->getptr() - start.getptr());
1732
/* return the type */
1737
* get the next token, limiting to the length of the source buffer
1739
tc_toktyp_t CTcTokenizer::next_on_line(const CTcTokString *srcbuf,
1740
utf8_ptr *p, CTcToken *tok,
1741
int *in_embedding, int expanding)
1743
/* get the next token */
1744
next_on_line(p, tok, in_embedding, expanding);
1746
/* if the token is past the end of the line, return EOF */
1747
if (tok->get_text() >= srcbuf->get_text_end())
1749
/* set the token to indicate end of line */
1750
tok->settyp(TOKT_EOF);
1752
/* set the token to point to the end of the buffer */
1753
tok->set_text(srcbuf->get_text_end(), 0);
1756
/* return the token type */
1757
return tok->gettyp();
1761
* Get the next token on the line, translating escapes in strings. This
1762
* updates the line buffer in-place to incorporate the translated string
1765
tc_toktyp_t CTcTokenizer::next_on_line_xlat(utf8_ptr *p, CTcToken *tok,
1768
/* skip whitespace */
1769
skip_ws_and_markers(p);
1771
/* if this is a string, translate escapes */
1776
/* translate the string */
1777
return xlat_string(p, tok, in_embedding);
1780
/* if we're in an embedding, check for '>>' */
1781
if (in_embedding != 0 && *in_embedding && p->getch_at(1) == '>')
1782
return tokenize_string(p, tok, in_embedding);
1784
/* use the default case */
1789
/* for anything else, use the default tokenizer */
1790
return next_on_line(p, tok, in_embedding, FALSE);
1797
int CTcTokenizer::look_up_keyword(const CTcToken *tok, tc_toktyp_t *kwtok)
1801
/* look it up in the keyword table */
1802
kw = (CTcHashEntryKw *)kw_->find(tok->get_text(), tok->get_text_len());
1805
/* we found the keyword - set 'kw' to the keyword token id */
1806
*kwtok = kw->get_tok_id();
1808
/* tell the caller we found it */
1813
/* tell the caller it's not a keyword */
1819
* Get the next token on the line, translating escape sequences in
1820
* strings, and storing strings and symbols in the source block list.
1821
* This routine also translates keywords for token types.
1823
tc_toktyp_t CTcTokenizer::next_on_line_xlat_keep()
1827
/* keep going until we find a valid symbol */
1830
/* skip whitespace and macro expansion flags */
1831
skip_ws_and_markers(&p_);
1833
/* see what we have */
1838
/* it's a string - translate and save it */
1839
return xlat_string_to_src(&main_in_embedding_, FALSE);
1842
/* if we're in an embedding, this is the end of it */
1843
if (main_in_embedding_ && p_.getch_at(1) == '>')
1844
return xlat_string_to_src(&main_in_embedding_, FALSE);
1846
/* use the normal parsing */
1851
/* for anything else, use the default tokenizer */
1852
typ = next_on_line(&p_, &curtok_, &main_in_embedding_, FALSE);
1854
/* check the token type */
1863
/* look it up in the keyword table */
1864
kw = (CTcHashEntryKw *)kw_->find(curtok_.get_text(),
1865
curtok_.get_text_len());
1868
/* replace the token with the keyword token type */
1869
typ = kw->get_tok_id();
1870
curtok_.settyp(typ);
1874
/* ordinary symbol - save the text */
1875
p = store_source(curtok_.get_text(),
1876
curtok_.get_text_len());
1879
* change the token's text to point to the
1880
* source block, so that this token's text
1881
* pointer will remain permanently valid (the
1882
* original copy, in the source line buffer,
1883
* will be overwritten as soon as we read
1884
* another source line; we don't want the caller
1885
* to have to worry about this, so we return the
1888
curtok_.set_text(p, curtok_.get_text_len());
1894
/* floating-point number */
1899
* save the text so that it remains permanently
1900
* valid - we keep track of floats by the original
1901
* text, and let the code generator produce the
1902
* appropriate object file representation
1904
p = store_source(curtok_.get_text(),
1905
curtok_.get_text_len());
1906
curtok_.set_text(p, curtok_.get_text_len());
1912
* check for unmappable characters - these will show up as
1913
* Unicode U+FFFD, the "replacement character"; log it as
1914
* 'unmappable' if applicable, otherwise as an invalid
1917
if (utf8_ptr::s_getch(curtok_.get_text()) == 0xfffd)
1918
log_error_curtok(TCERR_UNMAPPABLE_CHAR);
1920
log_error_curtok(TCERR_INVALID_CHAR);
1922
/* skip this character */
1933
/* return the type */
1940
* Translate the string at the current token position in the input
1941
* stream to the source block list.
1943
tc_toktyp_t CTcTokenizer::xlat_string_to_src(int *in_embedding,
1944
int force_embed_end)
1949
* Reserve space for the entire rest of the line. This is
1950
* conservative, in that we will definitely need less space than
1951
* this. This might cause us to waste a little space here and
1952
* there, since we will over-allocate when we have a short string
1953
* early in a long line, but this will save us the time of scanning
1954
* the string twice just to see how long it is.
1956
reserve_source(curbuf_->get_text_len() -
1957
(p_.getptr() - curbuf_->get_text()));
1959
/* translate into the source block */
1960
typ = xlat_string_to(src_ptr_, &p_, &curtok_,
1961
in_embedding, force_embed_end);
1963
/* commit the space in the source block */
1964
commit_source(curtok_.get_text_len() + 1);
1966
/* return the string token */
1971
* Translate a string, setting up the token structure for the string,
1972
* and writing the translated version of the string directly over the
1973
* original source buffer of the string.
1975
* Since a translated string can only shrink (because a translated
1976
* escape sequence is always shorter than the original source version),
1977
* we don't need a separate buffer, but can simply translate into the
1978
* source buffer, overwriting the original string as we go.
1980
tc_toktyp_t CTcTokenizer::xlat_string(utf8_ptr *p, CTcToken *tok,
1986
* write the translated string over the original string's text,
1987
* starting at the character after the quote
1989
dst = p->getptr() + 1;
1991
/* translate the string into our destination buffer */
1992
return xlat_string_to(dst, p, tok, in_embedding, FALSE);
1996
* Translate a string, setting up the token structure for the string.
1997
* We will update the line buffer in-place to incorporate the translated
2000
tc_toktyp_t CTcTokenizer::xlat_string_to(char *dstp, utf8_ptr *p,
2001
CTcToken *tok, int *in_embedding,
2002
int force_embed_end)
2006
utf8_ptr start, end;
2009
/* set up our output utf8 pointer */
2012
/* note the open quote character */
2015
/* set the appropriate string token type */
2016
tok->settyp(qu == '"'
2018
: (qu == '>' ? TOKT_DSTR_END : TOKT_SSTR));
2020
/* skip the open quote */
2023
/* skip the second '>' if it's a '>>' */
2024
if (force_embed_end)
2027
* they want us to assume the embedding ends here, regardless of
2028
* what we're looking at - act the same as though we had
2029
* actually seen '>>', but don't skip any input (in fact, back
2030
* up one, since we already skipped one character for what we
2031
* had thought was the open quote
2035
/* clear the caller's in-embedding status */
2036
*in_embedding = FALSE;
2038
/* close with a double quote */
2041
/* it's a double-quoted string continuation */
2042
tok->settyp(TOKT_DSTR_END);
2046
/* skip the second '>' */
2049
/* clear the caller's in-embedding status */
2050
*in_embedding = FALSE;
2052
/* close with a double quote */
2056
/* remember where the string's contents start */
2059
/* scan the string and translate quotes */
2064
/* get this character */
2067
/* if this is the matching quote, we're done */
2072
* if we find an end-of-line within the string, it's an error -
2073
* we should always splice strings together onto a single line
2074
* before starting to tokenize the line
2081
/* note where the string ends */
2084
/* set the token's text pointer */
2085
tok->set_text(dstp, end.getptr() - dstp);
2087
/* null-terminate the result string */
2091
* get the length of the unterminated string so far, but for
2092
* error logging, limit the length to twenty characters --
2093
* we just want to give the user enough information to find
2094
* the string in error, without making the error message
2098
len = p.len(end.getptr() - dstp);
2100
len = p.bytelen(20);
2103
* Check for a special heuristic case. If the string was of
2104
* zero length, and we have something sitting in our
2105
* unsplice buffer, here's what probably happened: the input
2106
* was missing a ">>" sequence at the end of an embedded
2107
* expression, and the parser told us to put it back in. We
2108
* had earlier decided we needed to splice up to a quote to
2109
* end what looked to us like an unterminated string. If
2110
* this is the case, we and the parser are working at cross
2111
* purposes; the parser is smarter than we are, so we should
2112
* synchronize with it.
2114
if (tok->get_text_len() == 0
2116
&& unsplicebuf_.get_text_len() != 0)
2121
* we must have spliced a line to finish a string -
2122
* insert the quote into the splice buffer, and ignore
2127
* make sure there's room for one more character (plus a
2130
unsplicebuf_.ensure_space(unsplicebuf_.get_text_len() + 2);
2132
/* get the buffer pointer */
2133
buf = unsplicebuf_.get_buf();
2135
/* make room for the '"' */
2136
memmove(buf + 1, buf, unsplicebuf_.get_text_len());
2137
unsplicebuf_.set_text_len(unsplicebuf_.get_text_len() + 1);
2143
* return the 'null token' to tell the caller to try
2144
* again - do not log an error at this point
2146
return TOKT_NULLTOK;
2150
log_error(TCERR_UNTERM_STRING,
2151
(char)qu, (int)len, dstp, (char)qu);
2153
/* return the string type */
2154
return tok->gettyp();
2157
/* if this is an escape, translate it */
2162
/* get the character after the escape */
2166
/* see what we have */
2175
/* miniscules - 0x000E */
2180
/* blank line - 0x000B */
2185
/* quoted space - 0x0015 */
2190
/* newline - explicitly use Unicode 10 character */
2195
/* tab - explicitly use Unicode 9 character */
2201
* Hex unicode character number. Read up to 4 hex
2202
* digits that follow the 'u', and use that as a Unicode
2205
for (i = 0, acc = 0, p->inc() ; i < 4 ; ++i, p->inc())
2207
/* get the next character */
2211
* if it's another hex digit, add it into the
2212
* accumulator; otherwise, we're done
2215
acc = 16*acc + value_of_xdigit(cur);
2220
/* use the accumulated value as the character number */
2221
dst.setch((wchar_t)acc);
2224
* continue with the current character, since we've
2225
* already skipped ahead to the next one
2238
* Octal ASCII character number. Accumulate up to three
2239
* octal numbers, and use the result as a character ID.
2241
for (i = 0, acc = 0 ; i < 3 ; ++i, p->inc())
2243
/* get the next character */
2247
* if it's another digit, and it would leave our
2248
* result in the 0-255 range, count it; if not,
2255
/* compute the new value */
2256
new_acc = 8*acc + value_of_odigit(cur);
2258
/* if this would be too high, don't count it */
2268
/* use the accumulated value as the character number */
2269
dst.setch((wchar_t)acc);
2272
* continue with the current character, since we've
2273
* already skipped ahead to the next one
2279
* Hex ASCII character number. Read up to two hex
2280
* digits as a character number.
2282
for (i = 0, acc = 0, p->inc() ; i < 2 ; ++i, p->inc())
2284
/* get the next character */
2288
* if it's another hex digit, add it into the
2289
* accumulator; otherwise, we're done
2292
acc = 16*acc + value_of_xdigit(cur);
2297
/* use the accumulated value as the character number */
2298
dst.setch((wchar_t)acc);
2301
* continue with the current character, since we've
2302
* already skipped ahead to the next one
2307
/* copy anything else as-is */
2311
else if (in_embedding != 0 && !*in_embedding
2312
&& cur == '<' && p->getch_at(1) == '<')
2315
* it's the start of an embedded expression - change the
2316
* type to so indicate
2318
tok->settyp(tok->gettyp() == TOKT_DSTR
2319
? TOKT_DSTR_START : TOKT_DSTR_MID);
2321
/* tell the caller we're in an embedding */
2322
*in_embedding = TRUE;
2328
/* copy this character to the output position */
2331
/* get the next character */
2335
/* note where the string ends */
2338
/* set the token's text pointer */
2339
tok->set_text(dstp, end.getptr() - dstp);
2341
/* null-terminate the result string */
2344
/* skip an extra character if this is the start of an embedding */
2345
if (p->getch() == '<')
2348
/* skip the closing quote */
2351
/* return the string type */
2352
return tok->gettyp();
2357
* Skip a string, setting up the token structure for the string. This
2358
* routine only parses to the end of the line; if the line ends with the
2359
* string unterminated, we'll flag an error
2361
tc_toktyp_t CTcTokenizer::tokenize_string(utf8_ptr *p, CTcToken *tok,
2365
const char *contents_start;
2366
const char *contents_end;
2369
int allow_embedding;
2371
/* remember where the text starts */
2372
start = p->getptr();
2374
/* note the quote type */
2377
/* skip the quote in the input */
2380
/* determine the token type based on the quote type */
2384
/* single-quoted string */
2386
allow_embedding = FALSE;
2391
* this must be the next part of a string with embeddings; for now,
2392
* assume it's the end of the string, although it may just turn out
2395
typ = TOKT_DSTR_END;
2396
allow_embedding = (in_embedding != 0);
2398
/* skip the extra '>' character */
2401
/* clear the embedding flag */
2402
if (in_embedding != 0)
2403
*in_embedding = FALSE;
2405
/* look for a closing double quote */
2410
/* regular double-quoted string */
2412
allow_embedding = (in_embedding != 0);
2416
/* anything else is invalid */
2418
allow_embedding = FALSE;
2422
/* this is where the string's contents start */
2423
contents_start = p->getptr();
2425
/* scan the string */
2430
/* get the current character */
2433
/* see what we have */
2436
/* escape sequence - skip an extra character */
2439
else if (cur == '<' && allow_embedding && p->getch_at(1) == '<')
2442
* it's the start of an embedded expression - return the
2443
* appropriate embedded string part type
2445
if (typ == TOKT_DSTR)
2446
typ = TOKT_DSTR_START;
2448
typ = TOKT_DSTR_MID;
2450
/* remember that we're in an embedding in the token stream */
2451
*in_embedding = TRUE;
2453
/* this is where the contents end */
2454
contents_end = p->getptr();
2456
/* skip the two embedding characters */
2460
/* we're done - set the text in the token */
2461
tok->set_text(start, p->getptr() - start);
2468
/* this is where the contents end */
2469
contents_end = p->getptr();
2471
/* skip the closing quote */
2474
/* we're done - set the text in the token */
2475
tok->set_text(start, p->getptr() - start);
2480
else if (cur == '\0')
2482
/* this is where the contents end */
2483
contents_end = p->getptr();
2486
* We have an unterminated string. If we're evaluating a
2487
* preprocessor constant expression, log an error; otherwise
2488
* let it go for now, since we'll catch the error during the
2489
* normal tokenizing pass for parsing.
2491
if (G_tok->in_pp_expr_)
2492
log_error(TCERR_PP_UNTERM_STRING);
2494
/* set the partial text */
2495
tok->set_text(start, p->getptr() - start);
2497
/* end of line - return with the string unfinished */
2501
/* skip this charater of input */
2506
* if we're not in preprocessor mode, and we're saving string text,
2507
* write the string to the string text output file
2509
if (!G_tok->in_pp_expr_ && G_tok->string_fp_ != 0
2510
&& contents_start != contents_end)
2512
/* write the line, translating back to the source character set */
2513
G_tok->string_fp_map_
2514
->write_file(G_tok->string_fp_, contents_start,
2515
(size_t)(contents_end - contents_start));
2518
osfwb(G_tok->string_fp_, "\n", 1);
2521
/* set the type in the token */
2524
/* return the token type */
2525
return tok->gettyp();
2529
/* ------------------------------------------------------------------------ */
2531
* Read a source line and handle preprocessor directives. This routine
2532
* will transparently handle #include, #define, and other directives;
2533
* when this routine returns, the input buffer will have a line of text
2534
* that contains no # directive.
2536
* Returns zero on success, non-zero upon reaching the end of the input.
2538
int CTcTokenizer::read_line_pp()
2540
int started_in_string;
2544
* Read the next line from the input. If that fails, return an end
2545
* of file indication.
2547
ofs = read_line(FALSE);
2552
* before we process comments, note whether or not the line started
2553
* out within a character string
2555
started_in_string = (in_quote_ != '\0');
2557
/* set up our source pointer to the start of the new line */
2558
start_new_line(&linebuf_, ofs);
2560
/* skip leading whitespace */
2561
while (is_space(p_.getch()))
2565
* If this line begins with a '#', process the directive. Ignore
2566
* any initial '#' if the line started off in a string.
2568
if (!started_in_string && p_.getch() == '#' && allow_pp_)
2573
int process_in_false_if;
2574
void (CTcTokenizer::*func)();
2576
static pp_kw_def kwlist[] =
2578
{ "charset", FALSE, &CTcTokenizer::pp_charset },
2579
{ "pragma", FALSE, &CTcTokenizer::pp_pragma },
2580
{ "include", FALSE, &CTcTokenizer::pp_include },
2581
{ "define", FALSE, &CTcTokenizer::pp_define },
2582
{ "if", TRUE, &CTcTokenizer::pp_if },
2583
{ "ifdef", TRUE, &CTcTokenizer::pp_ifdef },
2584
{ "ifndef", TRUE, &CTcTokenizer::pp_ifndef },
2585
{ "else", TRUE, &CTcTokenizer::pp_else },
2586
{ "elif", TRUE, &CTcTokenizer::pp_elif },
2587
{ "endif", TRUE, &CTcTokenizer::pp_endif },
2588
{ "error", FALSE, &CTcTokenizer::pp_error },
2589
{ "undef", FALSE, &CTcTokenizer::pp_undef },
2590
{ "line", FALSE, &CTcTokenizer::pp_line },
2601
* If the line ended inside a comment, read the next line until
2602
* we're no longer in a comment. The ANSI C preprocessor rules
2603
* say that a newline in a comment should not be treated as a
2604
* lexical newline, so pretend that the next line is part of the
2605
* preprocessor line in such a case.
2607
while (str_->is_in_comment())
2611
/* remember the current offset in the line buffer */
2612
p_ofs = p_.getptr() - linebuf_.get_buf();
2614
/* append another line - stop at the end of the stream */
2615
if (read_line(TRUE) == -1)
2618
/* restore the line pointer, in case the buffer moved */
2619
start_new_line(&linebuf_, p_ofs);
2622
/* read the directive */
2626
* if we've reached the end of the line, it's a null directive;
2627
* simply return an empty line
2629
if (curtok_.gettyp() == TOKT_EOF)
2635
/* get the text and length of the keyword */
2636
kwtxt = curtok_.get_text();
2637
kwlen = curtok_.get_text_len();
2639
/* if it's not a symbol, it's not a valid directive */
2640
if (curtok_.gettyp() != TOKT_SYM)
2642
/* log the error and return an empty line */
2643
log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2648
/* determine which keyword we have, and process it */
2649
for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
2651
/* is this our keyword? */
2652
if (strlen(kwp->kw) == kwlen
2653
&& memcmp(kwtxt, kwp->kw, kwlen) == 0)
2656
* This is our directive.
2658
* If we're in the false branch of a #if block, only
2659
* process the directive if it's a kind of directive
2660
* that we should process in false #if branches. The
2661
* only directives that we process in #if branches are
2662
* those that would affect the #if branching, such as a
2663
* #endif or a nested #if.
2665
if (!in_false_if() || kwp->process_in_false_if)
2667
/* invoke the handler to process the directive */
2668
(this->*(kwp->func))();
2673
* we're in a #if branch not taken - simply clear
2679
/* we don't need to look any further */
2685
* if we didn't find the keyword, log an error and otherwise
2686
* ignore the entire line
2689
log_error(TCERR_INV_PP_DIR, (int)kwlen, kwtxt);
2692
* Preprocessor lines must always be entirely self-contained.
2693
* Therefore, it's not valid for a string to start on a
2694
* preprocessor line and continue onto subsequent lines. If
2695
* we're marked as being inside a string, there must have been
2696
* an error on the preprocessor line. Simply clear the
2697
* in-string flag; we don't need to issue an error at this
2698
* point, since the preprocessor line handler should have
2699
* already caught the problem and reported an error.
2706
* There's no preprocessor directive.
2708
* If we're in a false #if branch, return an empty line. We
2709
* return an empty line rather than skipping to the next line so
2710
* that the caller sees the same number of lines as are in the
2716
* it's a #if not taken - we don't want to compile the line
2717
* at all, so just clear it out
2720
expbuf_.clear_text();
2725
* If we ended the line in a string, splice additional lines
2726
* onto the end of this line until we find the end of the
2727
* string, then unsplice the part after the end of the
2730
if (in_quote_ != '\0')
2732
/* splice additional lines to finish the quote */
2737
* Expand macros in the line, splicing additional source
2738
* lines if necessary to fill out any incomplete actual
2741
start_new_line(&linebuf_, 0);
2742
expand_macros_curline(TRUE, FALSE, FALSE);
2745
/* store the line in the appropriate place */
2749
* we're only preprocessing - store the macro-expanded line
2750
* back in the line buffer so that the caller can read out
2751
* the final preprocessed text
2753
linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
2758
* We're compiling - simply read subsequent tokens out of
2759
* the expansion buffer.
2761
start_new_line(&expbuf_, 0);
2765
/* return success */
2769
/* ------------------------------------------------------------------------ */
2771
* Read the next line from the input file. Returns a pointer to the
2772
* start of the newly-read data on success, or null if we reach the end
2775
* If 'append' is true, we'll add the line on to the end of the existing
2776
* buffer; otherwise, we'll overwrite what's in the buffer.
2778
* The only preprocessing performed in this routine is line-splicing.
2779
* Any line that ends with a backslash character will be spliced with
2780
* the following line, with the backslash and newline removed.
2782
* The new line will be stored in our internal buffer, and will be
2783
* null-terminated with the trailing newline removed.
2785
* If we reach the end of the current file, and there's an enclosing
2786
* file, we'll resume reading from the enclosing file. Hence, when this
2787
* routine returns non-zero, it indicates that we've reached the end of
2788
* the entire source, not just of the current file.
2790
int CTcTokenizer::read_line(int append)
2795
/* if there's no input stream, indicate end-of-file */
2799
/* if we're not appending, clear out the line buffer */
2802
/* start with an empty line */
2805
/* note the current input position */
2806
last_desc_ = str_->get_desc();
2807
last_linenum_ = str_->get_next_linenum();
2810
/* note where the new data starts */
2811
len = linebuf_.get_text_len();
2815
* if there's anything in the unsplice buffer, use it as the new
2818
if (unsplicebuf_.get_text_len() != 0)
2821
* Copy the unsplice buffer as the current line. Note that we
2822
* don't have to worry about any of the complicated cases, such
2823
* as whether or not it ends with a newline or a backslash,
2824
* because the unspliced line was already processed as an input
2825
* line when we read it in the first place.
2827
linebuf_.append(unsplicebuf_.get_text(), unsplicebuf_.get_text_len());
2829
/* clear the unsplice buffer, since it's been consumed now */
2830
unsplicebuf_.clear_text();
2833
* make the current line the appended line - if we're
2834
* unsplicing, it means that we appended, so the current line is
2835
* now the line from which the last appended text came
2837
last_desc_ = appended_desc_;
2838
last_linenum_ = appended_linenum_;
2840
/* return the offset of the new text */
2844
/* if we're appending, note where the appendage is coming from */
2847
/* remember the last source line appended */
2848
appended_desc_ = str_->get_desc();
2849
appended_linenum_ = str_->get_next_linenum();
2852
/* keep going until we finish reading the input line */
2857
/* read a line of text from the input file */
2858
curlen = str_->get_src()->
2859
read_line(linebuf_.get_buf() + len,
2860
linebuf_.get_buf_size() - len);
2862
/* check for end of file */
2865
CTcTokStream *old_str;
2868
* We've reached the end of the current input stream. If
2869
* we've already read anything into the current line, it
2870
* means that the file ended in mid-line, without a final
2871
* newline character; ignore this and proceed with the line
2872
* as it now stands in this case.
2874
if (len > start_len)
2878
* We've finished with this stream. If there's a parent
2879
* stream, return to it; otherwise, we're at the end of the
2884
* if we didn't close all of the #if/#ifdef levels opened
2885
* within this file, flag one or more errors
2887
while (if_sp_ > str_->get_init_if_level())
2891
/* get the filename from the #if stack */
2892
fname = if_stack_[if_sp_ - 1].desc->get_fname();
2894
/* if we're in test reporting mode, use the root name only */
2895
if (test_report_mode_)
2896
fname = os_get_root_name((char *)fname);
2899
log_error(TCERR_IF_WITHOUT_ENDIF,
2900
if_stack_[if_sp_ - 1].linenum,
2901
(int)strlen(fname), fname);
2903
/* discard the #if level */
2907
/* remember the old stream */
2910
/* return to the parent stream, if there is one */
2911
str_ = str_->get_parent();
2913
/* delete the old stream now that we're done with it */
2916
/* note the new file the line will be coming from */
2917
if (!append && str_ != 0)
2919
last_desc_ = str_->get_desc();
2920
last_linenum_ = str_->get_next_linenum();
2923
/* if there's no stream, return end of file */
2928
* restore the #pragma newline_spacing mode that was in effect
2929
* when we interrupted the parent stream
2931
string_newline_spacing_ = str_->get_newline_spacing();
2933
/* if there's a parser, notify it of the new pragma C mode */
2934
#if 0 // #pragma C is not currently used
2936
G_prs->set_pragma_c(str_->is_pragma_c());
2939
/* go back to read the next line from the parent */
2943
/* set the new length of the buffer contents */
2945
linebuf_.set_text_len(len);
2948
* Check the result to see if it ends in a newline. If not, it
2949
* means either that we don't have room in the buffer for the
2950
* full source line, or we've reached the last line in the file,
2951
* and it doesn't end with a newline.
2953
* Note that the file reader will always supply us with '\n'
2954
* newlines, regardless of the local operating system
2957
* Also, check to see if the line ends with '\\'. If so, remove
2958
* the '\\' character and read the next line, since this
2959
* indicates that the logical line continues onto the next
2960
* newline-deliminted line.
2962
if (len != 0 && linebuf_.get_text()[len - 1] != '\n')
2965
* There's no newline, hence the file reader wasn't able to
2966
* fit the entire line into our buffer, or else we've read
2967
* the last line in the file and there's no newline at the
2968
* end. If we haven't reached the end of the file, expand
2969
* our line buffer to make room to read more from this same
2972
if (!str_->get_src()->at_eof())
2975
else if (len > 1 && linebuf_.get_text()[len - 2] == '\\')
2978
* There's a backslash at the end of the line, so they want
2979
* to continue this logical line. Remove the backslash, and
2980
* read the next line onto the end of the current line.
2982
* Note that we must remove two characters from the end of
2983
* the line (and tested for buf_[len-2] above) because we
2984
* have both a backslash and a newline at the end of the
2988
linebuf_.set_text_len(len);
2990
/* count reading the physical line */
2995
/* remove the newline from the buffer */
2999
linebuf_.set_text_len(len);
3002
/* count reading the line */
3011
* remove comments from the newly-read material - this replaces each
3012
* comment by a single whitespace character
3014
process_comments(start_len);
3017
* we've successfully read a line -- return the offset of the start of
3018
* the newly-read text
3024
* Un-splice a line at the given point. This breaks the current source
3025
* line in two, keeping the part before the given point as the current
3026
* line, but making the part from the given point to the end of the line
3027
* a new source line. We'll put the new source line into a special
3028
* holding buffer, and then fetch this part as a new line the next time
3029
* we read a line in read_line().
3031
void CTcTokenizer::unsplice_line(const char *new_line_start)
3035
/* make sure the starting point is within the current line */
3036
if (!(new_line_start >= linebuf_.get_text()
3037
&& new_line_start <= linebuf_.get_text() + linebuf_.get_text_len()))
3039
/* note the error - this is an internal problem */
3040
throw_internal_error(TCERR_UNSPLICE_NOT_CUR);
3044
/* calculate the length of the part we're keeping */
3045
keep_len = new_line_start - linebuf_.get_text();
3048
* prepend the remainder of the current line into the unsplice buffer
3049
* (we prepend it because the unsplice line is text that comes after
3050
* the current line - so anything in the current line comes before
3051
* anything already in the unsplice buffer)
3053
unsplicebuf_.prepend(new_line_start, linebuf_.get_text_len() - keep_len);
3055
/* cut off the current line at the given point */
3056
linebuf_.set_text_len(keep_len);
3060
/* ------------------------------------------------------------------------ */
3062
* Store text in the source array
3064
const char *CTcTokenizer::store_source(const char *txt, size_t len)
3066
/* reserve space for the text */
3067
reserve_source(len);
3070
const char *p = store_source_partial(txt, len);
3072
/* add a null terminator */
3073
static const char nt[1] = { '\0' };
3074
store_source_partial(nt, 1);
3076
/* return the pointer to the stored space */
3081
* Store partial source; use this AFTER reserving the necessary space. If
3082
* you want null-termination, be sure to reserve the extra byte for that
3083
* and include it in the string. This can be used to build a string piece
3084
* by piece; we simply add the text without null-terminating it.
3086
const char *CTcTokenizer::store_source_partial(const char *txt, size_t len)
3088
/* remember where the string starts */
3089
const char *p = src_ptr_;
3091
/* store the text */
3092
memcpy(src_ptr_, txt, len);
3094
/* advance the source block write position and length */
3098
/* return the storage pointer */
3103
* Reserve space for text in the source array. This always reserves the
3104
* requested amount of space, plus an extra byte for null termination.
3106
void CTcTokenizer::reserve_source(size_t len)
3109
* if we don't have enough space for this line in the current source
3110
* block, start a new block
3112
if (len + 1 > src_rem_)
3114
CTcTokSrcBlock *blk;
3117
* if the line is too long for a source block, throw a fatal
3120
if (len + 1 > TCTOK_SRC_BLOCK_SIZE)
3121
throw_fatal_error(TCERR_SRCLINE_TOO_LONG,
3122
(long)TCTOK_SRC_BLOCK_SIZE);
3124
/* allocate a new block */
3125
blk = new CTcTokSrcBlock();
3127
/* link it into our list */
3128
src_cur_->set_next(blk);
3130
/* it's now the current block */
3133
/* start writing at the start of this block */
3134
src_rem_ = TCTOK_SRC_BLOCK_SIZE;
3135
src_ptr_ = blk->get_buf();
3140
* Commit space previously reserved and now used in the source block
3143
void CTcTokenizer::commit_source(size_t len)
3145
/* advance the write position past the committed text */
3151
/* ------------------------------------------------------------------------ */
3153
* Expand macros in the current line from the current source pointer,
3154
* filling in expbuf_ with the expanded result.
3156
int CTcTokenizer::expand_macros_curline(int read_more, int allow_defined,
3157
int append_to_expbuf)
3161
/* expand macros in the current line */
3162
err = expand_macros(&linebuf_, &p_, &expbuf_, read_more, allow_defined,
3165
/* if that failed, return an error */
3170
* if we're in preprocessor mode, clean up the text for human
3171
* consumption by removing our various expansion flags
3174
remove_expansion_flags(&expbuf_);
3176
/* return the result */
3180
/* ------------------------------------------------------------------------ */
3182
* Remove the special internal macro expansion flags from an expanded macro
3185
void CTcTokenizer::remove_expansion_flags(CTcTokString *buf)
3192
* Scan the expansion buffer and remove all of the no-more-expansion
3193
* flag bytes - we're done expanding the macro now, so we don't need
3194
* this information any longer. When we're writing out the
3195
* preprocessed source for human viewing, we don't want to leave these
3196
* internal markers in the expanded source.
3198
for (src = dst = buf->get_buf(), p.set(src) ; p.getch() != '\0' ; )
3200
/* if this isn't a macro flag, copy it */
3201
if (p.getch() == TOK_MACRO_EXP_END)
3203
/* skip the flag byte and the following embedded pointer */
3204
src += 1 + sizeof(CTcHashEntryPp *);
3207
else if (p.getch() == TOK_FULLY_EXPANDED_FLAG)
3209
/* skip the flag byte */
3215
/* skip this character */
3218
/* copy the bytes of this character as-is */
3219
while (src < p.getptr())
3224
/* set the new buffer length */
3225
buf->set_text_len(dst - buf->get_buf());
3228
/* ------------------------------------------------------------------------ */
3230
* Expand macros in the current line, reading additional source lines if
3233
* 'src' is a pointer to the start of the text to expand; it must point
3234
* into the 'srcbuf' buffer. If 'src' is null, we'll simply start at
3235
* the beginning of the source buffer.
3237
int CTcTokenizer::expand_macros(CTcTokString *srcbuf, utf8_ptr *src,
3238
CTcTokString *expbuf, int read_more,
3239
int allow_defined, int append)
3243
CTcTokString *subexp;
3246
CTcTokStringRef local_srcbuf;
3250
/* presume success */
3253
/* get a macro expansion resource object */
3254
res = alloc_macro_rsc();
3258
/* get our subexpression buffer from the resource object */
3259
subexp = &res->line_exp_;
3261
/* if there's no source buffer or source pointer, provide one */
3265
* there's no source buffer - provide our own non-allocated
3266
* buffer tied to the caller's buffer
3268
local_srcbuf.set_buffer(src->getptr(), strlen(src->getptr()));
3269
srcbuf = &local_srcbuf;
3274
* there's no source pointer - start at the beginning of the
3277
local_src.set((char *)srcbuf->get_text());
3281
/* clear the expansion buffer, unless we're appending to the buffer */
3283
expbuf->clear_text();
3286
* Make sure we have room for a copy of the source line. This is an
3287
* optimization for the simple case where we'll just copy the source
3288
* line unchanged, so that we don't have to repeatedly expand the
3289
* buffer; we will, however, expand the buffer dynamically later, if
3290
* this pre-allocation should prove to be insufficient.
3292
expbuf->ensure_space(expbuf->get_text_len() + srcbuf->get_text_len());
3294
/* note the starting offset, if we have an underlying string buffer */
3295
startofs = src->getptr() - srcbuf->get_text();
3297
/* read the first token */
3298
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, TRUE);
3300
/* scan through the tokens on the line, looking for macros to expand */
3301
while (typ != TOKT_EOF)
3304
* if it's a symbol, and it hasn't already been marked as fully
3305
* expanded, look it up in the #define table
3307
if (typ == TOKT_SYM && !tok.get_fully_expanded())
3309
CTcHashEntryPp *entry;
3312
* Look up the symbol in the #define symbol table. If we
3313
* find it, expand the macro. Otherwise, if the "defined"
3314
* operator is active, check for that.
3316
* Do not expand the macro if we find that it has already
3317
* been expanded on a prior scan through the current text.
3319
entry = find_define(tok.get_text(), tok.get_text_len());
3321
&& !scan_for_prior_expansion(*src, srcbuf->get_text_end(),
3324
&& tok.get_text_len() == 7
3325
&& memcmp(tok.get_text(), "defined", 7) == 0))
3331
/* get the offset of the macro token in the source buffer */
3332
macro_ofs = tok.get_text() - srcbuf->get_text();
3334
/* expand it into our sub-expansion buffer */
3337
/* expand the macro */
3338
err = expand_macro(res, subexp, srcbuf, src,
3340
read_more, allow_defined, &expanded);
3344
/* parse and expand the defined() operator */
3345
err = expand_defined(subexp, srcbuf, src);
3347
/* "defined" always expands if there's not an error */
3351
/* if an error occurred, return failure */
3356
* if we expanded something, append everything we
3357
* skipped preceding the macro, then rescan; otherwise,
3358
* just keep going without a rescan
3362
/* copy the preceding text to the output */
3363
expbuf->append(srcbuf->get_text() + startofs,
3364
macro_ofs - startofs);
3369
* we didn't expand - get the next token after the
3372
typ = next_on_line(srcbuf, src, &tok,
3373
¯o_in_embedding_, TRUE);
3375
/* continue processing from this token */
3380
* We must now insert the expansion into the source
3381
* buffer at the current point, and re-scan the
3382
* expansion, *along with* the rest of the original
3383
* source line (this is how ANSI C specifies the
3386
* If we can read more, we must be reading out of the
3387
* main input line buffer, so insert the expansion text
3388
* directly into the original source stream, and
3389
* continue reading out of the source stream; this will
3390
* simplify the case where we must read more data from
3391
* the file in the course of the expansion. If we can't
3392
* read more, simply copy the remainder of the current
3393
* input line onto the expanded macro and use it as the
3397
/* get the current offset in the source line */
3398
startofs = src->getptr() - srcbuf->get_text();
3400
/* figure out how much is left on the current line */
3401
rem_len = srcbuf->get_text_len() - startofs;
3403
/* check to see if we can read more */
3407
* we're reading from the original line input buffer
3408
* -- insert the expansion into the source buffer at
3409
* the current point, replacing the original macro
3413
/* make sure we have room for adding the expansion text */
3414
srcbuf->ensure_space(macro_ofs + rem_len
3415
+ subexp->get_text_len());
3417
/* make sure src is still pointing to the right place */
3418
src->set(srcbuf->get_buf() + macro_ofs);
3420
/* move the remainder of the current line to make room */
3421
memmove(srcbuf->get_buf() + macro_ofs
3422
+ subexp->get_text_len(),
3423
srcbuf->get_buf() + startofs,
3426
/* insert the expansion text */
3427
memcpy(srcbuf->get_buf() + macro_ofs, subexp->get_buf(),
3428
subexp->get_text_len());
3430
/* set the new source length */
3431
srcbuf->set_text_len(macro_ofs + rem_len
3432
+ subexp->get_text_len());
3434
/* the new starting offset is the current position */
3435
startofs = macro_ofs;
3437
/* get the next token */
3438
typ = next_on_line(srcbuf, src, &tok,
3439
¯o_in_embedding_, TRUE);
3441
/* continue processing from this token */
3447
* we're reading from a read-only buffer -- add the
3448
* remainder of the source to the expansion buffer,
3449
* and recursively parse the remainder
3451
subexp->append(srcbuf->get_text() + startofs, rem_len);
3454
* evaluate the remainder recursively and append it
3455
* to the expansion already in progress
3457
err = expand_macros(subexp, 0, expbuf, FALSE,
3458
allow_defined, TRUE);
3466
/* get the next token */
3467
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, TRUE);
3470
/* add the remainder of the source to the output */
3471
expbuf->append(srcbuf->get_text() + startofs,
3472
tok.get_text() - startofs - srcbuf->get_text());
3475
/* release our macro resource object */
3476
release_macro_rsc(res);
3478
/* return the result */
3483
* Allocate a macro resource object. If we're out of resource objects
3484
* in the pool, we'll add another object to the pool.
3486
CTcMacroRsc *CTcTokenizer::alloc_macro_rsc()
3491
* if there's anything in the available list, take the first item
3492
* off the list and return it
3494
if (macro_res_avail_ != 0)
3496
/* remember the item to return */
3497
rsc = macro_res_avail_;
3499
/* remove it from the list */
3500
macro_res_avail_ = macro_res_avail_->next_avail_;
3506
/* there's nothing on the available list - allocate a new item */
3507
rsc = new CTcMacroRsc();
3509
/* if that failed, return failure */
3512
log_error(TCERR_OUT_OF_MEM_MAC_EXP);
3516
/* add it onto the master list */
3517
rsc->next_ = macro_res_head_;
3518
macro_res_head_ = rsc;
3525
* Release a macro resource, returning it to the pool
3527
void CTcTokenizer::release_macro_rsc(CTcMacroRsc *rsc)
3529
/* put it back at the head of the available list */
3530
rsc->next_avail_ = macro_res_avail_;
3531
macro_res_avail_ = rsc;
3535
* Scan a buffer for a prior-expansion flag for a given macro. We'll
3536
* look through the buffer for a TOK_MACRO_EXP_END byte that mentions
3537
* the given symbol table entry; we'll return true if found, false if
3538
* not. True means that the symbol has already been expanded on a prior
3539
* scan of the text, so it should not be re-expanded now.
3541
int CTcTokenizer::scan_for_prior_expansion(utf8_ptr src, const char *src_end,
3542
const CTcHashEntryPp *entry)
3544
/* scan the buffer for the expansion flag byte */
3545
while (src.getptr() < src_end)
3547
/* if this is the flag, check what follows */
3548
if (src.getch() == TOK_MACRO_EXP_END)
3550
CTcHashEntryPp *flag_entry;
3552
/* read the entry from the buffer */
3553
memcpy(&flag_entry, src.getptr() + 1, sizeof(flag_entry));
3555
/* if it matches, indicate that we found it */
3556
if (entry == flag_entry)
3559
/* it's not a match - keep scanning after this flag sequence */
3560
src.set(src.getptr() + 1 + sizeof(flag_entry));
3564
/* it's not the flag - skip this character */
3569
/* we didn't find it */
3574
* Go through a macro expansion and translate from end-of-expansion
3575
* markers to individual token full-expansion markers. This is used
3576
* after we leave a recursion level to convert expanded text into text
3577
* suitable for use in further expansion at an enclosing recursion
3580
void CTcTokenizer::mark_full_exp_tokens(CTcTokString *dstbuf,
3581
const CTcTokString *srcbuf,
3589
/* clear the output buffer if we're not appending to existing text */
3591
dstbuf->clear_text();
3593
/* remember the starting point */
3594
start = srcbuf->get_text();
3596
/* not in an embedded expression within the expansion text yet */
3597
in_embedding = FALSE;
3599
/* scan the source buffer */
3600
p.set((char *)start);
3603
CTcHashEntryPp *cur_entry;
3607
/* get the next token; stop at the end of the line */
3608
typ = next_on_line(srcbuf, &p, &tok, &in_embedding, TRUE);
3609
if (typ == TOKT_EOF)
3613
* if this macro token is being expanded, and it's not already
3614
* marked for no more expansion, mark it
3617
&& !tok.get_fully_expanded()
3618
&& (cur_entry = find_define(tok.get_text(),
3619
tok.get_text_len())) != 0
3620
&& scan_for_prior_expansion(p, srcbuf->get_text_end(), cur_entry))
3623
* This token has been fully expanded in the substitution
3624
* buffer but hasn't yet been marked as such - we must
3625
* insert the fully-expanded marker. First, add up to the
3626
* current point to the output buffer.
3628
if (tok.get_text() > start)
3629
dstbuf->append(start, tok.get_text() - start);
3631
/* add the fully-expanded marker */
3632
ch = TOK_FULLY_EXPANDED_FLAG;
3633
dstbuf->append(&ch, 1);
3635
/* the new starting point is the start of the symbol token */
3636
start = tok.get_text();
3640
/* copy any remaining text to the output */
3641
if (tok.get_text() > start)
3642
dstbuf->append(start, tok.get_text() - start);
3645
* Remove any macro expansion end markers from the output buffer.
3646
* We don't want to leave these around, because they don't apply to
3647
* the enclosing buffer into which we'll substitute this result.
3648
* Note that we've already ensured that these markers will be
3649
* respected for the substitution text by inserting "fully expanded"
3650
* markers in front of each token to which any of the markers we're
3651
* removing should apply.
3653
remove_end_markers(dstbuf);
3658
* Remove end markers from a buffer
3660
void CTcTokenizer::remove_end_markers(CTcTokString *buf)
3666
/* scan the buffer */
3667
for (src = dst = buf->get_buf(), p.set(src) ;
3668
p.getptr() < buf->get_text_end() ; )
3670
/* check for our flag */
3671
if (p.getch() == TOK_MACRO_EXP_END)
3673
/* skip the flag byte and the following embedded pointer */
3674
src += 1 + sizeof(CTcHashEntryPp *);
3679
/* skip this character */
3682
/* copy the bytes of this character as-is */
3683
while (src < p.getptr())
3688
/* set the new buffer size */
3689
buf->set_text_len(dst - buf->get_buf());
3694
* Expand the macro at the current token in the current line.
3696
* 'src' is a pointer to the current position in 'srcbuf'. We'll update
3697
* 'src' to point to the next token after macro or its actual parameters
3698
* list, if it has one.
3700
int CTcTokenizer::expand_macro(CTcMacroRsc *rsc, CTcTokString *expbuf,
3701
const CTcTokString *srcbuf, utf8_ptr *src,
3702
size_t macro_srcbuf_ofs,
3703
CTcHashEntryPp *entry, int read_more,
3704
int allow_defined, int *expanded)
3706
CTcTokString *subexp;
3707
size_t argofs[TOK_MAX_MACRO_ARGS];
3708
size_t arglen[TOK_MAX_MACRO_ARGS];
3713
char flagbuf[1 + sizeof(entry)];
3715
/* presume we won't do any expansion */
3718
/* get our resources */
3719
subexp = &rsc->macro_exp_;
3721
/* remember our parsing starting offset */
3722
startofs = src->getptr() - srcbuf->get_text();
3724
/* clear the expansion output buffer */
3725
expbuf->clear_text();
3727
/* if the macro has arguments, scan the actuals */
3728
if (entry->has_args())
3732
/* read the macro arguments */
3733
if (parse_macro_actuals(srcbuf, src, entry, argofs, arglen,
3734
read_more, &found_actuals))
3741
* If we found no actuals, then this wasn't really an invocation
3742
* of the macro after all - a function-like macro invoked with
3743
* no arguments is simply not replaced. Store the original text
3744
* in the output buffer and return success.
3748
/* copy the original text */
3749
expbuf->copy(srcbuf->get_text() + macro_srcbuf_ofs,
3750
startofs - macro_srcbuf_ofs);
3753
* restore the source read pointer to where it was when we
3756
src->set((char *)srcbuf->get_text() + startofs);
3758
/* return success */
3765
* if there are arguments, replace the macro and substitute actuals
3766
* for the formals; otherwise, just copy the replacement text
3769
if (entry->get_argc() != 0)
3771
/* substitute the actuals */
3772
if (substitute_macro_actuals(rsc, subexp, entry, srcbuf,
3773
argofs, arglen, allow_defined))
3779
/* set up to parse from the expansion buffer */
3780
start = subexp->get_text();
3781
end = start + subexp->get_text_len();
3786
* use our local source buffer that simply references the
3787
* original expansion text, rather than making a copy of the
3790
start = entry->get_expansion();
3791
end = start + entry->get_expan_len();
3794
/* copy the expansion into the output buffer */
3795
expbuf->copy(start, end - start);
3798
* After the end of the expansion sequence, insert the
3799
* fully-expanded flag plus a pointer to the symbol table entry that
3800
* we just expanded. This will allow us to detect during the
3801
* re-scan of the expansion text that this symbol has already been
3802
* expanded, in which case we must suppress further expansion of the
3803
* symbol. This allows us to follow the ANSI C rules for recursive
3806
flagbuf[0] = TOK_MACRO_EXP_END;
3807
memcpy(&flagbuf[1], &entry, sizeof(entry));
3808
expbuf->append(flagbuf, sizeof(flagbuf));
3810
/* indicate that we expanded the macro */
3817
/* return the result */
3822
* Parse a macro's actual parameter list, filling in the given hash
3823
* table with the arguments. Returns zero on success, non-zero on
3824
* error. 'entry' is the macro's defining symbol table entry.
3826
int CTcTokenizer::parse_macro_actuals(const CTcTokString *srcbuf,
3828
const CTcHashEntryPp *entry,
3829
size_t argofs[TOK_MAX_MACRO_ARGS],
3830
size_t arglen[TOK_MAX_MACRO_ARGS],
3831
int read_more, int *found_actuals)
3839
/* presume we're not going to do any line splicing */
3842
/* no arguments parsed yet */
3845
/* get the next token after the macro symbol */
3846
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, TRUE);
3848
/* splice another line if necessary */
3849
if (typ == TOKT_EOF && read_more)
3852
typ = actual_splice_next_line(srcbuf, src, &tok);
3854
/* note the splice */
3858
/* if we didn't find an open paren, there's no actual list after all */
3859
if (typ != TOKT_LPAR)
3861
/* tell the caller we didn't find any actuals */
3862
*found_actuals = FALSE;
3864
/* if we spliced a line, unsplice it at the current token */
3866
unsplice_line(tok.get_text());
3868
/* return success */
3872
/* remember the offset of the start of the first argument */
3873
argofs[argc] = tok.get_text() + tok.get_text_len() - srcbuf->get_text();
3875
/* skip the open paren */
3876
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, TRUE);
3878
/* read the arguments */
3879
while (typ != TOKT_RPAR)
3882
int paren_depth, bracket_depth, brace_depth;
3885
/* if we have too many arguments, it's an error */
3886
if ((argc >= entry->get_argc() && !entry->has_varargs())
3887
|| argc >= TOK_MAX_MACRO_ARGS)
3890
log_error(TCERR_PP_MANY_MACRO_ARGS,
3891
(int)entry->getlen(), entry->getstr());
3893
/* scan ahead to to close paren or end of line */
3894
while (typ != TOKT_RPAR && typ != TOKT_EOF)
3895
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_,
3898
/* done scanning arguments */
3903
* skip tokens until we find a comma outside of nested parens,
3904
* square brackets, or curly braces
3906
paren_depth = bracket_depth = brace_depth = 0;
3907
while (paren_depth != 0
3908
|| bracket_depth != 0
3910
|| (typ != TOKT_COMMA && typ != TOKT_RPAR))
3913
* if it's an open or close paren, brace, or bracket, adjust
3914
* the depth accordingly
3946
/* get the next token */
3947
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_,
3951
* if we're at the end of the line, and we're allowed to
3952
* read more, splice the next line onto the current line
3954
if (typ == TOKT_EOF && read_more)
3957
typ = actual_splice_next_line(srcbuf, src, &tok);
3959
/* note that we've done some line splicing */
3963
/* if we've reached the end of the file, stop */
3964
if (typ == TOKT_EOF)
3968
/* if we've reached the end of the file, stop */
3969
if (typ == TOKT_EOF)
3972
/* remove any trailing whitespace from the actual's text */
3974
p.set((char *)tok.get_text());
3975
while (p.getptr() > srcbuf->get_text() + argofs[argc])
3979
/* move to the prior character */
3982
/* if it's not a space, stop looking */
3987
* advance past this character so that we keep it in the
3993
* if this last character was a backslash, and we removed
3994
* at least one space following it, keep the one space
3995
* that immediately follows the backslash, since that
3996
* space is part of the backslash's two-character escape
3999
if (ch == '\\' && sp_cnt != 0)
4006
/* that's one more trailing space we've removed - count it */
4010
/* note the argument length */
4011
arglen[argc] = (p.getptr() - srcbuf->get_text()) - argofs[argc];
4013
/* count the argument */
4016
/* check for another argument */
4017
if (typ == TOKT_COMMA)
4019
/* remember the offset of the start of this argument */
4020
argofs[argc] = tok.get_text() + tok.get_text_len()
4021
- srcbuf->get_text();
4023
/* skip the comma and go back for another argument */
4024
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_,
4027
else if (typ == TOKT_RPAR)
4030
* No need to look any further. Note that we don't want to
4031
* get another token, since we're done parsing the input
4032
* now, and we want to leave the token stream positioned for
4033
* the caller just after the extent of the macro, which, in
4034
* the case of this function-like macro, ends with the
4041
/* if we didn't find the right paren, flag the error */
4042
if (typ != TOKT_RPAR)
4045
? TCERR_PP_MACRO_ARG_RPAR : TCERR_PP_MACRO_ARG_RPAR_1LINE,
4046
(int)entry->getlen(), entry->getstr());
4050
/* remove leading and trailing whitespace from each argument */
4051
for (i = 0 ; i < argc ; ++i)
4059
/* figure the limits of the argument text */
4060
start = srcbuf->get_text() + argofs[i];
4061
end = start + arglen[i];
4063
/* remove leading whitespace */
4064
for (p.set((char *)start) ; p.getptr() < end && is_space(p.getch()) ;
4067
/* set the new offset and length */
4068
del_len = p.getptr() - start;
4069
argofs[i] += del_len;
4070
arglen[i] -= del_len;
4073
/* remove trailing whitespace */
4076
while (p.getptr() > start)
4080
/* go to the prior character */
4083
/* if it's not whitespace, keep it */
4087
/* put the character back */
4091
* if this is a backslash, and a space follows, keep the
4092
* immediately following space, since it's part of the
4093
* backslash sequence
4095
if (ch == '\\' && sp_cnt != 0)
4098
/* we're done scanning */
4102
/* count another removed trailing space */
4106
/* adjust the length */
4107
arglen[i] -= (end - p.getptr());
4111
* if we did any line splicing, cut off the rest of the line and
4112
* push it back into the logical input stream as a new line - this
4113
* will allow better error message positioning if errors occur in
4114
* the remainder of the line, since this means we'll only
4115
* artificially join onto one line the part of the new line that
4116
* contained the macro parameters
4119
unsplice_line(tok.get_text() + tok.get_text_len());
4121
/* make sure we found enough arguments */
4122
if (argc < entry->get_min_argc())
4124
/* fill in the remaining arguments with empty strings */
4125
for ( ; argc < entry->get_argc() ; ++argc)
4131
/* note the error, but proceed with empty arguments */
4132
log_warning(TCERR_PP_FEW_MACRO_ARGS,
4133
(int)entry->getlen(), entry->getstr());
4137
* if we have varargs, always supply an empty marker for the last
4140
if (entry->has_varargs() && argc < TOK_MAX_MACRO_ARGS)
4146
/* success - we found an actual parameter list */
4147
*found_actuals = TRUE;
4152
* Splice a line for macro actual parameters. Sets the source pointer
4153
* to the start of the new line. Reads the first token on the spliced
4154
* line and returns it.
4156
* We will splice new lines until we find a non-empty line or reach the
4157
* end of the input. If this returns EOF, it indicates that we've
4158
* reached the end of the entire input.
4160
tc_toktyp_t CTcTokenizer::
4161
actual_splice_next_line(const CTcTokString *srcbuf,
4162
utf8_ptr *src, CTcToken *tok)
4164
/* add a space onto the end of the current line */
4165
linebuf_.append(" ", 1);
4167
/* keep going until we find a non-empty line */
4173
/* splice the next line onto the current line */
4174
new_line_ofs = read_line(TRUE);
4177
* make sure we read additional lines as needed to complete any
4178
* strings left open at the end of the line
4180
if (in_quote_ != '\0')
4183
/* if there was no more, return end of file */
4184
if (new_line_ofs == -1)
4187
/* set the source to the start of the additional line */
4188
src->set((char *)linebuf_.get_text() + new_line_ofs);
4190
/* get the next token */
4191
typ = next_on_line(srcbuf, src, tok, ¯o_in_embedding_, TRUE);
4193
/* if we didn't get EOF, it means we found a non-empty line */
4194
if (typ != TOKT_EOF)
4200
* Substitute the actual parameters in a macro's expansion
4202
int CTcTokenizer::substitute_macro_actuals(CTcMacroRsc *rsc,
4203
CTcTokString *subexp,
4204
CTcHashEntryPp *entry,
4205
const CTcTokString *srcbuf,
4206
const size_t *argofs,
4207
const size_t *arglen,
4216
const CVmHashTable *actuals;
4217
CTcTokString *actual_exp_buf;
4218
const size_t expand_max = 10;
4219
static struct expand_info_t
4221
/* type of expansion (#foreach, #ifempty, #ifnempty) */
4225
* flag: this is an iterator type (if this is true, the varargs
4226
* formal should be expanded to the current argument given by our
4227
* 'arg' member; if this is false, the varargs formal should be
4228
* expanded as the full varargs list)
4232
/* the marker character that delimits the foreach arguments */
4235
/* location of start of expansion region for foreach */
4238
/* current argument index */
4241
/* the current expansion part (0 = first part, etc) */
4244
expand_stack[expand_max], *expand_sp;
4246
/* get the actuals table */
4247
actuals = entry->get_params_table();
4249
/* get the actual expansion buffer from the resource object */
4250
actual_exp_buf = &rsc->actual_exp_buf_;
4253
* Scan the replacement text for formals, and replace each formal
4254
* with the actual. Set up a pointer at the start of the expansion
4257
start = entry->get_expansion();
4258
expsrc.set((char *)start);
4260
/* we don't yet have a previous token */
4261
prvtok.settyp(TOKT_EOF);
4262
prvprvtok.settyp(TOKT_EOF);
4264
/* clear the expansion buffer */
4265
subexp->clear_text();
4267
/* we have no #foreach/#ifempty/#ifnempty stack yet */
4268
expand_sp = expand_stack;
4270
/* scan the tokens in the expansion text */
4271
for (typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE) ;
4275
* check to see if we've reached the end of a
4276
* #foreach/#ifempty/#ifnempty
4278
if (expand_sp != expand_stack)
4280
/* check to see if we're at the delimiter */
4281
if (utf8_ptr::s_getch(tok.get_text()) == (expand_sp-1)->delim)
4283
/* copy the prior expansion so far */
4284
if (tok.get_text() > start)
4285
subexp->append(start, tok.get_text() - start);
4287
/* go back to the start of the token */
4288
expsrc.set((char *)tok.get_text());
4290
/* see what kind of token we're expanding */
4291
switch((expand_sp-1)->typ)
4293
case TOKT_MACRO_FOREACH:
4294
/* it's a #foreach - process the appropriate part */
4295
switch ((expand_sp-1)->part)
4299
* We've been doing the first part, which is the
4300
* main expansion per actual. This delimiter thus
4301
* introduces the 'between' portion, which we copy
4302
* between each iteration, but not after the last
4303
* iteration. So, if we've just done the last
4304
* actual, skip this part entirely; otherwise,
4305
* keep going, using this part.
4307
if (argofs[(expand_sp-1)->arg + 1] == 0)
4309
/* skip this one remaining part */
4310
skip_delimited_group(&expsrc, 1);
4312
/* we're finished with the iteration */
4318
* we have more arguments, so we want to
4319
* expand this part - skip the deliter and
4324
/* we're now in the next part of the iterator */
4325
(expand_sp-1)->part++;
4331
* We've reached the end of the entire #foreach
4332
* string, so we're done with this iteration.
4333
* Skip the delimiter.
4339
* if we have more arguments, start over with the
4340
* next iteration; otherwise, pop the #foreach
4343
if (argofs[(expand_sp-1)->arg + 1] == 0)
4345
/* no more arguments - pop the #foreach level */
4350
/* we have more arguments - move to the next */
4351
(expand_sp-1)->arg++;
4353
/* go back to the start of the expansion */
4354
expsrc = (expand_sp-1)->start;
4356
/* we have no previous token for pasting ops */
4357
prvtok.settyp(TOKT_EOF);
4358
prvprvtok.settyp(TOKT_EOF);
4360
/* we're back in the first part of the iterator */
4361
(expand_sp-1)->part = 0;
4367
case TOKT_MACRO_IFEMPTY:
4368
case TOKT_MACRO_IFNEMPTY:
4370
* #ifempty or #ifnempty - we've reached the end of
4371
* the conditional text, so simply pop a level and
4372
* keep going after the delimiter
4375
/* skip the delimiter */
4388
/* the next chunk starts here */
4389
start = expsrc.getptr();
4391
/* get the next token */
4392
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4394
/* we have the next token, so back and process it */
4399
/* if it's a #foreach marker, start a #foreach iteration */
4400
if (typ == TOKT_MACRO_FOREACH && entry->has_varargs())
4402
/* copy the prior expansion so far */
4403
if (tok.get_text() > start)
4404
subexp->append(start, tok.get_text() - start);
4406
/* push a #foreach level, if possible */
4407
if (expand_sp - expand_stack >= expand_max)
4410
* we can't create another level - log an error and ignore
4413
log_error(TCERR_PP_FOREACH_TOO_DEEP);
4415
else if (argofs[entry->get_argc() - 1] == 0)
4418
* we have no actuals for the variable part of the
4419
* formals, so we must iterate zero times through the
4420
* #foreach part - in other words, simply skip ahead to
4421
* the end of the #foreach
4423
skip_delimited_group(&expsrc, 2);
4427
/* remember and skip the marker character */
4428
expand_sp->delim = expsrc.getch();
4431
/* set the expansion type */
4432
expand_sp->typ = typ;
4435
* remember the position where the #foreach started, since
4436
* we need to come back here for each use of the variable
4438
expand_sp->start = expsrc;
4440
/* we're an iterator type */
4441
expand_sp->is_iterator = TRUE;
4444
* Start at the first argument in the variable part of the
4445
* argument list. The last formal corresponds to the
4446
* first variable argument.
4448
expand_sp->arg = entry->get_argc() - 1;
4450
/* we're in the main expansion part of the expression */
4451
expand_sp->part = 0;
4453
/* push the new level */
4457
/* the next chunk starts here */
4458
start = expsrc.getptr();
4460
/* get the next token */
4461
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4463
/* we have the next token, so back and process it */
4467
/* if it's a varargs #ifempty or #ifnempty flag, expand it */
4468
if ((typ == TOKT_MACRO_IFEMPTY || typ == TOKT_MACRO_IFNEMPTY)
4469
&& entry->has_varargs())
4474
/* copy the prior expansion so far */
4475
if (tok.get_text() > start)
4476
subexp->append(start, tok.get_text() - start);
4478
/* determine if the varargs list is empty or not */
4479
is_empty = (argofs[entry->get_argc() - 1] == 0);
4482
* decide whether or not expand it, according to the empty
4483
* state and the flag type
4485
expand = ((is_empty && typ == TOKT_MACRO_IFEMPTY)
4486
|| (!is_empty && typ == TOKT_MACRO_IFNEMPTY));
4489
* if we're going to expand it, push a level; otherwise, just
4490
* skip the entire expansion
4494
/* make sure we have room for another level */
4495
if (expand_sp - expand_stack >= expand_max)
4497
/* no room - log an error and ignore the new level */
4498
log_error(TCERR_PP_FOREACH_TOO_DEEP);
4502
/* remember and skip the delimiter */
4503
expand_sp->delim = expsrc.getch();
4507
* we're not an iterator type, so inherit the
4508
* enclosing level's meaning of the varargs formal
4510
if (expand_sp - expand_stack == 0)
4512
/* outermost level - use the whole varargs list */
4513
expand_sp->is_iterator = FALSE;
4517
/* use the enclosing level's meaning */
4518
expand_sp->is_iterator = (expand_sp-1)->is_iterator;
4519
expand_sp->arg = (expand_sp-1)->arg;
4522
/* set the expansion type */
4523
expand_sp->typ = typ;
4525
/* push the new level */
4531
/* not expanding - just skip the entire expansion */
4532
skip_delimited_group(&expsrc, 1);
4535
/* the next chunk starts here */
4536
start = expsrc.getptr();
4538
/* get the next token */
4539
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4541
/* we have the next token, so back and process it */
4545
/* if it's a varargs #argcount indicator, expand it */
4546
if (typ == TOKT_MACRO_ARGCOUNT && entry->has_varargs())
4551
/* copy the prior expansion so far */
4552
if (tok.get_text() > start)
4553
subexp->append(start, tok.get_text() - start);
4556
* count the number of arguments after and including the
4557
* variable argument placeholder
4559
for (i = entry->get_argc() - 1 ; argofs[i] != 0 ; ++i) ;
4561
/* make a string out of the variable argument count */
4562
sprintf(buf, "%d", i - (entry->get_argc() - 1));
4564
/* add the argument count to the output buffer */
4565
subexp->append(buf, strlen(buf));
4567
/* the next chunk starts after the #argcount */
4568
start = expsrc.getptr();
4570
/* get the next token */
4571
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4573
/* we have the next token, so back and process it */
4577
/* if it's a symbol, check for an actual */
4578
if (typ == TOKT_MACRO_FORMAL)
4584
int pasting_at_left, pasting_at_right;
4587
tc_toktyp_t stringize_type;
4588
CTcToken paste_at_right_tok;
4590
/* assume we'll copy up to the start of this token */
4594
* get the index of the actual in the argument vector --
4595
* this is given by the second byte of the special macro
4596
* parameter flag token
4598
argnum = (int)(uchar)tok.get_text()[1] - 1;
4601
* If we have varargs, and this is the varargs argument, and
4602
* the current #foreach stack level indicates that we're
4603
* iterating through the varargs list, treat this as a
4604
* reference to the current argument in the iteration.
4606
if (expand_sp != expand_stack
4607
&& argnum == entry->get_argc() - 1
4608
&& (expand_sp-1)->is_iterator)
4611
* we're on a #foreach iterator, and this is the varargs
4612
* formal - use the current #foreach iteration element
4615
argnum = (expand_sp-1)->arg;
4619
* Get the length of this argument. If we have varargs, and
4620
* this is the last formal, which is the placeholder for the
4621
* variable argument list, and we're not in a #foreach
4622
* iterator, the value is the value of the entire string of
4623
* variable arguments, including the commas.
4625
if (expand_sp == expand_stack
4626
&& entry->has_varargs()
4627
&& argnum == entry->get_argc() - 1)
4632
* It's the full varargs list - use the length from the
4633
* first varargs argument to the last. Find the last
4637
i < TOK_MAX_MACRO_ARGS && argofs[i] != 0 ; ++i) ;
4640
* The full list length is the distance from the offset of
4641
* the first to the end of the last. If there are no
4642
* varargs arguments at all, the length is zero.
4647
argnum_len = argofs[i-1] + arglen[i-1] - argofs[argnum];
4652
* it's not the full varargs list, so just use the length
4653
* of this single actual
4655
argnum_len = arglen[argnum];
4658
/* assume we won't do any token pasting or stringizing */
4659
pasting = pasting_at_left = pasting_at_right = FALSE;
4663
* if the previous token was a token-pasting operator,
4664
* remove it and any preceding whitespace from the source
4665
* material, since we want to append the actual parameter
4666
* text directly after the preceding token
4669
if (prvtok.gettyp() == TOKT_POUNDPOUND)
4674
* note that we have token pasting - we're pasting
4675
* something to the left of this token (since we had a
4676
* "##" before this token
4679
pasting_at_left = TRUE;
4681
/* go back to the ## token */
4682
p = prvtok.get_text();
4684
/* remove any preceding whitespace */
4685
for (prv_ch = 0 ; p > start ; )
4689
/* get the previous character */
4690
prvp = utf8_ptr::s_dec((char *)p);
4691
prv_ch = utf8_ptr::s_getch((char *)prvp);
4693
/* if it's not a space, we're done */
4694
if (!is_space(prv_ch))
4697
/* move back over this character */
4702
* Weird special case: if the previous character was a
4703
* comma, and the formal we're pasting is a variable
4704
* argument formal (i.e., the last formal in a varargs
4705
* macro), and the varargs list is empty, then remove the
4706
* comma. This is a handy shorthand notation that allows
4707
* the varargs list to be added to a comma-delimited list,
4708
* such as a function call's actuals or the contents of a
4712
&& entry->has_varargs()
4713
&& argnum == entry->get_argc() - 1
4714
&& argofs[argnum] == 0)
4717
* it's the special case - move back one more
4718
* character to delete the comma
4720
p = utf8_ptr::s_dec((char *)p);
4723
else if (prvtok.gettyp() == TOKT_POUND
4724
|| prvtok.gettyp() == TOKT_POUNDAT)
4726
/* go back to the # token */
4727
p = prvtok.get_text();
4729
/* note that we have stringizing */
4731
stringize_type = prvtok.gettyp();
4732
stringize_qu = (prvtok.gettyp() == TOKT_POUND
4735
/* go back one more token */
4737
prvprvtok.settyp(TOKT_EOF);
4740
* go back and check for pasting again, since we could
4741
* be pasting to a stringized token
4743
goto check_paste_left;
4746
/* copy the prior expansion so far */
4748
subexp->append(start, p - start);
4750
/* remember the symbol as the previous token */
4754
/* get the next token after the formal */
4755
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4758
* If it's followed by a token-pasting operator, we need to
4759
* paste the next token directly onto the end of the text we
4760
* just added to the buffer, skipping any intervening
4761
* whitespace; otherwise, we want to start adding again at
4762
* the next character after the original token.
4764
if (typ == TOKT_POUNDPOUND)
4766
utf8_ptr old_expsrc;
4769
/* note that we have pasting to the right of this token */
4771
pasting_at_right = TRUE;
4773
/* remember where we started */
4774
old_expsrc = expsrc;
4776
/* remember the current token for a moment */
4779
/* skip to the next token after the ## */
4780
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
4782
/* remember the token we're pasting to the right */
4783
paste_at_right_tok = tok;
4785
/* check for pasting to a stringizer */
4786
if (stringize && typ == stringize_type)
4789
* leave the ## in the stream for now - we'll fix it
4790
* up when we stringize the next token, rather than
4793
expsrc = old_expsrc;
4799
* remember that we have a token-pasting operator,
4800
* so that we can tell that we're pasting when we
4801
* look at the next token
4807
/* start next text from here */
4808
start = tok.get_text();
4812
/* Start at the end of the symbol token */
4813
start = prvtok.get_text() + prvtok.get_text_len();
4817
* If we're not doing any pasting, recursively expand macros
4818
* in the actual expansion text. If we're pasting, do not
4819
* expand any macros in the expansion, since we want to do
4820
* the pasting before we do any expanding.
4822
if (pasting && stringize)
4827
/* presume we'll include the open and close quotes */
4832
* If we're pasting to the left, and the buffer so far
4833
* ends in the same quote we're adding to this token,
4834
* combine the strings by removing the preceding quote
4835
* and not adding the open quote on the new string
4837
if (subexp->get_text_len() > 0
4838
&& *(subexp->get_text_end() - 1) == stringize_qu)
4840
/* remove the close quote from the expansion so far */
4841
subexp->set_text_len(subexp->get_text_len() - 1);
4843
/* don't add the open quote to the new string */
4848
* If we're pasting to the right, and we have a string
4849
* of the same type following, or we will be pasting a
4850
* stringizing pair, paste the two strings together to
4851
* form one string by removing the close quote from this
4852
* string and the open quote from the next string
4854
if (pasting_at_right && *tok.get_text() == stringize_qu)
4858
* We're both stringizing this argument and pasting
4859
* another token - first stringize the actual.
4861
stringize_macro_actual(subexp,
4863
+ argofs[argnum], argnum_len,
4864
stringize_qu, add_open, add_close);
4867
* if we decided to remove the closing quote, we want to
4868
* remove the open quote from the following string as
4869
* well - copy in the following string without its open
4875
* append the following token without its first
4876
* character (its open quote)
4878
subexp->append(tok.get_text() + 1,
4879
tok.get_text_len() - 1);
4881
/* move on to the next token */
4884
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_,
4887
/* start from the new token */
4888
start = tok.get_text();
4898
/* get the actual argument information */
4899
argp = srcbuf->get_text() + argofs[argnum];
4903
* if we're pasting to the left of this token, and the
4904
* token starts with a fully-expanded flag, remove the
4905
* flag - we're making up a new token out of this and
4906
* what comes before, so the token that we fully
4907
* expanded is disappearing, so the fully-expanded
4908
* status no longer applies
4910
if (pasting_at_left && *argp == TOK_FULLY_EXPANDED_FLAG)
4917
/* presume we won't find any quoted strings */
4921
* check for string concatenation to the left - if we're
4922
* concatenating two strings of the same type, remove
4923
* the adjacent quotes to make it a single string
4926
&& subexp->get_text_len() > 0
4927
&& (*argp == '\'' || *argp == '"')
4928
&& *(subexp->get_text_end() - 1) == *argp)
4930
/* remove the close quote from the expansion so far */
4931
subexp->set_text_len(subexp->get_text_len() - 1);
4933
/* remember the quote character */
4936
/* don't add the open quote to the new string */
4941
/* presume we won't have to do anything special */
4945
* If we're pasting at the right, also remove any
4946
* fully-expanded flag just before the last token in the
4949
if (pasting_at_right)
4955
/* scan for the final token in the expansion string */
4956
p.set((char *)argp);
4957
old_tok.settyp(TOKT_INVALID);
4958
while (p.getptr() < argp + len)
4961
* get another token - stop at EOF or if we go
4962
* past the bounds of the expansion text
4964
if (next_on_line(&p, &tok, ¯o_in_embedding_,
4967
|| tok.get_text() >= argp + len)
4970
/* remember the previous token */
4975
* if the final token is a symbol, and it has the
4976
* fully-expanded flag, we must omit the flag from
4979
if (old_tok.gettyp() == TOKT_SYM
4980
&& old_tok.get_fully_expanded())
4983
* append up to but not including the flag byte
4984
* preceding the final token
4986
subexp->append(argp, tok.get_text() - 1 - argp);
4989
* append from the last token to the end of the
4990
* expansion, skipping the flag byte
4992
subexp->append(tok.get_text(),
4993
len - (tok.get_text() - argp));
4995
/* we've done the appending */
4998
else if (quote_char != 0
4999
&& paste_at_right_tok.get_text_len() != 0
5000
&& *paste_at_right_tok.get_text() == quote_char)
5003
* we're pasting two strings together - append
5004
* up to but not including the close quote
5006
subexp->append(argp, len - 1);
5009
* append the next token, but do not include the
5012
subexp->append(paste_at_right_tok.get_text() + 1,
5013
paste_at_right_tok.get_text_len() - 1);
5016
* restart after the right token, since we've
5017
* now fully processed that token
5019
start = paste_at_right_tok.get_text()
5020
+ paste_at_right_tok.get_text_len();
5028
* append the actual without expansion, if we haven't
5029
* already handled it specially
5032
subexp->append(argp, len);
5036
/* stringize the actual */
5037
stringize_macro_actual(subexp,
5039
+ argofs[argnum], argnum_len,
5040
stringize_qu, TRUE, TRUE);
5044
CTcTokStringRef actual_src_buf;
5046
/* recursively expand macros in the actual text */
5048
set_buffer(srcbuf->get_text() + argofs[argnum],
5050
if (expand_macros(&actual_src_buf, 0, actual_exp_buf,
5051
FALSE, allow_defined, FALSE))
5055
* Append the expanded actual, marking any
5056
* fully-expanded tokens as such and removing
5057
* end-of-expansion markers.
5059
* We can't leave end-of-expansion markers in the
5060
* expanded actual text, because end-of-expansion
5061
* markers apply only to the current recursion level,
5062
* and we've now exited the actual's recursion level.
5063
* However, we must not expand further anything in the
5064
* actual's expansion that has already been fully
5065
* expanded. To achieve both of these goals, we switch
5066
* here from marking the run of text (with the end
5067
* marker) to marking individual tokens.
5069
mark_full_exp_tokens(subexp, actual_exp_buf, TRUE);
5072
/* we've already read the next token, so proceed */
5076
/* remember the current token as the previous token */
5080
/* get the next token of the expansion */
5081
typ = next_on_line(&expsrc, &tok, ¯o_in_embedding_, TRUE);
5084
/* copy the remaining replacement text */
5085
subexp->append(start, tok.get_text() - start);
5092
* Skip the source of a delimited macro expansion area (#foreach,
5093
* #ifempty, #ifnempty).
5095
void CTcTokenizer::skip_delimited_group(utf8_ptr *p, int parts_to_skip)
5099
/* get the delimiter character */
5103
* if the delimiter put us at the end of the line, there's nothing to
5106
if (delim == 0 || delim == TOK_END_PP_LINE)
5109
/* skip the delimiter */
5112
/* keep going until we've skipped the desired number of parts */
5113
while (parts_to_skip != 0)
5117
/* read the next character */
5120
/* if it's the end of the line, give up */
5121
if (ch == 0 || ch == TOK_END_PP_LINE)
5124
* we ran out of input before reaching the delimiter, so this
5125
* is implicitly the end of it
5130
/* check what we have */
5133
/* that's one less part to skip */
5139
else if (ch == TOK_MACRO_FOREACH_FLAG)
5141
/* it's a nested #foreach - skip all of its parts */
5142
skip_delimited_group(p, 2);
5144
else if (ch == TOK_MACRO_IFEMPTY_FLAG
5145
|| ch == TOK_MACRO_IFNEMPTY_FLAG)
5147
/* nested #ifempty or #ifnempty - skip its expansion */
5148
skip_delimited_group(p, 1);
5152
/* it's nothing special to us - skip it */
5159
* Stringize a macro actual parameter value into a macro expansion
5162
void CTcTokenizer::stringize_macro_actual(CTcTokString *expbuf,
5163
const char *actual_val,
5164
size_t actual_len, char quote_char,
5166
int add_close_quote)
5171
wchar_t inner_quote_char;
5174
/* add the open quote if desired */
5176
expbuf->append("e_char, 1);
5178
/* remember the start of the current segment */
5182
* add the characters of the actual parameter value, quoting any
5183
* quotes or backslashes
5185
for (src.set((char *)actual_val),
5186
in_inner_quote = FALSE, inner_quote_char = '\0', prvch = '\0' ;
5187
src.getptr() < actual_val + actual_len ; )
5191
/* get this character */
5194
/* compress runs of whitespace to single spaces */
5195
if (is_space(cur) && prvch != '\\')
5197
/* append up to this character */
5198
if (src.getptr() > start)
5199
expbuf->append(start, src.getptr() - start);
5201
/* find the next non-space character */
5202
for ( ; src.getptr() < actual_val + actual_len ; src.inc())
5204
if (!is_space(src.getch()))
5209
* if we're not at the start or end of the string, add a
5210
* single space to replace the entire run of whitespace --
5211
* don't do this at the start or end of the string, since
5212
* we must remove leading and trailing whitespace
5214
if (prvch != '\0' && src.getptr() < actual_val + actual_len)
5215
expbuf->append(" ", 1);
5217
/* note that the previous character is a space */
5220
/* this is the new starting point */
5221
start = src.getptr();
5223
/* proceed - we're already at the next character */
5228
* Check to see if we need to quote this character. Quote any
5229
* quote mark matching the enclosing quotes; also quote any
5230
* backslash that occurs within nested quotes within the source
5231
* material, but not backslashes that occur originally outside
5234
if (cur == quote_char
5235
|| (cur == '\\' && in_inner_quote))
5237
/* append the segment up to (but not including) this character */
5238
if (src.getptr() > start)
5239
expbuf->append(start, src.getptr() - start);
5241
/* add an extra backslash */
5242
expbuf->append("\\", 1);
5244
/* remember the start of the next segment */
5245
start = src.getptr();
5249
* if this is a quote character, and it's not itself escaped,
5250
* reverse our in-quote flag
5255
* If we're in an inner quote, and it's a match for the open
5256
* inner quote, we're no longer in a quote. Otherwise, if
5257
* we're not in quotes and this is some kind of quote, enter
5260
if (in_inner_quote && cur == inner_quote_char)
5262
/* we're leaving the inner quoted string */
5263
in_inner_quote = FALSE;
5265
else if (!in_inner_quote && (cur == '"' || cur == '\''))
5267
/* we're entering a new inner quoted string */
5268
in_inner_quote = TRUE;
5269
inner_quote_char = cur;
5273
/* remember this as the previous character */
5276
/* move on to the next character */
5280
/* if there's anything in the final segment, append it */
5281
if (src.getptr() > start)
5282
expbuf->append(start, src.getptr() - start);
5284
/* add the close quote if desired */
5285
if (add_close_quote)
5286
expbuf->append("e_char, 1);
5290
* Expand a "defined" preprocessor operator
5292
int CTcTokenizer::expand_defined(CTcTokString *subexp,
5293
const CTcTokString *srcbuf, utf8_ptr *src)
5300
/* get the next token */
5301
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, FALSE);
5303
/* note whether we have an open paren; if we do, skip it */
5304
paren = (typ == TOKT_LPAR);
5306
typ = next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, FALSE);
5308
/* get the symbol */
5309
if (typ != TOKT_SYM)
5311
log_error(TCERR_PP_DEFINED_NO_SYM,
5312
(int)tok.get_text_len(), tok.get_text());
5316
/* look to see if the symbol is defined */
5317
found = (find_define(tok.get_text(), tok.get_text_len()) != 0);
5319
/* expand the macro to "1" if found, "0" if not */
5320
subexp->copy(found ? "1" : "0", 1);
5322
/* check for and skip the matching close paren */
5325
/* require the closing paren */
5326
if (next_on_line(srcbuf, src, &tok, ¯o_in_embedding_, FALSE)
5329
/* generate an error if we don't find it */
5330
log_error(TCERR_PP_DEFINED_RPAR);
5340
/* ------------------------------------------------------------------------ */
5342
* Process comments. Replaces each character of a comment with a space.
5344
void CTcTokenizer::process_comments(size_t start_ofs)
5348
int trailing_sp_after_bs;
5350
/* we haven't found a backslash followed by trailing space yet */
5351
trailing_sp_after_bs = FALSE;
5354
* Scan the line. When inside a comment, replace each character of
5355
* the comment with a space. When outside comments, simply copy
5356
* characters intact.
5358
* Note that we need a separate src and dst pointer, because the
5359
* character length of the original and replaced characters may
5360
* change. Fortunately, the length will never do anything but
5361
* shrink or stay the same, since the only change we make is to
5362
* insert spaces, which are always one byte apiece in UTF-8; we can
5363
* therefore update the buffer in place.
5365
for (src.set(linebuf_.get_buf() + start_ofs),
5366
dst.set(linebuf_.get_buf() + start_ofs) ;
5367
src.getch() != '\0' ; src.inc())
5371
/* get the current character */
5374
/* check to see if we're in a comment */
5375
if (str_->is_in_comment())
5378
* check to see if the comment is ending, or if we have an
5379
* apparent nested comment (which isn't allowed)
5381
if (cur == '*' && src.getch_at(1) == '/')
5384
* skip an extra character of the source - we'll skip
5385
* one in the main loop, so we only need to skip one
5390
/* we're no longer in a comment */
5391
str_->set_in_comment(FALSE);
5393
else if (cur == '/' && src.getch_at(1) == '*')
5395
/* looks like a nested comment - warn about it */
5396
if (!G_prs->get_syntax_only())
5397
log_warning(TCERR_NESTED_COMMENT);
5400
/* continue without copying anything from inside the comment */
5403
else if (in_quote_ != '\0')
5405
/* see what we have */
5409
* It's a backslash sequence -- copy the backslash to
5410
* the output, and skip it. Note that we don't have to
5411
* worry about the line ending with a backslash, since
5412
* the line reader will already have considered that to
5418
/* get the next character, so we copy it directly */
5421
else if (cur == in_quote_)
5424
* this is the closing quote character - simply note
5425
* that we're no longer in a quoted string
5429
else if (in_quote_ == '"' && !comment_in_embedding_
5430
&& cur == '<' && src.getch_at(1) == '<')
5433
* it's an embedded expression starting point - skip the
5434
* first of the '<' characters (the enclosing loop will
5435
* skip the second one)
5439
/* the string is done */
5442
/* we're in an embedding now */
5443
comment_in_embedding_ = TRUE;
5445
/* copy the extra '<' to the output */
5452
* Monitor the stream for a backslash followed by trailing
5453
* spaces. If this is a backslash, note that we might have a
5454
* backslash with trailing spaces; if it's a space, we might
5455
* still have this, so leave the flag alone; if it's anything
5456
* else, clear the flag, since we've found something other
5457
* than backslashes and spaces.
5460
trailing_sp_after_bs = TRUE;
5461
else if (!is_space(cur))
5462
trailing_sp_after_bs = FALSE;
5464
/* check to see if we're starting a comment */
5467
switch(src.getch_at(1))
5470
/* note that we're starting a comment */
5471
str_->set_in_comment(TRUE);
5474
* replace the starting slash with a space - this
5475
* will effectively replace the entire comment with
5476
* a single space, since we won't copy anything else
5477
* from inside the comment
5484
* comment to end of line - we can terminate the
5485
* line at the opening slash and return immediately,
5486
* because the entire rest of the line is to be
5493
/* not a comment - copy it as-is */
5497
else if (cur == '"' || cur == '\'')
5499
/* it's the start of a new string */
5502
else if (cur < 0x09)
5505
* it's a special flag character - we need to guarantee
5506
* that this character never occurs in input (it
5507
* shouldn't anyway, since it's a control character), so
5508
* translate it to a space
5512
else if (comment_in_embedding_
5513
&& cur == '>' && src.getch_at(1) == '>')
5516
* it's the end of an embedded expression - we're back
5517
* in a double-quoted string (only double-quoted strings
5518
* can have embedded expressions)
5521
comment_in_embedding_ = FALSE;
5523
/* skip the extra '>' and copy it to the output */
5529
/* set the current character in the output */
5533
/* set the updated line buffer length */
5534
linebuf_.set_text_len(dst.getptr() - linebuf_.get_buf());
5537
* if we found a backslash with nothing following but whitespace, flag
5538
* a warning, since they might have meant the backslash as a line
5539
* continuation signal, but we're not interpreting it that way because
5540
* of the trailing whitespace
5542
if (trailing_sp_after_bs)
5543
log_warning(TCERR_TRAILING_SP_AFTER_BS);
5547
* Splice strings. Splice additional lines onto the current line until
5548
* we find the end of the string.
5550
void CTcTokenizer::splice_string()
5557
/* presume we'll find proper termination */
5561
* remember the current in-quote and in-embedding status, as of the
5562
* end of the current line - when we splice, the line reader will
5563
* update these to the status at the end of the newly-read material,
5564
* but we want to scan from the beginning of the newly-read material
5566
in_quote = in_quote_;
5567
in_embedding = comment_in_embedding_;
5569
/* keep going until we find the end of the string */
5577
* append a space at the end of the line, to replace the newline
5578
* that we've eliminated
5580
if (string_newline_spacing_)
5581
linebuf_.append(" ", 1);
5583
/* splice another line */
5584
new_line_ofs = read_line(TRUE);
5586
/* if we reached end of file, there's no more splicing we can do */
5587
if (new_line_ofs == -1)
5590
/* get a pointer to the new text */
5591
new_line_p = (char *)linebuf_.get_text() + new_line_ofs;
5593
/* skip leading spaces in the new line */
5594
for (p.set(new_line_p) ; is_space(p.getch()) ; p.inc()) ;
5596
/* if we skipped any spaces, remove them from the text */
5597
if (p.getptr() > new_line_p)
5602
/* calculate the length of the rest of the line */
5603
rem = linebuf_.get_text_len() - (p.getptr() - linebuf_.get_buf());
5605
/* calculate the new length of the line */
5606
new_len = (new_line_p - linebuf_.get_buf()) + rem;
5608
/* move the rest of the line down over the spaces */
5609
memmove(new_line_p, p.getptr(), rem);
5611
/* set the new length */
5612
linebuf_.set_text_len(new_len);
5616
* If the new line contains only "}" or ";", presume that the
5617
* string is unterminated and terminate it here. (This
5618
* heuristic could flag well-formed strings as erroneous, but
5619
* users can always work around this by moving these characters
5620
* onto lines that contain at least one other non-whitespace
5624
if (p.getch() == '}' || p.getch() == ';')
5626
/* skip trailing whitespace */
5627
for (p.inc() ; is_space(p.getch()) ; p.inc()) ;
5630
* if there's nothing else on the line, presume it's an
5631
* unterminated string
5633
if (p.getch() == '\0')
5636
log_error(TCERR_POSSIBLE_UNTERM_STR,
5639
/* remember that it's unterminated */
5640
unterm = (char)in_quote;
5643
* since we're adding a presumed close quote that never
5644
* appears in the text, we need to figure the new
5645
* in-string status for the line; clear the in-quote
5646
* flag, and re-scan comments from the current point on
5650
process_comments(new_line_p - linebuf_.get_buf());
5652
/* we're done - unsplice from the start of the new line */
5658
/* scan for the end of the string */
5659
for (p.set(new_line_p) ;; p.inc())
5661
/* get this character */
5664
/* see what we have */
5667
/* it's a backslash sequence - skip the extra character */
5670
else if (cur == in_quote)
5672
/* it's our quote character - skip it, and we're done */
5676
else if (in_quote == '"' && !in_embedding
5677
&& cur == '<' && p.getch_at(1) == '<')
5680
* it's an embedded expression starter - skip the '<<'
5681
* sequence and stop scanning
5687
else if (cur == '\0')
5689
/* end of line - go back and splice another line */
5696
/* unsplice the line at the current point */
5697
unsplice_line(p.getptr());
5699
/* if we found an unterminated string, supply implicit termination */
5701
linebuf_.append(&unterm, 1);
5705
/* ------------------------------------------------------------------------ */
5707
* Process a #pragma directive
5709
void CTcTokenizer::pp_pragma()
5714
void (CTcTokenizer::*func)();
5716
static pp_kw_def kwlist[] =
5718
// { "c", &CTcTokenizer::pragma_c }, -- obsolete
5719
{ "once", &CTcTokenizer::pragma_once },
5720
{ "all_once", &CTcTokenizer::pragma_all_once },
5721
{ "message", &CTcTokenizer::pragma_message },
5722
{ "newline_spacing", &CTcTokenizer::pragma_newline_spacing },
5723
{ "sourceTextGroup", &CTcTokenizer::pragma_source_text_group },
5729
/* get the pragma keyword */
5730
if (next_on_line() != TOKT_SYM)
5732
log_warning(TCERR_UNKNOWN_PRAGMA,
5733
(int)curtok_.get_text_len(), curtok_.get_text());
5737
/* get the keyword length */
5738
kwlen = curtok_.get_text_len();
5740
/* scan the pragma list */
5741
for (kwp = kwlist ; kwp->kw != 0 ; ++kwp)
5743
/* is this our keyword? */
5744
if (strlen(kwp->kw) == kwlen
5745
&& memicmp(curtok_.get_text(), kwp->kw, kwlen) == 0)
5747
/* this is our keyword - invoke the handler */
5748
(this->*(kwp->func))();
5755
/* we didn't find it - generate a warning */
5756
log_warning(TCERR_UNKNOWN_PRAGMA, kwlen, curtok_.get_text());
5759
#if 0 // #pragma C is not currently used
5761
* Process a #pragma C directive
5763
void CTcTokenizer::pragma_c()
5768
/* get the next token */
5769
tok = next_on_line();
5772
* "+" or empty (end of line or whitespace) indicates C mode; "-"
5773
* indicates standard mode
5775
if (tok == TOKT_PLUS || tok == TOKT_EOF)
5776
new_pragma_c = TRUE;
5777
else if (tok == TOKT_MINUS)
5778
new_pragma_c = FALSE;
5781
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5782
new_pragma_c = str_->is_pragma_c();
5786
* retain the pragma in the result if we're in preprocess-only mode,
5787
* otherwise remove it
5792
/* set the mode in the stream */
5793
str_->set_pragma_c(new_pragma_c);
5795
/* if there's a parser, notify it of the change */
5797
G_prs->set_pragma_c(new_pragma_c);
5802
* Process a #pragma once directive
5804
void CTcTokenizer::pragma_once()
5806
/* add this file to the ONCE list */
5807
add_include_once(str_->get_desc()->get_fname());
5809
/* don't retain this pragma in the result */
5814
* Process a #pragma all_once directive
5816
void CTcTokenizer::pragma_all_once()
5820
/* get the next token */
5821
tok = next_on_line();
5824
* "+" or empty (end of line or whitespace) indicates ALL_ONCE mode;
5825
* '-' indicates standard mode
5827
if (tok == TOKT_PLUS || tok == TOKT_EOF)
5829
else if (tok == TOKT_MINUS)
5832
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5834
/* don't retain this pragma in the result */
5839
* Process a #pragma message directive
5841
void CTcTokenizer::pragma_message()
5846
* copy the source line through the "message" token to the macro
5847
* expansion buffer - we don't want to expand that part, but we want
5848
* it to appear in the expansion, so just copy the original
5850
startofs = (curtok_.get_text() + curtok_.get_text_len()
5851
- linebuf_.get_text());
5852
expbuf_.copy(linebuf_.get_text(), startofs);
5854
/* expand macros; don't allow reading additional lines */
5855
if (expand_macros_curline(FALSE, FALSE, TRUE))
5862
* If we're in normal compilation mode, display the message. If we're
5863
* in preprocess-only mode, simply retain the message in the
5864
* preprocessed result, so that it shows up when the result is
5867
* Ignore messages in list-includes mode.
5869
if (!pp_only_mode_ && !list_includes_mode_)
5871
/* set up at the first post-processed token */
5872
start_new_line(&expbuf_, startofs);
5874
/* if there's an open paren, skip it */
5875
if (next_on_line_xlat(0) == TOKT_LPAR)
5876
next_on_line_xlat(0);
5878
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5880
/* keep going until we reach the closing paren */
5881
while (curtok_.gettyp() != TOKT_RPAR
5882
&& curtok_.gettyp() != TOKT_EOF)
5884
/* display this token */
5885
switch(curtok_.gettyp())
5890
/* display the text of the token */
5891
msg_str(curtok_.get_text(), curtok_.get_text_len());
5895
/* display the integer */
5896
msg_long(curtok_.get_int_val());
5900
/* ignore anything else */
5904
/* get the next token */
5905
next_on_line_xlat(0);
5911
/* remove the message from the result text */
5916
/* preprocessing - copy expanded text to line buffer */
5917
linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
5922
* Process a #pragma newline_spacing(on/off) directive
5924
void CTcTokenizer::pragma_newline_spacing()
5928
/* if we're in preprocess-only mode, just pass the pragma through */
5932
/* get the '(' token and the on/off token */
5933
if (next_on_line() != TOKT_LPAR || next_on_line() != TOKT_SYM)
5935
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5939
/* note the new mode flag */
5940
if (curtok_.get_text_len() == 2
5941
&& memcmp(curtok_.get_text(), "on", 2) == 0)
5946
else if (curtok_.get_text_len() == 3
5947
&& memcmp(curtok_.get_text(), "off", 3) == 0)
5954
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5958
/* make sure we have the ')' token */
5959
if (next_on_line() != TOKT_RPAR)
5961
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
5965
/* set the new mode */
5966
string_newline_spacing_ = f;
5969
/* done - discard this line buffer */
5975
* Process a #pragma sourceTextGroup(on/off) directive
5977
void CTcTokenizer::pragma_source_text_group()
5982
/* if we're in preprocess-only mode, just pass the pragma through */
5986
/* get the '(' token and the on/off token, if present */
5987
if ((tok = next_on_line()) == TOKT_EOF)
5989
/* no on/off - by default it's on */
5992
else if (tok == TOKT_LPAR && next_on_line() == TOKT_SYM)
5994
/* get the on/off mode */
5995
if (curtok_.get_text_len() == 2
5996
&& memcmp(curtok_.get_text(), "on", 2) == 0)
6001
else if (curtok_.get_text_len() == 3
6002
&& memcmp(curtok_.get_text(), "off", 3) == 0)
6009
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
6013
/* make sure we have the ')' token */
6014
if (next_on_line() != TOKT_RPAR)
6016
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
6022
/* anything else is invalid syntax */
6023
log_warning(TCERR_BAD_PRAGMA_SYNTAX);
6027
/* set the new mode in the parser */
6028
G_prs->set_source_text_group_mode(f);
6031
/* done - discard this line buffer */
6036
/* ------------------------------------------------------------------------ */
6038
* Process a #charset directive
6040
void CTcTokenizer::pp_charset()
6043
* Encountering a #charset directive within the tokenizer is always
6044
* an error. If the file opener managed to use a #charset, we'll
6045
* never see it, because the file opener will have skipped it before
6046
* giving us the file.
6048
* If we flagged a #charset error when opening the file, indicate
6049
* that the problem is that the character set given was unloadable;
6050
* otherwise, the problem is that #charset is in the wrong place.
6052
log_error(str_ != 0 && str_->get_charset_error()
6053
? TCERR_CANT_LOAD_CHARSET : TCERR_UNEXPECTED_CHARSET);
6055
/* don't retain this pragma in the result */
6059
/* ------------------------------------------------------------------------ */
6061
* Process a #include directive
6063
void CTcTokenizer::pp_include()
6069
CTcSrcFile *new_src;
6071
int default_charset_error;
6072
char full_name[OSFNMAX];
6073
char lcl_name[OSFNMAX];
6075
CTcTokFileDesc *desc;
6079
/* presume we'll expand macros */
6083
* Check to see if expansion is needed. Macro expansion is needed
6084
* only if the source line is not of one of the following forms:
6086
*. #include "filename"
6087
*. #include <filename>
6089
for (start = p_ ; is_space(p_.getch()) ; p_.inc()) ;
6093
/* look for a matching '>' */
6098
/* look for a matching '"' */
6103
/* find the matching character */
6104
for (p_.inc() ; p_.getch() != '\0' && p_.getch() != match ;
6107
/* if we found it, check for other characters on the line */
6108
if (p_.getch() == match)
6110
/* skip the matching character */
6113
/* skip whitespace */
6114
while (is_space(p_.getch()))
6118
* make sure there's nothing else on the line - if not, it's
6119
* one of the approved formats, so there's no need to do
6122
if (p_.getch() == 0)
6128
/* go back to read from the original starting point */
6131
/* expand macros if necessary */
6134
/* do the expansion */
6135
if (expand_macros_curline(FALSE, FALSE, FALSE))
6137
/* clear the buffer and abort */
6143
* remove any expansion flags, so that we don't have to worry about
6144
* parsing or skipping them
6146
remove_expansion_flags(&expbuf_);
6148
/* read from the expansion buffer */
6149
start_new_line(&expbuf_, 0);
6152
/* skip leading whitespace */
6153
for ( ; is_space(p_.getch()) ; p_.inc()) ;
6155
/* we have to be looking at at '"' or '<' character */
6156
if (p_.getch() == '"')
6158
/* look for a matching quote, and look for a local file */
6162
else if (p_.getch() == '<')
6164
/* look for a matching angle bracket, and look for a system file */
6170
/* invalid syntax - log an error and ignore the line */
6171
log_error(TCERR_BAD_INC_SYNTAX);
6176
/* skip the open quote, and remember where the filename starts */
6180
/* find the matching quote */
6181
for ( ; p_.getch() != '\0' && p_.getch() != match ; p_.inc()) ;
6183
/* if we didn't find the match, log an error and ignore the line */
6184
if (p_.getch() == '\0')
6186
log_error(TCERR_BAD_INC_SYNTAX);
6193
* We found the close quote. Before we parse the filename, make
6194
* one last check: if there's anything further on the line apart
6195
* from whitespace, it's extraneous, so issue a warning.
6198
/* remember where the close quote is */
6199
utf8_ptr closep = p_;
6201
/* skip it, and then skip any trailing whitespace */
6202
for (p_.inc() ; is_space(p_.getch()) ; p_.inc()) ;
6204
/* if we're not at the end of the line, issue a warning */
6205
if (p_.getch() != '\0')
6206
log_warning(TCERR_EXTRA_INC_SYNTAX);
6209
* Null-terminate the filename. (We know there's nothing else
6210
* interesting in the buffer after the filename at this point, so
6211
* we don't care about overwriting the quote or anything that might
6217
/* check to see if the filename is absolute */
6218
is_absolute = os_is_file_absolute(fname.getptr());
6220
/* we have yet to find the file */
6224
* in case the name is in portable URL notation, convert from URL
6225
* notation to local notation; we'll consider this form of the name
6226
* first, and only if we can't find it in this form will we try
6227
* treating the name as using local filename conventions
6229
os_cvt_url_dir(lcl_name, sizeof(lcl_name), fname.getptr(), FALSE);
6232
* Search for the included file.
6234
* First, if it's a local file (in quotes rather than angle
6235
* brackets), start the search in the directory containing the
6236
* current file, then look in the directory containing the parent
6237
* file, and so on. If we fail to find it, proceed as for a
6240
if (is_local && last_desc_ != 0)
6242
CTcTokStream *cur_str;
6243
char pathbuf[OSFNMAX];
6245
/* start with the current file, and search parents */
6246
for (cur_str = str_ ; cur_str != 0 ; cur_str = cur_str->get_parent())
6248
/* get the path to the current file */
6249
os_get_path_name(pathbuf, sizeof(pathbuf),
6250
last_desc_->get_fname());
6253
* try the URL-converted name first - this takes precedence
6254
* over a local interpretation of the name
6256
os_build_full_path(full_name, sizeof(full_name),
6258
if (!osfacc(full_name))
6264
/* if it's a relative local name, try again with local naming */
6268
* build the full filename, treating the name as using
6269
* local system conventions
6271
os_build_full_path(full_name, sizeof(full_name),
6272
pathbuf, fname.getptr());
6274
/* if we found it, so note and stop searching */
6275
if (!osfacc(full_name))
6285
* If we still haven't found the file (or if it's a non-local file,
6286
* in angle brackets), search the include path.
6290
tctok_incpath_t *inc_path;
6292
/* scan the include path */
6293
for (inc_path = incpath_head_ ; inc_path != 0 ;
6294
inc_path = inc_path->nxt)
6296
/* try the URL-converted local name first */
6297
os_build_full_path(full_name, sizeof(full_name),
6298
inc_path->path, lcl_name);
6299
if (!osfacc(full_name))
6305
/* try with the local name, if it's a relative local name */
6308
/* build the full name for the file in this directory */
6309
os_build_full_path(full_name, sizeof(full_name),
6310
inc_path->path, fname.getptr());
6312
/* if we found it, stop searching */
6313
if (!osfacc(full_name))
6323
* If the filename specified an absolute path, and we didn't find a
6324
* file with any of the local interpretations, look at the absolute
6325
* path. Note that our portable URL-style notation doesn't allow
6326
* absolute notation, so we use only the exact name as specified in
6327
* the #include directive as the absolute form.
6329
if (is_absolute && !found)
6331
/* use the original filename as the full name */
6332
strcpy(full_name, fname.getptr());
6334
/* try finding the file */
6335
found = !osfacc(full_name);
6339
* we have our copy of the filename now; we don't want to retain
6340
* this directive in the preprocessed source, so clear out the line
6346
* if we didn't find the file anywhere, show an error and ignore the
6347
* #include directive
6351
log_error(TCERR_INC_NOT_FOUND,
6352
(int)strlen(fname.getptr()), fname.getptr());
6357
* Check the list of included files that are marked for inclusion
6358
* only once. If we've already included this file, ignore this
6359
* redundant inclusion. Check based on the full filename that we
6360
* resolved from the search path.
6362
if (find_include_once(full_name))
6364
/* log an error if appropriate */
6365
if (warn_on_ignore_incl_)
6366
log_warning(TCERR_REDUNDANT_INCLUDE,
6367
(int)strlen(full_name), full_name);
6369
/* ignore this #include directive */
6373
/* open a file source to read the file */
6374
new_src = CTcSrcFile::open_source(full_name, res_loader_,
6375
default_charset_, &charset_error,
6376
&default_charset_error);
6378
/* if we couldn't open the file, log an error and ignore the line */
6382
* if the error was due to the default character set, log that
6383
* problem; otherwise, log the general file-open problem
6385
if (default_charset_error)
6386
log_error(TCERR_CANT_LOAD_DEFAULT_CHARSET, default_charset_);
6388
log_error(TCERR_INC_NOT_FOUND,
6389
(int)strlen(full_name), full_name);
6391
/* we can go no further */
6395
/* get the descriptor for the source file */
6396
desc = get_file_desc(full_name, strlen(full_name), FALSE,
6398
fname.getptr() != 0 ? strlen(fname.getptr()) : 0);
6401
* remember the current #pragma newline_spacing mode, so we can restore
6402
* it when we reinstate the current stream
6404
str_->set_newline_spacing(string_newline_spacing_);
6407
* Create and install the new file reader stream object. By
6408
* installing it as the current reader, we'll activate it so that
6409
* the next line read will come from the new stream. Note that the
6410
* current stream becomes the parent of the new stream, so that we
6411
* revert to the current stream when the new stream is exhausted;
6412
* this will allow us to pick up reading from the current stream at
6413
* the next line after the #include directive when we've finished
6414
* including the new file.
6416
str_ = new CTcTokStream(desc, new_src, str_, charset_error, if_sp_);
6419
* If we're in ALL_ONCE mode, it means that every single file we
6420
* include should be included only once.
6423
add_include_once(full_name);
6426
* if we're in list-includes mode, write the name of the include file
6427
* to the standard output
6429
if (list_includes_mode_)
6430
G_hostifc->print_msg("#include %s\n", full_name);
6433
/* ------------------------------------------------------------------------ */
6435
* Add a file to the include-once list. Once a file is in this list, we
6436
* won't include it again.
6438
void CTcTokenizer::add_include_once(const char *fname)
6440
tctok_incfile_t *prvinc;
6442
/* if the file is already in the list, don't add it again */
6443
if (find_include_once(fname))
6446
/* create a new entry for the filename */
6447
prvinc = (tctok_incfile_t *)t3malloc(sizeof(tctok_incfile_t)
6450
/* save the filename */
6451
strcpy(prvinc->fname, fname);
6453
/* link the new entry into our list */
6454
prvinc->nxt = prev_includes_;
6455
prev_includes_ = prvinc;
6459
* Find a file in the list of files to be included only once. Returns
6460
* true if the file is in the list, false if not.
6462
int CTcTokenizer::find_include_once(const char *fname)
6464
tctok_incfile_t *prvinc;
6466
/* search the list */
6467
for (prvinc = prev_includes_ ; prvinc != 0 ; prvinc = prvinc->nxt)
6469
/* if this one matches, we found it, so return true */
6470
if (strcmp(fname, prvinc->fname) == 0)
6474
/* we didn't find the file */
6478
/* ------------------------------------------------------------------------ */
6480
* Process a #define directive
6482
void CTcTokenizer::pp_define()
6484
const char *macro_name;
6486
const char *argv[TOK_MAX_MACRO_ARGS];
6487
size_t argvlen[TOK_MAX_MACRO_ARGS];
6492
CTcHashEntryPp *entry;
6495
/* get the macro name */
6496
if (next_on_line() != TOKT_SYM)
6498
log_error(TCERR_BAD_DEFINE_SYM,
6499
(int)curtok_.get_text_len(), curtok_.get_text());
6504
/* make a copy of the macro name */
6505
macro_name = curtok_.get_text();
6506
macro_len = curtok_.get_text_len();
6508
/* no arguments yet */
6511
/* presume we won't find a varargs marker */
6512
has_varargs = FALSE;
6515
* If there's a '(' immediately after the macro name, without any
6516
* intervening whitespace, it has arguments; otherwise, it has no
6517
* arguments. Note which case we have.
6519
if (p_.getch() == '(')
6524
/* note that we have an argument list */
6527
/* assume we're not done yet */
6530
/* skip the paren and get the next token */
6532
tok = next_on_line();
6534
/* check for an empty argument list */
6535
if (tok == TOKT_RPAR)
6537
/* note that we're done with the arguments */
6541
/* scan the argument list */
6544
/* if we have too many arguments, it's an error */
6545
if (argc >= TOK_MAX_MACRO_ARGS)
6547
log_error(TCERR_TOO_MANY_MAC_PARMS,
6548
macro_name, macro_len, TOK_MAX_MACRO_ARGS);
6553
/* if we're at the end of the macro, it's an error */
6554
if (tok == TOKT_EOF)
6556
/* log the error and ignore the line */
6557
log_error(TCERR_MACRO_NO_RPAR);
6562
/* check for a valid initial symbol character */
6563
if (tok != TOKT_SYM)
6565
log_error_curtok(TCERR_BAD_MACRO_ARG_NAME);
6570
/* remember the argument name */
6571
argvlen[argc] = curtok_.get_text_len();
6572
argv[argc++] = curtok_.get_text();
6574
/* get the next token */
6575
tok = next_on_line();
6577
/* make sure we have a comma or paren following */
6578
if (tok == TOKT_COMMA)
6580
/* we have more arguments - skip the comma */
6581
tok = next_on_line();
6583
else if (tok == TOKT_ELLIPSIS)
6585
/* skip the ellipsis */
6586
tok = next_on_line();
6588
/* note the varargs marker */
6591
/* this must be the last argument */
6592
if (tok != TOKT_RPAR)
6595
log_error_curtok(TCERR_MACRO_ELLIPSIS_REQ_RPAR);
6597
/* discard the line and give up */
6602
/* that's the last argument - we can stop now */
6605
else if (tok == TOKT_RPAR)
6607
/* no more arguments - note that we can stop now */
6612
/* invalid argument - log an error and discard the line */
6613
log_error_curtok(TCERR_MACRO_EXP_COMMA);
6622
* there are no arguments - the macro's expansion starts
6623
* immediately after the end of the name and any subsequent
6629
/* skip whitespace leading up to the expansion */
6630
while (is_space(p_.getch()))
6633
/* the rest of the line is the expansion */
6634
expan = p_.getptr();
6636
/* don't allow defining "defined" */
6637
if (macro_len == 7 && memcmp(macro_name, "defined", 7) == 0)
6640
log_error(TCERR_REDEF_OP_DEFINED);
6642
/* don't retain the directive in the preprocessed result */
6645
/* ignore the definition */
6649
/* get the length of the expansion text */
6650
expan_len = strlen(expan);
6653
* remove any trailing whitespace from the expansion text; however,
6654
* leave a trailing space if it's preceded by a backslash
6656
while (expan_len > 0
6657
&& is_space(expan[expan_len-1])
6658
&& !(expan_len > 1 && expan[expan_len-2] == '\\'))
6662
* If there are arguments, scan the expansion for formal parameter
6663
* names. For each one we find, replace it with the special
6664
* TOK_MACRO_FORMAL_FLAG character followed by a one-byte value
6665
* giving the argument index. This special sequence is less costly
6666
* to find when we're expanding the macros - by doing the search
6667
* here, we only need to do it once, rather than each time we expand
6677
int in_embedding = FALSE;
6680
* Generate our modified expansion text in the macro expansion
6681
* buffer. Initially, make sure we have room for a copy of the
6682
* text; we'll resize the buffer later if we find we need even
6685
expbuf_.ensure_space(expan_len);
6687
/* scan for argument names, and replace them */
6688
for (start = expan, dstofs = 0, src.set((char *)expan) ;; )
6690
/* get the next token */
6691
typ = next_on_line(&src, &tok, &in_embedding, FALSE);
6693
/* if we've reached the end of the expansion, we're done */
6694
if (typ == TOKT_EOF)
6698
* If this is a formal parameter name, we'll replace it with
6699
* a special two-byte sequence; otherwise, we'll keep it
6702
if (typ == TOKT_SYM)
6706
/* find it in the table */
6707
for (i = 0 ; i < argc ; ++i)
6709
/* does it match this argument name? */
6710
if (argvlen[i] == tok.get_text_len()
6711
&& memcmp(argv[i], tok.get_text(),
6712
tok.get_text_len()) == 0)
6719
/* get the length of the formal name */
6720
arg_len = argvlen[i];
6723
* the normal replacement length for a formal
6724
* parameter is two bytes - one byte for the flag,
6725
* and one for the formal parameter index
6729
/* by default, the flag byte is the formal flag */
6730
flag_byte = TOK_MACRO_FORMAL_FLAG;
6733
* Check for special varargs control suffixes. If
6734
* we matched the last argument name, and this is
6735
* a varargs macro, we might have a suffix.
6739
&& src.getch() == '#')
6741
/* check for the various suffixes */
6742
if (memcmp(src.getptr() + 1, "foreach", 7) == 0
6743
&& !is_sym(src.getch_at(8)))
6746
* include the suffix length in the token
6752
* the flag byte is the #foreach flag,
6753
* which is a one-byte sequence
6755
flag_byte = TOK_MACRO_FOREACH_FLAG;
6758
else if (memcmp(src.getptr() + 1,
6760
&& !is_sym(src.getch_at(9)))
6763
* include the suffix length in the token
6769
* the flag byte is the #argcount flag,
6770
* which is a one-byte sequence
6772
flag_byte = TOK_MACRO_ARGCOUNT_FLAG;
6775
else if (memcmp(src.getptr() + 1,
6777
&& !is_sym(src.getch_at(8)))
6779
/* include the length */
6782
/* set the one-byte flag */
6783
flag_byte = TOK_MACRO_IFEMPTY_FLAG;
6786
else if (memcmp(src.getptr() + 1,
6788
&& !is_sym(src.getch_at(9)))
6790
/* include the length */
6793
/* set the one-byte flag */
6794
flag_byte = TOK_MACRO_IFNEMPTY_FLAG;
6800
* calculate the new length - we're removing the
6801
* argument name and adding the replacement string
6804
new_len = expan_len + repl_len - arg_len;
6807
* we need two bytes for the replacement - if
6808
* this is more than we're replacing, make sure
6809
* we have room for the extra
6811
if (new_len > expan_len)
6812
expbuf_.ensure_space(new_len);
6815
* copy everything up to but not including the
6818
if (tok.get_text() > start)
6820
/* store the text */
6821
memcpy(expbuf_.get_buf() + dstofs,
6822
start, tok.get_text() - start);
6824
/* move past the stored text in the output */
6825
dstofs += tok.get_text() - start;
6828
/* the next segment starts after this token */
6829
start = tok.get_text() + arg_len;
6831
/* store the flag byte */
6832
expbuf_.get_buf()[dstofs++] = flag_byte;
6835
* If appropriate, store the argument index - this
6836
* always fits in one byte because our hard limit
6837
* on formal parameters is less than 128 per
6838
* macro. Note that we add one to the index so
6839
* that we never store a zero byte, to avoid any
6840
* potential confusion with a null terminator
6844
expbuf_.get_buf()[dstofs++] = (char)(i + 1);
6846
/* remember the new length */
6847
expan_len = new_len;
6849
/* no need to search further for it */
6856
/* copy the last segment */
6857
if (tok.get_text() > start)
6859
/* store the text */
6860
memcpy(expbuf_.get_buf() + dstofs, start,
6861
tok.get_text() - start);
6864
/* set the new length */
6865
expbuf_.set_text_len(expan_len);
6867
/* use the modified expansion text instead of the original */
6868
expan = expbuf_.get_text();
6872
* check the symbol table to see if this symbol is already defined -
6873
* if so, show a warning, but honor the new definition
6875
entry = find_define(macro_name, macro_len);
6879
* Check for a trivial redefinition - if the number of arguments
6880
* is the same, and the type (object-like or function-like) is
6881
* the same, and the expansion string is identical, there's no
6882
* need to warn, because the redefinition has no effect and can
6883
* thus be safely ignored. Note that we must ignore any
6884
* differences in the whitespace in the expansions for this
6887
if ((entry->has_args() != 0) == (has_args != 0)
6888
&& entry->get_argc() == argc
6889
&& lib_strequal_collapse_spaces(expan, expan_len,
6890
entry->get_expansion(),
6891
entry->get_expan_len()))
6893
/* it's a non-trivial redefinition - ignore it */
6897
/* log a warning about the redefinition */
6898
log_warning(TCERR_MACRO_REDEF, (int)macro_len, macro_name);
6900
/* remove and delete the old entry */
6901
defines_->remove(entry);
6903
/* if the item isn't already in the #undef table, add it */
6904
if (find_undef(macro_name, macro_len) == 0)
6907
* move the entry to the #undef table so that we can keep track
6908
* of the fact that this macro's definition has changed in the
6909
* course of the compilation
6911
undefs_->add(entry);
6916
* the name is already in the #undef table, so we don't need
6917
* another copy - just forget about the old entry entirely
6923
/* create an entry for the new macro */
6924
entry = new CTcHashEntryPpDefine(macro_name, macro_len, TRUE,
6925
has_args, argc, has_varargs,
6926
argv, argvlen, expan, expan_len);
6928
/* add it to the hash table */
6929
defines_->add(entry);
6932
/* don't retain the directive in the preprocessed source */
6936
/* ------------------------------------------------------------------------ */
6938
* Process a #ifdef directive
6940
void CTcTokenizer::pp_ifdef()
6942
/* process the ifdef/ifndef with a positive sense */
6943
pp_ifdef_or_ifndef(TRUE);
6947
* Process a #ifndef directive
6949
void CTcTokenizer::pp_ifndef()
6951
/* process the ifdef/ifndef with a negative sense */
6952
pp_ifdef_or_ifndef(FALSE);
6956
* Process a #ifdef or #ifndef. If 'sense' is true, we'll take the
6957
* branch if the symbol is defined (hence #ifdef), otherwise we'll take
6958
* it if the symbol isn't defined (hence #ifndef).
6960
void CTcTokenizer::pp_ifdef_or_ifndef(int sense)
6962
char macro_name[TOK_SYM_MAX_BUFFER];
6966
/* make sure we have a valid symbol */
6967
if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
6969
/* clear the line buffer */
6973
* push a true if to avoid cascading errors for matching #endif
6976
push_if(TOKIF_IF_YES);
6982
/* check to see if it's defined */
6983
found = (find_define(macro_name, strlen(macro_name)) != 0);
6986
* if we found it and they wanted it found, or we didn't find it and
6987
* they didn't want it found, take a true branch; otherwise, take a
6990
if ((sense != 0) == (found != 0))
6991
state = TOKIF_IF_YES;
6993
state = TOKIF_IF_NO;
6995
/* push the new #if state */
6998
/* don't retain the directive in the preprocessed source */
7002
/* ------------------------------------------------------------------------ */
7004
* Process a #if directive
7006
void CTcTokenizer::pp_if()
7010
/* expand macros; don't allow reading additional lines */
7011
if (expand_macros_curline(FALSE, TRUE, FALSE))
7015
* we don't need the original source line any more, and we don't
7016
* want to copy it to the preprocessed output, so clear it
7020
/* parse out of the expansion buffer */
7021
start_new_line(&expbuf_, 0);
7023
/* parse the preprocessor expression */
7024
if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
7027
* we can't get a value; treat the expression as true and
7028
* continue parsing, so that we don't throw off the #if nesting
7034
/* push the new state according to the value of the expression */
7035
push_if(val.get_val_bool() ? TOKIF_IF_YES : TOKIF_IF_NO);
7041
/* clear the line buffer */
7045
* push a true if - even though we can't evaluate the condition, we
7046
* can at least avoid a cascade of errors for the matching #endif
7049
push_if(TOKIF_IF_YES);
7052
/* ------------------------------------------------------------------------ */
7054
* Process a #elif directive
7056
void CTcTokenizer::pp_elif()
7060
/* expand macros; don't allow reading additional lines */
7061
if (expand_macros_curline(FALSE, TRUE, FALSE))
7067
/* parse out of the expansion buffer */
7068
start_new_line(&expbuf_, 0);
7070
/* parse the preprocessor expression */
7071
if (pp_parse_expr(&val, TRUE, TRUE, TRUE))
7078
* make sure that the #elif occurs in the same file as the
7081
if (if_sp_ <= str_->get_init_if_level())
7084
log_error(TCERR_PP_ELIF_NOT_IN_SAME_FILE);
7086
/* clear the text and abort */
7091
/* check the current #if state */
7092
switch(get_if_state())
7096
* we just took the #if branch, so don't take this or any
7097
* subsequent #elif or #else branch, regardless of the value of
7098
* the condition - set the state to DONE to indicate that we're
7099
* skipping everything through the endif
7101
change_if_state(TOKIF_IF_DONE);
7106
* We haven't yet taken a #if or #elif branch, so we can take
7107
* this branch if its condition is true. If this branch's
7108
* condition is false, stay with NO so that we will consider
7109
* future #elif and #else branches.
7111
if (val.get_val_bool())
7112
change_if_state(TOKIF_IF_YES);
7117
* we've already taken a #if or #elif branch, so we must ignore
7118
* this and subsequent #elif and #else branches until we get to
7119
* our #endif - just stay in state DONE
7124
case TOKIF_ELSE_YES:
7127
* we're not in a #if branch at all, or we're inside a #else; a
7128
* #elif is not legal here
7130
log_error(TCERR_PP_ELIF_WITHOUT_IF);
7134
/* don't retain the directive in the preprocessed source */
7138
/* ------------------------------------------------------------------------ */
7140
* Process a #else directive
7142
void CTcTokenizer::pp_else()
7144
/* make sure there's nothing but whitespace on the line */
7145
if (next_on_line() != TOKT_EOF)
7146
log_error(TCERR_PP_EXTRA);
7149
* make sure that the #else occurs in the same file as the
7152
if (if_sp_ <= str_->get_init_if_level())
7155
log_error(TCERR_PP_ELSE_NOT_IN_SAME_FILE);
7157
/* clear the text and abort */
7162
/* check our current #if state */
7163
switch(get_if_state())
7168
* we've already taken a true #if branch, so we don't want to
7169
* process the #else part - switch to a false #else branch
7171
change_if_state(TOKIF_ELSE_NO);
7176
* we haven't yet found a true #if branch, so take the #else
7177
* branch -- switch to a true #else branch
7179
change_if_state(TOKIF_ELSE_YES);
7183
case TOKIF_ELSE_YES:
7186
* we're not in a #if at all, or we're in a #else - log an error
7189
log_error(TCERR_PP_ELSE_WITHOUT_IF);
7193
/* don't retain the directive in the preprocessed source */
7197
/* ------------------------------------------------------------------------ */
7199
* Process a #endif directive
7201
void CTcTokenizer::pp_endif()
7203
/* make sure the rest of the line is blank */
7204
if (next_on_line() != TOKT_EOF)
7205
log_error(TCERR_PP_EXTRA);
7207
/* ignore the rest of the line */
7210
/* if we're not in a #if in the same file it's an error */
7213
log_error(TCERR_PP_ENDIF_WITHOUT_IF);
7216
else if (if_sp_ <= str_->get_init_if_level())
7218
log_error(TCERR_PP_ENDIF_NOT_IN_SAME_FILE);
7222
/* pop a #if level */
7225
/* don't retain the directive in the preprocessed source */
7229
/* ------------------------------------------------------------------------ */
7231
* Process a #error directive
7233
void CTcTokenizer::pp_error()
7238
* copy the source line through the "error" token to the macro
7239
* expansion buffer - we don't want to expand that part, but we want
7240
* it to appear in the expansion, so just copy the original
7242
startofs = (curtok_.get_text() + curtok_.get_text_len()
7243
- linebuf_.get_text());
7244
expbuf_.copy(linebuf_.get_text(), startofs);
7246
/* expand macros; don't allow reading additional lines */
7247
if (expand_macros_curline(FALSE, FALSE, TRUE))
7253
/* clean up any expansion flags embedded in the buffer */
7254
remove_expansion_flags(&expbuf_);
7257
* If we're in preprocess-only mode, simply retain the text in the
7258
* processed result, so that the error is processed on a subsequent
7259
* compilation of the result; otherwise, display the error.
7261
* Ignore #error directives in list-includes mode as well.
7263
if (!pp_only_mode_ && !list_includes_mode_)
7265
/* display the error */
7266
log_error(TCERR_ERROR_DIRECTIVE,
7267
(int)expbuf_.get_text_len() - startofs,
7268
expbuf_.get_text() + startofs);
7270
/* clear the directive from the result */
7275
/* preprocessing - copy expanded text to line buffer */
7276
linebuf_.copy(expbuf_.get_text(), expbuf_.get_text_len());
7280
/* ------------------------------------------------------------------------ */
7282
* Process a #undef directive
7284
void CTcTokenizer::pp_undef()
7286
char macro_name[TOK_SYM_MAX_BUFFER];
7288
/* get the macro name */
7289
if (pp_get_lone_ident(macro_name, sizeof(macro_name)))
7296
undefine(macro_name);
7298
/* don't retain the directive in the preprocessed source */
7303
* Programmatically delete a preprocesor symbol
7305
void CTcTokenizer::undefine(const char *sym, size_t len)
7307
CTcHashEntryPp *entry;
7310
* find the macro - if it wasn't defined, silently ignore it, since
7311
* it's legal to #undef a symbol that wasn't previously defined
7313
entry = find_define(sym, len);
7314
if (entry != 0 && entry->is_undefable())
7317
defines_->remove(entry);
7319
/* if it's not already in the #undef table, move it there */
7320
if (find_undef(sym, len) == 0)
7322
/* move it to the #undef table */
7323
undefs_->add(entry);
7328
* the name is already in the #undef table, so we don't need to
7329
* add it again - we can forget about this entry entirely
7336
/* ------------------------------------------------------------------------ */
7338
* Process a #line directive
7340
void CTcTokenizer::pp_line()
7342
CTcConstVal val_line;
7343
CTcConstVal val_fname;
7344
CTcTokFileDesc *desc;
7346
/* expand macros; don't allow reading additional lines */
7347
if (expand_macros_curline(FALSE, TRUE, FALSE))
7354
* we don't need the original source line any more, and we don't
7355
* want to copy it to the preprocessed output, so clear it
7359
/* set up to parse from the expansion */
7360
start_new_line(&expbuf_, 0);
7362
/* evaluate the line number expression */
7363
if (pp_parse_expr(&val_line, TRUE, FALSE, TRUE))
7366
/* if it's not an integer constant, it's an error */
7367
if (val_line.get_type() != TC_CVT_INT)
7369
log_error(TCERR_LINE_REQ_INT);
7373
/* evaluate the filename expression */
7374
if (pp_parse_expr(&val_fname, FALSE, TRUE, TRUE))
7377
/* the filename must be a string expression */
7378
if (val_fname.get_type() != TC_CVT_SSTR)
7380
log_error(TCERR_LINE_FILE_REQ_STR);
7384
/* find or create a descriptor for the filename */
7385
desc = get_file_desc(val_fname.get_val_str(),
7386
val_fname.get_val_str_len(), FALSE, 0, 0);
7388
/* set the new line number and descriptor in the current stream */
7391
str_->set_next_linenum(val_line.get_val_int());
7392
str_->set_desc(desc);
7396
* retain the pragma in the result if we're in preprocess-only mode,
7397
* otherwise remove it
7403
/* ------------------------------------------------------------------------ */
7405
* Look up a symbol in the #define symbol table
7407
CTcHashEntryPp *CTcTokenizer::find_define(const char *sym, size_t len) const
7409
/* look it up in the #define symbol table and return the result */
7410
return (CTcHashEntryPp *)defines_->find(sym, len);
7414
* Look up a symbol in the #undef table
7416
CTcHashEntryPp *CTcTokenizer::find_undef(const char *sym, size_t len) const
7418
/* look it up in the #define symbol table and return the result */
7419
return (CTcHashEntryPp *)undefs_->find(sym, len);
7423
* Add a preprocessor macro definition
7425
void CTcTokenizer::add_define(const char *sym, size_t len,
7426
const char *expansion, size_t expan_len)
7428
CTcHashEntryPp *entry;
7430
/* create an entry for the macro, with no argument list */
7431
entry = new CTcHashEntryPpDefine(sym, len, TRUE, FALSE, 0, FALSE, 0, 0,
7432
expansion, expan_len);
7434
/* add the new entry to the table */
7435
defines_->add(entry);
7439
* Add a preprocessor macro definition
7441
void CTcTokenizer::add_define(CTcHashEntryPp *entry)
7443
/* add the entry to our symbol table */
7444
defines_->add(entry);
7448
* parse an expression
7450
int CTcTokenizer::pp_parse_expr(CTcConstVal *val, int read_first,
7451
int last_on_line, int add_line_ending)
7453
CTcPrsNode *expr_tree;
7456
/* add the line ending marker if required */
7457
if (add_line_ending)
7460
* append the special end-of-preprocess-line to the macro
7463
ch = TOK_END_PP_LINE;
7464
expbuf_.append(&ch, 1);
7468
* note that we're pasing a preprocessor expression; this affects
7469
* error logging in certain cases
7474
* parse the expression in preprocessor mode, so that double-quoted
7475
* strings can be concatenated and compared
7477
G_prs->set_pp_expr_mode(TRUE);
7479
/* get the first token on the line if desired */
7483
/* parse the expression */
7484
expr_tree = G_prs->parse_expr();
7486
/* make sure we're at the end of the line if desired */
7487
if (last_on_line && next() != TOKT_EOF)
7488
log_error(TCERR_PP_EXPR_EXTRA);
7490
/* if we added the special pp-line-ending marker, remove it */
7491
if (add_line_ending)
7494
* the marker is always the last character - remove it simply by
7495
* shortening the buffer by a character
7497
expbuf_.set_text_len(expbuf_.get_text_len() - 1);
7500
/* return to normal expression mode */
7501
G_prs->set_pp_expr_mode(FALSE);
7503
/* return to normal tokenizing mode */
7504
in_pp_expr_ = FALSE;
7506
/* if we didn't get a valid expression, return failure */
7510
/* make sure we got a constant */
7511
if (!expr_tree->is_const())
7513
log_error(TCERR_PP_EXPR_NOT_CONST);
7517
/* fill in the caller's value */
7518
*val = *expr_tree->get_const_val();
7524
/* ------------------------------------------------------------------------ */
7526
* #define enumeration callback context
7528
struct def_enum_cb_t
7530
/* original callback function */
7531
void (*cb)(void *, CTcHashEntryPp *);
7533
/* original callback context */
7538
* #define enumeration callback. This is a simple impedence matcher on the
7539
* way to the real callbac; we cast the generic hash entry type to the
7540
* CTcHashEntryPp subclass for the benefit of the real callback.
7542
static void enum_defines_cb(void *ctx0, CVmHashEntry *entry)
7546
/* get our real context */
7547
ctx = (def_enum_cb_t *)ctx0;
7549
/* invoke the real callback, casting the entry reference appropriately */
7550
(*ctx->cb)(ctx->ctx, (CTcHashEntryPp *)entry);
7554
* Enumerate the entries in the #define table through a callback
7556
void CTcTokenizer::enum_defines(void (*cb)(void *, CTcHashEntryPp *),
7559
def_enum_cb_t myctx;
7561
/* set up our impedence-matcher context with the real callback info */
7565
/* enumerate through our impedence-matcher callback */
7566
defines_->enum_entries(&enum_defines_cb, &myctx);
7569
/* ------------------------------------------------------------------------ */
7571
* Get a lone identifier for a preprocessor directive. The identifier
7572
* must be the only thing left on the line; we'll generate an error if
7573
* extra characters follow on the line.
7575
* If there's no identifier on the line, or there's more information
7576
* after the identifier, logs an error and returns non-zero; returns
7579
int CTcTokenizer::pp_get_lone_ident(char *buf, size_t bufl)
7581
/* get the next token, and make sure it's a symbol */
7582
if (next_on_line() != TOKT_SYM)
7584
log_error_curtok(TCERR_BAD_DEFINE_SYM);
7588
/* return an error if it doesn't fit */
7589
if (curtok_.get_text_len() > bufl)
7593
memcpy(buf, curtok_.get_text(), curtok_.get_text_len());
7594
buf[curtok_.get_text_len()] = '\0';
7596
/* make sure there's nothing else on the line but whitespace */
7597
if (next_on_line() != TOKT_EOF)
7599
log_error(TCERR_PP_EXTRA);
7607
/* ------------------------------------------------------------------------ */
7609
* Push a new #if level
7611
void CTcTokenizer::push_if(tok_if_t state)
7613
/* if we're out of space in the stack, throw a fatal error */
7614
if (if_sp_ == TOK_MAX_IF_NESTING)
7615
throw_fatal_error(TCERR_IF_NESTING_OVERFLOW);
7618
* if we're in a nested #if in a false #if, increase the nested
7624
/* push the state, remembering where the #if was defined */
7625
if_stack_[if_sp_].desc = last_desc_;
7626
if_stack_[if_sp_].linenum = last_linenum_;
7627
if_stack_[if_sp_++].state = state;
7633
void CTcTokenizer::pop_if()
7635
/* if we're in a nested #if in a false #if, pop the nesting level */
7636
if (if_false_level_ != 0)
7639
/* pop the main if level */
7645
/* ------------------------------------------------------------------------ */
7649
void CTcTokenizer::log_error(int errnum, ...)
7653
/* display the message */
7654
va_start(marker, errnum);
7655
G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7656
TC_SEV_ERROR, errnum, marker);
7661
* Log an error with the current token's text as the parameter data,
7662
* suitable for use with a "%.*s" display format entry
7664
void CTcTokenizer::log_error_curtok(int errnum)
7667
* display the message, passing "%.*s" parameter data for the
7668
* current token text: an integer giving the length of the token
7669
* text, and a pointer to the token text
7671
log_error_or_warning_curtok(TC_SEV_ERROR, errnum);
7675
* Log an error or warning for the current token
7677
void CTcTokenizer::log_error_or_warning_curtok(tc_severity_t sev, int errnum)
7679
/* log the error with our current token */
7680
log_error_or_warning_with_tok(sev, errnum, getcur());
7684
* Log an error or warning with the given token
7686
void CTcTokenizer::log_error_or_warning_with_tok(
7687
tc_severity_t sev, int errnum, const CTcToken *tok)
7689
const char *tok_txt;
7699
/* see what we have */
7700
switch(tok->gettyp())
7703
/* show the string in quotes, but limit the length */
7713
case TOKT_DSTR_START:
7729
/* set the prefix */
7730
strcpy(buf, prefix);
7733
* show the string, but limit the length, and convert control
7734
* characters to escaped representation
7736
src.set((char *)tok->get_text());
7737
rem = tok->get_text_len();
7738
for (dst.set(buf + strlen(buf)), outchars = 0 ;
7739
rem != 0 && outchars < 20 ; src.inc(&rem), ++outchars)
7741
/* if this is a control character, escape it */
7742
if (src.getch() < 32)
7774
dst.setch('0' + (src.getch() >> 12) & 0xf);
7775
dst.setch('0' + (src.getch() >> 8) & 0xf);
7776
dst.setch('0' + (src.getch() >> 4) & 0xf);
7777
dst.setch('0' + (src.getch()) & 0xf);
7783
/* put this character as-is */
7784
dst.setch(src.getch());
7788
/* if there's more string left, add "..." */
7796
/* add the suffix */
7797
strcpy(dst.getptr(), suffix);
7799
/* use this buffer as the token string to display */
7801
tok_len = strlen(tok_txt);
7805
/* show a special "<End Of File>" marker */
7806
tok_txt = "<End Of File>";
7807
tok_len = strlen(tok_txt);
7811
/* just show the current token text */
7812
tok_txt = tok->get_text();
7813
tok_len = tok->get_text_len();
7818
G_tcmain->log_error(get_last_desc(), get_last_linenum(),
7819
sev, errnum, tok_len, tok_txt);
7825
void CTcTokenizer::log_warning(int errnum, ...)
7829
/* display the message */
7830
va_start(marker, errnum);
7831
G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7832
TC_SEV_WARNING, errnum, marker);
7837
* Log a warning with the current token's text as the parameter data,
7838
* suitable for use with a "%.*s" display format entry
7840
void CTcTokenizer::log_warning_curtok(int errnum)
7843
* display the warning message, passing "%.*s" parameter data for
7844
* the current token text: an integer giving the length of the token
7845
* text, and a pointer to the token text
7847
log_error_or_warning_curtok(TC_SEV_WARNING, errnum);
7851
* Log and throw an internal error
7853
void CTcTokenizer::throw_internal_error(int errnum, ...)
7857
/* display the message */
7858
va_start(marker, errnum);
7859
G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7860
TC_SEV_INTERNAL, errnum, marker);
7863
/* throw the generic internal error, since we've logged this */
7864
err_throw(TCERR_INTERNAL_ERROR);
7868
* Log and throw a fatal error
7870
void CTcTokenizer::throw_fatal_error(int errnum, ...)
7874
/* display the message */
7875
va_start(marker, errnum);
7876
G_tcmain->v_log_error(G_tok->get_last_desc(), G_tok->get_last_linenum(),
7877
TC_SEV_FATAL, errnum, marker);
7880
/* throw the generic fatal error, since we've logged this */
7881
err_throw(TCERR_FATAL_ERROR);
7885
* display a string value
7887
void CTcTokenizer::msg_str(const char *str, size_t len) const
7889
/* display the string through the host interface */
7890
G_hostifc->print_msg("%.*s", (int)len, str);
7894
* display a numeric value
7896
void CTcTokenizer::msg_long(long val) const
7898
/* display the number through the host interface */
7899
G_hostifc->print_msg("%ld", val);
7902
/* ------------------------------------------------------------------------ */
7904
* Tokenizer Input Stream implementation
7908
* create a token input stream
7910
CTcTokStream::CTcTokStream(CTcTokFileDesc *desc, CTcSrcObject *src,
7911
CTcTokStream *parent, int charset_error,
7914
/* remember the underlying source file */
7917
/* remember the file descriptor */
7920
/* remember the containing stream */
7923
/* the next line to read is line number 1 */
7926
/* remember if there was a #charset error */
7927
charset_error_ = charset_error;
7929
/* we're not in a comment yet */
7930
in_comment_ = FALSE;
7932
/* remember the starting #if level */
7933
init_if_level_ = init_if_level;
7935
#if 0 // #pragma C is not currently used
7937
* start out in parent's pragma C mode, or in non-C mode if we have
7941
pragma_c_ = parent->is_pragma_c();
7948
* delete a token input stream
7950
CTcTokStream::~CTcTokStream()
7952
/* we own the underlying file, so delete it */
7957
/* ------------------------------------------------------------------------ */
7963
* Get the length of a string with each instance of the given quote
7964
* character escaped with a backslash. We'll also count the escapes we
7965
* need for each backslash.
7967
static size_t get_quoted_len(const char *str, wchar_t qu)
7973
* scan the string for instances of the quote mark; each one adds an
7974
* extra byte to the length needed, since each one requires a
7975
* backslash character to escape the quote mark
7977
for (p.set((char *)str), len = strlen(str) ; p.getch() != '\0' ; p.inc())
7982
* check to see if this character is quotable - it is quotable if
7983
* it's a backslash or it's the quote character we're escaping
7986
if (ch == qu || ch == '\\')
7989
* we need to escape this character, so add a byte for the
7990
* backslash we'll need to insert
7996
/* return the length we calculated */
8001
* Build a quoted string. Fills in dst with the source string with each
8002
* of the given quote marks and each backslash escaped with a backslash.
8003
* Use get_quoted_len() to determine how much space to allocate for the
8004
* destination buffer.
8006
static void build_quoted_str(char *dstbuf, const char *src, wchar_t qu)
8011
/* scan the source string for escapable characters */
8012
for (p.set((char *)src), dst.set(dstbuf), dst.setch(qu) ;
8013
p.getch() != '\0' ; p.inc())
8017
/* get this source character */
8020
/* add a quote if we have a backslash or the quote character */
8021
if (ch == '\\' || ch == qu)
8023
/* add a backslash to escape the character */
8027
/* add the character */
8031
/* add the close quote and trailing null */
8037
* create a file descriptor
8039
CTcTokFileDesc::CTcTokFileDesc(const char *fname, size_t fname_len,
8040
int index, CTcTokFileDesc *orig_desc,
8041
const char *orig_fname, size_t orig_fname_len)
8043
const char *rootname;
8045
/* no source pages are allocated yet */
8049
/* remember the first instance of this filename in the list */
8052
/* there's nothing else in our chain yet */
8055
/* remember my index in the master list */
8058
/* if there's a filename, save a copy of the name */
8059
fname_ = lib_copy_str(fname, fname_len);
8061
/* if there's an original filename save it as well */
8062
orig_fname_ = lib_copy_str(orig_fname, orig_fname_len);
8065
* get the root filename, since we need to build a quoted version of
8066
* that as well as of the basic filename
8068
rootname = os_get_root_name(fname_);
8071
* Allocate space for the quoted versions of the filename - make room
8072
* for the filename plus the quotes (one on each end) and a null
8075
dquoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '"') + 3);
8076
squoted_fname_ = (char *)t3malloc(get_quoted_len(fname_, '\'') + 3);
8077
dquoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '"') + 3);
8078
squoted_rootname_ = (char *)t3malloc(get_quoted_len(rootname, '\'') + 3);
8080
/* build the quoted version of the name */
8081
build_quoted_str(dquoted_fname_, fname_, '"');
8082
build_quoted_str(squoted_fname_, fname_, '\'');
8083
build_quoted_str(dquoted_rootname_, rootname, '"');
8084
build_quoted_str(squoted_rootname_, rootname, '\'');
8088
* delete the descriptor
8090
CTcTokFileDesc::~CTcTokFileDesc()
8092
/* delete the filename and original filename strings */
8093
lib_free_str(fname_);
8094
lib_free_str(orig_fname_);
8096
/* delete the quotable filename strings */
8097
t3free(dquoted_fname_);
8098
t3free(squoted_fname_);
8099
t3free(dquoted_rootname_);
8100
t3free(squoted_rootname_);
8102
/* delete each source page we've allocated */
8103
if (src_pages_ != 0)
8107
/* go through the index array and delete each allocated page */
8108
for (i = 0 ; i < src_pages_alo_ ; ++i)
8110
/* if this page was allocated, delete it */
8111
if (src_pages_[i] != 0)
8112
t3free(src_pages_[i]);
8115
/* delete the source page index array */
8121
* Source page structure. Each page tracks a block of source lines.
8123
const size_t TCTOK_SRC_PAGE_CNT = 1024;
8124
struct CTcTokSrcPage
8127
* Array of line entries on this page. Each entry is zero if it
8128
* hasn't been assigned yet, and contains the absolute image file
8129
* address of the generated code for the source line if it has been
8132
ulong ofs[TCTOK_SRC_PAGE_CNT];
8139
void CTcTokFileDesc::add_source_line(ulong linenum, ulong line_addr)
8144
/* get the index of the page containing this source line */
8145
page_idx = linenum / TCTOK_SRC_PAGE_CNT;
8147
/* get the index of the entry within the page */
8148
idx = linenum % TCTOK_SRC_PAGE_CNT;
8151
* determine if our page index table is large enough, and expand it
8154
if (page_idx >= src_pages_alo_)
8159
/* allocate or expand the source pages array */
8160
new_alo = page_idx + 16;
8161
siz = new_alo * sizeof(src_pages_[0]);
8162
if (src_pages_ == 0)
8163
src_pages_ = (CTcTokSrcPage **)t3malloc(siz);
8165
src_pages_ = (CTcTokSrcPage **)t3realloc(src_pages_, siz);
8167
/* clear the new part */
8168
memset(src_pages_ + src_pages_alo_, 0,
8169
(new_alo - src_pages_alo_) * sizeof(src_pages_[0]));
8171
/* remember the new allocation size */
8172
src_pages_alo_ = new_alo;
8175
/* if this page isn't allocated, do so now */
8176
if (src_pages_[page_idx] == 0)
8178
/* allocate the new page */
8179
src_pages_[page_idx] = (CTcTokSrcPage *)
8180
t3malloc(sizeof(CTcTokSrcPage));
8183
memset(src_pages_[page_idx], 0, sizeof(CTcTokSrcPage));
8187
* if this source line entry has been previously set, don't change
8188
* it; otherwise, store the new setting
8190
if (src_pages_[page_idx]->ofs[idx] == 0)
8191
src_pages_[page_idx]->ofs[idx] = line_addr;
8195
* Enumerate source lines
8197
void CTcTokFileDesc::enum_source_lines(void (*cbfunc)(void *, ulong, ulong),
8203
/* loop over all of the pages */
8204
for (page_idx = 0, pg = src_pages_ ; page_idx < src_pages_alo_ ;
8211
/* if this page is not populated, skip it */
8215
/* calculate the starting line number for this page */
8216
linenum = page_idx * TCTOK_SRC_PAGE_CNT;
8218
/* loop over the entries on this page */
8219
for (i = 0, p = (*pg)->ofs ; i < TCTOK_SRC_PAGE_CNT ;
8220
++i, ++p, ++linenum)
8222
/* if this entry has been set, call the callback */
8224
(*cbfunc)(cbctx, linenum, *p);
8229
/* ------------------------------------------------------------------------ */
8231
* #define symbol table hash entry
8237
CTcHashEntryPpDefine::CTcHashEntryPpDefine(const textchar_t *str, size_t len,
8238
int copy, int has_args, int argc,
8241
const size_t *argvlen,
8242
const char *expansion,
8244
: CTcHashEntryPp(str, len, copy)
8246
/* copy the argument list if necessary */
8247
has_args_ = has_args;
8248
has_varargs_ = has_varargs;
8254
/* allocate the argument list */
8255
argv_ = (char **)t3malloc(argc * sizeof(*argv_));
8257
/* allocate the parameters hash table */
8258
params_table_ = new CVmHashTable(16, new CVmHashFuncCS(), TRUE);
8260
/* allocate the entry list */
8261
arg_entry_ = (CTcHashEntryPpArg **)
8262
t3malloc(argc * sizeof(arg_entry_[0]));
8264
/* copy the arguments */
8265
for (i = 0 ; i < argc ; ++i)
8267
CTcHashEntryPpArg *entry;
8269
/* copy the argument name */
8270
argv_[i] = lib_copy_str(argv[i], argvlen[i]);
8273
* Create the hash entries for this parameters. We'll use
8274
* this entry to look up tokens in the expansion text for
8275
* matches to the formal names when expanding the macro.
8277
* Note that we'll refer directly to our local copy of the
8278
* argument name, so we don't need to make another copy in
8281
entry = new CTcHashEntryPpArg(argv_[i], argvlen[i], FALSE, i);
8282
params_table_->add(entry);
8284
/* add it to our by-index list */
8285
arg_entry_[i] = entry;
8296
/* save the expansion */
8297
expan_ = lib_copy_str(expansion, expan_len);
8298
expan_len_ = expan_len;
8304
CTcHashEntryPpDefine::~CTcHashEntryPpDefine()
8308
/* delete the argument list */
8311
/* delete each argument string */
8312
for (i = 0 ; i < argc_ ; ++i)
8313
lib_free_str(argv_[i]);
8315
/* delete the argument vector */
8318
/* delete the argument entry list */
8321
/* delete the hash table */
8322
delete params_table_;
8325
/* delete the expansion */
8326
lib_free_str(expan_);
8330
* __LINE__ static buffer
8332
char CTcHashEntryPpLINE::buf_[20];
8335
/* ------------------------------------------------------------------------ */
8337
* Load macro definitions from a file.
8339
int CTcTokenizer::load_macros_from_file(CVmStream *fp,
8340
CTcTokLoadMacErr *err_handler)
8345
char *argv[TOK_MAX_MACRO_ARGS];
8346
size_t argvlen[TOK_MAX_MACRO_ARGS];
8352
/* we haven't allocated any argument buffers yet */
8355
/* allocate an initial expansion buffer */
8357
expan = (char *)t3malloc(expmaxlen);
8359
/* presume success */
8362
/* read the number of macros */
8363
cnt = fp->read_uint4();
8365
/* read each macro */
8366
for (i = 0 ; i < cnt ; ++i)
8368
char namebuf[TOK_SYM_MAX_LEN];
8373
CTcHashEntryPp *entry;
8377
/* read the name's length */
8378
namelen = fp->read_uint2();
8379
if (namelen > sizeof(namebuf))
8381
/* log an error through the handler */
8382
err_handler->log_error(1);
8384
/* give up - we can't read any more of the file */
8390
fp->read_bytes(namebuf, namelen);
8392
/* read and decode the flags */
8393
flags = fp->read_uint2();
8394
has_args = ((flags & 1) != 0);
8395
has_varargs = ((flags & 2) != 0);
8397
/* read the number of arguments, and read each argument */
8398
argc = fp->read_uint2();
8399
for (curarg = 0 ; curarg < argc ; ++curarg)
8401
/* read the length, and make sure it's valid */
8402
argvlen[curarg] = fp->read_uint2();
8403
if (argvlen[curarg] > TOK_SYM_MAX_LEN)
8406
err_handler->log_error(2);
8408
/* give up - we can't read any more of the file */
8414
* if we haven't allocated a buffer for this argument slot yet,
8415
* allocate it now; allocate the buffer at the maximum symbol
8416
* size, so we can reuse the same buffer for an argument of
8417
* other macros we read later
8419
while (curarg >= maxarg)
8420
argv[maxarg++] = (char *)t3malloc(TOK_SYM_MAX_LEN);
8422
/* read the argument text */
8423
fp->read_bytes(argv[curarg], argvlen[curarg]);
8426
/* read the expansion size */
8427
explen = (size_t)fp->read_uint4();
8429
/* expand the expansion buffer if necessary */
8430
if (explen > expmaxlen)
8433
* overshoot a bit, so that we won't have to reallocate again
8434
* if we find a slightly larger expansion for a future macro
8436
expmaxlen = explen + 512;
8438
/* allocate the new buffer */
8439
expan = (char *)t3realloc(expan, expmaxlen);
8442
/* read the expansion */
8443
fp->read_bytes(expan, explen);
8446
* Before we create the entry, check to see if there's an existing
8447
* entry with the same name.
8449
entry = find_define(namebuf, namelen);
8453
* We have another entry. If the entry is exactly the same,
8454
* then we can simply skip the current entry, because we simply
8455
* want to keep one copy of each macro that's defined
8456
* identically in mutiple compilation macros. If the entry is
8457
* different from the new one, delete both - a macro which
8458
* appears in two or more compilation units with different
8459
* meanings is NOT a global macro, and thus we can't include it
8460
* in the debugging records.
8462
if (entry->is_pseudo()
8463
|| entry->has_args() != has_args
8464
|| entry->has_varargs() != has_varargs
8465
|| entry->get_argc() != (int)argc
8466
|| entry->get_expan_len() != explen
8467
|| memcmp(entry->get_expansion(), expan, explen) != 0)
8470
* The existing entry is different from the new entry, so
8471
* the macro has different meanings in different
8472
* compilation units, hence we cannot keep *either*
8473
* definition in the debug records. Delete the existing
8474
* macro, and do not create the new macro. If the existing
8475
* macro is a pseudo-macro, keep the old one (since it's
8476
* provided by the compiler itself), but still discard the
8479
if (!entry->is_pseudo())
8480
undefine(namebuf, namelen);
8485
* The new entry is identical to the old one, so keep it.
8486
* We only need one copy of the entry, though, so simply
8487
* keep the old one - there's no need to create a new entry
8488
* for the object file data.
8495
* There's no existing macro with the same name, so create a
8496
* new entry based on the object file data.
8498
entry = new CTcHashEntryPpDefine(namebuf, namelen, TRUE,
8499
has_args, argc, has_varargs,
8500
(const char **)argv, argvlen,
8503
/* add it to the preprocessor's macro symbol table */
8509
/* free the argument buffers we allocated */
8510
for (curarg = 0 ; curarg < maxarg ; ++curarg)
8511
t3free(argv[curarg]);
8513
/* free the expansion buffer */
8520
/* ------------------------------------------------------------------------ */
8522
* Callback context for writing enumerated #define symbols to a file
8524
struct write_macro_ctx_t
8526
/* object file we're writing to */
8529
/* number of symbols written so far */
8534
* Enumeration callback for writing the #define symbols to a file
8536
static void write_macros_cb(void *ctx0, CTcHashEntryPp *entry)
8538
write_macro_ctx_t *ctx = (write_macro_ctx_t *)ctx0;
8541
CVmFile *fp = ctx->fp;
8544
* if this is a pseudo-macro (such as __LINE__ or __FILE__), ignore it
8545
* - these macros do not have permanent global definitions, so they're
8546
* not usable in the debugger
8548
if (entry->is_pseudo())
8552
* If the macro was ever redefined or undefined, ignore it - the
8553
* debugger can only use truly global macros, which are macros that
8554
* have stable meanings throughout the compilation units where they
8555
* appear (and which do not have different meanings in different
8556
* compilation units, but that's not our concern at the moment). The
8557
* preprocessor keeps an "undef" table of everything undefined
8558
* (explicitly, or implicitly via redefinition), so look up this macro
8559
* in the undef table, and ignore the macro if it we find it.
8561
if (G_tok->find_undef(entry->getstr(), entry->getlen()) != 0)
8564
/* count this macro */
8567
/* write the macro's name */
8568
fp->write_int2(entry->getlen());
8569
fp->write_bytes(entry->getstr(), entry->getlen());
8571
/* write the flag bits */
8573
if (entry->has_args()) flags |= 1;
8574
if (entry->has_varargs()) flags |= 2;
8575
fp->write_int2(flags);
8577
/* write the number of arguments, and write each argument */
8578
fp->write_int2(entry->get_argc());
8579
for (i = 0 ; i < entry->get_argc() ; ++i)
8581
CTcHashEntryPpArg *arg;
8583
/* get the argument */
8584
arg = entry->get_arg_entry(i);
8586
/* write the parameter name */
8587
fp->write_int2(arg->getlen());
8588
fp->write_bytes(arg->getstr(), arg->getlen());
8591
/* write the expansion */
8592
fp->write_int4(entry->get_expan_len());
8593
fp->write_bytes(entry->get_expansion(), entry->get_expan_len());
8597
* Write all #define symbols to a file, for debugging purposes. Writes
8598
* only symbols that have never been undefined or redefined, since the
8599
* debugger can only make use of global symbols (i.e., symbols with
8600
* consistent meanings through all compilation units in which they
8603
void CTcTokenizer::write_macros_to_file_for_debug(CVmFile *fp)
8607
write_macro_ctx_t ctx;
8609
/* write a placeholder for the symbol count */
8610
pos = fp->get_pos();
8613
/* write the symbols */
8616
enum_defines(&write_macros_cb, &ctx);
8618
/* go back and fix up the symbol count */
8619
endpos = fp->get_pos();
8621
fp->write_int4(ctx.cnt);
8623
/* seek back to where we left off */
8624
fp->set_pos(endpos);