1
/* This file is part of Malaga, a system for Natural Language Analysis.
2
* Copyright (C) 1995-1999 Bjoern Beutel
5
* Universitaet Erlangen-Nuernberg
6
* Abteilung fuer Computerlinguistik
9
* e-mail: malaga@linguistik.uni-erlangen.de
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; either version 2 of the License, or
14
* (at your option) any later version.
16
* This program is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
* GNU General Public License for more details.
21
* You should have received a copy of the GNU General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
25
/* description ==============================================================*/
27
/* This module supports scanning (lexical analysis) of malaga source files. */
29
/* includes =================================================================*/
43
/* constants ================================================================*/
45
#define INCLUDE_LEVEL_MAX 10 /* maximum number of nested includes */
47
LOCAL struct { string_t name; int_t code; } keywords[NUMBER_OF_KEYWORDS] =
48
/* list of all keywords and their token codes
49
* (this list must be maintained in alphabetical order) */
51
{ "accept", TOK_ACCEPT },
52
{ "allo_rule", TOK_ALLO_RULE },
54
{ "assert", TOK_ASSERT },
55
{ "choose", TOK_CHOOSE },
56
{ "combi_rule", TOK_COMBI_RULE },
57
{ "define", TOK_DEFINE },
59
{ "elseif", TOK_ELSEIF },
61
{ "end_rule", TOK_END_RULE },
62
{ "error", TOK_ERROR },
64
{ "foreach", TOK_FOREACH },
65
{ "greater", TOK_GREATER },
66
{ "greater_equal", TOK_GREATER_EQUAL },
69
{ "include", TOK_INCLUDE },
70
{ "initial", TOK_INITIAL },
71
{ "input_filter", TOK_INPUT_FILTER },
73
{ "less_equal", TOK_LESS_EQUAL },
74
{ "matches", TOK_MATCHES },
77
{ "output_filter", TOK_OUTPUT_FILTER },
78
{ "parallel", TOK_PARALLEL },
79
{ "pruning_rule", TOK_PRUNING_RULE },
80
{ "repeat", TOK_REPEAT },
81
{ "require", TOK_REQUIRE },
82
{ "result", TOK_RESULT },
83
{ "return", TOK_RETURN },
84
{ "robust_rule", TOK_ROBUST_RULE },
85
{ "rules", TOK_RULES },
86
{ "subrule", TOK_SUBRULE },
88
{ "while", TOK_WHILE }
91
/* types ====================================================================*/
93
typedef struct /* a source stream for lexical analysis */
95
FILE *stream; /* the input stream for this include level */
96
string_t file_name; /* the name of the input file */
97
int_t column; /* column that has been read */
98
int_t line_number; /* number of the line that has been read */
101
/* variables ================================================================*/
103
LOCAL source_t sources[INCLUDE_LEVEL_MAX];
104
/* For each include level, we define a source stream description. */
106
LOCAL int_t include_level = 0; /* current include level */
108
LOCAL source_t *source = NULL; /* points to <sources>[<include_level>-1] */
110
LOCAL string_t scanner_input = NULL;
111
/* If no file is included, the scanner reads its input from <scanner_input> */
113
LOCAL int_t next_char; /* the next char to be read */
115
LOCAL text_t token_text; /* the text of the next token. */
117
/* functions ================================================================*/
119
LOCAL void read_next_char (void)
120
/* Read the next char from input into <next_char>.
121
* If end of input stream is reached, return EOF.
122
* If no input stream is selected, read input from <input_buffer>
123
* If reading from stream, update column information. */
127
next_char = getc (source->stream);
129
if (next_char == EOF && ferror (source->stream))
130
error ("can't read from \"%s\": %s",
131
source->file_name, strerror (errno));
133
if (next_char == '\t')
134
source->column = (source->column + 8) & ~7;
135
else if (next_char == '\n')
138
source->line_number++;
143
else if (scanner_input != NULL && *scanner_input != EOS)
144
next_char = *scanner_input++;
147
scanner_input = NULL;
152
/*---------------------------------------------------------------------------*/
154
LOCAL void read_next_char_again (void)
155
/* Like "read_next_char", but don't update column information. */
159
next_char = getc (source->stream);
161
if (next_char == EOF && ferror (source->stream))
162
error ("can't read from \"%s\"", source->file_name);
164
else if (scanner_input != NULL && *scanner_input != EOS)
165
next_char = *scanner_input++;
168
scanner_input = NULL;
173
/*---------------------------------------------------------------------------*/
175
GLOBAL string_t current_file_name (void)
176
/* Return the name of the file reading from or NULL. */
181
return source->file_name;
184
/*---------------------------------------------------------------------------*/
186
GLOBAL int_t current_line_number (void)
187
/* Return the line number where the last char has been read or -1. */
192
return source->line_number;
195
/*---------------------------------------------------------------------------*/
197
GLOBAL int_t current_column (void)
198
/* Return the column where the last char has been read or -1. */
202
else if (source->column == 0)
205
return source->column - 1; /* Let columns start with 0. */
208
/*---------------------------------------------------------------------------*/
210
GLOBAL void set_scanner_input (string_t input)
211
/* Let the scanner use <input> as scanner input.
212
* <input> must remain valid until the scanner has done its work. */
214
scanner_input = input;
219
/*---------------------------------------------------------------------------*/
221
GLOBAL void begin_include (string_t file_name)
222
/* Open a new level of inclusion and read tokens from file <file_name>. */
226
if (include_level >= INCLUDE_LEVEL_MAX)
227
error ("too many nested includes");
229
stream = open_stream (file_name, "r");
231
/* Next char of old source should be read later. */
233
ungetc (next_char, source->stream);
234
else if (scanner_input != NULL)
237
source = sources + include_level;
239
source->file_name = file_name;
240
source->line_number = 1;
242
source->stream = stream;
247
/*---------------------------------------------------------------------------*/
249
GLOBAL void end_include (void)
250
/* Stop reading from current source stream and read from former stream. */
252
DB_ASSERT (include_level > 0);
254
close_stream (&source->stream, source->file_name);
257
if (include_level > 0)
258
source = sources + include_level - 1;
262
if (source != NULL || scanner_input != NULL)
264
read_next_char_again ();
269
/*---------------------------------------------------------------------------*/
271
GLOBAL void stop_scanner (void)
272
/* Stop the scanner in case of an emergency. */
277
scanner_input = NULL;
278
for (i = 0; i < include_level; i++)
279
close_stream (&sources[i].stream, NULL);
283
/*---------------------------------------------------------------------------*/
285
LOCAL void read_name (void)
286
/* Read rule name, variable, or keyword into <token_name>. */
288
clear_text (&token_text);
290
while (next_char != EOF &&
291
(next_char == '_' || next_char == '&' || next_char == '|'
292
|| IS_ALPHA (next_char) || isdigit (next_char)))
294
add_char_to_text (token_text, next_char);
298
token_name = text_string (token_text);
299
if (*token_name == EOS)
300
error ("illegal character in name");
303
/*---------------------------------------------------------------------------*/
305
LOCAL int_t keyword (string_t name)
306
/* Look up <name> in the keyword table and return its token value.
307
* If <name> is no keyword, return TOK_IDENT. */
310
int_t upper = NUMBER_OF_KEYWORDS - 1;
312
/* We do a binary search on the keywords.
313
* A keyword must be in the range of keywords[lower..upper]. */
314
while (lower <= upper)
316
int_t middle = (lower + upper) / 2;
317
int_t result = strcmp_no_case (name, keywords[middle].name);
324
return keywords[middle].code;
329
/*---------------------------------------------------------------------------*/
331
LOCAL void read_number (void)
332
/* Read a floating point number. Save its value in <token_number>. */
334
clear_text (&token_text);
336
while (isdigit (next_char))
338
add_char_to_text (token_text, next_char);
342
if (next_char == '.')
344
add_char_to_text (token_text, next_char);
347
if (! isdigit (next_char))
348
error ("missing digits after \".\"");
350
while (isdigit (next_char))
352
add_char_to_text (token_text, next_char);
357
if (next_char == 'E' || next_char == 'e') /* Read an exponent. */
359
add_char_to_text (token_text, next_char);
362
if (next_char == '-' || next_char == '+')
364
add_char_to_text (token_text, next_char);
368
if (! isdigit (next_char))
369
error ("missing exponent");
371
while (isdigit (next_char))
373
add_char_to_text (token_text, next_char);
377
token_name = text_string (token_text);
378
if (sscanf (token_name, "%lf", &token_number) != 1)
379
error ("illegal double value");
382
/*---------------------------------------------------------------------------*/
384
GLOBAL void read_next_token (void)
385
/* Read the next token from current source into <next_token>.
386
* If end of input stream is reached, return EOF. */
388
/* Read chars until a token has been recognised. */
397
case ' ': /* Read over whitespace. */
403
case '#': /* Read over a comment. */
407
} while (next_char != '\n' && next_char != EOF);
410
case '\"': /* Read a string. */
411
clear_text (&token_text);
412
read_next_char (); /* overread beginning '"' */
413
while (next_char != '\"')
415
if (next_char == '\\')
417
/* See if we get '\"'. */
419
if (next_char != '\"')
420
add_to_text (token_text, "\\");
423
if (next_char == EOF || next_char == '\n')
424
error ("unterminated string at end of line");
426
add_char_to_text (token_text, next_char);
429
read_next_char (); /* overread ending '"' */
430
free_mem (&token_string);
431
token_string = new_string (text_string (token_text), NULL);
432
next_token = TOK_STRING;
435
case ':': /* Read a ":", ":=", ":=+", ":=-", ":=*", ":=/". */
437
if (next_char == '=')
440
if (next_char == '+')
442
next_token = TOK_ASSIGN_PLUS;
445
else if (next_char == '-')
447
next_token = TOK_ASSIGN_MINUS;
450
else if (next_char == '*')
452
next_token = TOK_ASSIGN_ASTERISK;
455
else if (next_char == '/')
457
next_token = TOK_ASSIGN_SLASH;
461
next_token = TOK_ASSIGN;
467
case '/': /* Read a "/", a "/=" or a "/~". */
469
if (next_char == '=')
471
next_token = TOK_NOT_EQUAL;
474
else if (next_char == '~')
476
next_token = TOK_NOT_CONGRUENT;
485
if (! isdigit (next_char))
490
token_number = -token_number;
491
next_token = TOK_NUMBER;
495
case '0': case '1': case '2': case '3': case '4':
496
case '5': case '6': case '7': case '8': case '9':
499
next_token = TOK_NUMBER;
505
next_token = TOK_VARIABLE;
511
next_token = TOK_CONSTANT;
515
if (IS_ALPHA (next_char)
516
|| next_char == '_' || next_char == '&' || next_char == '|')
519
next_token = keyword (token_name);
524
next_token = next_char;
532
/*---------------------------------------------------------------------------*/
534
GLOBAL string_t token_as_text (int_t token)
535
/* Return <token> as a string readable for humans.
536
* The string is valid until freed with "free". */
540
/* Look if <token> is a keyword. */
541
for (i = 0; i < NUMBER_OF_KEYWORDS; i++)
543
if (keywords[i].code == token)
544
return concat_strings ("\"", keywords[i].name, "\"", NULL);
549
case EOF: return new_string ("end of input", NULL);
550
case TOK_STRING: return new_string ("string", NULL);
551
case TOK_IDENT: return new_string ("identifier", NULL);
552
case TOK_VARIABLE: return new_string ("variable", NULL);
553
case TOK_CONSTANT: return new_string ("constant", NULL);
554
case TOK_NUMBER: return new_string ("number", NULL);
555
case TOK_ASSIGN: return new_string ("\":=\"", NULL);
556
case TOK_ASSIGN_PLUS: return new_string ("\":=+\"", NULL);
557
case TOK_ASSIGN_MINUS: return new_string ("\":=-\"", NULL);
558
case TOK_ASSIGN_ASTERISK: return new_string ("\":=*\"", NULL);
559
case TOK_ASSIGN_SLASH: return new_string ("\":=/\"", NULL);
560
case TOK_NOT_EQUAL: return new_string ("\"/=\"", NULL);
561
case TOK_NOT_CONGRUENT: return new_string ("\"/~\"", NULL);
564
char token_buffer[2];
566
token_buffer[0] = token;
567
token_buffer[1] = EOS;
568
return new_string_readable (token_buffer, NULL);
573
/*---------------------------------------------------------------------------*/
575
GLOBAL void test_token (int_t token)
576
/* Test if <token> is the next token. If it's not, report an error. */
578
if (next_token != token)
579
error ("%s expected, not %s",
580
token_as_text (token), token_as_text (next_token));
583
/*---------------------------------------------------------------------------*/
585
GLOBAL void parse_token (int_t token)
586
/* Test if <token> is the next token and read next token. */
592
/* end of file ==============================================================*/