~martin-decky/helenos/rcu : contents of uspace/app/sbi/src/lex.c at revision 720.2.56

~martin-decky/helenos/rcu : (revision 720.2.56)
/*
 * Copyright (c) 2011 Jiri Svoboda
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 * - The name of the author may not be used to endorse or promote products
 *   derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/** @file Lexer (lexical analyzer).
 *
 * Consumes a text file and produces a sequence of lexical elements (lems).
 */

#include <stdio.h>
#include <stdlib.h>
#include "bigint.h"
#include "cspan.h"
#include "mytypes.h"
#include "input.h"
#include "os/os.h"
#include "strtab.h"

#include "lex.h"

#define TAB_WIDTH 8

typedef enum {
	cs_chr,
	cs_str
} chr_str_t;

static void lex_touch(lex_t *lex);
static bool_t lex_read_try(lex_t *lex);

static void lex_skip_comment(lex_t *lex);
static void lex_skip_ws(lex_t *lex);
static bool_t is_wstart(char c);
static bool_t is_wcont(char c);
static bool_t is_digit(char c);
static void lex_word(lex_t *lex);
static void lex_char(lex_t *lex);
static void lex_number(lex_t *lex);
static void lex_string(lex_t *lex);
static void lex_char_string_core(lex_t *lex, chr_str_t cs);
static int digit_value(char c);

/* Note: This imposes an implementation limit on identifier length. */
#define IBUF_SIZE 128
static char ident_buf[IBUF_SIZE + 1];

/* XXX This imposes an implementation limit on string literal length. */
#define SLBUF_SIZE 128
static char strlit_buf[SLBUF_SIZE + 1];

/** Lclass-string pair */
struct lc_name {
	lclass_t lclass;
	const char *name;
};

/** Keyword names. Used both for printing and recognition. */
static struct lc_name keywords[] = {
	{ lc_and,	"and" },
	{ lc_as,	"as" },
	{ lc_bool,	"bool" },
	{ lc_break,	"break" },
	{ lc_builtin,	"builtin" },
	{ lc_char,	"char" },
	{ lc_class,	"class" },
	{ lc_deleg,	"deleg" },
	{ lc_do,	"do" },
	{ lc_elif,	"elif" },
	{ lc_else,	"else" },
	{ lc_end,	"end" },
	{ lc_enum,	"enum" },
	{ lc_except,	"except" },
	{ lc_false,	"false" },
	{ lc_finally,	"finally" },
	{ lc_for,	"for" },
	{ lc_fun,	"fun" },
	{ lc_get,	"get" },
	{ lc_if,	"if" },
	{ lc_in,	"in" },
	{ lc_int,	"int" },
	{ lc_interface,	"interface" },
	{ lc_is,	"is" },
	{ lc_new,	"new" },
	{ lc_not,	"not" },
	{ lc_nil,	"nil" },
	{ lc_or,	"or" },
	{ lc_override,	"override" },
	{ lc_packed,	"packed" },
	{ lc_private,	"private" },
	{ lc_prop,	"prop" },
	{ lc_protected,	"protected" },
	{ lc_public,	"public" },
	{ lc_raise,	"raise" },
	{ lc_resource,	"resource" },
	{ lc_return,	"return" },
	{ lc_self,	"self" },
	{ lc_set,	"set" },
	{ lc_static,	"static" },
	{ lc_string,	"string" },
	{ lc_struct,	"struct" },
	{ lc_switch,	"switch" },
	{ lc_then,	"then" },
	{ lc_this,	"this" },
	{ lc_true,	"true" },
	{ lc_var,	"var" },
	{ lc_with,	"with" },
	{ lc_when,	"when" },
	{ lc_while,	"while" },
	{ lc_yield,	"yield" },

	{ 0,		NULL }
};

/** Other simple lclasses. Only used for printing. */
static struct lc_name simple_lc[] = {
	{ lc_invalid,	"INVALID" },
	{ lc_eof,	"EOF" },

	/* Operators */
	{ lc_period,	"." },
	{ lc_slash,	"/" },
	{ lc_lparen,	"(" },
	{ lc_rparen,	")" },
	{ lc_lsbr,	"[" },
	{ lc_rsbr,	"]" },
	{ lc_equal,	"==" },
	{ lc_notequal,	"!=" },
	{ lc_lt,	"<" },
	{ lc_gt,	">" },
	{ lc_lt_equal,	"<=" },
	{ lc_gt_equal,	">=" },
	{ lc_assign,	"=" },
	{ lc_plus,	"+" },
	{ lc_minus,	"-" },
	{ lc_mult,	"*" },
	{ lc_increase,	"+=" },

	/* Punctuators */
	{ lc_comma,	"," },
	{ lc_colon,	":" },
	{ lc_scolon,	";" },

	{ 0,		NULL },
};

/** Print lclass value.
 *
 * Prints lclass (lexical element class) value in human-readable form
 * (for debugging).
 *
 * @param lclass	Lclass value for display.
 */
void lclass_print(lclass_t lclass)
{
	struct lc_name *dp;

	dp = keywords;
	while (dp->name != NULL) {
		if (dp->lclass == lclass) {
			printf("%s", dp->name);
			return;
		}
		++dp;
	}

	dp = simple_lc;
	while (dp->name != NULL) {
		if (dp->lclass == lclass) {
			printf("%s", dp->name);
			return;
		}
		++dp;
	}

	switch (lclass) {
	case lc_ident:
		printf("ident");
		break;
	case lc_lit_int:
		printf("int_literal");
		break;
	case lc_lit_string:
		printf("string_literal");
		break;
	default:
		printf("<unknown?>");
		break;
	}
}

/** Print lexical element.
 *
 * Prints lexical element in human-readable form (for debugging).
 *
 * @param lem		Lexical element for display.
 */
void lem_print(lem_t *lem)
{
	lclass_print(lem->lclass);

	switch (lem->lclass) {
	case lc_ident:
		printf("('%s')", strtab_get_str(lem->u.ident.sid));
		break;
	case lc_lit_int:
		printf("(");
		bigint_print(&lem->u.lit_int.value);
		printf(")");
		break;
	case lc_lit_string:
		printf("(\"%s\")", lem->u.lit_string.value);
	default:
		break;
	}
}

/** Print lem coordinates.
 *
 * Print the coordinates (line number, column number) of a lexical element.
 *
 * @param lem		Lexical element for coordinate printing.
 */
void lem_print_coords(lem_t *lem)
{
	cspan_print(lem->cspan);
}

/** Initialize lexer instance.
 *
 * @param lex		Lexer object to initialize.
 * @param input		Input to associate with lexer.
 */
void lex_init(lex_t *lex, struct input *input)
{
	int rc;

	lex->input = input;

	rc = input_get_line(lex->input, &lex->inbuf);
	if (rc != EOK) {
		printf("Error reading input.\n");
		exit(1);
	}

	lex->ibp = lex->inbuf;
	lex->col_adj = 0;
	lex->prev_valid = b_false;
	lex->current_valid = b_true;
}

/** Advance to next lexical element.
 *
 * The new element is read in lazily then it is actually accessed.
 *
 * @param lex		Lexer object.
 */
void lex_next(lex_t *lex)
{
	/* Make sure the current lem has already been read in. */
	lex_touch(lex);

	/* Force a new lem to be read on next access. */
	lex->current_valid = b_false;
}

/** Get current lem.
 *
 * The returned pointer is invalidated by next call to lex_next()
 *
 * @param lex		Lexer object.
 * @return		Pointer to current lem. Owned by @a lex and only valid
 *			until next call to lex_xxx().
 */
lem_t *lex_get_current(lex_t *lex)
{
	lex_touch(lex);
	return &lex->current;
}

/** Get previous lem if valid.
 *
 * The returned pointer is invalidated by next call to lex_next()
 *
 * @param lex		Lexer object.
 * @return		Pointer to previous lem. Owned by @a lex and only valid
 *			until next call to lex_xxx().
 */
lem_t *lex_peek_prev(lex_t *lex)
{
	if (lex->current_valid == b_false) {
		/*
		 * This means the head is advanced but next lem was not read.
		 * Thus the previous lem is still in @a current.
		 */
		return &lex->current;
	}

	if (lex->prev_valid != b_true) {
		/* Looks like we are still at the first lem. */
		return NULL;
	}

	/*
	 * Current lem has been read in. Thus the previous lem was moved to
	 * @a previous.
	 */
	return &lex->prev;
}

/** Read in the current lexical element (unless already read in).
 *
 * @param lex		Lexer object.
 */
static void lex_touch(lex_t *lex)
{
	bool_t got_lem;

	if (lex->current_valid == b_true)
		return;

	/* Copy previous lem */
	lex->prev = lex->current;
	lex->prev_valid = b_true;

	do {
		got_lem = lex_read_try(lex);
	} while (got_lem == b_false);

	lex->current_valid = b_true;
}

/** Try reading next lexical element.
 *
 * Attemps to read the next lexical element. In some cases (such as a comment)
 * this function will need to give it another try and returns @c b_false
 * in such case.
 *
 * @param lex		Lexer object.
 * @return		@c b_true on success or @c b_false if it needs
 *			restarting. On success the lem is stored to
 *			the current lem in @a lex.
 */
static bool_t lex_read_try(lex_t *lex)
{
	char *bp, *lsp;
	int line0, col0;

	lex_skip_ws(lex);

	/*
	 * Record lem coordinates. Line number we already have. For column
	 * number we start with position in the input buffer. This works
	 * for all characters except tab. Thus we keep track of tabs
	 * separately using col_adj.
	 */
	line0 = input_get_line_no(lex->input);
	col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);

	lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);

	lsp = lex->ibp;
	bp = lex->ibp;

	if (bp[0] == '\0') {
		/* End of input */
		lex->current.lclass = lc_eof;
		goto finish;
	}

	if (is_wstart(bp[0])) {
		lex_word(lex);
		goto finish;
	}

	if (bp[0] == '\'') {
		lex_char(lex);
		goto finish;
	}

	if (is_digit(bp[0])) {
		lex_number(lex);
		goto finish;
	}

	if (bp[0] == '"') {
		lex_string(lex);
		goto finish;
	}

	if (bp[0] == '-' && bp[1] == '-') {
		lex_skip_comment(lex);

		/* Compute ending column number */
		lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;

		/* Try again */
		return b_false;
	}

	switch (bp[0]) {
	case ',': lex->current.lclass = lc_comma; ++bp; break;
	case ':': lex->current.lclass = lc_colon; ++bp; break;
	case ';': lex->current.lclass = lc_scolon; ++bp; break;

	case '.': lex->current.lclass = lc_period; ++bp; break;
	case '/': lex->current.lclass = lc_slash; ++bp; break;
	case '(': lex->current.lclass = lc_lparen; ++bp; break;
	case ')': lex->current.lclass = lc_rparen; ++bp; break;
	case '[': lex->current.lclass = lc_lsbr; ++bp; break;
	case ']': lex->current.lclass = lc_rsbr; ++bp; break;

	case '=':
		if (bp[1] == '=') {
			lex->current.lclass = lc_equal; bp += 2; break;
		}
		lex->current.lclass = lc_assign; ++bp; break;

	case '!':
		if (bp[1] == '=') {
			lex->current.lclass = lc_notequal; bp += 2; break;
		}
		goto invalid;

	case '+':
		if (bp[1] == '=') {
			lex->current.lclass = lc_increase; bp += 2; break;
		}
		lex->current.lclass = lc_plus; ++bp; break;

	case '-':
		lex->current.lclass = lc_minus; ++bp; break;

	case '*':
		lex->current.lclass = lc_mult; ++bp; break;

	case '<':
		if (bp[1] == '=') {
			lex->current.lclass = lc_lt_equal; bp += 2; break;
		}
		lex->current.lclass = lc_lt; ++bp; break;

	case '>':
		if (bp[1] == '=') {
			lex->current.lclass = lc_gt_equal; bp += 2; break;
		}
		lex->current.lclass = lc_gt; ++bp; break;

	default:
		goto invalid;
	}

	lex->ibp = bp;

finish:
	/* Compute ending column number */
	lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
	return b_true;

invalid:
	lex->current.lclass = lc_invalid;
	++bp;
	lex->ibp = bp;

	return b_true;
}

/** Lex a word (identifier or keyword).
 *
 * Read in a word. This may later turn out to be a keyword or a regular
 * identifier. It is stored in the current lem in @a lex.
 *
 * @param lex		Lexer object.
 */
static void lex_word(lex_t *lex)
{
	struct lc_name *dp;
	char *bp;
	int idx;

	bp = lex->ibp;
	ident_buf[0] = bp[0];
	idx = 1;

	while (is_wcont(bp[idx])) {
		if (idx >= IBUF_SIZE) {
			printf("Error: Identifier too long.\n");
			exit(1);
		}

		ident_buf[idx] = bp[idx];
		++idx;
	}

	lex->ibp = bp + idx;

	ident_buf[idx] = '\0';

	dp = keywords;
	while (dp->name != NULL) {
		if (os_str_cmp(ident_buf, dp->name) == 0) {
			/* Match */
			lex->current.lclass = dp->lclass;
			return;
		}
		++dp;
	}

	/* No matching keyword -- it must be an identifier. */
	lex->current.lclass = lc_ident;
	lex->current.u.ident.sid = strtab_get_sid(ident_buf);
}

/** Lex a character literal.
 *
 * Reads in a character literal and stores it in the current lem in @a lex.
 *
 * @param lex		Lexer object.
 */
static void lex_char(lex_t *lex)
{
	size_t len;
	int char_val;

	lex_char_string_core(lex, cs_chr);

	len = os_str_length(strlit_buf);
	if (len != 1) {
		printf("Character literal should contain one character, "
		    "but contains %u characters instead.\n", (unsigned) len);
		exit(1);
	}

	os_str_get_char(strlit_buf, 0, &char_val);
	lex->current.lclass = lc_lit_char;
	bigint_init(&lex->current.u.lit_char.value, char_val);
}

/** Lex a numeric literal.
 *
 * Reads in a numeric literal and stores it in the current lem in @a lex.
 *
 * @param lex		Lexer object.
 */
static void lex_number(lex_t *lex)
{
	char *bp;
	bigint_t value;
	bigint_t dgval;
	bigint_t base;
	bigint_t tprod;

	bp = lex->ibp;

	bigint_init(&value, 0);
	bigint_init(&base, 10);

	while (is_digit(*bp)) {
		bigint_mul(&value, &base, &tprod);
		bigint_init(&dgval, digit_value(*bp));

		bigint_destroy(&value);
		bigint_add(&tprod, &dgval, &value);
		bigint_destroy(&tprod);
		bigint_destroy(&dgval);

		++bp;
	}

	bigint_destroy(&base);

	lex->ibp = bp;

	lex->current.lclass = lc_lit_int;
	bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
}

/** Lex a string literal.
 *
 * Reads in a string literal and stores it in the current lem in @a lex.
 *
 * @param lex		Lexer object.
 */
static void lex_string(lex_t *lex)
{
	lex_char_string_core(lex, cs_str);

	lex->current.lclass = lc_lit_string;
	lex->current.u.lit_string.value = os_str_dup(strlit_buf);
}

static void lex_char_string_core(lex_t *lex, chr_str_t cs)
{
	char *bp;
	int sidx, didx;
	char term;
	const char *descr, *cap_descr;
	char spchar;

	/* Make compiler happy */
	term = '\0';
	descr = NULL;
	cap_descr = NULL;

	switch (cs) {
	case cs_chr:
		term = '\'';
		descr = "character";
		cap_descr = "Character";
		break;
	case cs_str:
		term = '"';
		descr = "string";
		cap_descr = "String";
		break;
	}

	bp = lex->ibp + 1;
	sidx = didx = 0;

	while (bp[sidx] != term) {
		if (didx >= SLBUF_SIZE) {
			printf("Error: %s literal too long.\n", cap_descr);
			exit(1);
		}

		if (bp[sidx] == '\0') {
			printf("Error: Unterminated %s literal.\n", descr);
			exit(1);
		}

		if (bp[sidx] == '\\') {
			switch (bp[sidx + 1]) {
			case '\\':
				spchar = '\\';
				break;
			case '\'':
				spchar = '\'';
				break;
			case '"':
				spchar = '"';
				break;
			case 'n':
				spchar = '\n';
				break;
			case 't':
				spchar = '\t';
				break;
			default:
				printf("Error: Unknown character escape sequence.\n");
				exit(1);
			}

			strlit_buf[didx] = spchar;
			++didx;
			sidx += 2;
		} else {
			strlit_buf[didx] = bp[sidx];
			++sidx; ++didx;
		}
	}

	lex->ibp = bp + sidx + 1;

	strlit_buf[didx] = '\0';
}

/** Lex a single-line comment.
 *
 * This does not produce any lem. The comment is just skipped.
 *
 * @param lex		Lexer object.
 */
static void lex_skip_comment(lex_t *lex)
{
	char *bp;

	bp = lex->ibp + 2;

	while (*bp != '\n' && *bp != '\0') {
		++bp;
	}

	lex->ibp = bp;
}

/** Skip whitespace characters.
 *
 * This does not produce any lem. The whitespace is just skipped.
 *
 * @param lex		Lexer object.
 */
static void lex_skip_ws(lex_t *lex)
{
	char *bp;
	int rc;

	bp = lex->ibp;

	while (b_true) {
		while (*bp == ' ' || *bp == '\t') {
			if (*bp == '\t') {
				/* XXX This is too simplifed. */
				lex->col_adj += (TAB_WIDTH - 1);
			}
			++bp;
		}

		if (*bp != '\n')
			break;

		/* Read next line */
		rc = input_get_line(lex->input, &lex->inbuf);
		if (rc != EOK) {
			printf("Error reading input.\n");
			exit(1);
		}

		bp = lex->inbuf;
		lex->col_adj = 0;
	}

	lex->ibp = bp;
}

/** Determine if character can start a word.
 *
 * @param c 	Character.
 * @return	@c b_true if @a c can start a word, @c b_false otherwise.
 */
static bool_t is_wstart(char c)
{
	return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
	    (c == '_');
}

/** Determine if character can continue a word.
 *
 * @param c 	Character.
 * @return	@c b_true if @a c can start continue word, @c b_false
 *		otherwise.
 */
static bool_t is_wcont(char c)
{
	return is_digit(c) || is_wstart(c);
}

/** Determine if character is a numeric digit.
 *
 * @param c 	Character.
 * @return	@c b_true if @a c is a numeric digit, @c b_false otherwise.
 */
static bool_t is_digit(char c)
{
	return ((c >= '0') && (c <= '9'));
}

/** Determine numeric value of digit character.
 *
 * @param c 	Character, must be a valid decimal digit.
 * @return	Value of the digit (0-9).
 */
static int digit_value(char c)
{
	return (c - '0');
}