/******************************************** scan.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the AWK programming language. Mawk is distributed without warranty under the terms of the GNU General Public License, version 2, 1991. ********************************************/ /* $Log: scan.c,v $ * Revision 1.8 1996/07/28 21:47:05 mike * gnuish patch * * Revision 1.7 1995/06/18 19:42:24 mike * Remove some redundant declarations and add some prototypes * * Revision 1.6 1995/06/10 16:57:52 mike * silently exit(0) if no program * always add a '\n' on eof in scan_fillbuff() * * Revision 1.5 1995/06/06 00:18:33 mike * change mawk_exit(1) to mawk_exit(2) * * Revision 1.4 1994/09/23 00:20:04 mike * minor bug fix: handle \ in eat_nl() * * Revision 1.3 1993/07/17 00:45:21 mike * indent * * Revision 1.2 1993/07/04 12:52:09 mike * start on autoconfig changes * * Revision 1.1.1.1 1993/07/03 18:58:20 mike * move source to cvs * * Revision 5.6 1993/02/13 21:57:33 mike * merge patch3 * * Revision 5.5 1993/01/01 21:30:48 mike * split new_STRING() into new_STRING and new_STRING0 * * Revision 5.4.1.1 1993/01/15 03:33:50 mike * patch3: safer double to int conversion * * Revision 5.4 1992/11/29 18:57:50 mike * field expressions convert to long so 16 bit and 32 bit * systems behave the same * * Revision 5.3 1992/07/08 15:43:41 brennan * patch2: length returns. I am a wimp * * Revision 5.2 1992/02/21 14:16:53 brennan * fix: getline <= * * Revision 5.1 91/12/05 07:56:27 brennan * 1.1 pre-release * */ #include "mawk.h" #include "scan.h" #include "memory.h" #include "field.h" #include "init.h" #include "fin.h" #include "repl.h" #include "code.h" #ifndef NO_FCNTL_H #include #endif #include "files.h" /* static functions */ static void PROTO(scan_fillbuff, (void)) ; static void PROTO(scan_open, (void)) ; static int PROTO(slow_next, (void)) ; static void PROTO(eat_comment, (void)) ; static void PROTO(eat_semi_colon, (void)) ; static double PROTO(collect_decimal, (int, int *)) ; static int PROTO(collect_string, (void)) ; static int PROTO(collect_RE, (void)) ; /*----------------------------- program file management *----------------------------*/ char *pfile_name ; STRING *program_string ; PFILE *pfile_list ; static unsigned char *buffer ; static unsigned char *buffp ; /* unsigned so it works with 8 bit chars */ static int program_fd ; static int eof_flag ; void scan_init(cmdline_program) char *cmdline_program ; { if (cmdline_program) { program_fd = -1 ; /* command line program */ program_string = new_STRING0(strlen(cmdline_program) + 1) ; strcpy(program_string->str, cmdline_program) ; /* simulate file termination */ program_string->str[program_string->len - 1] = '\n' ; buffp = (unsigned char *) program_string->str ; eof_flag = 1 ; } else /* program from file[s] */ { scan_open() ; buffp = buffer = (unsigned char *) zmalloc(BUFFSZ + 1) ; scan_fillbuff() ; } #ifdef OS2 /* OS/2 "extproc" is similar to #! */ if (strnicmp(buffp, "extproc ", 8) == 0) eat_comment(); #endif eat_nl() ; /* scan to first token */ if (next() == 0) { /* no program */ mawk_exit(0) ; } un_next() ; } static void scan_open() /* open pfile_name */ { if (pfile_name[0] == '-' && pfile_name[1] == 0) { program_fd = 0 ; } else if ((program_fd = open(pfile_name, O_RDONLY, 0)) == -1) { errmsg(errno, "cannot open %s", pfile_name) ; mawk_exit(2) ; } } void scan_cleanup() { if (program_fd >= 0) zfree(buffer, BUFFSZ + 1) ; else free_STRING(program_string) ; if (program_fd > 0) close(program_fd) ; /* redefine SPACE as [ \t\n] */ scan_code['\n'] = posix_space_flag && rs_shadow.type != SEP_MLR ? SC_UNEXPECTED : SC_SPACE ; scan_code['\f'] = SC_UNEXPECTED ; /*value doesn't matter */ scan_code['\013'] = SC_UNEXPECTED ; /* \v not space */ scan_code['\r'] = SC_UNEXPECTED ; } /*-------------------------------- global variables shared by yyparse() and yylex() and used for error messages too *-------------------------------*/ int current_token = -1 ; unsigned token_lineno ; unsigned compile_error_count ; int NR_flag ; /* are we tracking NR */ int paren_cnt ; int brace_cnt ; int print_flag ; /* changes meaning of '>' */ int getline_flag ; /* changes meaning of '<' */ /*---------------------------------------- file reading functions next() and un_next(c) are macros in scan.h *---------------------*/ static unsigned lineno = 1 ; static void scan_fillbuff() { unsigned r ; r = fillbuff(program_fd, (char *) buffer, BUFFSZ) ; if (r < BUFFSZ) { eof_flag = 1 ; /* make sure eof is terminated */ buffer[r] = '\n' ; buffer[r + 1] = 0 ; } } /* read one character -- slowly */ static int slow_next() { while (*buffp == 0) { if (!eof_flag) { buffp = buffer ; scan_fillbuff() ; } else if (pfile_list /* open another program file */ ) { PFILE *q ; if (program_fd > 0) close(program_fd) ; eof_flag = 0 ; pfile_name = pfile_list->fname ; q = pfile_list ; pfile_list = pfile_list->link ; ZFREE(q) ; scan_open() ; token_lineno = lineno = 1 ; } else break /* real eof */ ; } return *buffp++ ; /* note can un_next() , eof which is zero */ } static void eat_comment() { register int c ; while ((c = next()) != '\n' && scan_code[c]) ; un_next() ; } /* this is how we handle extra semi-colons that are now allowed to separate pattern-action blocks A proof that they are useless clutter to the language: we throw them away */ static void eat_semi_colon() /* eat one semi-colon on the current line */ { register int c ; while (scan_code[c = next()] == SC_SPACE) ; if (c != ';') un_next() ; } void eat_nl() /* eat all space including newlines */ { while (1) switch (scan_code[next()]) { case SC_COMMENT: eat_comment() ; break ; case SC_NL: lineno++ ; /* fall thru */ case SC_SPACE: break ; case SC_ESCAPE: /* bug fix - surprised anyone did this, a csh user with backslash dyslexia.(Not a joke) */ { unsigned c ; while (scan_code[c = next()] == SC_SPACE) ; if (c == '\n') token_lineno = ++lineno ; else if (c == 0) { un_next() ; return ; } else /* error */ { un_next() ; /* can't un_next() twice so deal with it */ yylval.ival = '\\' ; unexpected_char() ; if( ++compile_error_count == MAX_COMPILE_ERRORS ) mawk_exit(2) ; return ; } } break ; default: un_next() ; return ; } } int yylex() { register int c ; token_lineno = lineno ; reswitch: switch (scan_code[c = next()]) { case 0: ct_ret(EOF) ; case SC_SPACE: goto reswitch ; case SC_COMMENT: eat_comment() ; goto reswitch ; case SC_NL: lineno++ ; eat_nl() ; ct_ret(NL) ; case SC_ESCAPE: while (scan_code[c = next()] == SC_SPACE) ; if (c == '\n') { token_lineno = ++lineno ; goto reswitch ; } if (c == 0) ct_ret(EOF) ; un_next() ; yylval.ival = '\\' ; ct_ret(UNEXPECTED) ; case SC_SEMI_COLON: eat_nl() ; ct_ret(SEMI_COLON) ; case SC_LBRACE: eat_nl() ; brace_cnt++ ; ct_ret(LBRACE) ; case SC_PLUS: switch (next()) { case '+': yylval.ival = '+' ; string_buff[0] = string_buff[1] = '+' ; string_buff[2] = 0 ; ct_ret(INC_or_DEC) ; case '=': ct_ret(ADD_ASG) ; default: un_next() ; ct_ret(PLUS) ; } case SC_MINUS: switch (next()) { case '-': yylval.ival = '-' ; string_buff[0] = string_buff[1] = '-' ; string_buff[2] = 0 ; ct_ret(INC_or_DEC) ; case '=': ct_ret(SUB_ASG) ; default: un_next() ; ct_ret(MINUS) ; } case SC_COMMA: eat_nl() ; ct_ret(COMMA) ; case SC_MUL: test1_ret('=', MUL_ASG, MUL) ; case SC_DIV: { static int can_precede_div[] = {DOUBLE, STRING_, RPAREN, ID, D_ID, RE, RBOX, FIELD, GETLINE, INC_or_DEC, -1} ; int *p = can_precede_div ; do { if (*p == current_token) { if (*p != INC_or_DEC) { test1_ret('=', DIV_ASG, DIV) ; } if (next() == '=') { un_next() ; ct_ret(collect_RE()) ; } } } while (*++p != -1) ; ct_ret(collect_RE()) ; } case SC_MOD: test1_ret('=', MOD_ASG, MOD) ; case SC_POW: test1_ret('=', POW_ASG, POW) ; case SC_LPAREN: paren_cnt++ ; ct_ret(LPAREN) ; case SC_RPAREN: if (--paren_cnt < 0) { compile_error("extra ')'") ; paren_cnt = 0 ; goto reswitch ; } ct_ret(RPAREN) ; case SC_LBOX: ct_ret(LBOX) ; case SC_RBOX: ct_ret(RBOX) ; case SC_MATCH: string_buff[0] = '~' ; string_buff[0] = 0 ; yylval.ival = 1 ; ct_ret(MATCH) ; case SC_EQUAL: test1_ret('=', EQ, ASSIGN) ; case SC_NOT: /* ! */ if ((c = next()) == '~') { string_buff[0] = '!' ; string_buff[1] = '~' ; string_buff[2] = 0 ; yylval.ival = 0 ; ct_ret(MATCH) ; } else if (c == '=') ct_ret(NEQ) ; un_next() ; ct_ret(NOT) ; case SC_LT: /* '<' */ if (next() == '=') ct_ret(LTE) ; else un_next() ; if (getline_flag) { getline_flag = 0 ; ct_ret(IO_IN) ; } else ct_ret(LT) ; case SC_GT: /* '>' */ if (print_flag && paren_cnt == 0) { print_flag = 0 ; /* there are 3 types of IO_OUT -- build the error string in string_buff */ string_buff[0] = '>' ; if (next() == '>') { yylval.ival = F_APPEND ; string_buff[1] = '>' ; string_buff[2] = 0 ; } else { un_next() ; yylval.ival = F_TRUNC ; string_buff[1] = 0 ; } return current_token = IO_OUT ; } test1_ret('=', GTE, GT) ; case SC_OR: if (next() == '|') { eat_nl() ; ct_ret(OR) ; } else { un_next() ; if (print_flag && paren_cnt == 0) { print_flag = 0 ; yylval.ival = PIPE_OUT ; string_buff[0] = '|' ; string_buff[1] = 0 ; ct_ret(IO_OUT) ; } else ct_ret(PIPE) ; } case SC_AND: if (next() == '&') { eat_nl() ; ct_ret(AND) ; } else { un_next() ; yylval.ival = '&' ; ct_ret(UNEXPECTED) ; } case SC_QMARK: ct_ret(QMARK) ; case SC_COLON: ct_ret(COLON) ; case SC_RBRACE: if (--brace_cnt < 0) { compile_error("extra '}'") ; eat_semi_colon() ; brace_cnt = 0 ; goto reswitch ; } if ((c = current_token) == NL || c == SEMI_COLON || c == SC_FAKE_SEMI_COLON || c == RBRACE) { /* if the brace_cnt is zero , we've completed a pattern action block. If the user insists on adding a semi-colon on the same line we will eat it. Note what we do below: physical law -- conservation of semi-colons */ if (brace_cnt == 0) eat_semi_colon() ; eat_nl() ; ct_ret(RBRACE) ; } /* supply missing semi-colon to statement that precedes a '}' */ brace_cnt++ ; un_next() ; current_token = SC_FAKE_SEMI_COLON ; return SEMI_COLON ; case SC_DIGIT: case SC_DOT: { double d; int flag ; static double double_zero = 0.0 ; static double double_one = 1.0 ; if ((d = collect_decimal(c, &flag)) == 0.0) { if (flag) ct_ret(flag) ; else yylval.ptr = (PTR) & double_zero ; } else if (d == 1.0) { yylval.ptr = (PTR) & double_one ; } else { yylval.ptr = (PTR) ZMALLOC(double) ; *(double *) yylval.ptr = d ; } ct_ret(DOUBLE) ; } case SC_DOLLAR: /* '$' */ { double d; int flag ; while (scan_code[c = next()] == SC_SPACE) ; if (scan_code[c] != SC_DIGIT && scan_code[c] != SC_DOT) { un_next() ; ct_ret(DOLLAR) ; } /* compute field address at compile time */ if ((d = collect_decimal(c, &flag)) == 0.0) { if (flag) ct_ret(flag) ; /* an error */ else yylval.cp = &field[0] ; } else { if (d > MAX_FIELD) { compile_error( "$%g exceeds maximum field(%d)", d, MAX_FIELD) ; d = MAX_FIELD ; } yylval.cp = field_ptr((int) d) ; } ct_ret(FIELD) ; } case SC_DQUOTE: return current_token = collect_string() ; case SC_IDCHAR: /* collect an identifier */ { unsigned char *p = (unsigned char *) string_buff + 1 ; SYMTAB *stp ; string_buff[0] = c ; while ( (c = scan_code[*p++ = next()]) == SC_IDCHAR || c == SC_DIGIT) ; un_next() ; *--p = 0 ; switch ((stp = find(string_buff))->type) { case ST_NONE: /* check for function call before defined */ if (next() == '(') { stp->type = ST_FUNCT ; stp->stval.fbp = (FBLOCK *) zmalloc(sizeof(FBLOCK)) ; stp->stval.fbp->name = stp->name ; stp->stval.fbp->code = (INST *) 0 ; yylval.fbp = stp->stval.fbp ; current_token = FUNCT_ID ; } else { yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; } un_next() ; break ; case ST_NR: NR_flag = 1 ; stp->type = ST_VAR ; /* fall thru */ case ST_VAR: case ST_ARRAY: case ST_LOCAL_NONE: case ST_LOCAL_VAR: case ST_LOCAL_ARRAY: yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; break ; case ST_ENV: stp->type = ST_ARRAY ; stp->stval.array = new_ARRAY() ; load_environ(stp->stval.array) ; yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; break ; case ST_FUNCT: yylval.fbp = stp->stval.fbp ; current_token = FUNCT_ID ; break ; case ST_KEYWORD: current_token = stp->stval.kw ; break ; case ST_BUILTIN: yylval.bip = stp->stval.bip ; current_token = BUILTIN ; break ; case ST_LENGTH: yylval.bip = stp->stval.bip ; /* check for length alone, this is an ugly hack */ while (scan_code[c = next()] == SC_SPACE) ; un_next() ; current_token = c == '(' ? BUILTIN : LENGTH ; break ; case ST_FIELD: yylval.cp = stp->stval.cp ; current_token = FIELD ; break ; default: bozo("find returned bad st type") ; } return current_token ; } case SC_UNEXPECTED: yylval.ival = c & 0xff ; ct_ret(UNEXPECTED) ; } return 0 ; /* never get here make lint happy */ } /* collect a decimal constant in temp_buff. Return the value and error conditions by reference */ static double collect_decimal(c, flag) int c ; int *flag ; { register unsigned char *p = (unsigned char *) string_buff + 1 ; unsigned char *endp ; double d; *flag = 0 ; string_buff[0] = c ; if (c == '.') { if (scan_code[*p++ = next()] != SC_DIGIT) { *flag = UNEXPECTED ; yylval.ival = '.' ; return 0.0 ; } } else { while (scan_code[*p++ = next()] == SC_DIGIT) ; if (p[-1] != '.') { un_next() ; p-- ; } } /* get rest of digits after decimal point */ while (scan_code[*p++ = next()] == SC_DIGIT) ; /* check for exponent */ if (p[-1] != 'e' && p[-1] != 'E') { un_next() ; *--p = 0 ; } else /* get the exponent */ { if (scan_code[*p = next()] != SC_DIGIT && *p != '-' && *p != '+') { *++p = 0 ; *flag = BAD_DECIMAL ; return 0.0 ; } else /* get the rest of the exponent */ { p++ ; while (scan_code[*p++ = next()] == SC_DIGIT) ; un_next() ; *--p = 0 ; } } errno = 0 ; /* check for overflow/underflow */ d = strtod(string_buff, (char **) &endp) ; #ifndef STRTOD_UNDERFLOW_ON_ZERO_BUG if (errno) compile_error("%s : decimal %sflow", string_buff, d == 0.0 ? "under" : "over") ; #else /* ! sun4 bug */ if (errno && d != 0.0) compile_error("%s : decimal overflow", string_buff) ; #endif if (endp < p) { *flag = BAD_DECIMAL ; return 0.0 ; } return d ; } /*---------- process escape characters ---------------*/ static char hex_val['f' - 'A' + 1] = { 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15} ; #define isoctal(x) ((x)>='0'&&(x)<='7') #define hex_value(x) hex_val[(x)-'A'] #define ishex(x) (scan_code[x] == SC_DIGIT ||\ ('A' <= (x) && (x) <= 'f' && hex_value(x))) static int PROTO(octal, (char **)) ; static int PROTO(hex, (char **)) ; /* process one , two or three octal digits moving a pointer forward by reference */ static int octal(start_p) char **start_p ; { register char *p = *start_p ; register unsigned x ; x = *p++ - '0' ; if (isoctal(*p)) { x = (x << 3) + *p++ - '0' ; if (isoctal(*p)) x = (x << 3) + *p++ - '0' ; } *start_p = p ; return x & 0xff ; } /* process one or two hex digits moving a pointer forward by reference */ static int hex(start_p) char **start_p ; { register unsigned char *p = (unsigned char *) *start_p ; register unsigned x ; unsigned t ; if (scan_code[*p] == SC_DIGIT) x = *p++ - '0' ; else x = hex_value(*p++) ; if (scan_code[*p] == SC_DIGIT) x = (x << 4) + *p++ - '0' ; else if ('A' <= *p && *p <= 'f' && (t = hex_value(*p))) { x = (x << 4) + t ; p++ ; } *start_p = (char *) p ; return x ; } #define ET_END 9 static struct { char in, out ; } escape_test[ET_END + 1] = { {'n', '\n'}, {'t', '\t'}, {'f', '\f'}, {'b', '\b'}, {'r', '\r'}, {'a', '\07'}, {'v', '\013'}, {'\\', '\\'}, {'\"', '\"'}, {0, 0} } ; /* process the escape characters in a string, in place . */ char * rm_escape(s) char *s ; { register char *p, *q ; char *t ; int i ; q = p = s ; while (*p) { if (*p == '\\') { escape_test[ET_END].in = *++p ; /* sentinal */ i = 0 ; while (escape_test[i].in != *p) i++ ; if (i != ET_END) /* in table */ { p++ ; *q++ = escape_test[i].out ; } else if (isoctal(*p)) { t = p ; *q++ = octal(&t) ; p = t ; } else if (*p == 'x' && ishex(*(unsigned char *) (p + 1))) { t = p + 1 ; *q++ = hex(&t) ; p = t ; } else if (*p == 0) /* can only happen with command line assign */ *q++ = '\\' ; else /* not an escape sequence */ { *q++ = '\\' ; *q++ = *p++ ; } } else *q++ = *p++ ; } *q = 0 ; return s ; } static int collect_string() { register unsigned char *p = (unsigned char *) string_buff ; int c ; int e_flag = 0 ; /* on if have an escape char */ while (1) switch (scan_code[*p++ = next()]) { case SC_DQUOTE: /* done */ *--p = 0 ; goto out ; case SC_NL: p[-1] = 0 ; /* fall thru */ case 0: /* unterminated string */ compile_error( "runaway string constant \"%.10s ...", string_buff, token_lineno) ; mawk_exit(2) ; case SC_ESCAPE: if ((c = next()) == '\n') { p-- ; lineno++ ; } else if (c == 0) un_next() ; else { *p++ = c ; e_flag = 1 ; } break ; default: break ; } out: yylval.ptr = (PTR) new_STRING( e_flag ? rm_escape(string_buff) : string_buff) ; return STRING_ ; } static int collect_RE() { register unsigned char *p = (unsigned char *) string_buff ; int c ; STRING *sval ; while (1) switch (scan_code[*p++ = next()]) { case SC_DIV: /* done */ *--p = 0 ; goto out ; case SC_NL: p[-1] = 0 ; /* fall thru */ case 0: /* unterminated re */ compile_error( "runaway regular expression /%.10s ...", string_buff, token_lineno) ; mawk_exit(2) ; case SC_ESCAPE: switch (c = next()) { case '/': p[-1] = '/' ; break ; case '\n': p-- ; break ; case 0: un_next() ; break ; default: *p++ = c ; break ; } break ; } out: /* now we've got the RE, so compile it */ sval = new_STRING(string_buff) ; yylval.ptr = re_compile(sval) ; free_STRING(sval) ; return RE ; }