/* * re.c - compile regular expressions. */ /* * Copyright (C) 1991-2019, 2021-2024 * the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Programming Language. * * GAWK is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * GAWK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ #include "awk.h" #include "localeinfo.h" static reg_syntax_t syn; static void check_bracket_exp(char *s, size_t len); const char *regexflags2str(int flags); static struct localeinfo localeinfo; /* make_regexp --- generate compiled regular expressions */ Regexp * make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) { static char metas[] = ".*+(){}[]|?^$\\"; Regexp *rp; const char *rerr; const char *src = s; static char *buf = NULL; static size_t buflen; const char *end = s + len; char *dest; int c; static bool first = true; static bool no_dfa = false; int i; static struct dfa* dfaregs[2] = { NULL, NULL }; static bool nul_warned = false; assert(s[len] == '\0'); if (do_lint && ! nul_warned && memchr(s, '\0', len) != NULL) { nul_warned = true; lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX")); } mbstate_t mbs; memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */ if (first) { /* for debugging and testing */ no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* don't set first to false here, we do it below */ } /* always check */ check_bracket_exp((char *) s, len); /* Handle escaped characters first. */ /* * Build a copy of the string (in buf) with the * escaped characters translated, and generate the regex * from that. */ if (buf == NULL) { emalloc(buf, char *, len + 1, "make_regexp"); buflen = len; } else if (len > buflen) { erealloc(buf, char *, len + 1, "make_regexp"); buflen = len; } dest = buf; while (src < end) { /* * Keep multibyte characters together. This avoids * problems if a subsequent byte of a multibyte * character happens to be a backslash. */ if (gawk_mb_cur_max > 1) { size_t mblen = mbrlen(src, end - src, &mbs); /* * Incomplete (-2), invalid (-1), and * null (0) characters are excluded here. * They are read as a sequence of bytes. */ if (mblen > 1 && mblen < (size_t) -2) { size_t i; for (i = 0; i < mblen; i++) *dest++ = *src++; continue; } } /* * From here *src is a single byte character. */ if (*src != '\\') { *dest++ = *src++; continue; } /* Escape sequence */ c = *++src; switch (c) { case '\0': /* \\ before \0, either dynamic data or real end of string */ if (src >= s + len) *dest++ = '\\'; // at end of string, will fatal below else fatal(_("invalid NUL byte in dynamic regexp")); break; case 'a': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': case 'x': case 'u': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { const char *result; size_t nbytes; enum escape_results ret; ret = parse_escape(& src, & result, & nbytes); switch (ret) { case ESCAPE_OK: /* * Unix awk treats octal (and hex?) chars * literally in re's, so escape regexp * metacharacters. */ if (nbytes == 1 && do_traditional && ! do_posix && (isdigit(c) || c == 'x' || c == 'u') && strchr(metas, *result) != NULL) *dest++ = '\\'; if (nbytes == 1 && do_lint && ! nul_warned && *result == '\0') { nul_warned = true; lintwarn(_("behavior of matching a regexp containing NUL characters is not defined by POSIX")); } /* nbytes is now > 0 */ while (nbytes--) *dest++ = *result++; break; case ESCAPE_CONV_ERR: /* * Invalid code points produce '?' (0x3F). * These are quoted so that they're taken * literally. Unlike \u3F, a metachar. */ *dest++ = '\\'; *dest++ = '?'; break; default: /* * The outer switch handles terminal * backslashes and line continuations. * parse_escape should never see them * and therefore it should never return * ESCAPE_TERM_BACKSLASH nor * ESCAPE_LINE_CONTINUATION. * * This also catches unknown values. */ cant_happen("received bad result %d from parse_escape(), nbytes = %zu", (int) ret, nbytes); } break; } case '8': case '9': /* a\9b not valid */ *dest++ = c; src++; { static bool warned[2]; if (! warned[c - '8']) { warning(_("regexp escape sequence `\\%c' treated as plain `%c'"), c, c); warned[c - '8'] = true; } } break; case 'y': /* normally \b */ /* gnu regex op */ if (! do_traditional) { *dest++ = '\\'; *dest++ = 'b'; src++; break; } /* else, fall through */ default: { static const char *ok_to_escape = NULL; /* * The posix and traditional flags do not change * once the awk program is running. Therefore, * neither does ok_to_escape. */ if (ok_to_escape == NULL) { if (do_posix || do_traditional) ok_to_escape = "{}()|*+?.^$\\[]/-"; else ok_to_escape = "<>`'BywWsS{}()|*+?.^$\\[]/-"; } if (strchr(ok_to_escape, c) == NULL) { static bool warned[256]; if (! warned[c & 0xFF]) { warning(_("regexp escape sequence `\\%c' is not a known regexp operator"), c); warned[c & 0xFF] = true; } } *dest++ = '\\'; *dest++ = (char) c; src++; break; } } /* switch */ } /* while */ *dest = '\0'; len = dest - buf; ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); /* * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE * was going to turn out to be, I wouldn't have bothered with it. * * In the case where we have a multibyte character set, we have no * choice but to use RE_ICASE, since the casetable is for single-byte * character sets only. * * On the other hand, if we do have a single-byte character set, * using the casetable should give a performance improvement, since * it's computed only once, not each time a regex is compiled. We * also think it's probably better for portability. See the * discussion by the definition of casetable[] in eval.c. */ ignorecase = !! ignorecase; /* force to 1 or 0 */ if (ignorecase) { if (gawk_mb_cur_max > 1) { syn |= RE_ICASE; rp->pat.translate = NULL; } else { syn &= ~RE_ICASE; rp->pat.translate = (RE_TRANSLATE_TYPE) casetable; } } else { rp->pat.translate = NULL; syn &= ~RE_ICASE; } /* initialize dfas to hold syntax */ if (first) { first = false; dfaregs[0] = dfaalloc(); dfaregs[1] = dfaalloc(); dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR); dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR); } re_set_syntax(syn); if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) { refree(rp); if (! canfatal) { /* rerr already gettextized inside regex routines */ error("%s: /%s/", rerr, s); return NULL; } fatal("invalid regexp: %s: /%s/", rerr, s); } /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { rp->dfareg = dfaalloc(); dfacopysyntax(rp->dfareg, dfaregs[ignorecase]); dfacomp(buf, len, rp->dfareg, true); } else rp->dfareg = NULL; /* Additional flags that help with RS as regexp. */ for (i = 0; i < len; i++) { if (strchr(metas, buf[i]) != NULL) { rp->has_meta = true; break; } } for (i = len - 1; i >= 0; i--) { if (strchr("*+|?{}", buf[i]) != NULL) { rp->maybe_long = true; break; } } return rp; } /* research --- do a regexp search. use dfa if possible */ int research(Regexp *rp, char *str, int start, size_t len, int flags) { const char *ret = str; bool try_backref = false; int need_start; int no_bol; int res; need_start = ((flags & RE_NEED_START) != 0); no_bol = ((flags & RE_NO_BOL) != 0); if (no_bol) rp->pat.not_bol = 1; /* * Always do dfa search if can; if it fails, then even if * need_start is true, we won't bother with the regex search. * * The dfa matcher doesn't have a no_bol flag, so don't bother * trying it in that case. * * 7/2008: Skip the dfa matcher if need_start. The dfa matcher * has bugs in certain multibyte cases and it's too difficult * to try to special case things. * 7/2017: Apparently there are some cases where DFA gets * stuck, even in the C locale, so we use dfa only if not need_start. * * Should that issue ever get resolved, note this comment: * * 7/2016: The dfa matcher can't handle a case where searching * starts in the middle of a string, so don't bother trying it * in that case. * if (rp->dfa && ! no_bol && start == 0) ... */ if (rp->dfareg != NULL && ! no_bol && ! need_start) { struct dfa *superset = dfasuperset(rp->dfareg); if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); if (ret && (! need_start || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, true, NULL, &try_backref); } if (ret) { if ( rp->dfareg == NULL || start != 0 || no_bol || need_start || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. */ res = re_search(&(rp->pat), str, start+len, start, len, need_start ? &(rp->regs) : NULL); } else res = 1; } else res = -1; rp->pat.not_bol = 0; return res; } /* refree --- free up the dynamic memory used by a compiled regexp */ void refree(Regexp *rp) { if (rp == NULL) return; rp->pat.translate = NULL; regfree(& rp->pat); if (rp->regs.start) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } efree(rp); } /* dfaerror --- print an error message for the dfa routines */ void dfaerror(const char *s) { fatal("%s", s); exit(EXIT_FATAL); } /* re_cache_get --- populate regexp cache if empty */ static inline Regexp * re_cache_get(NODE *t) { if (t->re_reg[IGNORECASE] == NULL) t->re_reg[IGNORECASE] = make_regexp(t->re_exp->stptr, t->re_exp->stlen, IGNORECASE, t->re_cnt, true); return t->re_reg[IGNORECASE]; } /* re_update --- recompile a dynamic regexp */ Regexp * re_update(NODE *t) { NODE *t1; if (t->type == Node_val && (t->flags & REGEX) != 0) return re_cache_get(t->typed_re); if ((t->re_flags & CONSTANT) != 0) { /* it's a constant, so just return it as is */ assert(t->type == Node_regex); return re_cache_get(t); } t1 = t->re_exp; if (t->re_text != NULL) { /* if contents haven't changed, just return it */ if (cmp_nodes(t->re_text, t1, true) == 0) return re_cache_get(t); /* things changed, fall through to recompile */ unref(t->re_text); } /* get fresh copy of the text of the regexp */ t->re_text = dupnode(t1); /* text changed */ /* free old */ if (t->re_reg[0] != NULL) { refree(t->re_reg[0]); t->re_reg[0] = NULL; } if (t->re_reg[1] != NULL) { refree(t->re_reg[1]); t->re_reg[1] = NULL; } if (t->re_cnt > 0 && ++t->re_cnt > 10) /* * The regex appears to update frequently, so disable DFA * matching (which trades off expensive upfront compilation * overhead for faster subsequent matching). */ t->re_cnt = 0; if (t->re_text == NULL) { /* reset regexp text if needed */ t1 = t->re_exp; unref(t->re_text); t->re_text = dupnode(t1); } return re_cache_get(t); } /* resetup --- choose what kind of regexps we match */ void resetup() { // init localeinfo for dfa init_localeinfo(& localeinfo); /* * Syntax bits: _that_ is yet another mind trip. Recreational drugs * are helpful for recovering from the experience. * * Aharon Robbins * Sun, 21 Oct 2007 23:55:33 +0200 */ if (do_posix) syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ else if (do_traditional) syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ else syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ /* * Interval expressions are now on by default, as POSIX is * wide-spread enough that people want it. * * 2/2022: BWK awk has supported interval expressions since * March 2019, with an important fix added in Januay 2020. * So we add that support even for --traditional. It's easier to * do it here than to try to get the GLIBC / GNULIB folks to change * the definition of RE_SYNTAX_AWK, which likely would cause * binary compatibility issues. */ if (do_traditional) syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES; (void) re_set_syntax(syn); } /* using_utf8 --- are we using utf8 */ bool using_utf8(void) { return localeinfo.using_utf8; } /* reisstring --- return true if the RE match is a simple string match */ int reisstring(const char *text, size_t len, Regexp *re, const char *buf) { int res; const char *matched; /* simple checking for meta characters in re */ if (re->has_meta) return false; /* give up early, can't be string match */ /* make accessable to gdb */ matched = &buf[RESTART(re, buf)]; res = (memcmp(text, matched, len) == 0); return res; } /* reflags2str --- make a regex flags value readable */ const char * reflags2str(int flagval) { static const struct flagtab values[] = { { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" }, { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" }, { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" }, { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" }, { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" }, { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" }, { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" }, { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" }, { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" }, { RE_INTERVALS, "RE_INTERVALS" }, { RE_LIMITED_OPS, "RE_LIMITED_OPS" }, { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" }, { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" }, { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" }, { RE_NO_BK_REFS, "RE_NO_BK_REFS" }, { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" }, { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" }, { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" }, { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" }, { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" }, { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" }, { RE_ICASE, "RE_ICASE" }, { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" }, { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" }, { RE_NO_SUB, "RE_NO_SUB" }, { 0, NULL }, }; if (flagval == RE_SYNTAX_EMACS) /* == 0 */ return "RE_SYNTAX_EMACS"; return genflags2str(flagval, values); } /* * dfawarn() is called by the dfa routines whenever a regex is compiled * must supply a dfawarn. */ void dfawarn(const char *dfa_warning) { /* * This routine does nothing, since gawk does its own * (better) check for bad [[:foo:]] syntax. */ } /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */ static void check_bracket_exp(char *s, size_t length) { static struct reclass { const char *name; size_t len; bool warned; } classes[] = { /* * Ordered by what we hope is frequency, * since it's linear searched. */ { "[:alpha:]", 9, false }, { "[:digit:]", 9, false }, { "[:alnum:]", 9, false }, { "[:upper:]", 9, false }, { "[:lower:]", 9, false }, { "[:space:]", 9, false }, { "[:xdigit:]", 10, false }, { "[:punct:]", 9, false }, { "[:print:]", 9, false }, { "[:graph:]", 9, false }, { "[:cntrl:]", 9, false }, { "[:blank:]", 9, false }, { NULL, 0 } }; int i; bool found = false; char save; char *sp, *sp2, *end; int len; int count = 0; if (length == 0) return; end = s + length; save = s[length]; s[length] = '\0'; sp = s; again: sp = sp2 = (char *) memchr(sp, '[', (end - sp)); if (sp == NULL) goto done; for (count++, sp++; *sp != '\0'; sp++) { if (*sp == '[') count++; /* * ] as first char after open [ is skipped * \] is skipped * [^]] is skipped */ if (*sp == ']' && sp > sp2) { if (sp[-1] != '[' && sp[-1] != '\\') count--; else if ((sp - sp2) >= 2 && sp[-1] == '^' && sp[-2] == '[') ; else count--; } if (count == 0) { sp++; /* skip past ']' */ break; } } if (count > 0) { /* bad regex, give up */ goto done; } /* sp2 has start */ for (i = 0; classes[i].name != NULL; i++) { if (classes[i].warned) continue; len = classes[i].len; if ( len == (sp - sp2) && memcmp(sp2, classes[i].name, len) == 0) { found = true; break; } } if (found && ! classes[i].warned) { warning(_("regexp component `%.*s' should probably be `[%.*s]'"), len, sp2, len, sp2); classes[i].warned = true; } if (sp < end) { found = false; goto again; } done: s[length] = save; }