~vcs-imports/gawk/master

9 by Arnold D. Robbins
Moving to 2.13.2.
1
/*
2
 * re.c - compile regular expressions.
3
 */
4
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
5
/*
408.19.184 by Arnold D. Robbins
Changes toward release and test tarball.
6
 * Copyright (C) 1991-2016 the Free Software Foundation, Inc.
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
7
 *
9 by Arnold D. Robbins
Moving to 2.13.2.
8
 * This file is part of GAWK, the GNU implementation of the
21 by Arnold D. Robbins
Move to gawk-3.0.0.
9
 * AWK Programming Language.
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
10
 *
9 by Arnold D. Robbins
Moving to 2.13.2.
11
 * GAWK is free software; you can redistribute it and/or modify
12
 * it under the terms of the GNU General Public License as published by
34 by Arnold D. Robbins
Move to gawk-3.1.6.
13
 * the Free Software Foundation; either version 3 of the License, or
12 by Arnold D. Robbins
Move to 2.13.3 (from 2.13.tar.gz - sigh).
14
 * (at your option) any later version.
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
15
 *
9 by Arnold D. Robbins
Moving to 2.13.2.
16
 * GAWK is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
 * GNU General Public License for more details.
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
20
 *
9 by Arnold D. Robbins
Moving to 2.13.2.
21
 * You should have received a copy of the GNU General Public License
21 by Arnold D. Robbins
Move to gawk-3.0.0.
22
 * along with this program; if not, write to the Free Software
33 by Arnold D. Robbins
Move to gawk 3.1.5.
23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
9 by Arnold D. Robbins
Moving to 2.13.2.
24
 */
25
26
#include "awk.h"
27
408.26.66 by Arnold D. Robbins
Merge multithreaded dfa into gawk.
28
#include "localeinfo.h"
29
21 by Arnold D. Robbins
Move to gawk-3.0.0.
30
static reg_syntax_t syn;
37 by Arnold D. Robbins
Bring in development gawk changes.
31
static void check_bracket_exp(char *s, size_t len);
277.1.208 by Arnold D. Robbins
Fix bug with --traditional + --re-interval.
32
const char *regexflags2str(int flags);
21 by Arnold D. Robbins
Move to gawk-3.0.0.
33
408.26.66 by Arnold D. Robbins
Merge multithreaded dfa into gawk.
34
static struct localeinfo localeinfo;
35
21 by Arnold D. Robbins
Move to gawk-3.0.0.
36
/* make_regexp --- generate compiled regular expressions */
14 by Arnold D. Robbins
Move to 2.15.
37
9 by Arnold D. Robbins
Moving to 2.13.2.
38
Regexp *
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
39
make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
9 by Arnold D. Robbins
Moving to 2.13.2.
40
{
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
41
	static char metas[] = ".*+(){}[]|?^$\\";
9 by Arnold D. Robbins
Moving to 2.13.2.
42
	Regexp *rp;
18 by Arnold D. Robbins
Move to gawk-2.15.4.
43
	const char *rerr;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
44
	const char *src = s;
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
45
	static char *buf = NULL;
46
	static size_t buflen;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
47
	const char *end = s + len;
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
48
	char *dest;
49
	int c, c2;
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
50
	static bool first = true;
51
	static bool no_dfa = false;
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
52
	reg_syntax_t dfa_syn;
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
53
	int i;
33 by Arnold D. Robbins
Move to gawk 3.1.5.
54
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
55
	/*
56
	 * The number of bytes in the current multibyte character.
57
	 * It is 0, when the current character is a singlebyte character.
58
	 */
29 by Arnold D. Robbins
Move to gawk-3.1.1.
59
	size_t is_multibyte = 0;
60
	mbstate_t mbs;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
61
408.5.336 by Arnold D. Robbins
Remove MBS_SUPPORT ifdefs.
62
	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
14 by Arnold D. Robbins
Move to 2.15.
63
32 by Arnold D. Robbins
Move to gawk-3.1.4.
64
	if (first) {
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
65
		first = false;
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
66
		/* for debugging and testing */
67
		no_dfa = (getenv("GAWK_NO_DFA") != NULL);
32 by Arnold D. Robbins
Move to gawk-3.1.4.
68
	}
69
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
70
	/* always check */
71
	check_bracket_exp((char *) s, len);
37 by Arnold D. Robbins
Bring in development gawk changes.
72
14 by Arnold D. Robbins
Move to 2.15.
73
	/* Handle escaped characters first. */
74
21 by Arnold D. Robbins
Move to gawk-3.0.0.
75
	/*
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
76
	 * Build a copy of the string (in buf) with the
21 by Arnold D. Robbins
Move to gawk-3.0.0.
77
	 * escaped characters translated, and generate the regex
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
78
	 * from that.
21 by Arnold D. Robbins
Move to gawk-3.0.0.
79
	 */
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
80
	if (buf == NULL) {
408.17.1 by Andrew J. Schorr
Stop allocating an extra wasted byte at the end of various strings.
81
		emalloc(buf, char *, len + 1, "make_regexp");
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
82
		buflen = len;
83
	} else if (len > buflen) {
408.17.1 by Andrew J. Schorr
Stop allocating an extra wasted byte at the end of various strings.
84
		erealloc(buf, char *, len + 1, "make_regexp");
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
85
		buflen = len;
86
	}
87
	dest = buf;
14 by Arnold D. Robbins
Move to 2.15.
88
89
	while (src < end) {
33 by Arnold D. Robbins
Move to gawk 3.1.5.
90
		if (gawk_mb_cur_max > 1 && ! is_multibyte) {
29 by Arnold D. Robbins
Move to gawk-3.1.1.
91
			/* The previous byte is a singlebyte character, or last byte
92
			   of a multibyte character.  We check the next character.  */
93
			is_multibyte = mbrlen(src, end - src, &mbs);
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
94
			if (   is_multibyte == 1
95
			    || is_multibyte == (size_t) -1
96
			    || is_multibyte == (size_t) -2
97
			    || is_multibyte == 0) {
98
				/* We treat it as a single-byte character.  */
29 by Arnold D. Robbins
Move to gawk-3.1.1.
99
				is_multibyte = 0;
100
			}
101
		}
102
103
		/* We skip multibyte character, since it must not be a special
104
		   character.  */
33 by Arnold D. Robbins
Move to gawk 3.1.5.
105
		if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
30 by Arnold D. Robbins
Move to gawk-3.1.2.
106
		    (*src == '\\')) {
14 by Arnold D. Robbins
Move to 2.15.
107
			c = *++src;
108
			switch (c) {
109
			case 'a':
110
			case 'b':
111
			case 'f':
112
			case 'n':
113
			case 'r':
114
			case 't':
115
			case 'v':
116
			case 'x':
117
			case '0':
118
			case '1':
119
			case '2':
120
			case '3':
121
			case '4':
122
			case '5':
123
			case '6':
124
			case '7':
21 by Arnold D. Robbins
Move to gawk-3.0.0.
125
				c2 = parse_escape(&src);
126
				if (c2 < 0)
14 by Arnold D. Robbins
Move to 2.15.
127
					cant_happen();
21 by Arnold D. Robbins
Move to gawk-3.0.0.
128
				/*
129
				 * Unix awk treats octal (and hex?) chars
130
				 * literally in re's, so escape regexp
131
				 * metacharacters.
132
				 */
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
133
				if (do_traditional
134
				    && ! do_posix
135
				    && (isdigit(c) || c == 'x')
21 by Arnold D. Robbins
Move to gawk-3.0.0.
136
				    && strchr("()|*+?.^$\\[]", c2) != NULL)
137
					*dest++ = '\\';
138
				*dest++ = (char) c2;
14 by Arnold D. Robbins
Move to 2.15.
139
				break;
22 by Arnold D. Robbins
Move to gawk-3.0.1.
140
			case '8':
141
			case '9':	/* a\9b not valid */
142
				*dest++ = c;
143
				src++;
144
				break;
21 by Arnold D. Robbins
Move to gawk-3.0.0.
145
			case 'y':	/* normally \b */
146
				/* gnu regex op */
147
				if (! do_traditional) {
148
					*dest++ = '\\';
149
					*dest++ = 'b';
150
					src++;
151
					break;
152
				}
153
				/* else, fall through */
14 by Arnold D. Robbins
Move to 2.15.
154
			default:
155
				*dest++ = '\\';
21 by Arnold D. Robbins
Move to gawk-3.0.0.
156
				*dest++ = (char) c;
14 by Arnold D. Robbins
Move to 2.15.
157
				src++;
158
				break;
159
			} /* switch */
32 by Arnold D. Robbins
Move to gawk-3.1.4.
160
		} else {
161
			c = *src;
14 by Arnold D. Robbins
Move to 2.15.
162
			*dest++ = *src++;	/* not '\\' */
32 by Arnold D. Robbins
Move to gawk-3.1.4.
163
		}
30 by Arnold D. Robbins
Move to gawk-3.1.2.
164
		if (gawk_mb_cur_max > 1 && is_multibyte)
29 by Arnold D. Robbins
Move to gawk-3.1.1.
165
			is_multibyte--;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
166
	} /* while */
14 by Arnold D. Robbins
Move to 2.15.
167
65 by Arnold D. Robbins
Expand ranges before compiling the regexp.
168
	*dest = '\0';
169
	len = dest - buf;
170
9 by Arnold D. Robbins
Moving to 2.13.2.
171
	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
172
	memset((char *) rp, 0, sizeof(*rp));
21 by Arnold D. Robbins
Move to gawk-3.0.0.
173
	rp->pat.allocated = 0;	/* regex will allocate the buffer */
9 by Arnold D. Robbins
Moving to 2.13.2.
174
	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
175
32 by Arnold D. Robbins
Move to gawk-3.1.4.
176
	/*
177
	 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
178
	 * was going to turn out to be, I wouldn't have bothered with it.
179
	 *
180
	 * In the case where we have a multibyte character set, we have no
181
	 * choice but to use RE_ICASE, since the casetable is for single-byte
182
	 * character sets only.
183
	 *
184
	 * On the other hand, if we do have a single-byte character set,
185
	 * using the casetable should give  a performance improvement, since
186
	 * it's computed only once, not each time a regex is compiled.  We
187
	 * also think it's probably better for portability.  See the
188
	 * discussion by the definition of casetable[] in eval.c.
189
	 */
33 by Arnold D. Robbins
Move to gawk 3.1.5.
190
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
191
	ignorecase = !! ignorecase;	/* force to 1 or 0 */
32 by Arnold D. Robbins
Move to gawk-3.1.4.
192
	if (ignorecase) {
193
		if (gawk_mb_cur_max > 1) {
194
			syn |= RE_ICASE;
195
			rp->pat.translate = NULL;
196
		} else {
197
			syn &= ~RE_ICASE;
34 by Arnold D. Robbins
Move to gawk-3.1.6.
198
			rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
32 by Arnold D. Robbins
Move to gawk-3.1.4.
199
		}
200
	} else {
201
		rp->pat.translate = NULL;
202
		syn &= ~RE_ICASE;
203
	}
204
408.19.250 by Arnold D. Robbins
Update dfa, including API changes.
205
	dfa_syn = syn;
206
	/* FIXME: dfa doesn't pay attention RE_ICASE */
207
	if (ignorecase)
208
		dfa_syn |= RE_ICASE;
209
32 by Arnold D. Robbins
Move to gawk-3.1.4.
210
	re_set_syntax(syn);
211
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
212
	if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
213
		refree(rp);
214
		if (! canfatal) {
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
215
			/* rerr already gettextized inside regex routines */
216
			error("%s: /%s/", rerr, buf);
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
217
 			return NULL;
218
		}
219
		fatal("%s: /%s/", rerr, buf);
220
	}
20 by Arnold D. Robbins
Move to gawk-2.15.6.
221
222
	/* gack. this must be done *after* re_compile_pattern */
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
223
	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
32 by Arnold D. Robbins
Move to gawk-3.1.4.
224
	if (dfa && ! no_dfa) {
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
225
		rp->dfareg = dfaalloc();
408.26.70 by Arnold D. Robbins
Merge grep dfa.
226
		dfasyntax(rp->dfareg, & localeinfo, dfa_syn,
408.26.74 by Arnold D. Robbins
Use dfa's new ability to handle anchors.
227
			  (ignorecase ? DFA_CASE_FOLD : 0) | DFA_ANCHOR);
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
228
		dfacomp(buf, len, rp->dfareg, true);
32 by Arnold D. Robbins
Move to gawk-3.1.4.
229
	} else
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
230
		rp->dfareg = NULL;
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
231
232
	/* Additional flags that help with RS as regexp. */
233
	for (i = 0; i < len; i++) {
234
		if (strchr(metas, buf[i]) != NULL) {
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
235
			rp->has_meta = true;
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
236
			break;
237
		}
238
	}
239
240
	for (i = len - 1; i >= 0; i--) {
241
		if (strchr("*+|?", buf[i]) != NULL) {
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
242
			rp->maybe_long = true;
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
243
			break;
244
		}
245
	}
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
246
9 by Arnold D. Robbins
Moving to 2.13.2.
247
	return rp;
248
}
249
32 by Arnold D. Robbins
Move to gawk-3.1.4.
250
/* research --- do a regexp search. use dfa if possible */
21 by Arnold D. Robbins
Move to gawk-3.0.0.
251
9 by Arnold D. Robbins
Moving to 2.13.2.
252
int
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
253
research(Regexp *rp, char *str, int start,
254
	 size_t len, int flags)
9 by Arnold D. Robbins
Moving to 2.13.2.
255
{
30 by Arnold D. Robbins
Move to gawk-3.1.2.
256
	const char *ret = str;
408.19.165 by Arnold D. Robbins
Sync dfa with grep.
257
	bool try_backref = false;
33 by Arnold D. Robbins
Move to gawk 3.1.5.
258
	int need_start;
259
	int no_bol;
260
	int res;
261
262
	need_start = ((flags & RE_NEED_START) != 0);
263
	no_bol = ((flags & RE_NO_BOL) != 0);
264
265
	if (no_bol)
266
		rp->pat.not_bol = 1;
32 by Arnold D. Robbins
Move to gawk-3.1.4.
267
268
	/*
408.19.229 by Arnold D. Robbins
Use dfa even in multibyte locales.
269
	 * Always do dfa search if can; if it fails, we won't bother
270
	 * with the regex search.
33 by Arnold D. Robbins
Move to gawk 3.1.5.
271
	 *
272
	 * The dfa matcher doesn't have a no_bol flag, so don't bother
273
	 * trying it in that case.
34 by Arnold D. Robbins
Move to gawk-3.1.6.
274
	 *
408.19.229 by Arnold D. Robbins
Use dfa even in multibyte locales.
275
	 * 7/2016: The dfa matcher can't handle a case where searching
276
	 * starts in the middle of a string, so don't bother trying it
277
	 * in that case.
32 by Arnold D. Robbins
Move to gawk-3.1.4.
278
	 */
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
279
	if (rp->dfareg != NULL && ! no_bol && start == 0) {
408.5.325 by Arnold D. Robbins
Use dfa superset to speed up matching.
280
		struct dfa *superset = dfasuperset(rp->dfareg);
281
		if (superset)
282
			ret = dfaexec(superset, str+start, str+start+len,
283
							true, NULL, NULL);
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
284
408.26.74 by Arnold D. Robbins
Use dfa's new ability to handle anchors.
285
		if (ret && (! need_start
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
286
				|| (! superset && dfaisfast(rp->dfareg))))
408.5.325 by Arnold D. Robbins
Use dfa superset to speed up matching.
287
			ret = dfaexec(rp->dfareg, str+start, str+start+len,
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
288
						true, NULL, &try_backref);
32 by Arnold D. Robbins
Move to gawk-3.1.4.
289
	}
30 by Arnold D. Robbins
Move to gawk-3.1.2.
290
9 by Arnold D. Robbins
Moving to 2.13.2.
291
	if (ret) {
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
292
		if (   rp->dfareg == NULL
293
			|| start != 0
294
			|| no_bol
295
			|| need_start
296
			|| try_backref) {
32 by Arnold D. Robbins
Move to gawk-3.1.4.
297
			/*
298
			 * Passing NULL as last arg speeds up search for cases
299
			 * where we don't need the start/end info.
300
			 */
33 by Arnold D. Robbins
Move to gawk 3.1.5.
301
			res = re_search(&(rp->pat), str, start+len,
30 by Arnold D. Robbins
Move to gawk-3.1.2.
302
				start, len, need_start ? &(rp->regs) : NULL);
32 by Arnold D. Robbins
Move to gawk-3.1.4.
303
		} else
33 by Arnold D. Robbins
Move to gawk 3.1.5.
304
			res = 1;
21 by Arnold D. Robbins
Move to gawk-3.0.0.
305
	} else
33 by Arnold D. Robbins
Move to gawk 3.1.5.
306
		res = -1;
307
308
	rp->pat.not_bol = 0;
309
	return res;
9 by Arnold D. Robbins
Moving to 2.13.2.
310
}
311
21 by Arnold D. Robbins
Move to gawk-3.0.0.
312
/* refree --- free up the dynamic memory used by a compiled regexp */
313
9 by Arnold D. Robbins
Moving to 2.13.2.
314
void
28 by Arnold D. Robbins
Move to gawk-3.1.0.
315
refree(Regexp *rp)
9 by Arnold D. Robbins
Moving to 2.13.2.
316
{
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
317
	if (rp == NULL)
408.26.83 by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII.
318
		return;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
319
	rp->pat.translate = NULL;
320
	regfree(& rp->pat);
20 by Arnold D. Robbins
Move to gawk-2.15.6.
321
	if (rp->regs.start)
322
		free(rp->regs.start);
323
	if (rp->regs.end)
324
		free(rp->regs.end);
408.26.63 by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
325
	if (rp->dfareg != NULL) {
36 by Arnold D. Robbins
Move to 3.1.8.
326
		dfafree(rp->dfareg);
327
		free(rp->dfareg);
328
	}
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
329
	efree(rp);
9 by Arnold D. Robbins
Moving to 2.13.2.
330
}
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
331
32 by Arnold D. Robbins
Move to gawk-3.1.4.
332
/* dfaerror --- print an error message for the dfa routines */
333
334
void
335
dfaerror(const char *s)
336
{
337
	fatal("%s", s);
277.1.114 by Arnold D. Robbins
Fix compile warnings on DJGPP.
338
	exit(EXIT_FATAL);	/* for DJGPP */
32 by Arnold D. Robbins
Move to gawk-3.1.4.
339
}
9 by Arnold D. Robbins
Moving to 2.13.2.
340
21 by Arnold D. Robbins
Move to gawk-3.0.0.
341
/* re_update --- recompile a dynamic regexp */
342
9 by Arnold D. Robbins
Moving to 2.13.2.
343
Regexp *
28 by Arnold D. Robbins
Move to gawk-3.1.0.
344
re_update(NODE *t)
9 by Arnold D. Robbins
Moving to 2.13.2.
345
{
346
	NODE *t1;
347
348
	if ((t->re_flags & CASE) == IGNORECASE) {
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
349
		/* regex was compiled with settings matching IGNORECASE */
35 by Arnold D. Robbins
Move to gawk-3.1.7.
350
		if ((t->re_flags & CONSTANT) != 0) {
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
351
			/* it's a constant, so just return it as is */
408.26.46 by Arnold D. Robbins
Remove typed regexps until they can be done correctly.
352
			assert(t->type == Node_regex);
9 by Arnold D. Robbins
Moving to 2.13.2.
353
			return t->re_reg;
29 by Arnold D. Robbins
Move to gawk-3.1.1.
354
		}
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
355
		t1 = t->re_exp;
21 by Arnold D. Robbins
Move to gawk-3.0.0.
356
		if (t->re_text != NULL) {
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
357
			/* if contents haven't changed, just return it */
408.26.64 by Arnold D. Robbins
New POSIX rules for string comparison.
358
			if (cmp_nodes(t->re_text, t1, true) == 0)
9 by Arnold D. Robbins
Moving to 2.13.2.
359
				return t->re_reg;
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
360
			/* things changed, fall through to recompile */
9 by Arnold D. Robbins
Moving to 2.13.2.
361
			unref(t->re_text);
362
		}
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
363
		/* get fresh copy of the text of the regexp */
9 by Arnold D. Robbins
Moving to 2.13.2.
364
		t->re_text = dupnode(t1);
365
	}
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
366
	/* was compiled with different IGNORECASE or text changed */
367
368
	/* free old */
21 by Arnold D. Robbins
Move to gawk-3.0.0.
369
	if (t->re_reg != NULL)
9 by Arnold D. Robbins
Moving to 2.13.2.
370
		refree(t->re_reg);
32 by Arnold D. Robbins
Move to gawk-3.1.4.
371
	if (t->re_cnt > 0)
372
		t->re_cnt++;
373
	if (t->re_cnt > 10)
374
		t->re_cnt = 0;
26 by Arnold D. Robbins
Move to gawk-3.0.5.
375
	if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
376
		/* reset regexp text if needed */
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
377
		t1 = t->re_exp;
28 by Arnold D. Robbins
Move to gawk-3.1.0.
378
		unref(t->re_text);
9 by Arnold D. Robbins
Moving to 2.13.2.
379
		t->re_text = dupnode(t1);
380
	}
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
381
	/* compile it */
18 by Arnold D. Robbins
Move to gawk-2.15.4.
382
	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
383
				IGNORECASE, t->re_cnt, true);
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
384
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
385
	/* clear case flag */
9 by Arnold D. Robbins
Moving to 2.13.2.
386
	t->re_flags &= ~CASE;
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
387
	/* set current value of case flag */
9 by Arnold D. Robbins
Moving to 2.13.2.
388
	t->re_flags |= IGNORECASE;
389
	return t->re_reg;
390
}
13 by Arnold D. Robbins
Move to 2.14.
391
21 by Arnold D. Robbins
Move to gawk-3.0.0.
392
/* resetup --- choose what kind of regexps we match */
393
13 by Arnold D. Robbins
Move to 2.14.
394
void
395
resetup()
396
{
408.26.66 by Arnold D. Robbins
Merge multithreaded dfa into gawk.
397
	// init localeinfo for dfa
398
	init_localeinfo(& localeinfo);
399
408.4.46 by Arnold D. Robbins
Add a comment in re.c:resetup.
400
	/*
401
	 * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
402
	 * are helpful for recovering from the experience.
403
	 *
404
	 *	Aharon Robbins <arnold@skeeve.com>
405
	 *	Sun, 21 Oct 2007 23:55:33 +0200
406
	 */
21 by Arnold D. Robbins
Move to gawk-3.0.0.
407
	if (do_posix)
408
		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
267 by Arnold D. Robbins
Make ranges be character based all the time.
409
	else if (do_traditional)
21 by Arnold D. Robbins
Move to gawk-3.0.0.
410
		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
267 by Arnold D. Robbins
Make ranges be character based all the time.
411
	else
21 by Arnold D. Robbins
Move to gawk-3.0.0.
412
		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */
413
414
	/*
37 by Arnold D. Robbins
Bring in development gawk changes.
415
	 * Interval expressions are now on by default, as POSIX is
416
	 * wide-spread enough that people want it. The do_intervals
417
	 * variable remains for use with --traditional.
21 by Arnold D. Robbins
Move to gawk-3.0.0.
418
	 */
419
	if (do_intervals)
277.1.208 by Arnold D. Robbins
Fix bug with --traditional + --re-interval.
420
		syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
18 by Arnold D. Robbins
Move to gawk-2.15.4.
421
422
	(void) re_set_syntax(syn);
408.26.66 by Arnold D. Robbins
Merge multithreaded dfa into gawk.
423
}
424
425
/* using_utf8 --- are we using utf8 */
426
427
bool
428
using_utf8(void)
429
{
430
	return localeinfo.using_utf8;
32 by Arnold D. Robbins
Move to gawk-3.1.4.
431
}
432
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
433
/* reisstring --- return true if the RE match is a simple string match */
28 by Arnold D. Robbins
Move to gawk-3.1.0.
434
435
int
30 by Arnold D. Robbins
Move to gawk-3.1.2.
436
reisstring(const char *text, size_t len, Regexp *re, const char *buf)
28 by Arnold D. Robbins
Move to gawk-3.1.0.
437
{
438
	int res;
30 by Arnold D. Robbins
Move to gawk-3.1.2.
439
	const char *matched;
28 by Arnold D. Robbins
Move to gawk-3.1.0.
440
319.1.1 by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp.
441
	/* simple checking for meta characters in re */
442
	if (re->has_meta)
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
443
		return false;	/* give up early, can't be string match */
28 by Arnold D. Robbins
Move to gawk-3.1.0.
444
445
	/* make accessable to gdb */
446
	matched = &buf[RESTART(re, buf)];
447
33 by Arnold D. Robbins
Move to gawk 3.1.5.
448
	res = (memcmp(text, matched, len) == 0);
28 by Arnold D. Robbins
Move to gawk-3.1.0.
449
450
	return res;
451
}
30 by Arnold D. Robbins
Move to gawk-3.1.2.
452
31 by Arnold D. Robbins
Move to gawk-3.1.3.
453
/* reflags2str --- make a regex flags value readable */
454
455
const char *
456
reflags2str(int flagval)
457
{
458
	static const struct flagtab values[] = {
459
		{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
460
		{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
461
		{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
462
		{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
463
		{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
464
		{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
465
		{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
466
		{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
467
		{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
468
		{ RE_INTERVALS, "RE_INTERVALS" },
469
		{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
470
		{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
471
		{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
472
		{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
473
		{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
474
		{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
475
		{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
476
		{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
477
		{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
478
		{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
479
		{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
480
		{ RE_ICASE, "RE_ICASE" },
33 by Arnold D. Robbins
Move to gawk 3.1.5.
481
		{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
482
		{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
483
		{ RE_NO_SUB, "RE_NO_SUB" },
31 by Arnold D. Robbins
Move to gawk-3.1.3.
484
		{ 0,	NULL },
485
	};
486
35 by Arnold D. Robbins
Move to gawk-3.1.7.
487
	if (flagval == RE_SYNTAX_EMACS) /* == 0 */
488
		return "RE_SYNTAX_EMACS";
489
31 by Arnold D. Robbins
Move to gawk-3.1.3.
490
	return genflags2str(flagval, values);
491
}
37 by Arnold D. Robbins
Bring in development gawk changes.
492
106 by Arnold D. Robbins
Clean up re.c; see ChangeLog.
493
/*
494
 * dfawarn() is called by the dfa routines whenever a regex is compiled
495
 * must supply a dfawarn.
496
 */
497
498
void
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
499
dfawarn(const char *dfa_warning)
500
{
501
	/*
127 by Arnold D. Robbins
Regex bug fix and token bug fix. See ChangeLog.
502
	 * This routine does nothing, since gawk does its own
40 by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray!
503
	 * (better) check for bad [[:foo:]] syntax.
504
	 */
505
}
506
37 by Arnold D. Robbins
Bring in development gawk changes.
507
/* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
508
509
static void
510
check_bracket_exp(char *s, size_t length)
511
{
512
	static struct reclass {
513
		const char *name;
514
		size_t len;
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
515
		bool warned;
37 by Arnold D. Robbins
Bring in development gawk changes.
516
	} classes[] = {
517
		/*
518
		 * Ordered by what we hope is frequency,
519
		 * since it's linear searched.
520
		 */
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
521
		{ "[:alpha:]", 9, false },
522
		{ "[:digit:]", 9, false },
523
		{ "[:alnum:]", 9, false },
524
		{ "[:upper:]", 9, false },
525
		{ "[:lower:]", 9, false },
526
		{ "[:space:]", 9, false },
527
		{ "[:xdigit:]", 10, false },
528
		{ "[:punct:]", 9, false },
529
		{ "[:print:]", 9, false },
530
		{ "[:graph:]", 9, false },
531
		{ "[:cntrl:]", 9, false },
532
		{ "[:blank:]", 9, false },
37 by Arnold D. Robbins
Bring in development gawk changes.
533
		{ NULL, 0 }
534
	};
535
	int i;
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
536
	bool found = false;
37 by Arnold D. Robbins
Bring in development gawk changes.
537
	char save;
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
538
	char *sp, *sp2, *end;
37 by Arnold D. Robbins
Bring in development gawk changes.
539
	int len;
540
	int count = 0;
541
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
542
	if (length == 0)
543
		return;
544
545
	end = s + length;
37 by Arnold D. Robbins
Bring in development gawk changes.
546
	save = s[length];
547
	s[length] = '\0';
548
	sp = s;
549
550
again:
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
551
	sp = sp2 = memchr(sp, '[', (end - sp));
37 by Arnold D. Robbins
Bring in development gawk changes.
552
	if (sp == NULL)
553
		goto done;
554
555
	for (count++, sp++; *sp != '\0'; sp++) {
556
		if (*sp == '[')
557
			count++;
277.1.215 by Arnold D. Robbins
Fix a bug with ] as real char in regexps.
558
		/*
559
		 * ] as first char after open [ is skipped
560
		 * \] is skipped
561
		 * [^]] is skipped
562
		 */
563
		if (*sp == ']' && sp > sp2) {
564
			 if (sp[-1] != '['
565
			     && sp[-1] != '\\')
566
				 ;
567
			 else if ((sp - sp2) >= 2
568
				  && sp[-1] == '^' && sp[-2] == '[')
569
				 ;
570
			 else
571
				count--;
572
		}
573
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
574
		if (count == 0) {
575
			sp++;	/* skip past ']' */
37 by Arnold D. Robbins
Bring in development gawk changes.
576
			break;
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
577
		}
37 by Arnold D. Robbins
Bring in development gawk changes.
578
	}
579
580
	if (count > 0) {	/* bad regex, give up */
581
		goto done;
582
	}
583
584
	/* sp2 has start */
585
586
	for (i = 0; classes[i].name != NULL; i++) {
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
587
		if (classes[i].warned)
588
			continue;
37 by Arnold D. Robbins
Bring in development gawk changes.
589
		len = classes[i].len;
254 by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment.
590
		if (   len == (sp - sp2)
591
		    && memcmp(sp2, classes[i].name, len) == 0) {
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
592
			found = true;
37 by Arnold D. Robbins
Bring in development gawk changes.
593
			break;
594
		}
595
	}
596
597
	if (found && ! classes[i].warned) {
598
		warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
599
				len, sp2, len, sp2);
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
600
		classes[i].warned = true;
37 by Arnold D. Robbins
Bring in development gawk changes.
601
	}
602
256 by Arnold D. Robbins
Typo fix.
603
	if (sp < end) {
319.1.9 by Arnold D. Robbins
Move to use of bool type, true, false, everywhere.
604
		found = false;
37 by Arnold D. Robbins
Bring in development gawk changes.
605
		goto again;
606
	}
607
done:
608
	s[length] = save;
609
}