9
by Arnold D. Robbins
Moving to 2.13.2. |
1 |
/*
|
2 |
* re.c - compile regular expressions.
|
|
3 |
*/
|
|
4 |
||
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
5 |
/*
|
408.19.184
by Arnold D. Robbins
Changes toward release and test tarball. |
6 |
* Copyright (C) 1991-2016 the Free Software Foundation, Inc.
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
7 |
*
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
8 |
* This file is part of GAWK, the GNU implementation of the
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
9 |
* AWK Programming Language.
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
10 |
*
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
11 |
* GAWK is free software; you can redistribute it and/or modify
|
12 |
* it under the terms of the GNU General Public License as published by
|
|
34
by Arnold D. Robbins
Move to gawk-3.1.6. |
13 |
* the Free Software Foundation; either version 3 of the License, or
|
12
by Arnold D. Robbins
Move to 2.13.3 (from 2.13.tar.gz - sigh). |
14 |
* (at your option) any later version.
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
15 |
*
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
16 |
* GAWK is distributed in the hope that it will be useful,
|
17 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19 |
* GNU General Public License for more details.
|
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
20 |
*
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
21 |
* You should have received a copy of the GNU General Public License
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
22 |
* along with this program; if not, write to the Free Software
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
23 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
24 |
*/
|
25 |
||
26 |
#include "awk.h" |
|
27 |
||
408.26.66
by Arnold D. Robbins
Merge multithreaded dfa into gawk. |
28 |
#include "localeinfo.h" |
29 |
||
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
30 |
static reg_syntax_t syn; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
31 |
static void check_bracket_exp(char *s, size_t len); |
277.1.208
by Arnold D. Robbins
Fix bug with --traditional + --re-interval. |
32 |
const char *regexflags2str(int flags); |
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
33 |
|
408.26.66
by Arnold D. Robbins
Merge multithreaded dfa into gawk. |
34 |
static struct localeinfo localeinfo; |
35 |
||
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
36 |
/* make_regexp --- generate compiled regular expressions */
|
14
by Arnold D. Robbins
Move to 2.15. |
37 |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
38 |
Regexp * |
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
39 |
make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) |
9
by Arnold D. Robbins
Moving to 2.13.2. |
40 |
{
|
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
41 |
static char metas[] = ".*+(){}[]|?^$\\"; |
9
by Arnold D. Robbins
Moving to 2.13.2. |
42 |
Regexp *rp; |
18
by Arnold D. Robbins
Move to gawk-2.15.4. |
43 |
const char *rerr; |
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
44 |
const char *src = s; |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
45 |
static char *buf = NULL; |
46 |
static size_t buflen; |
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
47 |
const char *end = s + len; |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
48 |
char *dest; |
49 |
int c, c2; |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
50 |
static bool first = true; |
51 |
static bool no_dfa = false; |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
52 |
reg_syntax_t dfa_syn; |
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
53 |
int i; |
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
54 |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
55 |
/*
|
56 |
* The number of bytes in the current multibyte character.
|
|
57 |
* It is 0, when the current character is a singlebyte character.
|
|
58 |
*/
|
|
29
by Arnold D. Robbins
Move to gawk-3.1.1. |
59 |
size_t is_multibyte = 0; |
60 |
mbstate_t mbs; |
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
61 |
|
408.5.336
by Arnold D. Robbins
Remove MBS_SUPPORT ifdefs. |
62 |
memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */ |
14
by Arnold D. Robbins
Move to 2.15. |
63 |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
64 |
if (first) { |
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
65 |
first = false; |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
66 |
/* for debugging and testing */
|
67 |
no_dfa = (getenv("GAWK_NO_DFA") != NULL); |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
68 |
}
|
69 |
||
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
70 |
/* always check */
|
71 |
check_bracket_exp((char *) s, len); |
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
72 |
|
14
by Arnold D. Robbins
Move to 2.15. |
73 |
/* Handle escaped characters first. */
|
74 |
||
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
75 |
/*
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
76 |
* Build a copy of the string (in buf) with the
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
77 |
* escaped characters translated, and generate the regex
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
78 |
* from that.
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
79 |
*/
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
80 |
if (buf == NULL) { |
408.17.1
by Andrew J. Schorr
Stop allocating an extra wasted byte at the end of various strings. |
81 |
emalloc(buf, char *, len + 1, "make_regexp"); |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
82 |
buflen = len; |
83 |
} else if (len > buflen) { |
|
408.17.1
by Andrew J. Schorr
Stop allocating an extra wasted byte at the end of various strings. |
84 |
erealloc(buf, char *, len + 1, "make_regexp"); |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
85 |
buflen = len; |
86 |
}
|
|
87 |
dest = buf; |
|
14
by Arnold D. Robbins
Move to 2.15. |
88 |
|
89 |
while (src < end) { |
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
90 |
if (gawk_mb_cur_max > 1 && ! is_multibyte) { |
29
by Arnold D. Robbins
Move to gawk-3.1.1. |
91 |
/* The previous byte is a singlebyte character, or last byte
|
92 |
of a multibyte character. We check the next character. */
|
|
93 |
is_multibyte = mbrlen(src, end - src, &mbs); |
|
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
94 |
if ( is_multibyte == 1 |
95 |
|| is_multibyte == (size_t) -1 |
|
96 |
|| is_multibyte == (size_t) -2 |
|
97 |
|| is_multibyte == 0) { |
|
98 |
/* We treat it as a single-byte character. */
|
|
29
by Arnold D. Robbins
Move to gawk-3.1.1. |
99 |
is_multibyte = 0; |
100 |
}
|
|
101 |
}
|
|
102 |
||
103 |
/* We skip multibyte character, since it must not be a special
|
|
104 |
character. */
|
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
105 |
if ((gawk_mb_cur_max == 1 || ! is_multibyte) && |
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
106 |
(*src == '\\')) { |
14
by Arnold D. Robbins
Move to 2.15. |
107 |
c = *++src; |
108 |
switch (c) { |
|
109 |
case 'a': |
|
110 |
case 'b': |
|
111 |
case 'f': |
|
112 |
case 'n': |
|
113 |
case 'r': |
|
114 |
case 't': |
|
115 |
case 'v': |
|
116 |
case 'x': |
|
117 |
case '0': |
|
118 |
case '1': |
|
119 |
case '2': |
|
120 |
case '3': |
|
121 |
case '4': |
|
122 |
case '5': |
|
123 |
case '6': |
|
124 |
case '7': |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
125 |
c2 = parse_escape(&src); |
126 |
if (c2 < 0) |
|
14
by Arnold D. Robbins
Move to 2.15. |
127 |
cant_happen(); |
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
128 |
/*
|
129 |
* Unix awk treats octal (and hex?) chars
|
|
130 |
* literally in re's, so escape regexp
|
|
131 |
* metacharacters.
|
|
132 |
*/
|
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
133 |
if (do_traditional |
134 |
&& ! do_posix |
|
135 |
&& (isdigit(c) || c == 'x') |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
136 |
&& strchr("()|*+?.^$\\[]", c2) != NULL) |
137 |
*dest++ = '\\'; |
|
138 |
*dest++ = (char) c2; |
|
14
by Arnold D. Robbins
Move to 2.15. |
139 |
break; |
22
by Arnold D. Robbins
Move to gawk-3.0.1. |
140 |
case '8': |
141 |
case '9': /* a\9b not valid */ |
|
142 |
*dest++ = c; |
|
143 |
src++; |
|
144 |
break; |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
145 |
case 'y': /* normally \b */ |
146 |
/* gnu regex op */
|
|
147 |
if (! do_traditional) { |
|
148 |
*dest++ = '\\'; |
|
149 |
*dest++ = 'b'; |
|
150 |
src++; |
|
151 |
break; |
|
152 |
}
|
|
153 |
/* else, fall through */
|
|
14
by Arnold D. Robbins
Move to 2.15. |
154 |
default: |
155 |
*dest++ = '\\'; |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
156 |
*dest++ = (char) c; |
14
by Arnold D. Robbins
Move to 2.15. |
157 |
src++; |
158 |
break; |
|
159 |
} /* switch */ |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
160 |
} else { |
161 |
c = *src; |
|
14
by Arnold D. Robbins
Move to 2.15. |
162 |
*dest++ = *src++; /* not '\\' */ |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
163 |
}
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
164 |
if (gawk_mb_cur_max > 1 && is_multibyte) |
29
by Arnold D. Robbins
Move to gawk-3.1.1. |
165 |
is_multibyte--; |
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
166 |
} /* while */ |
14
by Arnold D. Robbins
Move to 2.15. |
167 |
|
65
by Arnold D. Robbins
Expand ranges before compiling the regexp. |
168 |
*dest = '\0'; |
169 |
len = dest - buf; |
|
170 |
||
9
by Arnold D. Robbins
Moving to 2.13.2. |
171 |
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); |
172 |
memset((char *) rp, 0, sizeof(*rp)); |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
173 |
rp->pat.allocated = 0; /* regex will allocate the buffer */ |
9
by Arnold D. Robbins
Moving to 2.13.2. |
174 |
emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); |
175 |
||
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
176 |
/*
|
177 |
* Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
|
|
178 |
* was going to turn out to be, I wouldn't have bothered with it.
|
|
179 |
*
|
|
180 |
* In the case where we have a multibyte character set, we have no
|
|
181 |
* choice but to use RE_ICASE, since the casetable is for single-byte
|
|
182 |
* character sets only.
|
|
183 |
*
|
|
184 |
* On the other hand, if we do have a single-byte character set,
|
|
185 |
* using the casetable should give a performance improvement, since
|
|
186 |
* it's computed only once, not each time a regex is compiled. We
|
|
187 |
* also think it's probably better for portability. See the
|
|
188 |
* discussion by the definition of casetable[] in eval.c.
|
|
189 |
*/
|
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
190 |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
191 |
ignorecase = !! ignorecase; /* force to 1 or 0 */ |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
192 |
if (ignorecase) { |
193 |
if (gawk_mb_cur_max > 1) { |
|
194 |
syn |= RE_ICASE; |
|
195 |
rp->pat.translate = NULL; |
|
196 |
} else { |
|
197 |
syn &= ~RE_ICASE; |
|
34
by Arnold D. Robbins
Move to gawk-3.1.6. |
198 |
rp->pat.translate = (RE_TRANSLATE_TYPE) casetable; |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
199 |
}
|
200 |
} else { |
|
201 |
rp->pat.translate = NULL; |
|
202 |
syn &= ~RE_ICASE; |
|
203 |
}
|
|
204 |
||
408.19.250
by Arnold D. Robbins
Update dfa, including API changes. |
205 |
dfa_syn = syn; |
206 |
/* FIXME: dfa doesn't pay attention RE_ICASE */
|
|
207 |
if (ignorecase) |
|
208 |
dfa_syn |= RE_ICASE; |
|
209 |
||
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
210 |
re_set_syntax(syn); |
211 |
||
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
212 |
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) { |
213 |
refree(rp); |
|
214 |
if (! canfatal) { |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
215 |
/* rerr already gettextized inside regex routines */
|
216 |
error("%s: /%s/", rerr, buf); |
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
217 |
return NULL; |
218 |
}
|
|
219 |
fatal("%s: /%s/", rerr, buf); |
|
220 |
}
|
|
20
by Arnold D. Robbins
Move to gawk-2.15.6. |
221 |
|
222 |
/* gack. this must be done *after* re_compile_pattern */
|
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
223 |
rp->pat.newline_anchor = false; /* don't get \n in middle of string */ |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
224 |
if (dfa && ! no_dfa) { |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
225 |
rp->dfareg = dfaalloc(); |
408.26.70
by Arnold D. Robbins
Merge grep dfa. |
226 |
dfasyntax(rp->dfareg, & localeinfo, dfa_syn, |
408.26.74
by Arnold D. Robbins
Use dfa's new ability to handle anchors. |
227 |
(ignorecase ? DFA_CASE_FOLD : 0) | DFA_ANCHOR); |
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
228 |
dfacomp(buf, len, rp->dfareg, true); |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
229 |
} else |
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
230 |
rp->dfareg = NULL; |
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
231 |
|
232 |
/* Additional flags that help with RS as regexp. */
|
|
233 |
for (i = 0; i < len; i++) { |
|
234 |
if (strchr(metas, buf[i]) != NULL) { |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
235 |
rp->has_meta = true; |
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
236 |
break; |
237 |
}
|
|
238 |
}
|
|
239 |
||
240 |
for (i = len - 1; i >= 0; i--) { |
|
241 |
if (strchr("*+|?", buf[i]) != NULL) { |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
242 |
rp->maybe_long = true; |
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
243 |
break; |
244 |
}
|
|
245 |
}
|
|
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
246 |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
247 |
return rp; |
248 |
}
|
|
249 |
||
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
250 |
/* research --- do a regexp search. use dfa if possible */
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
251 |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
252 |
int
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
253 |
research(Regexp *rp, char *str, int start, |
254 |
size_t len, int flags) |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
255 |
{
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
256 |
const char *ret = str; |
408.19.165
by Arnold D. Robbins
Sync dfa with grep. |
257 |
bool try_backref = false; |
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
258 |
int need_start; |
259 |
int no_bol; |
|
260 |
int res; |
|
261 |
||
262 |
need_start = ((flags & RE_NEED_START) != 0); |
|
263 |
no_bol = ((flags & RE_NO_BOL) != 0); |
|
264 |
||
265 |
if (no_bol) |
|
266 |
rp->pat.not_bol = 1; |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
267 |
|
268 |
/*
|
|
408.19.229
by Arnold D. Robbins
Use dfa even in multibyte locales. |
269 |
* Always do dfa search if can; if it fails, we won't bother
|
270 |
* with the regex search.
|
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
271 |
*
|
272 |
* The dfa matcher doesn't have a no_bol flag, so don't bother
|
|
273 |
* trying it in that case.
|
|
34
by Arnold D. Robbins
Move to gawk-3.1.6. |
274 |
*
|
408.19.229
by Arnold D. Robbins
Use dfa even in multibyte locales. |
275 |
* 7/2016: The dfa matcher can't handle a case where searching
|
276 |
* starts in the middle of a string, so don't bother trying it
|
|
277 |
* in that case.
|
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
278 |
*/
|
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
279 |
if (rp->dfareg != NULL && ! no_bol && start == 0) { |
408.5.325
by Arnold D. Robbins
Use dfa superset to speed up matching. |
280 |
struct dfa *superset = dfasuperset(rp->dfareg); |
281 |
if (superset) |
|
282 |
ret = dfaexec(superset, str+start, str+start+len, |
|
283 |
true, NULL, NULL); |
|
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
284 |
|
408.26.74
by Arnold D. Robbins
Use dfa's new ability to handle anchors. |
285 |
if (ret && (! need_start |
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
286 |
|| (! superset && dfaisfast(rp->dfareg)))) |
408.5.325
by Arnold D. Robbins
Use dfa superset to speed up matching. |
287 |
ret = dfaexec(rp->dfareg, str+start, str+start+len, |
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
288 |
true, NULL, &try_backref); |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
289 |
}
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
290 |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
291 |
if (ret) { |
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
292 |
if ( rp->dfareg == NULL |
293 |
|| start != 0 |
|
294 |
|| no_bol |
|
295 |
|| need_start |
|
296 |
|| try_backref) { |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
297 |
/*
|
298 |
* Passing NULL as last arg speeds up search for cases
|
|
299 |
* where we don't need the start/end info.
|
|
300 |
*/
|
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
301 |
res = re_search(&(rp->pat), str, start+len, |
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
302 |
start, len, need_start ? &(rp->regs) : NULL); |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
303 |
} else |
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
304 |
res = 1; |
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
305 |
} else |
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
306 |
res = -1; |
307 |
||
308 |
rp->pat.not_bol = 0; |
|
309 |
return res; |
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
310 |
}
|
311 |
||
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
312 |
/* refree --- free up the dynamic memory used by a compiled regexp */
|
313 |
||
9
by Arnold D. Robbins
Moving to 2.13.2. |
314 |
void
|
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
315 |
refree(Regexp *rp) |
9
by Arnold D. Robbins
Moving to 2.13.2. |
316 |
{
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
317 |
if (rp == NULL) |
408.26.83
by Arnold D. Robbins
Remove trailing whitespace everywhere. Fix Unicode into ASCII. |
318 |
return; |
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
319 |
rp->pat.translate = NULL; |
320 |
regfree(& rp->pat); |
|
20
by Arnold D. Robbins
Move to gawk-2.15.6. |
321 |
if (rp->regs.start) |
322 |
free(rp->regs.start); |
|
323 |
if (rp->regs.end) |
|
324 |
free(rp->regs.end); |
|
408.26.63
by Arnold D. Robbins
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c. |
325 |
if (rp->dfareg != NULL) { |
36
by Arnold D. Robbins
Move to 3.1.8. |
326 |
dfafree(rp->dfareg); |
327 |
free(rp->dfareg); |
|
328 |
}
|
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
329 |
efree(rp); |
9
by Arnold D. Robbins
Moving to 2.13.2. |
330 |
}
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
331 |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
332 |
/* dfaerror --- print an error message for the dfa routines */
|
333 |
||
334 |
void
|
|
335 |
dfaerror(const char *s) |
|
336 |
{
|
|
337 |
fatal("%s", s); |
|
277.1.114
by Arnold D. Robbins
Fix compile warnings on DJGPP. |
338 |
exit(EXIT_FATAL); /* for DJGPP */ |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
339 |
}
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
340 |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
341 |
/* re_update --- recompile a dynamic regexp */
|
342 |
||
9
by Arnold D. Robbins
Moving to 2.13.2. |
343 |
Regexp * |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
344 |
re_update(NODE *t) |
9
by Arnold D. Robbins
Moving to 2.13.2. |
345 |
{
|
346 |
NODE *t1; |
|
347 |
||
348 |
if ((t->re_flags & CASE) == IGNORECASE) { |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
349 |
/* regex was compiled with settings matching IGNORECASE */
|
35
by Arnold D. Robbins
Move to gawk-3.1.7. |
350 |
if ((t->re_flags & CONSTANT) != 0) { |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
351 |
/* it's a constant, so just return it as is */
|
408.26.46
by Arnold D. Robbins
Remove typed regexps until they can be done correctly. |
352 |
assert(t->type == Node_regex); |
9
by Arnold D. Robbins
Moving to 2.13.2. |
353 |
return t->re_reg; |
29
by Arnold D. Robbins
Move to gawk-3.1.1. |
354 |
}
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
355 |
t1 = t->re_exp; |
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
356 |
if (t->re_text != NULL) { |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
357 |
/* if contents haven't changed, just return it */
|
408.26.64
by Arnold D. Robbins
New POSIX rules for string comparison. |
358 |
if (cmp_nodes(t->re_text, t1, true) == 0) |
9
by Arnold D. Robbins
Moving to 2.13.2. |
359 |
return t->re_reg; |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
360 |
/* things changed, fall through to recompile */
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
361 |
unref(t->re_text); |
362 |
}
|
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
363 |
/* get fresh copy of the text of the regexp */
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
364 |
t->re_text = dupnode(t1); |
365 |
}
|
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
366 |
/* was compiled with different IGNORECASE or text changed */
|
367 |
||
368 |
/* free old */
|
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
369 |
if (t->re_reg != NULL) |
9
by Arnold D. Robbins
Moving to 2.13.2. |
370 |
refree(t->re_reg); |
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
371 |
if (t->re_cnt > 0) |
372 |
t->re_cnt++; |
|
373 |
if (t->re_cnt > 10) |
|
374 |
t->re_cnt = 0; |
|
26
by Arnold D. Robbins
Move to gawk-3.0.5. |
375 |
if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) { |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
376 |
/* reset regexp text if needed */
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
377 |
t1 = t->re_exp; |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
378 |
unref(t->re_text); |
9
by Arnold D. Robbins
Moving to 2.13.2. |
379 |
t->re_text = dupnode(t1); |
380 |
}
|
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
381 |
/* compile it */
|
18
by Arnold D. Robbins
Move to gawk-2.15.4. |
382 |
t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, |
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
383 |
IGNORECASE, t->re_cnt, true); |
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
384 |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
385 |
/* clear case flag */
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
386 |
t->re_flags &= ~CASE; |
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
387 |
/* set current value of case flag */
|
9
by Arnold D. Robbins
Moving to 2.13.2. |
388 |
t->re_flags |= IGNORECASE; |
389 |
return t->re_reg; |
|
390 |
}
|
|
13
by Arnold D. Robbins
Move to 2.14. |
391 |
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
392 |
/* resetup --- choose what kind of regexps we match */
|
393 |
||
13
by Arnold D. Robbins
Move to 2.14. |
394 |
void
|
395 |
resetup() |
|
396 |
{
|
|
408.26.66
by Arnold D. Robbins
Merge multithreaded dfa into gawk. |
397 |
// init localeinfo for dfa
|
398 |
init_localeinfo(& localeinfo); |
|
399 |
||
408.4.46
by Arnold D. Robbins
Add a comment in re.c:resetup. |
400 |
/*
|
401 |
* Syntax bits: _that_ is yet another mind trip. Recreational drugs
|
|
402 |
* are helpful for recovering from the experience.
|
|
403 |
*
|
|
404 |
* Aharon Robbins <arnold@skeeve.com>
|
|
405 |
* Sun, 21 Oct 2007 23:55:33 +0200
|
|
406 |
*/
|
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
407 |
if (do_posix) |
408 |
syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ |
|
267
by Arnold D. Robbins
Make ranges be character based all the time. |
409 |
else if (do_traditional) |
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
410 |
syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ |
267
by Arnold D. Robbins
Make ranges be character based all the time. |
411 |
else
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
412 |
syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ |
413 |
||
414 |
/*
|
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
415 |
* Interval expressions are now on by default, as POSIX is
|
416 |
* wide-spread enough that people want it. The do_intervals
|
|
417 |
* variable remains for use with --traditional.
|
|
21
by Arnold D. Robbins
Move to gawk-3.0.0. |
418 |
*/
|
419 |
if (do_intervals) |
|
277.1.208
by Arnold D. Robbins
Fix bug with --traditional + --re-interval. |
420 |
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES; |
18
by Arnold D. Robbins
Move to gawk-2.15.4. |
421 |
|
422 |
(void) re_set_syntax(syn); |
|
408.26.66
by Arnold D. Robbins
Merge multithreaded dfa into gawk. |
423 |
}
|
424 |
||
425 |
/* using_utf8 --- are we using utf8 */
|
|
426 |
||
427 |
bool
|
|
428 |
using_utf8(void) |
|
429 |
{
|
|
430 |
return localeinfo.using_utf8; |
|
32
by Arnold D. Robbins
Move to gawk-3.1.4. |
431 |
}
|
432 |
||
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
433 |
/* reisstring --- return true if the RE match is a simple string match */
|
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
434 |
|
435 |
int
|
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
436 |
reisstring(const char *text, size_t len, Regexp *re, const char *buf) |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
437 |
{
|
438 |
int res; |
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
439 |
const char *matched; |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
440 |
|
319.1.1
by Arnold D. Robbins
Cleanups in io.c and improve RS as regexp. |
441 |
/* simple checking for meta characters in re */
|
442 |
if (re->has_meta) |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
443 |
return false; /* give up early, can't be string match */ |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
444 |
|
445 |
/* make accessable to gdb */
|
|
446 |
matched = &buf[RESTART(re, buf)]; |
|
447 |
||
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
448 |
res = (memcmp(text, matched, len) == 0); |
28
by Arnold D. Robbins
Move to gawk-3.1.0. |
449 |
|
450 |
return res; |
|
451 |
}
|
|
30
by Arnold D. Robbins
Move to gawk-3.1.2. |
452 |
|
31
by Arnold D. Robbins
Move to gawk-3.1.3. |
453 |
/* reflags2str --- make a regex flags value readable */
|
454 |
||
455 |
const char * |
|
456 |
reflags2str(int flagval) |
|
457 |
{
|
|
458 |
static const struct flagtab values[] = { |
|
459 |
{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" }, |
|
460 |
{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" }, |
|
461 |
{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" }, |
|
462 |
{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" }, |
|
463 |
{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" }, |
|
464 |
{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" }, |
|
465 |
{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" }, |
|
466 |
{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" }, |
|
467 |
{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" }, |
|
468 |
{ RE_INTERVALS, "RE_INTERVALS" }, |
|
469 |
{ RE_LIMITED_OPS, "RE_LIMITED_OPS" }, |
|
470 |
{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" }, |
|
471 |
{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" }, |
|
472 |
{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" }, |
|
473 |
{ RE_NO_BK_REFS, "RE_NO_BK_REFS" }, |
|
474 |
{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" }, |
|
475 |
{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" }, |
|
476 |
{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" }, |
|
477 |
{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" }, |
|
478 |
{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" }, |
|
479 |
{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" }, |
|
480 |
{ RE_ICASE, "RE_ICASE" }, |
|
33
by Arnold D. Robbins
Move to gawk 3.1.5. |
481 |
{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" }, |
482 |
{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" }, |
|
483 |
{ RE_NO_SUB, "RE_NO_SUB" }, |
|
31
by Arnold D. Robbins
Move to gawk-3.1.3. |
484 |
{ 0, NULL }, |
485 |
};
|
|
486 |
||
35
by Arnold D. Robbins
Move to gawk-3.1.7. |
487 |
if (flagval == RE_SYNTAX_EMACS) /* == 0 */ |
488 |
return "RE_SYNTAX_EMACS"; |
|
489 |
||
31
by Arnold D. Robbins
Move to gawk-3.1.3. |
490 |
return genflags2str(flagval, values); |
491 |
}
|
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
492 |
|
106
by Arnold D. Robbins
Clean up re.c; see ChangeLog. |
493 |
/*
|
494 |
* dfawarn() is called by the dfa routines whenever a regex is compiled
|
|
495 |
* must supply a dfawarn.
|
|
496 |
*/
|
|
497 |
||
498 |
void
|
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
499 |
dfawarn(const char *dfa_warning) |
500 |
{
|
|
501 |
/*
|
|
127
by Arnold D. Robbins
Regex bug fix and token bug fix. See ChangeLog. |
502 |
* This routine does nothing, since gawk does its own
|
40
by Arnold D. Robbins
Bring latest byte code gawk into git. Hurray! |
503 |
* (better) check for bad [[:foo:]] syntax.
|
504 |
*/
|
|
505 |
}
|
|
506 |
||
37
by Arnold D. Robbins
Bring in development gawk changes. |
507 |
/* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
|
508 |
||
509 |
static void |
|
510 |
check_bracket_exp(char *s, size_t length) |
|
511 |
{
|
|
512 |
static struct reclass { |
|
513 |
const char *name; |
|
514 |
size_t len; |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
515 |
bool warned; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
516 |
} classes[] = { |
517 |
/*
|
|
518 |
* Ordered by what we hope is frequency,
|
|
519 |
* since it's linear searched.
|
|
520 |
*/
|
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
521 |
{ "[:alpha:]", 9, false }, |
522 |
{ "[:digit:]", 9, false }, |
|
523 |
{ "[:alnum:]", 9, false }, |
|
524 |
{ "[:upper:]", 9, false }, |
|
525 |
{ "[:lower:]", 9, false }, |
|
526 |
{ "[:space:]", 9, false }, |
|
527 |
{ "[:xdigit:]", 10, false }, |
|
528 |
{ "[:punct:]", 9, false }, |
|
529 |
{ "[:print:]", 9, false }, |
|
530 |
{ "[:graph:]", 9, false }, |
|
531 |
{ "[:cntrl:]", 9, false }, |
|
532 |
{ "[:blank:]", 9, false }, |
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
533 |
{ NULL, 0 } |
534 |
};
|
|
535 |
int i; |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
536 |
bool found = false; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
537 |
char save; |
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
538 |
char *sp, *sp2, *end; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
539 |
int len; |
540 |
int count = 0; |
|
541 |
||
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
542 |
if (length == 0) |
543 |
return; |
|
544 |
||
545 |
end = s + length; |
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
546 |
save = s[length]; |
547 |
s[length] = '\0'; |
|
548 |
sp = s; |
|
549 |
||
550 |
again: |
|
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
551 |
sp = sp2 = memchr(sp, '[', (end - sp)); |
37
by Arnold D. Robbins
Bring in development gawk changes. |
552 |
if (sp == NULL) |
553 |
goto done; |
|
554 |
||
555 |
for (count++, sp++; *sp != '\0'; sp++) { |
|
556 |
if (*sp == '[') |
|
557 |
count++; |
|
277.1.215
by Arnold D. Robbins
Fix a bug with ] as real char in regexps. |
558 |
/*
|
559 |
* ] as first char after open [ is skipped
|
|
560 |
* \] is skipped
|
|
561 |
* [^]] is skipped
|
|
562 |
*/
|
|
563 |
if (*sp == ']' && sp > sp2) { |
|
564 |
if (sp[-1] != '[' |
|
565 |
&& sp[-1] != '\\') |
|
566 |
;
|
|
567 |
else if ((sp - sp2) >= 2 |
|
568 |
&& sp[-1] == '^' && sp[-2] == '[') |
|
569 |
;
|
|
570 |
else
|
|
571 |
count--; |
|
572 |
}
|
|
573 |
||
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
574 |
if (count == 0) { |
575 |
sp++; /* skip past ']' */ |
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
576 |
break; |
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
577 |
}
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
578 |
}
|
579 |
||
580 |
if (count > 0) { /* bad regex, give up */ |
|
581 |
goto done; |
|
582 |
}
|
|
583 |
||
584 |
/* sp2 has start */
|
|
585 |
||
586 |
for (i = 0; classes[i].name != NULL; i++) { |
|
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
587 |
if (classes[i].warned) |
588 |
continue; |
|
37
by Arnold D. Robbins
Bring in development gawk changes. |
589 |
len = classes[i].len; |
254
by Arnold D. Robbins
Disallow gawk builtin/keyword as variable assignment. |
590 |
if ( len == (sp - sp2) |
591 |
&& memcmp(sp2, classes[i].name, len) == 0) { |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
592 |
found = true; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
593 |
break; |
594 |
}
|
|
595 |
}
|
|
596 |
||
597 |
if (found && ! classes[i].warned) { |
|
598 |
warning(_("regexp component `%.*s' should probably be `[%.*s]'"), |
|
599 |
len, sp2, len, sp2); |
|
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
600 |
classes[i].warned = true; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
601 |
}
|
602 |
||
256
by Arnold D. Robbins
Typo fix. |
603 |
if (sp < end) { |
319.1.9
by Arnold D. Robbins
Move to use of bool type, true, false, everywhere. |
604 |
found = false; |
37
by Arnold D. Robbins
Bring in development gawk changes. |
605 |
goto again; |
606 |
}
|
|
607 |
done: |
|
608 |
s[length] = save; |
|
609 |
}
|