1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3
Software Foundation, Inc.
4
This file is part of the GNU C Library.
5
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7
This program is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3, or (at your option)
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
GNU General Public License for more details.
17
You should have received a copy of the GNU General Public License along
18
with this program; if not, write to the Free Software Foundation,
19
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22
size_t length, reg_syntax_t syntax);
23
static void re_compile_fastmap_iter (regex_t *bufp,
24
const re_dfastate_t *init_state,
26
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28
static void free_charset (re_charset_t *cset);
29
#endif /* RE_ENABLE_I18N */
30
static void free_workarea_compile (regex_t *preg);
31
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
static void optimize_utf8 (re_dfa_t *dfa);
35
static reg_errcode_t analyze (regex_t *preg);
36
static reg_errcode_t preorder (bin_tree_t *root,
37
reg_errcode_t (fn (void *, bin_tree_t *)),
39
static reg_errcode_t postorder (bin_tree_t *root,
40
reg_errcode_t (fn (void *, bin_tree_t *)),
42
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51
unsigned int constraint);
52
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56
static Idx fetch_number (re_string_t *input, re_token_t *token,
58
static int peek_token (re_token_t *token, re_string_t *input,
59
reg_syntax_t syntax) internal_function;
60
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61
reg_syntax_t syntax, reg_errcode_t *err);
62
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63
re_token_t *token, reg_syntax_t syntax,
64
Idx nest, reg_errcode_t *err);
65
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66
re_token_t *token, reg_syntax_t syntax,
67
Idx nest, reg_errcode_t *err);
68
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69
re_token_t *token, reg_syntax_t syntax,
70
Idx nest, reg_errcode_t *err);
71
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72
re_token_t *token, reg_syntax_t syntax,
73
Idx nest, reg_errcode_t *err);
74
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75
re_dfa_t *dfa, re_token_t *token,
76
reg_syntax_t syntax, reg_errcode_t *err);
77
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78
re_token_t *token, reg_syntax_t syntax,
80
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82
re_token_t *token, int token_len,
86
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90
static reg_errcode_t build_equiv_class (bitset_t sbcset,
92
Idx *equiv_class_alloc,
93
const unsigned char *name);
94
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97
Idx *char_class_alloc,
98
const unsigned char *class_name,
100
#else /* not RE_ENABLE_I18N */
101
static reg_errcode_t build_equiv_class (bitset_t sbcset,
102
const unsigned char *name);
103
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105
const unsigned char *class_name,
106
reg_syntax_t syntax);
107
#endif /* not RE_ENABLE_I18N */
108
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109
RE_TRANSLATE_TYPE trans,
110
const unsigned char *class_name,
111
const unsigned char *extra,
112
bool non_match, reg_errcode_t *err);
113
static bin_tree_t *create_tree (re_dfa_t *dfa,
114
bin_tree_t *left, bin_tree_t *right,
115
re_token_type_t type);
116
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117
bin_tree_t *left, bin_tree_t *right,
118
const re_token_t *token);
119
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120
static void free_token (re_token_t *node);
121
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124
/* This table gives an error message for each of the error codes listed
125
in regex.h. Obviously the order here has to be same as there.
126
POSIX doesn't require that we do anything for REG_NOERROR,
127
but why not be nice? */
129
static const char __re_error_msgid[] =
131
#define REG_NOERROR_IDX 0
132
gettext_noop ("Success") /* REG_NOERROR */
134
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135
gettext_noop ("No match") /* REG_NOMATCH */
137
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138
gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141
gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144
gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147
gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150
gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153
gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156
gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159
gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162
gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165
gettext_noop ("Invalid range end") /* REG_ERANGE */
167
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168
gettext_noop ("Memory exhausted") /* REG_ESPACE */
170
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171
gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174
gettext_noop ("Premature end of regular expression") /* REG_EEND */
176
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177
gettext_noop ("Regular expression too big") /* REG_ESIZE */
179
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180
gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183
static const size_t __re_error_msgid_idx[] =
204
/* Entry points for GNU code. */
206
/* re_compile_pattern is the GNU regular expression compiler: it
207
compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208
Returns 0 if the pattern was valid, otherwise an error string.
210
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211
are set in BUFP on entry. */
215
re_compile_pattern (pattern, length, bufp)
218
struct re_pattern_buffer *bufp;
219
#else /* size_t might promote */
221
re_compile_pattern (const char *pattern, size_t length,
222
struct re_pattern_buffer *bufp)
227
/* And GNU code determines whether or not to get register information
228
by passing null for the REGS argument to re_match, etc., not by
229
setting no_sub, unless RE_NO_SUB is set. */
230
bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232
/* Match anchors at newline. */
233
bufp->newline_anchor = 1;
235
ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239
return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242
weak_alias (__re_compile_pattern, re_compile_pattern)
245
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246
also be assigned to arbitrarily: each pattern buffer stores its own
247
syntax, so it can be changed between regex compilations. */
248
/* This has no initializer because initialized variables in Emacs
249
become read-only after dumping. */
250
reg_syntax_t re_syntax_options;
253
/* Specify the precise syntax of regexps for compilation. This provides
254
for compatibility for various utilities which historically have
255
different, incompatible syntaxes.
257
The argument SYNTAX is a bit mask comprised of the various bits
258
defined in regex.h. We return the old syntax. */
261
re_set_syntax (syntax)
264
reg_syntax_t ret = re_syntax_options;
266
re_syntax_options = syntax;
270
weak_alias (__re_set_syntax, re_set_syntax)
274
re_compile_fastmap (bufp)
275
struct re_pattern_buffer *bufp;
277
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278
char *fastmap = bufp->fastmap;
280
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282
if (dfa->init_state != dfa->init_state_word)
283
re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284
if (dfa->init_state != dfa->init_state_nl)
285
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286
if (dfa->init_state != dfa->init_state_begbuf)
287
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288
bufp->fastmap_accurate = 1;
292
weak_alias (__re_compile_fastmap, re_compile_fastmap)
296
__attribute ((always_inline))
297
re_set_fastmap (char *fastmap, bool icase, int ch)
301
fastmap[tolower (ch)] = 1;
304
/* Helper function for re_compile_fastmap.
305
Compile fastmap for the initial_state INIT_STATE. */
308
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313
bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316
Idx node = init_state->nodes.elems[node_cnt];
317
re_token_type_t type = dfa->nodes[node].type;
319
if (type == CHARACTER)
321
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322
#ifdef RE_ENABLE_I18N
323
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325
unsigned char buf[MB_LEN_MAX];
331
*p++ = dfa->nodes[node].opr.c;
332
while (++node < dfa->nodes_len
333
&& dfa->nodes[node].type == CHARACTER
334
&& dfa->nodes[node].mb_partial)
335
*p++ = dfa->nodes[node].opr.c;
336
memset (&state, '\0', sizeof (state));
337
if (__mbrtowc (&wc, (const char *) buf, p - buf,
339
&& (__wcrtomb ((char *) buf, towlower (wc), &state)
341
re_set_fastmap (fastmap, false, buf[0]);
345
else if (type == SIMPLE_BRACKET)
348
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351
bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353
if (w & ((bitset_word_t) 1 << j))
354
re_set_fastmap (fastmap, icase, ch);
357
#ifdef RE_ENABLE_I18N
358
else if (type == COMPLEX_BRACKET)
360
re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364
/* See if we have to try all bytes which start multiple collation
366
e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367
collation element, and don't catch 'b' since 'b' is
368
the only collation element which starts from 'b' (and
369
it is caught by SIMPLE_BRACKET). */
370
if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371
&& (cset->ncoll_syms || cset->nranges))
373
const int32_t *table = (const int32_t *)
374
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375
for (i = 0; i < SBC_MAX; ++i)
377
re_set_fastmap (fastmap, icase, i);
381
/* See if we have to start the match at all multibyte characters,
382
i.e. where we would not find an invalid sequence. This only
383
applies to multibyte character sets; for single byte character
384
sets, the SIMPLE_BRACKET again suffices. */
385
if (dfa->mb_cur_max > 1
386
&& (cset->nchar_classes || cset->non_match || cset->nranges
388
|| cset->nequiv_classes
396
memset (&mbs, 0, sizeof (mbs));
397
if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398
re_set_fastmap (fastmap, false, (int) c);
405
/* ... Else catch all bytes which can start the mbchars. */
406
for (i = 0; i < cset->nmbchars; ++i)
410
memset (&state, '\0', sizeof (state));
411
if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415
if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417
re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422
#endif /* RE_ENABLE_I18N */
423
else if (type == OP_PERIOD
424
#ifdef RE_ENABLE_I18N
425
|| type == OP_UTF8_PERIOD
426
#endif /* RE_ENABLE_I18N */
427
|| type == END_OF_RE)
429
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430
if (type == END_OF_RE)
431
bufp->can_be_null = 1;
437
/* Entry point for POSIX code. */
438
/* regcomp takes a regular expression as a string and compiles it.
440
PREG is a regex_t *. We do not expect any fields to be initialized,
441
since POSIX says we shouldn't. Thus, we set
443
`buffer' to the compiled pattern;
444
`used' to the length of the compiled pattern;
445
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446
REG_EXTENDED bit in CFLAGS is set; otherwise, to
447
RE_SYNTAX_POSIX_BASIC;
448
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
449
`fastmap' to an allocated space for the fastmap;
450
`fastmap_accurate' to zero;
451
`re_nsub' to the number of subexpressions in PATTERN.
453
PATTERN is the address of the pattern string.
455
CFLAGS is a series of bits which affect compilation.
457
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458
use POSIX basic syntax.
460
If REG_NEWLINE is set, then . and [^...] don't match newline.
461
Also, regexec will try a match beginning after every newline.
463
If REG_ICASE is set, then we considers upper- and lowercase
464
versions of letters to be equivalent when matching.
466
If REG_NOSUB is set, then when PREG is passed to regexec, that
467
routine will report only success or failure, and nothing about the
470
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471
the return codes and their meanings.) */
474
regcomp (preg, pattern, cflags)
475
regex_t *_Restrict_ preg;
476
const char *_Restrict_ pattern;
480
reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481
: RE_SYNTAX_POSIX_BASIC);
487
/* Try to allocate space for the fastmap. */
488
preg->fastmap = re_malloc (char, SBC_MAX);
489
if (BE (preg->fastmap == NULL, 0))
492
syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494
/* If REG_NEWLINE is set, newlines are treated differently. */
495
if (cflags & REG_NEWLINE)
496
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
497
syntax &= ~RE_DOT_NEWLINE;
498
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499
/* It also changes the matching behavior. */
500
preg->newline_anchor = 1;
503
preg->newline_anchor = 0;
504
preg->no_sub = !!(cflags & REG_NOSUB);
505
preg->translate = NULL;
507
ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509
/* POSIX doesn't distinguish between an unmatched open-group and an
510
unmatched close-group: both are REG_EPAREN. */
511
if (ret == REG_ERPAREN)
514
/* We have already checked preg->fastmap != NULL. */
515
if (BE (ret == REG_NOERROR, 1))
516
/* Compute the fastmap now, since regexec cannot modify the pattern
517
buffer. This function never fails in this implementation. */
518
(void) re_compile_fastmap (preg);
521
/* Some error occurred while compiling the expression. */
522
re_free (preg->fastmap);
523
preg->fastmap = NULL;
529
weak_alias (__regcomp, regcomp)
532
/* Returns a message corresponding to an error code, ERRCODE, returned
533
from either regcomp or regexec. We don't use PREG here. */
537
regerror (errcode, preg, errbuf, errbuf_size)
539
const regex_t *_Restrict_ preg;
540
char *_Restrict_ errbuf;
542
#else /* size_t might promote */
544
regerror (int errcode, const regex_t *_Restrict_ preg,
545
char *_Restrict_ errbuf, size_t errbuf_size)
552
|| errcode >= (int) (sizeof (__re_error_msgid_idx)
553
/ sizeof (__re_error_msgid_idx[0])), 0))
554
/* Only error codes returned by the rest of the code should be passed
555
to this routine. If we are given anything else, or if other regex
556
code generates an invalid error code, then the program has a bug.
557
Dump core so we can fix it. */
560
msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562
msg_size = strlen (msg) + 1; /* Includes the null. */
564
if (BE (errbuf_size != 0, 1))
566
size_t cpy_size = msg_size;
567
if (BE (msg_size > errbuf_size, 0))
569
cpy_size = errbuf_size - 1;
570
errbuf[cpy_size] = '\0';
572
memcpy (errbuf, msg, cpy_size);
578
weak_alias (__regerror, regerror)
582
#ifdef RE_ENABLE_I18N
583
/* This static array is used for the map to single-byte characters when
584
UTF-8 is used. Otherwise we would allocate memory just to initialize
585
it the same all the time. UTF-8 is the preferred encoding so this is
586
a worthwhile optimization. */
587
static const bitset_t utf8_sb_map =
589
/* Set the first 128 bits. */
590
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
591
# error "bitset_word_t is narrower than 32 bits"
592
# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593
BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594
# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595
BITSET_WORD_MAX, BITSET_WORD_MAX,
596
# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600
>> (SBC_MAX % BITSET_WORD_BITS == 0
602
: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608
free_dfa_content (re_dfa_t *dfa)
613
for (i = 0; i < dfa->nodes_len; ++i)
614
free_token (dfa->nodes + i);
615
re_free (dfa->nexts);
616
for (i = 0; i < dfa->nodes_len; ++i)
618
if (dfa->eclosures != NULL)
619
re_node_set_free (dfa->eclosures + i);
620
if (dfa->inveclosures != NULL)
621
re_node_set_free (dfa->inveclosures + i);
622
if (dfa->edests != NULL)
623
re_node_set_free (dfa->edests + i);
625
re_free (dfa->edests);
626
re_free (dfa->eclosures);
627
re_free (dfa->inveclosures);
628
re_free (dfa->nodes);
630
if (dfa->state_table)
631
for (i = 0; i <= dfa->state_hash_mask; ++i)
633
struct re_state_table_entry *entry = dfa->state_table + i;
634
for (j = 0; j < entry->num; ++j)
636
re_dfastate_t *state = entry->array[j];
639
re_free (entry->array);
641
re_free (dfa->state_table);
642
#ifdef RE_ENABLE_I18N
643
if (dfa->sb_char != utf8_sb_map)
644
re_free (dfa->sb_char);
646
re_free (dfa->subexp_map);
648
re_free (dfa->re_str);
655
/* Free dynamically allocated space used by PREG. */
661
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662
if (BE (dfa != NULL, 1))
663
free_dfa_content (dfa);
667
re_free (preg->fastmap);
668
preg->fastmap = NULL;
670
re_free (preg->translate);
671
preg->translate = NULL;
674
weak_alias (__regfree, regfree)
677
/* Entry points compatible with 4.2 BSD regex library. We don't define
678
them unless specifically requested. */
680
#if defined _REGEX_RE_COMP || defined _LIBC
682
/* BSD has one and only one pattern buffer. */
683
static struct re_pattern_buffer re_comp_buf;
687
/* Make these definitions weak in libc, so POSIX programs can redefine
688
these names if they don't use our functions, and still use
689
regcomp/regexec above without link errors. */
700
if (!re_comp_buf.buffer)
701
return gettext ("No previous regular expression");
705
if (re_comp_buf.buffer)
707
fastmap = re_comp_buf.fastmap;
708
re_comp_buf.fastmap = NULL;
709
__regfree (&re_comp_buf);
710
memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711
re_comp_buf.fastmap = fastmap;
714
if (re_comp_buf.fastmap == NULL)
716
re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717
if (re_comp_buf.fastmap == NULL)
718
return (char *) gettext (__re_error_msgid
719
+ __re_error_msgid_idx[(int) REG_ESPACE]);
722
/* Since `re_exec' always passes NULL for the `regs' argument, we
723
don't need to initialize the pattern buffer fields which affect it. */
725
/* Match anchors at newlines. */
726
re_comp_buf.newline_anchor = 1;
728
ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
733
/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738
libc_freeres_fn (free_mem)
740
__regfree (&re_comp_buf);
744
#endif /* _REGEX_RE_COMP */
746
/* Internal entry point.
747
Compile the regular expression PATTERN, whose length is LENGTH.
748
SYNTAX indicate regular expression's syntax. */
751
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754
reg_errcode_t err = REG_NOERROR;
758
/* Initialize the pattern buffer. */
759
preg->fastmap_accurate = 0;
760
preg->syntax = syntax;
761
preg->not_bol = preg->not_eol = 0;
764
preg->can_be_null = 0;
765
preg->regs_allocated = REGS_UNALLOCATED;
767
/* Initialize the dfa. */
768
dfa = (re_dfa_t *) preg->buffer;
769
if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771
/* If zero allocated, but buffer is non-null, try to realloc
772
enough space. This loses if buffer's address is bogus, but
773
that is the user's responsibility. If ->buffer is NULL this
774
is a simple allocation. */
775
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778
preg->allocated = sizeof (re_dfa_t);
779
preg->buffer = (unsigned char *) dfa;
781
preg->used = sizeof (re_dfa_t);
783
err = init_dfa (dfa, length);
784
if (BE (err != REG_NOERROR, 0))
786
free_dfa_content (dfa);
792
/* Note: length+1 will not overflow since it is checked in init_dfa. */
793
dfa->re_str = re_malloc (char, length + 1);
794
strncpy (dfa->re_str, pattern, length + 1);
797
__libc_lock_init (dfa->lock);
799
err = re_string_construct (®exp, pattern, length, preg->translate,
800
(syntax & RE_ICASE) != 0, dfa);
801
if (BE (err != REG_NOERROR, 0))
803
re_compile_internal_free_return:
804
free_workarea_compile (preg);
805
re_string_destruct (®exp);
806
free_dfa_content (dfa);
812
/* Parse the regular expression, and build a structure tree. */
814
dfa->str_tree = parse (®exp, preg, syntax, &err);
815
if (BE (dfa->str_tree == NULL, 0))
816
goto re_compile_internal_free_return;
818
/* Analyze the tree and create the nfa. */
819
err = analyze (preg);
820
if (BE (err != REG_NOERROR, 0))
821
goto re_compile_internal_free_return;
823
#ifdef RE_ENABLE_I18N
824
/* If possible, do searching in single byte encoding to speed things up. */
825
if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829
/* Then create the initial state of the dfa. */
830
err = create_initial_state (dfa);
832
/* Release work areas. */
833
free_workarea_compile (preg);
834
re_string_destruct (®exp);
836
if (BE (err != REG_NOERROR, 0))
838
free_dfa_content (dfa);
846
/* Initialize DFA. We use the length of the regular expression PAT_LEN
847
as the initial length of some arrays. */
850
init_dfa (re_dfa_t *dfa, size_t pat_len)
852
__re_size_t table_size;
856
#ifdef RE_ENABLE_I18N
857
size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
859
size_t max_i18n_object_size = 0;
861
size_t max_object_size =
862
MAX (sizeof (struct re_state_table_entry),
863
MAX (sizeof (re_token_t),
864
MAX (sizeof (re_node_set),
865
MAX (sizeof (regmatch_t),
866
max_i18n_object_size))));
868
memset (dfa, '\0', sizeof (re_dfa_t));
870
/* Force allocation of str_tree_storage the first time. */
871
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873
/* Avoid overflows. The extra "/ 2" is for the table_size doubling
874
calculation below, and for similar doubling calculations
875
elsewhere. And it's <= rather than <, because some of the
876
doubling calculations add 1 afterwards. */
877
if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
880
dfa->nodes_alloc = pat_len + 1;
881
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883
/* table_size = 2 ^ ceil(log pat_len) */
884
for (table_size = 1; ; table_size <<= 1)
885
if (table_size > pat_len)
888
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
889
dfa->state_hash_mask = table_size - 1;
891
dfa->mb_cur_max = MB_CUR_MAX;
893
if (dfa->mb_cur_max == 6
894
&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
896
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899
codeset_name = nl_langinfo (CODESET);
900
if (strcasecmp (codeset_name, "UTF-8") == 0
901
|| strcasecmp (codeset_name, "UTF8") == 0)
904
/* We check exhaustively in the loop below if this charset is a
905
superset of ASCII. */
906
dfa->map_notascii = 0;
909
#ifdef RE_ENABLE_I18N
910
if (dfa->mb_cur_max > 1)
913
dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
919
if (BE (dfa->sb_char == NULL, 0))
922
/* Set the bits corresponding to single byte chars. */
923
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
924
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926
wint_t wch = __btowc (ch);
928
dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930
if (isascii (ch) && wch != ch)
931
dfa->map_notascii = 1;
938
if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943
/* Initialize WORD_CHAR table, which indicate which character is
944
"word". In this case "word" means that it is the word construction
945
character used by some operators like "\<", "\>", etc. */
949
init_word_char (re_dfa_t *dfa)
952
dfa->word_ops_used = 1;
953
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
954
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955
if (isalnum (ch) || ch == '_')
956
dfa->word_char[i] |= (bitset_word_t) 1 << j;
959
/* Free the work area which are only used while compiling. */
962
free_workarea_compile (regex_t *preg)
964
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
965
bin_tree_storage_t *storage, *next;
966
for (storage = dfa->str_tree_storage; storage; storage = next)
968
next = storage->next;
971
dfa->str_tree_storage = NULL;
972
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
973
dfa->str_tree = NULL;
974
re_free (dfa->org_indices);
975
dfa->org_indices = NULL;
978
/* Create initial states for all contexts. */
981
create_initial_state (re_dfa_t *dfa)
985
re_node_set init_nodes;
987
/* Initial states have the epsilon closure of the node which is
988
the first node of the regular expression. */
989
first = dfa->str_tree->first->node_idx;
990
dfa->init_node = first;
991
err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
992
if (BE (err != REG_NOERROR, 0))
995
/* The back-references which are in initial states can epsilon transit,
996
since in this case all of the subexpressions can be null.
997
Then we add epsilon closures of the nodes which are the next nodes of
998
the back-references. */
999
if (dfa->nbackref > 0)
1000
for (i = 0; i < init_nodes.nelem; ++i)
1002
Idx node_idx = init_nodes.elems[i];
1003
re_token_type_t type = dfa->nodes[node_idx].type;
1006
if (type != OP_BACK_REF)
1008
for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010
re_token_t *clexp_node;
1011
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012
if (clexp_node->type == OP_CLOSE_SUBEXP
1013
&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016
if (clexp_idx == init_nodes.nelem)
1019
if (type == OP_BACK_REF)
1021
Idx dest_idx = dfa->edests[node_idx].elems[0];
1022
if (!re_node_set_contains (&init_nodes, dest_idx))
1024
reg_errcode_t merge_err
1025
= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026
if (merge_err != REG_NOERROR)
1033
/* It must be the first time to invoke acquire_state. */
1034
dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1035
/* We don't check ERR here, since the initial state must not be NULL. */
1036
if (BE (dfa->init_state == NULL, 0))
1038
if (dfa->init_state->has_constraint)
1040
dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042
dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044
dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048
if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1049
|| dfa->init_state_begbuf == NULL, 0))
1053
dfa->init_state_word = dfa->init_state_nl
1054
= dfa->init_state_begbuf = dfa->init_state;
1056
re_node_set_free (&init_nodes);
1060
#ifdef RE_ENABLE_I18N
1061
/* If it is possible to do searching in single byte encoding instead of UTF-8
1062
to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063
DFA nodes where needed. */
1066
optimize_utf8 (re_dfa_t *dfa)
1070
bool mb_chars = false;
1071
bool has_period = false;
1073
for (node = 0; node < dfa->nodes_len; ++node)
1074
switch (dfa->nodes[node].type)
1077
if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1081
switch (dfa->nodes[node].opr.ctx_type)
1089
/* Word anchors etc. cannot be handled. It's okay to test
1090
opr.ctx_type since constraints (for all DFA nodes) are
1091
created by ORing one or more opr.ctx_type values. */
1101
case OP_DUP_ASTERISK:
1102
case OP_OPEN_SUBEXP:
1103
case OP_CLOSE_SUBEXP:
1105
case COMPLEX_BRACKET:
1107
case SIMPLE_BRACKET:
1108
/* Just double check. */
1110
int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112
: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1113
for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115
if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1125
if (mb_chars || has_period)
1126
for (node = 0; node < dfa->nodes_len; ++node)
1128
if (dfa->nodes[node].type == CHARACTER
1129
&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1130
dfa->nodes[node].mb_partial = 0;
1131
else if (dfa->nodes[node].type == OP_PERIOD)
1132
dfa->nodes[node].type = OP_UTF8_PERIOD;
1135
/* The search can be in single byte locale. */
1136
dfa->mb_cur_max = 1;
1138
dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1142
/* Analyze the structure tree, and calculate "first", "next", "edest",
1143
"eclosure", and "inveclosure". */
1145
static reg_errcode_t
1146
analyze (regex_t *preg)
1148
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151
/* Allocate arrays. */
1152
dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1153
dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1154
dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155
dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156
if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157
|| dfa->eclosures == NULL, 0))
1160
dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1161
if (dfa->subexp_map != NULL)
1164
for (i = 0; i < preg->re_nsub; i++)
1165
dfa->subexp_map[i] = i;
1166
preorder (dfa->str_tree, optimize_subexps, dfa);
1167
for (i = 0; i < preg->re_nsub; i++)
1168
if (dfa->subexp_map[i] != i)
1170
if (i == preg->re_nsub)
1172
free (dfa->subexp_map);
1173
dfa->subexp_map = NULL;
1177
ret = postorder (dfa->str_tree, lower_subexps, preg);
1178
if (BE (ret != REG_NOERROR, 0))
1180
ret = postorder (dfa->str_tree, calc_first, dfa);
1181
if (BE (ret != REG_NOERROR, 0))
1183
preorder (dfa->str_tree, calc_next, dfa);
1184
ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185
if (BE (ret != REG_NOERROR, 0))
1187
ret = calc_eclosure (dfa);
1188
if (BE (ret != REG_NOERROR, 0))
1191
/* We only need this during the prune_impossible_nodes pass in regexec.c;
1192
skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193
if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197
if (BE (dfa->inveclosures == NULL, 0))
1199
ret = calc_inveclosure (dfa);
1205
/* Our parse trees are very unbalanced, so we cannot use a stack to
1206
implement parse tree visits. Instead, we use parent pointers and
1207
some hairy code in these two functions. */
1208
static reg_errcode_t
1209
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212
bin_tree_t *node, *prev;
1214
for (node = root; ; )
1216
/* Descend down the tree, preferably to the left (or to the right
1217
if that's the only child). */
1218
while (node->left || node->right)
1226
reg_errcode_t err = fn (extra, node);
1227
if (BE (err != REG_NOERROR, 0))
1229
if (node->parent == NULL)
1232
node = node->parent;
1234
/* Go up while we have a node that is reached from the right. */
1235
while (node->right == prev || node->right == NULL);
1240
static reg_errcode_t
1241
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246
for (node = root; ; )
1248
reg_errcode_t err = fn (extra, node);
1249
if (BE (err != REG_NOERROR, 0))
1252
/* Go to the left node, or up and to the right. */
1257
bin_tree_t *prev = NULL;
1258
while (node->right == prev || node->right == NULL)
1261
node = node->parent;
1270
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271
re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272
backreferences as well. Requires a preorder visit. */
1273
static reg_errcode_t
1274
optimize_subexps (void *extra, bin_tree_t *node)
1276
re_dfa_t *dfa = (re_dfa_t *) extra;
1278
if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280
int idx = node->token.opr.idx;
1281
node->token.opr.idx = dfa->subexp_map[idx];
1282
dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285
else if (node->token.type == SUBEXP
1286
&& node->left && node->left->token.type == SUBEXP)
1288
Idx other_idx = node->left->token.opr.idx;
1290
node->left = node->left->left;
1292
node->left->parent = node;
1294
dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295
if (other_idx < BITSET_WORD_BITS)
1296
dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1302
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303
of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304
static reg_errcode_t
1305
lower_subexps (void *extra, bin_tree_t *node)
1307
regex_t *preg = (regex_t *) extra;
1308
reg_errcode_t err = REG_NOERROR;
1310
if (node->left && node->left->token.type == SUBEXP)
1312
node->left = lower_subexp (&err, preg, node->left);
1314
node->left->parent = node;
1316
if (node->right && node->right->token.type == SUBEXP)
1318
node->right = lower_subexp (&err, preg, node->right);
1320
node->right->parent = node;
1327
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330
bin_tree_t *body = node->left;
1331
bin_tree_t *op, *cls, *tree1, *tree;
1334
/* We do not optimize empty subexpressions, because otherwise we may
1335
have bad CONCAT nodes with NULL children. This is obviously not
1336
very common, so we do not lose much. An example that triggers
1337
this case is the sed "script" /\(\)/x. */
1338
&& node->left != NULL
1339
&& (node->token.opr.idx >= BITSET_WORD_BITS
1340
|| !(dfa->used_bkref_map
1341
& ((bitset_word_t) 1 << node->token.opr.idx))))
1344
/* Convert the SUBEXP node to the concatenation of an
1345
OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346
op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347
cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348
tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349
tree = create_tree (dfa, op, tree1, CONCAT);
1350
if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1356
op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357
op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1361
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362
nodes. Requires a postorder visit. */
1363
static reg_errcode_t
1364
calc_first (void *extra, bin_tree_t *node)
1366
re_dfa_t *dfa = (re_dfa_t *) extra;
1367
if (node->token.type == CONCAT)
1369
node->first = node->left->first;
1370
node->node_idx = node->left->node_idx;
1375
node->node_idx = re_dfa_add_node (dfa, node->token);
1376
if (BE (node->node_idx == REG_MISSING, 0))
1378
if (node->token.type == ANCHOR)
1379
dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1384
/* Pass 2: compute NEXT on the tree. Preorder visit. */
1385
static reg_errcode_t
1386
calc_next (void *extra, bin_tree_t *node)
1388
switch (node->token.type)
1390
case OP_DUP_ASTERISK:
1391
node->left->next = node;
1394
node->left->next = node->right->first;
1395
node->right->next = node->next;
1399
node->left->next = node->next;
1401
node->right->next = node->next;
1407
/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408
static reg_errcode_t
1409
link_nfa_nodes (void *extra, bin_tree_t *node)
1411
re_dfa_t *dfa = (re_dfa_t *) extra;
1412
Idx idx = node->node_idx;
1413
reg_errcode_t err = REG_NOERROR;
1415
switch (node->token.type)
1421
assert (node->next == NULL);
1424
case OP_DUP_ASTERISK:
1428
dfa->has_plural_match = 1;
1429
if (node->left != NULL)
1430
left = node->left->first->node_idx;
1432
left = node->next->node_idx;
1433
if (node->right != NULL)
1434
right = node->right->first->node_idx;
1436
right = node->next->node_idx;
1437
assert (REG_VALID_INDEX (left));
1438
assert (REG_VALID_INDEX (right));
1439
err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444
case OP_OPEN_SUBEXP:
1445
case OP_CLOSE_SUBEXP:
1446
err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1450
dfa->nexts[idx] = node->next->node_idx;
1451
if (node->token.type == OP_BACK_REF)
1452
err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1456
assert (!IS_EPSILON_NODE (node->token.type));
1457
dfa->nexts[idx] = node->next->node_idx;
1464
/* Duplicate the epsilon closure of the node ROOT_NODE.
1465
Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466
to their own constraint. */
1468
static reg_errcode_t
1470
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1471
Idx root_node, unsigned int init_constraint)
1473
Idx org_node, clone_node;
1475
unsigned int constraint = init_constraint;
1476
for (org_node = top_org_node, clone_node = top_clone_node;;)
1478
Idx org_dest, clone_dest;
1479
if (dfa->nodes[org_node].type == OP_BACK_REF)
1481
/* If the back reference epsilon-transit, its destination must
1482
also have the constraint. Then duplicate the epsilon closure
1483
of the destination of the back reference, and store it in
1484
edests of the back reference. */
1485
org_dest = dfa->nexts[org_node];
1486
re_node_set_empty (dfa->edests + clone_node);
1487
clone_dest = duplicate_node (dfa, org_dest, constraint);
1488
if (BE (clone_dest == REG_MISSING, 0))
1490
dfa->nexts[clone_node] = dfa->nexts[org_node];
1491
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495
else if (dfa->edests[org_node].nelem == 0)
1497
/* In case of the node can't epsilon-transit, don't duplicate the
1498
destination and store the original destination as the
1499
destination of the node. */
1500
dfa->nexts[clone_node] = dfa->nexts[org_node];
1503
else if (dfa->edests[org_node].nelem == 1)
1505
/* In case of the node can epsilon-transit, and it has only one
1507
org_dest = dfa->edests[org_node].elems[0];
1508
re_node_set_empty (dfa->edests + clone_node);
1509
/* If the node is root_node itself, it means the epsilon closure
1510
has a loop. Then tie it to the destination of the root_node. */
1511
if (org_node == root_node && clone_node != org_node)
1513
ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518
/* In case the node has another constraint, append it. */
1519
constraint |= dfa->nodes[org_node].constraint;
1520
clone_dest = duplicate_node (dfa, org_dest, constraint);
1521
if (BE (clone_dest == REG_MISSING, 0))
1523
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527
else /* dfa->edests[org_node].nelem == 2 */
1529
/* In case of the node can epsilon-transit, and it has two
1530
destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531
org_dest = dfa->edests[org_node].elems[0];
1532
re_node_set_empty (dfa->edests + clone_node);
1533
/* Search for a duplicated node which satisfies the constraint. */
1534
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535
if (clone_dest == REG_MISSING)
1537
/* There is no such duplicated node, create a new one. */
1539
clone_dest = duplicate_node (dfa, org_dest, constraint);
1540
if (BE (clone_dest == REG_MISSING, 0))
1542
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545
err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546
root_node, constraint);
1547
if (BE (err != REG_NOERROR, 0))
1552
/* There is a duplicated node which satisfies the constraint,
1553
use it to avoid infinite loop. */
1554
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559
org_dest = dfa->edests[org_node].elems[1];
1560
clone_dest = duplicate_node (dfa, org_dest, constraint);
1561
if (BE (clone_dest == REG_MISSING, 0))
1563
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567
org_node = org_dest;
1568
clone_node = clone_dest;
1573
/* Search for a node which is duplicated from the node ORG_NODE, and
1574
satisfies the constraint CONSTRAINT. */
1577
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578
unsigned int constraint)
1581
for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583
if (org_node == dfa->org_indices[idx]
1584
&& constraint == dfa->nodes[idx].constraint)
1585
return idx; /* Found. */
1587
return REG_MISSING; /* Not found. */
1590
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591
Return the index of the new node, or REG_MISSING if insufficient storage is
1595
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597
Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598
if (BE (dup_idx != REG_MISSING, 1))
1600
dfa->nodes[dup_idx].constraint = constraint;
1601
dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1602
dfa->nodes[dup_idx].duplicated = 1;
1604
/* Store the index of the original node. */
1605
dfa->org_indices[dup_idx] = org_idx;
1610
static reg_errcode_t
1611
calc_inveclosure (re_dfa_t *dfa)
1615
for (idx = 0; idx < dfa->nodes_len; ++idx)
1616
re_node_set_init_empty (dfa->inveclosures + idx);
1618
for (src = 0; src < dfa->nodes_len; ++src)
1620
Idx *elems = dfa->eclosures[src].elems;
1621
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623
ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632
/* Calculate "eclosure" for all the node in DFA. */
1634
static reg_errcode_t
1635
calc_eclosure (re_dfa_t *dfa)
1640
assert (dfa->nodes_len > 0);
1643
/* For each nodes, calculate epsilon closure. */
1644
for (node_idx = 0; ; ++node_idx)
1647
re_node_set eclosure_elem;
1648
if (node_idx == dfa->nodes_len)
1657
assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660
/* If we have already calculated, skip it. */
1661
if (dfa->eclosures[node_idx].nelem != 0)
1663
/* Calculate epsilon closure of `node_idx'. */
1664
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1665
if (BE (err != REG_NOERROR, 0))
1668
if (dfa->eclosures[node_idx].nelem == 0)
1671
re_node_set_free (&eclosure_elem);
1677
/* Calculate epsilon closure of NODE. */
1679
static reg_errcode_t
1680
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1684
re_node_set eclosure;
1686
bool incomplete = false;
1687
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1688
if (BE (err != REG_NOERROR, 0))
1691
/* This indicates that we are calculating this node now.
1692
We reference this value to avoid infinite loop. */
1693
dfa->eclosures[node].nelem = REG_MISSING;
1695
/* If the current node has constraints, duplicate all nodes
1696
since they must inherit the constraints. */
1697
if (dfa->nodes[node].constraint
1698
&& dfa->edests[node].nelem
1699
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701
err = duplicate_node_closure (dfa, node, node, node,
1702
dfa->nodes[node].constraint);
1703
if (BE (err != REG_NOERROR, 0))
1707
/* Expand each epsilon destination nodes. */
1708
if (IS_EPSILON_NODE(dfa->nodes[node].type))
1709
for (i = 0; i < dfa->edests[node].nelem; ++i)
1711
re_node_set eclosure_elem;
1712
Idx edest = dfa->edests[node].elems[i];
1713
/* If calculating the epsilon closure of `edest' is in progress,
1714
return intermediate result. */
1715
if (dfa->eclosures[edest].nelem == REG_MISSING)
1720
/* If we haven't calculated the epsilon closure of `edest' yet,
1721
calculate now. Otherwise use calculated epsilon closure. */
1722
if (dfa->eclosures[edest].nelem == 0)
1724
err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1725
if (BE (err != REG_NOERROR, 0))
1729
eclosure_elem = dfa->eclosures[edest];
1730
/* Merge the epsilon closure of `edest'. */
1731
err = re_node_set_merge (&eclosure, &eclosure_elem);
1732
if (BE (err != REG_NOERROR, 0))
1734
/* If the epsilon closure of `edest' is incomplete,
1735
the epsilon closure of this node is also incomplete. */
1736
if (dfa->eclosures[edest].nelem == 0)
1739
re_node_set_free (&eclosure_elem);
1743
/* An epsilon closure includes itself. */
1744
ok = re_node_set_insert (&eclosure, node);
1747
if (incomplete && !root)
1748
dfa->eclosures[node].nelem = 0;
1750
dfa->eclosures[node] = eclosure;
1751
*new_set = eclosure;
1755
/* Functions for token which are used in the parser. */
1757
/* Fetch a token from INPUT.
1758
We must not use this function inside bracket expressions. */
1762
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764
re_string_skip_bytes (input, peek_token (result, input, syntax));
1767
/* Peek a token from INPUT, and return the length of the token.
1768
We must not use this function inside bracket expressions. */
1772
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776
if (re_string_eoi (input))
1778
token->type = END_OF_RE;
1782
c = re_string_peek_byte (input, 0);
1785
token->word_char = 0;
1786
#ifdef RE_ENABLE_I18N
1787
token->mb_partial = 0;
1788
if (input->mb_cur_max > 1 &&
1789
!re_string_first_byte (input, re_string_cur_idx (input)))
1791
token->type = CHARACTER;
1792
token->mb_partial = 1;
1799
if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801
token->type = BACK_SLASH;
1805
c2 = re_string_peek_byte_case (input, 1);
1807
token->type = CHARACTER;
1808
#ifdef RE_ENABLE_I18N
1809
if (input->mb_cur_max > 1)
1811
wint_t wc = re_string_wchar_at (input,
1812
re_string_cur_idx (input) + 1);
1813
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817
token->word_char = IS_WORD_CHAR (c2) != 0;
1822
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1823
token->type = OP_ALT;
1825
case '1': case '2': case '3': case '4': case '5':
1826
case '6': case '7': case '8': case '9':
1827
if (!(syntax & RE_NO_BK_REFS))
1829
token->type = OP_BACK_REF;
1830
token->opr.idx = c2 - '1';
1834
if (!(syntax & RE_NO_GNU_OPS))
1836
token->type = ANCHOR;
1837
token->opr.ctx_type = WORD_FIRST;
1841
if (!(syntax & RE_NO_GNU_OPS))
1843
token->type = ANCHOR;
1844
token->opr.ctx_type = WORD_LAST;
1848
if (!(syntax & RE_NO_GNU_OPS))
1850
token->type = ANCHOR;
1851
token->opr.ctx_type = WORD_DELIM;
1855
if (!(syntax & RE_NO_GNU_OPS))
1857
token->type = ANCHOR;
1858
token->opr.ctx_type = NOT_WORD_DELIM;
1862
if (!(syntax & RE_NO_GNU_OPS))
1863
token->type = OP_WORD;
1866
if (!(syntax & RE_NO_GNU_OPS))
1867
token->type = OP_NOTWORD;
1870
if (!(syntax & RE_NO_GNU_OPS))
1871
token->type = OP_SPACE;
1874
if (!(syntax & RE_NO_GNU_OPS))
1875
token->type = OP_NOTSPACE;
1878
if (!(syntax & RE_NO_GNU_OPS))
1880
token->type = ANCHOR;
1881
token->opr.ctx_type = BUF_FIRST;
1885
if (!(syntax & RE_NO_GNU_OPS))
1887
token->type = ANCHOR;
1888
token->opr.ctx_type = BUF_LAST;
1892
if (!(syntax & RE_NO_BK_PARENS))
1893
token->type = OP_OPEN_SUBEXP;
1896
if (!(syntax & RE_NO_BK_PARENS))
1897
token->type = OP_CLOSE_SUBEXP;
1900
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901
token->type = OP_DUP_PLUS;
1904
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905
token->type = OP_DUP_QUESTION;
1908
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909
token->type = OP_OPEN_DUP_NUM;
1912
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913
token->type = OP_CLOSE_DUP_NUM;
1921
token->type = CHARACTER;
1922
#ifdef RE_ENABLE_I18N
1923
if (input->mb_cur_max > 1)
1925
wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1926
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930
token->word_char = IS_WORD_CHAR (token->opr.c);
1935
if (syntax & RE_NEWLINE_ALT)
1936
token->type = OP_ALT;
1939
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1940
token->type = OP_ALT;
1943
token->type = OP_DUP_ASTERISK;
1946
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947
token->type = OP_DUP_PLUS;
1950
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951
token->type = OP_DUP_QUESTION;
1954
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955
token->type = OP_OPEN_DUP_NUM;
1958
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959
token->type = OP_CLOSE_DUP_NUM;
1962
if (syntax & RE_NO_BK_PARENS)
1963
token->type = OP_OPEN_SUBEXP;
1966
if (syntax & RE_NO_BK_PARENS)
1967
token->type = OP_CLOSE_SUBEXP;
1970
token->type = OP_OPEN_BRACKET;
1973
token->type = OP_PERIOD;
1976
if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1977
re_string_cur_idx (input) != 0)
1979
char prev = re_string_peek_byte (input, -1);
1980
if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983
token->type = ANCHOR;
1984
token->opr.ctx_type = LINE_FIRST;
1987
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1988
re_string_cur_idx (input) + 1 != re_string_length (input))
1991
re_string_skip_bytes (input, 1);
1992
peek_token (&next, input, syntax);
1993
re_string_skip_bytes (input, -1);
1994
if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997
token->type = ANCHOR;
1998
token->opr.ctx_type = LINE_LAST;
2006
/* Peek a token from INPUT, and return the length of the token.
2007
We must not use this function out of bracket expressions. */
2011
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014
if (re_string_eoi (input))
2016
token->type = END_OF_RE;
2019
c = re_string_peek_byte (input, 0);
2022
#ifdef RE_ENABLE_I18N
2023
if (input->mb_cur_max > 1 &&
2024
!re_string_first_byte (input, re_string_cur_idx (input)))
2026
token->type = CHARACTER;
2029
#endif /* RE_ENABLE_I18N */
2031
if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2032
&& re_string_cur_idx (input) + 1 < re_string_length (input))
2034
/* In this case, '\' escape a character. */
2036
re_string_skip_bytes (input, 1);
2037
c2 = re_string_peek_byte (input, 0);
2039
token->type = CHARACTER;
2042
if (c == '[') /* '[' is a special char in a bracket exps. */
2046
if (re_string_cur_idx (input) + 1 < re_string_length (input))
2047
c2 = re_string_peek_byte (input, 1);
2055
token->type = OP_OPEN_COLL_ELEM;
2058
token->type = OP_OPEN_EQUIV_CLASS;
2061
if (syntax & RE_CHAR_CLASSES)
2063
token->type = OP_OPEN_CHAR_CLASS;
2066
/* else fall through. */
2068
token->type = CHARACTER;
2078
token->type = OP_CHARSET_RANGE;
2081
token->type = OP_CLOSE_BRACKET;
2084
token->type = OP_NON_MATCH_LIST;
2087
token->type = CHARACTER;
2092
/* Functions for parser. */
2094
/* Entry point of the parser.
2095
Parse the regular expression REGEXP and return the structure tree.
2096
If an error is occured, ERR is set by error code, and return NULL.
2097
This function build the following tree, from regular expression <reg_exp>:
2103
CAT means concatenation.
2104
EOR means end of regular expression. */
2107
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111
bin_tree_t *tree, *eor, *root;
2112
re_token_t current_token;
2113
dfa->syntax = syntax;
2114
fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2115
tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2116
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118
eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120
root = create_tree (dfa, tree, eor, CONCAT);
2123
if (BE (eor == NULL || root == NULL, 0))
2131
/* This function build the following tree, from regular expression
2132
<branch1>|<branch2>:
2138
ALT means alternative, which represents the operator `|'. */
2141
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2142
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145
bin_tree_t *tree, *branch = NULL;
2146
tree = parse_branch (regexp, preg, token, syntax, nest, err);
2147
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150
while (token->type == OP_ALT)
2152
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153
if (token->type != OP_ALT && token->type != END_OF_RE
2154
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156
branch = parse_branch (regexp, preg, token, syntax, nest, err);
2157
if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162
tree = create_tree (dfa, tree, branch, OP_ALT);
2163
if (BE (tree == NULL, 0))
2172
/* This function build the following tree, from regular expression
2179
CAT means concatenation. */
2182
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185
bin_tree_t *tree, *expr;
2186
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2187
tree = parse_expression (regexp, preg, token, syntax, nest, err);
2188
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191
while (token->type != OP_ALT && token->type != END_OF_RE
2192
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194
expr = parse_expression (regexp, preg, token, syntax, nest, err);
2195
if (BE (*err != REG_NOERROR && expr == NULL, 0))
2199
if (tree != NULL && expr != NULL)
2201
tree = create_tree (dfa, tree, expr, CONCAT);
2208
else if (tree == NULL)
2210
/* Otherwise expr == NULL, we don't need to create new tree. */
2215
/* This function build the following tree, from regular expression a*:
2222
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227
switch (token->type)
2230
tree = create_token_tree (dfa, NULL, NULL, token);
2231
if (BE (tree == NULL, 0))
2236
#ifdef RE_ENABLE_I18N
2237
if (dfa->mb_cur_max > 1)
2239
while (!re_string_eoi (regexp)
2240
&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242
bin_tree_t *mbc_remain;
2243
fetch_token (token, regexp, syntax);
2244
mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245
tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246
if (BE (mbc_remain == NULL || tree == NULL, 0))
2255
case OP_OPEN_SUBEXP:
2256
tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260
case OP_OPEN_BRACKET:
2261
tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2266
if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271
dfa->used_bkref_map |= 1 << token->opr.idx;
2272
tree = create_token_tree (dfa, NULL, NULL, token);
2273
if (BE (tree == NULL, 0))
2279
dfa->has_mb_node = 1;
2281
case OP_OPEN_DUP_NUM:
2282
if (syntax & RE_CONTEXT_INVALID_DUP)
2288
case OP_DUP_ASTERISK:
2290
case OP_DUP_QUESTION:
2291
if (syntax & RE_CONTEXT_INVALID_OPS)
2296
else if (syntax & RE_CONTEXT_INDEP_OPS)
2298
fetch_token (token, regexp, syntax);
2299
return parse_expression (regexp, preg, token, syntax, nest, err);
2301
/* else fall through */
2302
case OP_CLOSE_SUBEXP:
2303
if ((token->type == OP_CLOSE_SUBEXP) &&
2304
!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309
/* else fall through */
2310
case OP_CLOSE_DUP_NUM:
2311
/* We treat it as a normal character. */
2313
/* Then we can these characters as normal characters. */
2314
token->type = CHARACTER;
2315
/* mb_partial and word_char bits should be initialized already
2317
tree = create_token_tree (dfa, NULL, NULL, token);
2318
if (BE (tree == NULL, 0))
2325
if ((token->opr.ctx_type
2326
& (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327
&& dfa->word_ops_used == 0)
2328
init_word_char (dfa);
2329
if (token->opr.ctx_type == WORD_DELIM
2330
|| token->opr.ctx_type == NOT_WORD_DELIM)
2332
bin_tree_t *tree_first, *tree_last;
2333
if (token->opr.ctx_type == WORD_DELIM)
2335
token->opr.ctx_type = WORD_FIRST;
2336
tree_first = create_token_tree (dfa, NULL, NULL, token);
2337
token->opr.ctx_type = WORD_LAST;
2341
token->opr.ctx_type = INSIDE_WORD;
2342
tree_first = create_token_tree (dfa, NULL, NULL, token);
2343
token->opr.ctx_type = INSIDE_NOTWORD;
2345
tree_last = create_token_tree (dfa, NULL, NULL, token);
2346
tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347
if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355
tree = create_token_tree (dfa, NULL, NULL, token);
2356
if (BE (tree == NULL, 0))
2362
/* We must return here, since ANCHORs can't be followed
2363
by repetition operators.
2364
eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365
it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366
fetch_token (token, regexp, syntax);
2369
tree = create_token_tree (dfa, NULL, NULL, token);
2370
if (BE (tree == NULL, 0))
2375
if (dfa->mb_cur_max > 1)
2376
dfa->has_mb_node = 1;
2380
tree = build_charclass_op (dfa, regexp->trans,
2381
(const unsigned char *) "alnum",
2382
(const unsigned char *) "_",
2383
token->type == OP_NOTWORD, err);
2384
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389
tree = build_charclass_op (dfa, regexp->trans,
2390
(const unsigned char *) "space",
2391
(const unsigned char *) "",
2392
token->type == OP_NOTSPACE, err);
2393
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403
/* Must not happen? */
2409
fetch_token (token, regexp, syntax);
2411
while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412
|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414
tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417
/* In BRE consecutive duplications are not allowed. */
2418
if ((syntax & RE_CONTEXT_INVALID_DUP)
2419
&& (token->type == OP_DUP_ASTERISK
2420
|| token->type == OP_OPEN_DUP_NUM))
2430
/* This function build the following tree, from regular expression
2438
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2439
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444
cur_nsub = preg->re_nsub++;
2446
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
/* The subexpression may be a null string. */
2449
if (token->type == OP_CLOSE_SUBEXP)
2453
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2454
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456
if (BE (*err != REG_NOERROR, 0))
2460
if (cur_nsub <= '9' - '1')
2461
dfa->completed_bkref_map |= 1 << cur_nsub;
2463
tree = create_tree (dfa, tree, NULL, SUBEXP);
2464
if (BE (tree == NULL, 0))
2469
tree->token.opr.idx = cur_nsub;
2473
/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2476
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2477
re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479
bin_tree_t *tree = NULL, *old_tree = NULL;
2480
Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2481
re_token_t start_token = *token;
2483
if (token->type == OP_OPEN_DUP_NUM)
2486
start = fetch_number (regexp, token, syntax);
2487
if (start == REG_MISSING)
2489
if (token->type == CHARACTER && token->opr.c == ',')
2490
start = 0; /* We treat "{,m}" as "{0,m}". */
2493
*err = REG_BADBR; /* <re>{} is invalid. */
2497
if (BE (start != REG_ERROR, 1))
2499
/* We treat "{n}" as "{n,n}". */
2500
end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2501
: ((token->type == CHARACTER && token->opr.c == ',')
2502
? fetch_number (regexp, token, syntax) : REG_ERROR));
2504
if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506
/* Invalid sequence. */
2507
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509
if (token->type == END_OF_RE)
2517
/* If the syntax bit is set, rollback. */
2518
re_string_set_index (regexp, start_idx);
2519
*token = start_token;
2520
token->type = CHARACTER;
2521
/* mb_partial and word_char bits should be already initialized by
2526
if (BE ((end != REG_MISSING && start > end)
2527
|| token->type != OP_CLOSE_DUP_NUM, 0))
2529
/* First number greater than second. */
2536
start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2537
end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540
fetch_token (token, regexp, syntax);
2542
if (BE (elem == NULL, 0))
2544
if (BE (start == 0 && end == 0, 0))
2546
postorder (elem, free_tree, NULL);
2550
/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2551
if (BE (start > 0, 0))
2554
for (i = 2; i <= start; ++i)
2556
elem = duplicate_tree (elem, dfa);
2557
tree = create_tree (dfa, tree, elem, CONCAT);
2558
if (BE (elem == NULL || tree == NULL, 0))
2559
goto parse_dup_op_espace;
2565
/* Duplicate ELEM before it is marked optional. */
2566
elem = duplicate_tree (elem, dfa);
2572
if (elem->token.type == SUBEXP)
2573
postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
tree = create_tree (dfa, elem, NULL,
2576
(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2577
if (BE (tree == NULL, 0))
2578
goto parse_dup_op_espace;
2580
/* From gnulib's "intprops.h":
2581
True if the arithmetic type T is signed. */
2582
#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584
/* This loop is actually executed only when end != REG_MISSING,
2585
to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586
already created the start+1-th copy. */
2587
if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2588
for (i = start + 2; i <= end; ++i)
2590
elem = duplicate_tree (elem, dfa);
2591
tree = create_tree (dfa, tree, elem, CONCAT);
2592
if (BE (elem == NULL || tree == NULL, 0))
2593
goto parse_dup_op_espace;
2595
tree = create_tree (dfa, tree, NULL, OP_ALT);
2596
if (BE (tree == NULL, 0))
2597
goto parse_dup_op_espace;
2601
tree = create_tree (dfa, old_tree, tree, CONCAT);
2605
parse_dup_op_espace:
2610
/* Size of the names for collating symbol/equivalence_class/character_class.
2611
I'm not sure, but maybe enough. */
2612
#define BRACKET_NAME_BUF_SIZE 32
2615
/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616
Build the range expression which starts from START_ELEM, and ends
2617
at END_ELEM. The result are written to MBCSET and SBCSET.
2618
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619
mbcset->range_ends, is a pointer argument sinse we may
2622
static reg_errcode_t
2624
# ifdef RE_ENABLE_I18N
2625
build_range_exp (const reg_syntax_t syntax,
2627
re_charset_t *mbcset,
2629
const bracket_elem_t *start_elem,
2630
const bracket_elem_t *end_elem)
2631
# else /* not RE_ENABLE_I18N */
2632
build_range_exp (const reg_syntax_t syntax,
2634
const bracket_elem_t *start_elem,
2635
const bracket_elem_t *end_elem)
2636
# endif /* not RE_ENABLE_I18N */
2638
unsigned int start_ch, end_ch;
2639
/* Equivalence Classes and Character Classes can't be a range start/end. */
2640
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645
/* We can handle no multi character collating elements without libc
2647
if (BE ((start_elem->type == COLL_SYM
2648
&& strlen ((char *) start_elem->opr.name) > 1)
2649
|| (end_elem->type == COLL_SYM
2650
&& strlen ((char *) end_elem->opr.name) > 1), 0))
2651
return REG_ECOLLATE;
2653
# ifdef RE_ENABLE_I18N
2658
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663
end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667
? __btowc (start_ch) : start_elem->opr.wch);
2668
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669
? __btowc (end_ch) : end_elem->opr.wch);
2670
if (start_wc == WEOF || end_wc == WEOF)
2671
return REG_ECOLLATE;
2672
cmp_buf[0] = start_wc;
2673
cmp_buf[4] = end_wc;
2675
if (BE ((syntax & RE_NO_EMPTY_RANGES)
2676
&& wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2679
/* Got valid collation sequence values, add them as a new entry.
2680
However, for !_LIBC we have no collation elements: if the
2681
character set is single byte, the single byte character set
2682
that we build below suffices. parse_bracket_exp passes
2683
no MBCSET if dfa->mb_cur_max == 1. */
2686
/* Check the space of the arrays. */
2687
if (BE (*range_alloc == mbcset->nranges, 0))
2689
/* There is not enough space, need realloc. */
2690
wchar_t *new_array_start, *new_array_end;
2693
/* +1 in case of mbcset->nranges is 0. */
2694
new_nranges = 2 * mbcset->nranges + 1;
2695
/* Use realloc since mbcset->range_starts and mbcset->range_ends
2696
are NULL if *range_alloc == 0. */
2697
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2702
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2705
mbcset->range_starts = new_array_start;
2706
mbcset->range_ends = new_array_end;
2707
*range_alloc = new_nranges;
2710
mbcset->range_starts[mbcset->nranges] = start_wc;
2711
mbcset->range_ends[mbcset->nranges++] = end_wc;
2714
/* Build the table for single byte characters. */
2715
for (wc = 0; wc < SBC_MAX; ++wc)
2718
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2719
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2720
bitset_set (sbcset, wc);
2723
# else /* not RE_ENABLE_I18N */
2726
start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2727
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729
end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2730
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732
if (start_ch > end_ch)
2734
/* Build the table for single byte characters. */
2735
for (ch = 0; ch < SBC_MAX; ++ch)
2736
if (start_ch <= ch && ch <= end_ch)
2737
bitset_set (sbcset, ch);
2739
# endif /* not RE_ENABLE_I18N */
2742
#endif /* not _LIBC */
2745
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2746
Build the collating element which is represented by NAME.
2747
The result are written to MBCSET and SBCSET.
2748
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2749
pointer argument since we may update it. */
2751
static reg_errcode_t
2753
build_collating_symbol (bitset_t sbcset,
2754
# ifdef RE_ENABLE_I18N
2755
re_charset_t *mbcset, Idx *coll_sym_alloc,
2757
const unsigned char *name)
2759
size_t name_len = strlen ((const char *) name);
2760
if (BE (name_len != 1, 0))
2761
return REG_ECOLLATE;
2764
bitset_set (sbcset, name[0]);
2768
#endif /* not _LIBC */
2770
/* This function parse bracket expression like "[abc]", "[a-c]",
2774
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2775
reg_syntax_t syntax, reg_errcode_t *err)
2778
const unsigned char *collseqmb;
2779
const char *collseqwc;
2782
const int32_t *symb_table;
2783
const unsigned char *extra;
2785
/* Local function for parse_bracket_exp used in _LIBC environement.
2786
Seek the collating symbol entry correspondings to NAME.
2787
Return the index of the symbol in the SYMB_TABLE. */
2790
__attribute ((always_inline))
2791
seek_collating_symbol_entry (name, name_len)
2792
const unsigned char *name;
2795
int32_t hash = elem_hash ((const char *) name, name_len);
2796
int32_t elem = hash % table_size;
2797
if (symb_table[2 * elem] != 0)
2799
int32_t second = hash % (table_size - 2) + 1;
2803
/* First compare the hashing value. */
2804
if (symb_table[2 * elem] == hash
2805
/* Compare the length of the name. */
2806
&& name_len == extra[symb_table[2 * elem + 1]]
2807
/* Compare the name. */
2808
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2811
/* Yep, this is the entry. */
2818
while (symb_table[2 * elem] != 0);
2823
/* Local function for parse_bracket_exp used in _LIBC environment.
2824
Look up the collation sequence value of BR_ELEM.
2825
Return the value if succeeded, UINT_MAX otherwise. */
2827
auto inline unsigned int
2828
__attribute ((always_inline))
2829
lookup_collation_sequence_value (br_elem)
2830
bracket_elem_t *br_elem;
2832
if (br_elem->type == SB_CHAR)
2835
if (MB_CUR_MAX == 1)
2838
return collseqmb[br_elem->opr.ch];
2841
wint_t wc = __btowc (br_elem->opr.ch);
2842
return __collseq_table_lookup (collseqwc, wc);
2845
else if (br_elem->type == MB_CHAR)
2848
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2850
else if (br_elem->type == COLL_SYM)
2852
size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2856
elem = seek_collating_symbol_entry (br_elem->opr.name,
2858
if (symb_table[2 * elem] != 0)
2860
/* We found the entry. */
2861
idx = symb_table[2 * elem + 1];
2862
/* Skip the name of collating element name. */
2863
idx += 1 + extra[idx];
2864
/* Skip the byte sequence of the collating element. */
2865
idx += 1 + extra[idx];
2866
/* Adjust for the alignment. */
2867
idx = (idx + 3) & ~3;
2868
/* Skip the multibyte collation sequence value. */
2869
idx += sizeof (unsigned int);
2870
/* Skip the wide char sequence of the collating element. */
2871
idx += sizeof (unsigned int) *
2872
(1 + *(unsigned int *) (extra + idx));
2873
/* Return the collation sequence value. */
2874
return *(unsigned int *) (extra + idx);
2876
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2878
/* No valid character. Match it as a single byte
2880
return collseqmb[br_elem->opr.name[0]];
2883
else if (sym_name_len == 1)
2884
return collseqmb[br_elem->opr.name[0]];
2889
/* Local function for parse_bracket_exp used in _LIBC environement.
2890
Build the range expression which starts from START_ELEM, and ends
2891
at END_ELEM. The result are written to MBCSET and SBCSET.
2892
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2893
mbcset->range_ends, is a pointer argument sinse we may
2896
auto inline reg_errcode_t
2897
__attribute ((always_inline))
2898
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2899
re_charset_t *mbcset;
2902
bracket_elem_t *start_elem, *end_elem;
2905
uint32_t start_collseq;
2906
uint32_t end_collseq;
2908
/* Equivalence Classes and Character Classes can't be a range
2910
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2911
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2915
start_collseq = lookup_collation_sequence_value (start_elem);
2916
end_collseq = lookup_collation_sequence_value (end_elem);
2917
/* Check start/end collation sequence values. */
2918
if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2919
return REG_ECOLLATE;
2920
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923
/* Got valid collation sequence values, add them as a new entry.
2924
However, if we have no collation elements, and the character set
2925
is single byte, the single byte character set that we
2926
build below suffices. */
2927
if (nrules > 0 || dfa->mb_cur_max > 1)
2929
/* Check the space of the arrays. */
2930
if (BE (*range_alloc == mbcset->nranges, 0))
2932
/* There is not enough space, need realloc. */
2933
uint32_t *new_array_start;
2934
uint32_t *new_array_end;
2937
/* +1 in case of mbcset->nranges is 0. */
2938
new_nranges = 2 * mbcset->nranges + 1;
2939
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2941
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947
mbcset->range_starts = new_array_start;
2948
mbcset->range_ends = new_array_end;
2949
*range_alloc = new_nranges;
2952
mbcset->range_starts[mbcset->nranges] = start_collseq;
2953
mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956
/* Build the table for single byte characters. */
2957
for (ch = 0; ch < SBC_MAX; ch++)
2959
uint32_t ch_collseq;
2961
if (MB_CUR_MAX == 1)
2964
ch_collseq = collseqmb[ch];
2966
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2967
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2968
bitset_set (sbcset, ch);
2973
/* Local function for parse_bracket_exp used in _LIBC environement.
2974
Build the collating element which is represented by NAME.
2975
The result are written to MBCSET and SBCSET.
2976
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2977
pointer argument sinse we may update it. */
2979
auto inline reg_errcode_t
2980
__attribute ((always_inline))
2981
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2982
re_charset_t *mbcset;
2983
Idx *coll_sym_alloc;
2985
const unsigned char *name;
2988
size_t name_len = strlen ((const char *) name);
2991
elem = seek_collating_symbol_entry (name, name_len);
2992
if (symb_table[2 * elem] != 0)
2994
/* We found the entry. */
2995
idx = symb_table[2 * elem + 1];
2996
/* Skip the name of collating element name. */
2997
idx += 1 + extra[idx];
2999
else if (symb_table[2 * elem] == 0 && name_len == 1)
3001
/* No valid character, treat it as a normal
3003
bitset_set (sbcset, name[0]);
3007
return REG_ECOLLATE;
3009
/* Got valid collation sequence, add it as a new entry. */
3010
/* Check the space of the arrays. */
3011
if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3013
/* Not enough, realloc it. */
3014
/* +1 in case of mbcset->ncoll_syms is 0. */
3015
Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3016
/* Use realloc since mbcset->coll_syms is NULL
3018
int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3019
new_coll_sym_alloc);
3020
if (BE (new_coll_syms == NULL, 0))
3022
mbcset->coll_syms = new_coll_syms;
3023
*coll_sym_alloc = new_coll_sym_alloc;
3025
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3030
if (BE (name_len != 1, 0))
3031
return REG_ECOLLATE;
3034
bitset_set (sbcset, name[0]);
3041
re_token_t br_token;
3042
re_bitset_ptr_t sbcset;
3043
#ifdef RE_ENABLE_I18N
3044
re_charset_t *mbcset;
3045
Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3046
Idx equiv_class_alloc = 0, char_class_alloc = 0;
3047
#endif /* not RE_ENABLE_I18N */
3048
bool non_match = false;
3049
bin_tree_t *work_tree;
3051
bool first_round = true;
3053
collseqmb = (const unsigned char *)
3054
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3055
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3061
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3062
table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3063
symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3064
_NL_COLLATE_SYMB_TABLEMB);
3065
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3066
_NL_COLLATE_SYMB_EXTRAMB);
3069
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3070
#ifdef RE_ENABLE_I18N
3071
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3072
#endif /* RE_ENABLE_I18N */
3073
#ifdef RE_ENABLE_I18N
3074
if (BE (sbcset == NULL || mbcset == NULL, 0))
3076
if (BE (sbcset == NULL, 0))
3077
#endif /* RE_ENABLE_I18N */
3083
token_len = peek_token_bracket (token, regexp, syntax);
3084
if (BE (token->type == END_OF_RE, 0))
3087
goto parse_bracket_exp_free_return;
3089
if (token->type == OP_NON_MATCH_LIST)
3091
#ifdef RE_ENABLE_I18N
3092
mbcset->non_match = 1;
3093
#endif /* not RE_ENABLE_I18N */
3095
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3096
bitset_set (sbcset, '\n');
3097
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3098
token_len = peek_token_bracket (token, regexp, syntax);
3099
if (BE (token->type == END_OF_RE, 0))
3102
goto parse_bracket_exp_free_return;
3106
/* We treat the first ']' as a normal character. */
3107
if (token->type == OP_CLOSE_BRACKET)
3108
token->type = CHARACTER;
3112
bracket_elem_t start_elem, end_elem;
3113
unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3114
unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117
bool is_range_exp = false;
3120
start_elem.opr.name = start_name_buf;
3121
ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3122
syntax, first_round);
3123
if (BE (ret != REG_NOERROR, 0))
3126
goto parse_bracket_exp_free_return;
3128
first_round = false;
3130
/* Get information about the next token. We need it in any case. */
3131
token_len = peek_token_bracket (token, regexp, syntax);
3133
/* Do not check for ranges if we know they are not allowed. */
3134
if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3136
if (BE (token->type == END_OF_RE, 0))
3139
goto parse_bracket_exp_free_return;
3141
if (token->type == OP_CHARSET_RANGE)
3143
re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3144
token_len2 = peek_token_bracket (&token2, regexp, syntax);
3145
if (BE (token2.type == END_OF_RE, 0))
3148
goto parse_bracket_exp_free_return;
3150
if (token2.type == OP_CLOSE_BRACKET)
3152
/* We treat the last '-' as a normal character. */
3153
re_string_skip_bytes (regexp, -token_len);
3154
token->type = CHARACTER;
3157
is_range_exp = true;
3161
if (is_range_exp == true)
3163
end_elem.opr.name = end_name_buf;
3164
ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3166
if (BE (ret != REG_NOERROR, 0))
3169
goto parse_bracket_exp_free_return;
3172
token_len = peek_token_bracket (token, regexp, syntax);
3175
*err = build_range_exp (sbcset, mbcset, &range_alloc,
3176
&start_elem, &end_elem);
3178
# ifdef RE_ENABLE_I18N
3179
*err = build_range_exp (syntax, sbcset,
3180
dfa->mb_cur_max > 1 ? mbcset : NULL,
3181
&range_alloc, &start_elem, &end_elem);
3183
*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3185
#endif /* RE_ENABLE_I18N */
3186
if (BE (*err != REG_NOERROR, 0))
3187
goto parse_bracket_exp_free_return;
3191
switch (start_elem.type)
3194
bitset_set (sbcset, start_elem.opr.ch);
3196
#ifdef RE_ENABLE_I18N
3198
/* Check whether the array has enough space. */
3199
if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3201
wchar_t *new_mbchars;
3202
/* Not enough, realloc it. */
3203
/* +1 in case of mbcset->nmbchars is 0. */
3204
mbchar_alloc = 2 * mbcset->nmbchars + 1;
3205
/* Use realloc since array is NULL if *alloc == 0. */
3206
new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3208
if (BE (new_mbchars == NULL, 0))
3209
goto parse_bracket_exp_espace;
3210
mbcset->mbchars = new_mbchars;
3212
mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3214
#endif /* RE_ENABLE_I18N */
3216
*err = build_equiv_class (sbcset,
3217
#ifdef RE_ENABLE_I18N
3218
mbcset, &equiv_class_alloc,
3219
#endif /* RE_ENABLE_I18N */
3220
start_elem.opr.name);
3221
if (BE (*err != REG_NOERROR, 0))
3222
goto parse_bracket_exp_free_return;
3225
*err = build_collating_symbol (sbcset,
3226
#ifdef RE_ENABLE_I18N
3227
mbcset, &coll_sym_alloc,
3228
#endif /* RE_ENABLE_I18N */
3229
start_elem.opr.name);
3230
if (BE (*err != REG_NOERROR, 0))
3231
goto parse_bracket_exp_free_return;
3234
*err = build_charclass (regexp->trans, sbcset,
3235
#ifdef RE_ENABLE_I18N
3236
mbcset, &char_class_alloc,
3237
#endif /* RE_ENABLE_I18N */
3238
start_elem.opr.name, syntax);
3239
if (BE (*err != REG_NOERROR, 0))
3240
goto parse_bracket_exp_free_return;
3247
if (BE (token->type == END_OF_RE, 0))
3250
goto parse_bracket_exp_free_return;
3252
if (token->type == OP_CLOSE_BRACKET)
3256
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3258
/* If it is non-matching list. */
3260
bitset_not (sbcset);
3262
#ifdef RE_ENABLE_I18N
3263
/* Ensure only single byte characters are set. */
3264
if (dfa->mb_cur_max > 1)
3265
bitset_mask (sbcset, dfa->sb_char);
3267
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3268
|| mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3269
|| mbcset->non_match)))
3271
bin_tree_t *mbc_tree;
3273
/* Build a tree for complex bracket. */
3274
dfa->has_mb_node = 1;
3275
br_token.type = COMPLEX_BRACKET;
3276
br_token.opr.mbcset = mbcset;
3277
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3278
if (BE (mbc_tree == NULL, 0))
3279
goto parse_bracket_exp_espace;
3280
for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3281
if (sbcset[sbc_idx])
3283
/* If there are no bits set in sbcset, there is no point
3284
of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3285
if (sbc_idx < BITSET_WORDS)
3287
/* Build a tree for simple bracket. */
3288
br_token.type = SIMPLE_BRACKET;
3289
br_token.opr.sbcset = sbcset;
3290
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291
if (BE (work_tree == NULL, 0))
3292
goto parse_bracket_exp_espace;
3294
/* Then join them by ALT node. */
3295
work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3296
if (BE (work_tree == NULL, 0))
3297
goto parse_bracket_exp_espace;
3302
work_tree = mbc_tree;
3306
#endif /* not RE_ENABLE_I18N */
3308
#ifdef RE_ENABLE_I18N
3309
free_charset (mbcset);
3311
/* Build a tree for simple bracket. */
3312
br_token.type = SIMPLE_BRACKET;
3313
br_token.opr.sbcset = sbcset;
3314
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3315
if (BE (work_tree == NULL, 0))
3316
goto parse_bracket_exp_espace;
3320
parse_bracket_exp_espace:
3322
parse_bracket_exp_free_return:
3324
#ifdef RE_ENABLE_I18N
3325
free_charset (mbcset);
3326
#endif /* RE_ENABLE_I18N */
3330
/* Parse an element in the bracket expression. */
3332
static reg_errcode_t
3333
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3334
re_token_t *token, int token_len, re_dfa_t *dfa,
3335
reg_syntax_t syntax, bool accept_hyphen)
3337
#ifdef RE_ENABLE_I18N
3339
cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3340
if (cur_char_size > 1)
3342
elem->type = MB_CHAR;
3343
elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3344
re_string_skip_bytes (regexp, cur_char_size);
3347
#endif /* RE_ENABLE_I18N */
3348
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3349
if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3350
|| token->type == OP_OPEN_EQUIV_CLASS)
3351
return parse_bracket_symbol (elem, regexp, token);
3352
if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3354
/* A '-' must only appear as anything but a range indicator before
3355
the closing bracket. Everything else is an error. */
3357
(void) peek_token_bracket (&token2, regexp, syntax);
3358
if (token2.type != OP_CLOSE_BRACKET)
3359
/* The actual error value is not standardized since this whole
3360
case is undefined. But ERANGE makes good sense. */
3363
elem->type = SB_CHAR;
3364
elem->opr.ch = token->opr.c;
3368
/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3369
such as [:<character_class>:], [.<collating_element>.], and
3370
[=<equivalent_class>=]. */
3372
static reg_errcode_t
3373
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3376
unsigned char ch, delim = token->opr.c;
3378
if (re_string_eoi(regexp))
3382
if (i >= BRACKET_NAME_BUF_SIZE)
3384
if (token->type == OP_OPEN_CHAR_CLASS)
3385
ch = re_string_fetch_byte_case (regexp);
3387
ch = re_string_fetch_byte (regexp);
3388
if (re_string_eoi(regexp))
3390
if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3392
elem->opr.name[i] = ch;
3394
re_string_skip_bytes (regexp, 1);
3395
elem->opr.name[i] = '\0';
3396
switch (token->type)
3398
case OP_OPEN_COLL_ELEM:
3399
elem->type = COLL_SYM;
3401
case OP_OPEN_EQUIV_CLASS:
3402
elem->type = EQUIV_CLASS;
3404
case OP_OPEN_CHAR_CLASS:
3405
elem->type = CHAR_CLASS;
3413
/* Helper function for parse_bracket_exp.
3414
Build the equivalence class which is represented by NAME.
3415
The result are written to MBCSET and SBCSET.
3416
EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3417
is a pointer argument sinse we may update it. */
3419
static reg_errcode_t
3420
#ifdef RE_ENABLE_I18N
3421
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3422
Idx *equiv_class_alloc, const unsigned char *name)
3423
#else /* not RE_ENABLE_I18N */
3424
build_equiv_class (bitset_t sbcset, const unsigned char *name)
3425
#endif /* not RE_ENABLE_I18N */
3428
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3431
const int32_t *table, *indirect;
3432
const unsigned char *weights, *extra, *cp;
3433
unsigned char char_buf[2];
3437
/* This #include defines a local function! */
3438
# include <locale/weight.h>
3439
/* Calculate the index for equivalence class. */
3441
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3442
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3443
_NL_COLLATE_WEIGHTMB);
3444
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3445
_NL_COLLATE_EXTRAMB);
3446
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3447
_NL_COLLATE_INDIRECTMB);
3448
idx1 = findidx (&cp);
3449
if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3450
/* This isn't a valid character. */
3451
return REG_ECOLLATE;
3453
/* Build single byte matcing table for this equivalence class. */
3454
char_buf[1] = (unsigned char) '\0';
3455
len = weights[idx1 & 0xffffff];
3456
for (ch = 0; ch < SBC_MAX; ++ch)
3460
idx2 = findidx (&cp);
3465
/* This isn't a valid character. */
3467
/* Compare only if the length matches and the collation rule
3468
index is the same. */
3469
if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3473
while (cnt <= len &&
3474
weights[(idx1 & 0xffffff) + 1 + cnt]
3475
== weights[(idx2 & 0xffffff) + 1 + cnt])
3479
bitset_set (sbcset, ch);
3482
/* Check whether the array has enough space. */
3483
if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3485
/* Not enough, realloc it. */
3486
/* +1 in case of mbcset->nequiv_classes is 0. */
3487
Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3488
/* Use realloc since the array is NULL if *alloc == 0. */
3489
int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3491
new_equiv_class_alloc);
3492
if (BE (new_equiv_classes == NULL, 0))
3494
mbcset->equiv_classes = new_equiv_classes;
3495
*equiv_class_alloc = new_equiv_class_alloc;
3497
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3502
if (BE (strlen ((const char *) name) != 1, 0))
3503
return REG_ECOLLATE;
3504
bitset_set (sbcset, *name);
3509
/* Helper function for parse_bracket_exp.
3510
Build the character class which is represented by NAME.
3511
The result are written to MBCSET and SBCSET.
3512
CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3513
is a pointer argument sinse we may update it. */
3515
static reg_errcode_t
3516
#ifdef RE_ENABLE_I18N
3517
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3518
re_charset_t *mbcset, Idx *char_class_alloc,
3519
const unsigned char *class_name, reg_syntax_t syntax)
3520
#else /* not RE_ENABLE_I18N */
3521
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522
const unsigned char *class_name, reg_syntax_t syntax)
3523
#endif /* not RE_ENABLE_I18N */
3526
const char *name = (const char *) class_name;
3528
/* In case of REG_ICASE "upper" and "lower" match the both of
3529
upper and lower cases. */
3530
if ((syntax & RE_ICASE)
3531
&& (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3534
#ifdef RE_ENABLE_I18N
3535
/* Check the space of the arrays. */
3536
if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3538
/* Not enough, realloc it. */
3539
/* +1 in case of mbcset->nchar_classes is 0. */
3540
Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3541
/* Use realloc since array is NULL if *alloc == 0. */
3542
wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3543
new_char_class_alloc);
3544
if (BE (new_char_classes == NULL, 0))
3546
mbcset->char_classes = new_char_classes;
3547
*char_class_alloc = new_char_class_alloc;
3549
mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3550
#endif /* RE_ENABLE_I18N */
3552
#define BUILD_CHARCLASS_LOOP(ctype_func) \
3554
if (BE (trans != NULL, 0)) \
3556
for (i = 0; i < SBC_MAX; ++i) \
3557
if (ctype_func (i)) \
3558
bitset_set (sbcset, trans[i]); \
3562
for (i = 0; i < SBC_MAX; ++i) \
3563
if (ctype_func (i)) \
3564
bitset_set (sbcset, i); \
3568
if (strcmp (name, "alnum") == 0)
3569
BUILD_CHARCLASS_LOOP (isalnum);
3570
else if (strcmp (name, "cntrl") == 0)
3571
BUILD_CHARCLASS_LOOP (iscntrl);
3572
else if (strcmp (name, "lower") == 0)
3573
BUILD_CHARCLASS_LOOP (islower);
3574
else if (strcmp (name, "space") == 0)
3575
BUILD_CHARCLASS_LOOP (isspace);
3576
else if (strcmp (name, "alpha") == 0)
3577
BUILD_CHARCLASS_LOOP (isalpha);
3578
else if (strcmp (name, "digit") == 0)
3579
BUILD_CHARCLASS_LOOP (isdigit);
3580
else if (strcmp (name, "print") == 0)
3581
BUILD_CHARCLASS_LOOP (isprint);
3582
else if (strcmp (name, "upper") == 0)
3583
BUILD_CHARCLASS_LOOP (isupper);
3584
else if (strcmp (name, "blank") == 0)
3585
BUILD_CHARCLASS_LOOP (isblank);
3586
else if (strcmp (name, "graph") == 0)
3587
BUILD_CHARCLASS_LOOP (isgraph);
3588
else if (strcmp (name, "punct") == 0)
3589
BUILD_CHARCLASS_LOOP (ispunct);
3590
else if (strcmp (name, "xdigit") == 0)
3591
BUILD_CHARCLASS_LOOP (isxdigit);
3599
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3600
const unsigned char *class_name,
3601
const unsigned char *extra, bool non_match,
3604
re_bitset_ptr_t sbcset;
3605
#ifdef RE_ENABLE_I18N
3606
re_charset_t *mbcset;
3608
#endif /* not RE_ENABLE_I18N */
3610
re_token_t br_token;
3613
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3614
#ifdef RE_ENABLE_I18N
3615
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3616
#endif /* RE_ENABLE_I18N */
3618
#ifdef RE_ENABLE_I18N
3619
if (BE (sbcset == NULL || mbcset == NULL, 0))
3620
#else /* not RE_ENABLE_I18N */
3621
if (BE (sbcset == NULL, 0))
3622
#endif /* not RE_ENABLE_I18N */
3630
#ifdef RE_ENABLE_I18N
3631
mbcset->non_match = 1;
3632
#endif /* not RE_ENABLE_I18N */
3635
/* We don't care the syntax in this case. */
3636
ret = build_charclass (trans, sbcset,
3637
#ifdef RE_ENABLE_I18N
3639
#endif /* RE_ENABLE_I18N */
3642
if (BE (ret != REG_NOERROR, 0))
3645
#ifdef RE_ENABLE_I18N
3646
free_charset (mbcset);
3647
#endif /* RE_ENABLE_I18N */
3651
/* \w match '_' also. */
3652
for (; *extra; extra++)
3653
bitset_set (sbcset, *extra);
3655
/* If it is non-matching list. */
3657
bitset_not (sbcset);
3659
#ifdef RE_ENABLE_I18N
3660
/* Ensure only single byte characters are set. */
3661
if (dfa->mb_cur_max > 1)
3662
bitset_mask (sbcset, dfa->sb_char);
3665
/* Build a tree for simple bracket. */
3666
br_token.type = SIMPLE_BRACKET;
3667
br_token.opr.sbcset = sbcset;
3668
tree = create_token_tree (dfa, NULL, NULL, &br_token);
3669
if (BE (tree == NULL, 0))
3670
goto build_word_op_espace;
3672
#ifdef RE_ENABLE_I18N
3673
if (dfa->mb_cur_max > 1)
3675
bin_tree_t *mbc_tree;
3676
/* Build a tree for complex bracket. */
3677
br_token.type = COMPLEX_BRACKET;
3678
br_token.opr.mbcset = mbcset;
3679
dfa->has_mb_node = 1;
3680
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3681
if (BE (mbc_tree == NULL, 0))
3682
goto build_word_op_espace;
3683
/* Then join them by ALT node. */
3684
tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3685
if (BE (mbc_tree != NULL, 1))
3690
free_charset (mbcset);
3693
#else /* not RE_ENABLE_I18N */
3695
#endif /* not RE_ENABLE_I18N */
3697
build_word_op_espace:
3699
#ifdef RE_ENABLE_I18N
3700
free_charset (mbcset);
3701
#endif /* RE_ENABLE_I18N */
3706
/* This is intended for the expressions like "a{1,3}".
3707
Fetch a number from `input', and return the number.
3708
Return REG_MISSING if the number field is empty like "{,1}".
3709
Return REG_ERROR if an error occurred. */
3712
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3714
Idx num = REG_MISSING;
3718
fetch_token (token, input, syntax);
3720
if (BE (token->type == END_OF_RE, 0))
3722
if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3724
num = ((token->type != CHARACTER || c < '0' || '9' < c
3725
|| num == REG_ERROR)
3727
: ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3728
num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3733
#ifdef RE_ENABLE_I18N
3735
free_charset (re_charset_t *cset)
3737
re_free (cset->mbchars);
3739
re_free (cset->coll_syms);
3740
re_free (cset->equiv_classes);
3741
re_free (cset->range_starts);
3742
re_free (cset->range_ends);
3744
re_free (cset->char_classes);
3747
#endif /* RE_ENABLE_I18N */
3749
/* Functions for binary tree operation. */
3751
/* Create a tree node. */
3754
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3755
re_token_type_t type)
3759
return create_token_tree (dfa, left, right, &t);
3763
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764
const re_token_t *token)
3767
if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3769
bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3771
if (storage == NULL)
3773
storage->next = dfa->str_tree_storage;
3774
dfa->str_tree_storage = storage;
3775
dfa->str_tree_storage_idx = 0;
3777
tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3779
tree->parent = NULL;
3781
tree->right = right;
3782
tree->token = *token;
3783
tree->token.duplicated = 0;
3784
tree->token.opt_subexp = 0;
3787
tree->node_idx = REG_MISSING;
3790
left->parent = tree;
3792
right->parent = tree;
3796
/* Mark the tree SRC as an optional subexpression.
3797
To be called from preorder or postorder. */
3799
static reg_errcode_t
3800
mark_opt_subexp (void *extra, bin_tree_t *node)
3802
Idx idx = (Idx) (long) extra;
3803
if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3804
node->token.opt_subexp = 1;
3809
/* Free the allocated memory inside NODE. */
3812
free_token (re_token_t *node)
3814
#ifdef RE_ENABLE_I18N
3815
if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3816
free_charset (node->opr.mbcset);
3818
#endif /* RE_ENABLE_I18N */
3819
if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3820
re_free (node->opr.sbcset);
3823
/* Worker function for tree walking. Free the allocated memory inside NODE
3824
and its children. */
3826
static reg_errcode_t
3827
free_tree (void *extra, bin_tree_t *node)
3829
free_token (&node->token);
3834
/* Duplicate the node SRC, and return new node. This is a preorder
3835
visit similar to the one implemented by the generic visitor, but
3836
we need more infrastructure to maintain two parallel trees --- so,
3837
it's easier to duplicate. */
3840
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3842
const bin_tree_t *node;
3843
bin_tree_t *dup_root;
3844
bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3846
for (node = root; ; )
3848
/* Create a new tree and link it back to the current parent. */
3849
*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3852
(*p_new)->parent = dup_node;
3853
(*p_new)->token.duplicated = 1;
3856
/* Go to the left node, or up and to the right. */
3860
p_new = &dup_node->left;
3864
const bin_tree_t *prev = NULL;
3865
while (node->right == prev || node->right == NULL)
3868
node = node->parent;
3869
dup_node = dup_node->parent;
3874
p_new = &dup_node->right;