1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
3
Free Software Foundation, Inc.
4
This file is part of the GNU C Library.
5
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7
This program is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 2, or (at your option)
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
GNU General Public License for more details.
17
You should have received a copy of the GNU General Public License along
18
with this program; if not, write to the Free Software Foundation,
19
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22
size_t length, reg_syntax_t syntax);
23
static void re_compile_fastmap_iter (regex_t *bufp,
24
const re_dfastate_t *init_state,
26
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28
static void free_charset (re_charset_t *cset);
29
#endif /* RE_ENABLE_I18N */
30
static void free_workarea_compile (regex_t *preg);
31
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
static void optimize_utf8 (re_dfa_t *dfa);
35
static reg_errcode_t analyze (regex_t *preg);
36
static reg_errcode_t preorder (bin_tree_t *root,
37
reg_errcode_t (fn (void *, bin_tree_t *)),
39
static reg_errcode_t postorder (bin_tree_t *root,
40
reg_errcode_t (fn (void *, bin_tree_t *)),
42
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51
unsigned int constraint);
52
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56
static Idx fetch_number (re_string_t *input, re_token_t *token,
58
static int peek_token (re_token_t *token, re_string_t *input,
59
reg_syntax_t syntax) internal_function;
60
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61
reg_syntax_t syntax, reg_errcode_t *err);
62
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63
re_token_t *token, reg_syntax_t syntax,
64
Idx nest, reg_errcode_t *err);
65
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66
re_token_t *token, reg_syntax_t syntax,
67
Idx nest, reg_errcode_t *err);
68
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69
re_token_t *token, reg_syntax_t syntax,
70
Idx nest, reg_errcode_t *err);
71
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72
re_token_t *token, reg_syntax_t syntax,
73
Idx nest, reg_errcode_t *err);
74
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75
re_dfa_t *dfa, re_token_t *token,
76
reg_syntax_t syntax, reg_errcode_t *err);
77
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78
re_token_t *token, reg_syntax_t syntax,
80
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82
re_token_t *token, int token_len,
86
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90
static reg_errcode_t build_equiv_class (bitset_t sbcset,
92
Idx *equiv_class_alloc,
93
const unsigned char *name);
94
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97
Idx *char_class_alloc,
98
const unsigned char *class_name,
100
#else /* not RE_ENABLE_I18N */
101
static reg_errcode_t build_equiv_class (bitset_t sbcset,
102
const unsigned char *name);
103
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105
const unsigned char *class_name,
106
reg_syntax_t syntax);
107
#endif /* not RE_ENABLE_I18N */
108
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109
RE_TRANSLATE_TYPE trans,
110
const unsigned char *class_name,
111
const unsigned char *extra,
112
bool non_match, reg_errcode_t *err);
113
static bin_tree_t *create_tree (re_dfa_t *dfa,
114
bin_tree_t *left, bin_tree_t *right,
115
re_token_type_t type);
116
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117
bin_tree_t *left, bin_tree_t *right,
118
const re_token_t *token);
119
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120
static void free_token (re_token_t *node);
121
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124
/* This table gives an error message for each of the error codes listed
125
in regex.h. Obviously the order here has to be same as there.
126
POSIX doesn't require that we do anything for REG_NOERROR,
127
but why not be nice? */
129
static const char __re_error_msgid[] =
131
#define REG_NOERROR_IDX 0
132
gettext_noop ("Success") /* REG_NOERROR */
134
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135
gettext_noop ("No match") /* REG_NOMATCH */
137
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138
gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141
gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144
gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147
gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150
gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153
gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156
gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159
gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162
gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165
gettext_noop ("Invalid range end") /* REG_ERANGE */
167
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168
gettext_noop ("Memory exhausted") /* REG_ESPACE */
170
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171
gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174
gettext_noop ("Premature end of regular expression") /* REG_EEND */
176
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177
gettext_noop ("Regular expression too big") /* REG_ESIZE */
179
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180
gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183
static const size_t __re_error_msgid_idx[] =
204
/* Entry points for GNU code. */
206
/* re_compile_pattern is the GNU regular expression compiler: it
207
compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208
Returns 0 if the pattern was valid, otherwise an error string.
210
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211
are set in BUFP on entry. */
215
re_compile_pattern (pattern, length, bufp)
218
struct re_pattern_buffer *bufp;
219
#else /* size_t might promote */
221
re_compile_pattern (const char *pattern, size_t length,
222
struct re_pattern_buffer *bufp)
227
/* And GNU code determines whether or not to get register information
228
by passing null for the REGS argument to re_match, etc., not by
229
setting no_sub, unless RE_NO_SUB is set. */
230
bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232
/* Match anchors at newline. */
233
bufp->newline_anchor = 1;
235
ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239
return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242
weak_alias (__re_compile_pattern, re_compile_pattern)
245
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246
also be assigned to arbitrarily: each pattern buffer stores its own
247
syntax, so it can be changed between regex compilations. */
248
/* This has no initializer because initialized variables in Emacs
249
become read-only after dumping. */
250
reg_syntax_t re_syntax_options;
253
/* Specify the precise syntax of regexps for compilation. This provides
254
for compatibility for various utilities which historically have
255
different, incompatible syntaxes.
257
The argument SYNTAX is a bit mask comprised of the various bits
258
defined in regex.h. We return the old syntax. */
261
re_set_syntax (syntax)
264
reg_syntax_t ret = re_syntax_options;
266
re_syntax_options = syntax;
270
weak_alias (__re_set_syntax, re_set_syntax)
274
re_compile_fastmap (bufp)
275
struct re_pattern_buffer *bufp;
277
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278
char *fastmap = bufp->fastmap;
280
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282
if (dfa->init_state != dfa->init_state_word)
283
re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284
if (dfa->init_state != dfa->init_state_nl)
285
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286
if (dfa->init_state != dfa->init_state_begbuf)
287
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288
bufp->fastmap_accurate = 1;
292
weak_alias (__re_compile_fastmap, re_compile_fastmap)
296
__attribute ((always_inline))
297
re_set_fastmap (char *fastmap, bool icase, int ch)
301
fastmap[tolower (ch)] = 1;
304
/* Helper function for re_compile_fastmap.
305
Compile fastmap for the initial_state INIT_STATE. */
308
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313
bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316
Idx node = init_state->nodes.elems[node_cnt];
317
re_token_type_t type = dfa->nodes[node].type;
319
if (type == CHARACTER)
321
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322
#ifdef RE_ENABLE_I18N
323
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325
unsigned char buf[MB_LEN_MAX];
331
*p++ = dfa->nodes[node].opr.c;
332
while (++node < dfa->nodes_len
333
&& dfa->nodes[node].type == CHARACTER
334
&& dfa->nodes[node].mb_partial)
335
*p++ = dfa->nodes[node].opr.c;
336
memset (&state, '\0', sizeof (state));
337
if (__mbrtowc (&wc, (const char *) buf, p - buf,
339
&& (__wcrtomb ((char *) buf, towlower (wc), &state)
341
re_set_fastmap (fastmap, false, buf[0]);
345
else if (type == SIMPLE_BRACKET)
348
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351
bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353
if (w & ((bitset_word_t) 1 << j))
354
re_set_fastmap (fastmap, icase, ch);
357
#ifdef RE_ENABLE_I18N
358
else if (type == COMPLEX_BRACKET)
360
re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364
/* See if we have to try all bytes which start multiple collation
366
e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367
collation element, and don't catch 'b' since 'b' is
368
the only collation element which starts from 'b' (and
369
it is caught by SIMPLE_BRACKET). */
370
if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371
&& (cset->ncoll_syms || cset->nranges))
373
const int32_t *table = (const int32_t *)
374
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375
for (i = 0; i < SBC_MAX; ++i)
377
re_set_fastmap (fastmap, icase, i);
381
/* See if we have to start the match at all multibyte characters,
382
i.e. where we would not find an invalid sequence. This only
383
applies to multibyte character sets; for single byte character
384
sets, the SIMPLE_BRACKET again suffices. */
385
if (dfa->mb_cur_max > 1
386
&& (cset->nchar_classes || cset->non_match
388
|| cset->nequiv_classes
396
memset (&mbs, 0, sizeof (mbs));
397
if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398
re_set_fastmap (fastmap, false, (int) c);
405
/* ... Else catch all bytes which can start the mbchars. */
406
for (i = 0; i < cset->nmbchars; ++i)
410
memset (&state, '\0', sizeof (state));
411
if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415
if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417
re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422
#endif /* RE_ENABLE_I18N */
423
else if (type == OP_PERIOD
424
#ifdef RE_ENABLE_I18N
425
|| type == OP_UTF8_PERIOD
426
#endif /* RE_ENABLE_I18N */
427
|| type == END_OF_RE)
429
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430
if (type == END_OF_RE)
431
bufp->can_be_null = 1;
437
/* Entry point for POSIX code. */
438
/* regcomp takes a regular expression as a string and compiles it.
440
PREG is a regex_t *. We do not expect any fields to be initialized,
441
since POSIX says we shouldn't. Thus, we set
443
`buffer' to the compiled pattern;
444
`used' to the length of the compiled pattern;
445
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446
REG_EXTENDED bit in CFLAGS is set; otherwise, to
447
RE_SYNTAX_POSIX_BASIC;
448
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
449
`fastmap' to an allocated space for the fastmap;
450
`fastmap_accurate' to zero;
451
`re_nsub' to the number of subexpressions in PATTERN.
453
PATTERN is the address of the pattern string.
455
CFLAGS is a series of bits which affect compilation.
457
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458
use POSIX basic syntax.
460
If REG_NEWLINE is set, then . and [^...] don't match newline.
461
Also, regexec will try a match beginning after every newline.
463
If REG_ICASE is set, then we considers upper- and lowercase
464
versions of letters to be equivalent when matching.
466
If REG_NOSUB is set, then when PREG is passed to regexec, that
467
routine will report only success or failure, and nothing about the
470
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471
the return codes and their meanings.) */
474
regcomp (preg, pattern, cflags)
475
regex_t *_Restrict_ preg;
476
const char *_Restrict_ pattern;
480
reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481
: RE_SYNTAX_POSIX_BASIC);
487
/* Try to allocate space for the fastmap. */
488
preg->fastmap = re_malloc (char, SBC_MAX);
489
if (BE (preg->fastmap == NULL, 0))
492
syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494
/* If REG_NEWLINE is set, newlines are treated differently. */
495
if (cflags & REG_NEWLINE)
496
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
497
syntax &= ~RE_DOT_NEWLINE;
498
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499
/* It also changes the matching behavior. */
500
preg->newline_anchor = 1;
503
preg->newline_anchor = 0;
504
preg->no_sub = !!(cflags & REG_NOSUB);
505
preg->translate = NULL;
507
ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509
/* POSIX doesn't distinguish between an unmatched open-group and an
510
unmatched close-group: both are REG_EPAREN. */
511
if (ret == REG_ERPAREN)
514
/* We have already checked preg->fastmap != NULL. */
515
if (BE (ret == REG_NOERROR, 1))
516
/* Compute the fastmap now, since regexec cannot modify the pattern
517
buffer. This function never fails in this implementation. */
518
(void) re_compile_fastmap (preg);
521
/* Some error occurred while compiling the expression. */
522
re_free (preg->fastmap);
523
preg->fastmap = NULL;
529
weak_alias (__regcomp, regcomp)
532
/* Returns a message corresponding to an error code, ERRCODE, returned
533
from either regcomp or regexec. We don't use PREG here. */
537
regerror (errcode, preg, errbuf, errbuf_size)
539
const regex_t *_Restrict_ preg;
540
char *_Restrict_ errbuf;
542
#else /* size_t might promote */
544
regerror (int errcode, const regex_t *_Restrict_ preg,
545
char *_Restrict_ errbuf, size_t errbuf_size)
552
|| errcode >= (int) (sizeof (__re_error_msgid_idx)
553
/ sizeof (__re_error_msgid_idx[0])), 0))
554
/* Only error codes returned by the rest of the code should be passed
555
to this routine. If we are given anything else, or if other regex
556
code generates an invalid error code, then the program has a bug.
557
Dump core so we can fix it. */
560
msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562
msg_size = strlen (msg) + 1; /* Includes the null. */
564
if (BE (errbuf_size != 0, 1))
566
size_t cpy_size = msg_size;
567
if (BE (msg_size > errbuf_size, 0))
569
cpy_size = errbuf_size - 1;
570
errbuf[cpy_size] = '\0';
572
memcpy (errbuf, msg, cpy_size);
578
weak_alias (__regerror, regerror)
582
#ifdef RE_ENABLE_I18N
583
/* This static array is used for the map to single-byte characters when
584
UTF-8 is used. Otherwise we would allocate memory just to initialize
585
it the same all the time. UTF-8 is the preferred encoding so this is
586
a worthwhile optimization. */
587
static const bitset_t utf8_sb_map =
589
/* Set the first 128 bits. */
590
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
591
# error "bitset_word_t is narrower than 32 bits"
592
# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593
BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594
# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595
BITSET_WORD_MAX, BITSET_WORD_MAX,
596
# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600
>> (SBC_MAX % BITSET_WORD_BITS == 0
602
: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608
free_dfa_content (re_dfa_t *dfa)
613
for (i = 0; i < dfa->nodes_len; ++i)
614
free_token (dfa->nodes + i);
615
re_free (dfa->nexts);
616
for (i = 0; i < dfa->nodes_len; ++i)
618
if (dfa->eclosures != NULL)
619
re_node_set_free (dfa->eclosures + i);
620
if (dfa->inveclosures != NULL)
621
re_node_set_free (dfa->inveclosures + i);
622
if (dfa->edests != NULL)
623
re_node_set_free (dfa->edests + i);
625
re_free (dfa->edests);
626
re_free (dfa->eclosures);
627
re_free (dfa->inveclosures);
628
re_free (dfa->nodes);
630
if (dfa->state_table)
631
for (i = 0; i <= dfa->state_hash_mask; ++i)
633
struct re_state_table_entry *entry = dfa->state_table + i;
634
for (j = 0; j < entry->num; ++j)
636
re_dfastate_t *state = entry->array[j];
639
re_free (entry->array);
641
re_free (dfa->state_table);
642
#ifdef RE_ENABLE_I18N
643
if (dfa->sb_char != utf8_sb_map)
644
re_free (dfa->sb_char);
646
re_free (dfa->subexp_map);
648
re_free (dfa->re_str);
655
/* Free dynamically allocated space used by PREG. */
661
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662
if (BE (dfa != NULL, 1))
663
free_dfa_content (dfa);
667
re_free (preg->fastmap);
668
preg->fastmap = NULL;
670
re_free (preg->translate);
671
preg->translate = NULL;
674
weak_alias (__regfree, regfree)
677
/* Entry points compatible with 4.2 BSD regex library. We don't define
678
them unless specifically requested. */
680
#if defined _REGEX_RE_COMP || defined _LIBC
682
/* BSD has one and only one pattern buffer. */
683
static struct re_pattern_buffer re_comp_buf;
687
/* Make these definitions weak in libc, so POSIX programs can redefine
688
these names if they don't use our functions, and still use
689
regcomp/regexec above without link errors. */
700
if (!re_comp_buf.buffer)
701
return gettext ("No previous regular expression");
705
if (re_comp_buf.buffer)
707
fastmap = re_comp_buf.fastmap;
708
re_comp_buf.fastmap = NULL;
709
__regfree (&re_comp_buf);
710
memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711
re_comp_buf.fastmap = fastmap;
714
if (re_comp_buf.fastmap == NULL)
716
re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717
if (re_comp_buf.fastmap == NULL)
718
return (char *) gettext (__re_error_msgid
719
+ __re_error_msgid_idx[(int) REG_ESPACE]);
722
/* Since `re_exec' always passes NULL for the `regs' argument, we
723
don't need to initialize the pattern buffer fields which affect it. */
725
/* Match anchors at newlines. */
726
re_comp_buf.newline_anchor = 1;
728
ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
733
/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738
libc_freeres_fn (free_mem)
740
__regfree (&re_comp_buf);
744
#endif /* _REGEX_RE_COMP */
746
/* Internal entry point.
747
Compile the regular expression PATTERN, whose length is LENGTH.
748
SYNTAX indicate regular expression's syntax. */
751
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754
reg_errcode_t err = REG_NOERROR;
758
/* Initialize the pattern buffer. */
759
preg->fastmap_accurate = 0;
760
preg->syntax = syntax;
761
preg->not_bol = preg->not_eol = 0;
764
preg->can_be_null = 0;
765
preg->regs_allocated = REGS_UNALLOCATED;
767
/* Initialize the dfa. */
768
dfa = (re_dfa_t *) preg->buffer;
769
if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771
/* If zero allocated, but buffer is non-null, try to realloc
772
enough space. This loses if buffer's address is bogus, but
773
that is the user's responsibility. If ->buffer is NULL this
774
is a simple allocation. */
775
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778
preg->allocated = sizeof (re_dfa_t);
779
preg->buffer = (unsigned char *) dfa;
781
preg->used = sizeof (re_dfa_t);
783
err = init_dfa (dfa, length);
784
if (BE (err != REG_NOERROR, 0))
786
free_dfa_content (dfa);
792
/* Note: length+1 will not overflow since it is checked in init_dfa. */
793
dfa->re_str = re_malloc (char, length + 1);
794
strncpy (dfa->re_str, pattern, length + 1);
797
__libc_lock_init (dfa->lock);
799
err = re_string_construct (®exp, pattern, length, preg->translate,
800
(syntax & RE_ICASE) != 0, dfa);
801
if (BE (err != REG_NOERROR, 0))
803
re_compile_internal_free_return:
804
free_workarea_compile (preg);
805
re_string_destruct (®exp);
806
free_dfa_content (dfa);
812
/* Parse the regular expression, and build a structure tree. */
814
dfa->str_tree = parse (®exp, preg, syntax, &err);
815
if (BE (dfa->str_tree == NULL, 0))
816
goto re_compile_internal_free_return;
818
/* Analyze the tree and create the nfa. */
819
err = analyze (preg);
820
if (BE (err != REG_NOERROR, 0))
821
goto re_compile_internal_free_return;
823
#ifdef RE_ENABLE_I18N
824
/* If possible, do searching in single byte encoding to speed things up. */
825
if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829
/* Then create the initial state of the dfa. */
830
err = create_initial_state (dfa);
832
/* Release work areas. */
833
free_workarea_compile (preg);
834
re_string_destruct (®exp);
836
if (BE (err != REG_NOERROR, 0))
838
free_dfa_content (dfa);
846
/* Initialize DFA. We use the length of the regular expression PAT_LEN
847
as the initial length of some arrays. */
850
init_dfa (re_dfa_t *dfa, size_t pat_len)
852
__re_size_t table_size;
853
#ifdef RE_ENABLE_I18N
854
size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
856
size_t max_i18n_object_size = 0;
858
size_t max_object_size =
859
MAX (sizeof (struct re_state_table_entry),
860
MAX (sizeof (re_token_t),
861
MAX (sizeof (re_node_set),
862
MAX (sizeof (regmatch_t),
863
max_i18n_object_size))));
865
memset (dfa, '\0', sizeof (re_dfa_t));
867
/* Force allocation of str_tree_storage the first time. */
868
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
870
/* Avoid overflows. The extra "/ 2" is for the table_size doubling
871
calculation below, and for similar doubling calculations
872
elsewhere. And it's <= rather than <, because some of the
873
doubling calculations add 1 afterwards. */
874
if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
877
dfa->nodes_alloc = pat_len + 1;
878
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
880
/* table_size = 2 ^ ceil(log pat_len) */
881
for (table_size = 1; ; table_size <<= 1)
882
if (table_size > pat_len)
885
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
886
dfa->state_hash_mask = table_size - 1;
888
dfa->mb_cur_max = MB_CUR_MAX;
890
if (dfa->mb_cur_max == 6
891
&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
893
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
896
if (strcmp (locale_charset (), "UTF-8") == 0)
899
/* We check exhaustively in the loop below if this charset is a
900
superset of ASCII. */
901
dfa->map_notascii = 0;
904
#ifdef RE_ENABLE_I18N
905
if (dfa->mb_cur_max > 1)
908
dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
913
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
914
if (BE (dfa->sb_char == NULL, 0))
917
/* Set the bits corresponding to single byte chars. */
918
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
919
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
921
wint_t wch = __btowc (ch);
923
dfa->sb_char[i] |= (bitset_word_t) 1 << j;
925
if (isascii (ch) && wch != ch)
926
dfa->map_notascii = 1;
933
if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
938
/* Initialize WORD_CHAR table, which indicate which character is
939
"word". In this case "word" means that it is the word construction
940
character used by some operators like "\<", "\>", etc. */
944
init_word_char (re_dfa_t *dfa)
947
dfa->word_ops_used = 1;
948
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
949
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
950
if (isalnum (ch) || ch == '_')
951
dfa->word_char[i] |= (bitset_word_t) 1 << j;
954
/* Free the work area which are only used while compiling. */
957
free_workarea_compile (regex_t *preg)
959
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
960
bin_tree_storage_t *storage, *next;
961
for (storage = dfa->str_tree_storage; storage; storage = next)
963
next = storage->next;
966
dfa->str_tree_storage = NULL;
967
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
968
dfa->str_tree = NULL;
969
re_free (dfa->org_indices);
970
dfa->org_indices = NULL;
973
/* Create initial states for all contexts. */
976
create_initial_state (re_dfa_t *dfa)
980
re_node_set init_nodes;
982
/* Initial states have the epsilon closure of the node which is
983
the first node of the regular expression. */
984
first = dfa->str_tree->first->node_idx;
985
dfa->init_node = first;
986
err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
987
if (BE (err != REG_NOERROR, 0))
990
/* The back-references which are in initial states can epsilon transit,
991
since in this case all of the subexpressions can be null.
992
Then we add epsilon closures of the nodes which are the next nodes of
993
the back-references. */
994
if (dfa->nbackref > 0)
995
for (i = 0; i < init_nodes.nelem; ++i)
997
Idx node_idx = init_nodes.elems[i];
998
re_token_type_t type = dfa->nodes[node_idx].type;
1001
if (type != OP_BACK_REF)
1003
for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005
re_token_t *clexp_node;
1006
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007
if (clexp_node->type == OP_CLOSE_SUBEXP
1008
&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1011
if (clexp_idx == init_nodes.nelem)
1014
if (type == OP_BACK_REF)
1016
Idx dest_idx = dfa->edests[node_idx].elems[0];
1017
if (!re_node_set_contains (&init_nodes, dest_idx))
1019
re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025
/* It must be the first time to invoke acquire_state. */
1026
dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027
/* We don't check ERR here, since the initial state must not be NULL. */
1028
if (BE (dfa->init_state == NULL, 0))
1030
if (dfa->init_state->has_constraint)
1032
dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1034
dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1036
dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040
if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041
|| dfa->init_state_begbuf == NULL, 0))
1045
dfa->init_state_word = dfa->init_state_nl
1046
= dfa->init_state_begbuf = dfa->init_state;
1048
re_node_set_free (&init_nodes);
1052
#ifdef RE_ENABLE_I18N
1053
/* If it is possible to do searching in single byte encoding instead of UTF-8
1054
to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055
DFA nodes where needed. */
1058
optimize_utf8 (re_dfa_t *dfa)
1062
bool mb_chars = false;
1063
bool has_period = false;
1065
for (node = 0; node < dfa->nodes_len; ++node)
1066
switch (dfa->nodes[node].type)
1069
if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1073
switch (dfa->nodes[node].opr.ctx_type)
1081
/* Word anchors etc. cannot be handled. It's okay to test
1082
opr.ctx_type since constraints (for all DFA nodes) are
1083
created by ORing one or more opr.ctx_type values. */
1093
case OP_DUP_ASTERISK:
1094
case OP_OPEN_SUBEXP:
1095
case OP_CLOSE_SUBEXP:
1097
case COMPLEX_BRACKET:
1099
case SIMPLE_BRACKET:
1100
/* Just double check. */
1102
int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1104
: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1105
for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1107
if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1117
if (mb_chars || has_period)
1118
for (node = 0; node < dfa->nodes_len; ++node)
1120
if (dfa->nodes[node].type == CHARACTER
1121
&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1122
dfa->nodes[node].mb_partial = 0;
1123
else if (dfa->nodes[node].type == OP_PERIOD)
1124
dfa->nodes[node].type = OP_UTF8_PERIOD;
1127
/* The search can be in single byte locale. */
1128
dfa->mb_cur_max = 1;
1130
dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1134
/* Analyze the structure tree, and calculate "first", "next", "edest",
1135
"eclosure", and "inveclosure". */
1137
static reg_errcode_t
1138
analyze (regex_t *preg)
1140
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143
/* Allocate arrays. */
1144
dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1145
dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1146
dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1147
dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1148
if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1149
|| dfa->eclosures == NULL, 0))
1152
dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1153
if (dfa->subexp_map != NULL)
1156
for (i = 0; i < preg->re_nsub; i++)
1157
dfa->subexp_map[i] = i;
1158
preorder (dfa->str_tree, optimize_subexps, dfa);
1159
for (i = 0; i < preg->re_nsub; i++)
1160
if (dfa->subexp_map[i] != i)
1162
if (i == preg->re_nsub)
1164
free (dfa->subexp_map);
1165
dfa->subexp_map = NULL;
1169
ret = postorder (dfa->str_tree, lower_subexps, preg);
1170
if (BE (ret != REG_NOERROR, 0))
1172
ret = postorder (dfa->str_tree, calc_first, dfa);
1173
if (BE (ret != REG_NOERROR, 0))
1175
preorder (dfa->str_tree, calc_next, dfa);
1176
ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1177
if (BE (ret != REG_NOERROR, 0))
1179
ret = calc_eclosure (dfa);
1180
if (BE (ret != REG_NOERROR, 0))
1183
/* We only need this during the prune_impossible_nodes pass in regexec.c;
1184
skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1185
if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1189
if (BE (dfa->inveclosures == NULL, 0))
1191
ret = calc_inveclosure (dfa);
1197
/* Our parse trees are very unbalanced, so we cannot use a stack to
1198
implement parse tree visits. Instead, we use parent pointers and
1199
some hairy code in these two functions. */
1200
static reg_errcode_t
1201
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1204
bin_tree_t *node, *prev;
1206
for (node = root; ; )
1208
/* Descend down the tree, preferably to the left (or to the right
1209
if that's the only child). */
1210
while (node->left || node->right)
1218
reg_errcode_t err = fn (extra, node);
1219
if (BE (err != REG_NOERROR, 0))
1221
if (node->parent == NULL)
1224
node = node->parent;
1226
/* Go up while we have a node that is reached from the right. */
1227
while (node->right == prev || node->right == NULL);
1232
static reg_errcode_t
1233
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238
for (node = root; ; )
1240
reg_errcode_t err = fn (extra, node);
1241
if (BE (err != REG_NOERROR, 0))
1244
/* Go to the left node, or up and to the right. */
1249
bin_tree_t *prev = NULL;
1250
while (node->right == prev || node->right == NULL)
1253
node = node->parent;
1262
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1263
re_search_internal to map the inner one's opr.idx to this one's. Adjust
1264
backreferences as well. Requires a preorder visit. */
1265
static reg_errcode_t
1266
optimize_subexps (void *extra, bin_tree_t *node)
1268
re_dfa_t *dfa = (re_dfa_t *) extra;
1270
if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1272
int idx = node->token.opr.idx;
1273
node->token.opr.idx = dfa->subexp_map[idx];
1274
dfa->used_bkref_map |= 1 << node->token.opr.idx;
1277
else if (node->token.type == SUBEXP
1278
&& node->left && node->left->token.type == SUBEXP)
1280
Idx other_idx = node->left->token.opr.idx;
1282
node->left = node->left->left;
1284
node->left->parent = node;
1286
dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1287
if (other_idx < BITSET_WORD_BITS)
1288
dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1294
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1295
of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1296
static reg_errcode_t
1297
lower_subexps (void *extra, bin_tree_t *node)
1299
regex_t *preg = (regex_t *) extra;
1300
reg_errcode_t err = REG_NOERROR;
1302
if (node->left && node->left->token.type == SUBEXP)
1304
node->left = lower_subexp (&err, preg, node->left);
1306
node->left->parent = node;
1308
if (node->right && node->right->token.type == SUBEXP)
1310
node->right = lower_subexp (&err, preg, node->right);
1312
node->right->parent = node;
1319
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1321
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322
bin_tree_t *body = node->left;
1323
bin_tree_t *op, *cls, *tree1, *tree;
1326
/* We do not optimize empty subexpressions, because otherwise we may
1327
have bad CONCAT nodes with NULL children. This is obviously not
1328
very common, so we do not lose much. An example that triggers
1329
this case is the sed "script" /\(\)/x. */
1330
&& node->left != NULL
1331
&& (node->token.opr.idx >= BITSET_WORD_BITS
1332
|| !(dfa->used_bkref_map
1333
& ((bitset_word_t) 1 << node->token.opr.idx))))
1336
/* Convert the SUBEXP node to the concatenation of an
1337
OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1338
op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339
cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340
tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341
tree = create_tree (dfa, op, tree1, CONCAT);
1342
if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348
op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349
op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1353
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354
nodes. Requires a postorder visit. */
1355
static reg_errcode_t
1356
calc_first (void *extra, bin_tree_t *node)
1358
re_dfa_t *dfa = (re_dfa_t *) extra;
1359
if (node->token.type == CONCAT)
1361
node->first = node->left->first;
1362
node->node_idx = node->left->node_idx;
1367
node->node_idx = re_dfa_add_node (dfa, node->token);
1368
if (BE (node->node_idx == REG_MISSING, 0))
1370
if (node->token.type == ANCHOR)
1371
dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1376
/* Pass 2: compute NEXT on the tree. Preorder visit. */
1377
static reg_errcode_t
1378
calc_next (void *extra, bin_tree_t *node)
1380
switch (node->token.type)
1382
case OP_DUP_ASTERISK:
1383
node->left->next = node;
1386
node->left->next = node->right->first;
1387
node->right->next = node->next;
1391
node->left->next = node->next;
1393
node->right->next = node->next;
1399
/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1400
static reg_errcode_t
1401
link_nfa_nodes (void *extra, bin_tree_t *node)
1403
re_dfa_t *dfa = (re_dfa_t *) extra;
1404
Idx idx = node->node_idx;
1405
reg_errcode_t err = REG_NOERROR;
1407
switch (node->token.type)
1413
assert (node->next == NULL);
1416
case OP_DUP_ASTERISK:
1420
dfa->has_plural_match = 1;
1421
if (node->left != NULL)
1422
left = node->left->first->node_idx;
1424
left = node->next->node_idx;
1425
if (node->right != NULL)
1426
right = node->right->first->node_idx;
1428
right = node->next->node_idx;
1429
assert (REG_VALID_INDEX (left));
1430
assert (REG_VALID_INDEX (right));
1431
err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436
case OP_OPEN_SUBEXP:
1437
case OP_CLOSE_SUBEXP:
1438
err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442
dfa->nexts[idx] = node->next->node_idx;
1443
if (node->token.type == OP_BACK_REF)
1444
re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448
assert (!IS_EPSILON_NODE (node->token.type));
1449
dfa->nexts[idx] = node->next->node_idx;
1456
/* Duplicate the epsilon closure of the node ROOT_NODE.
1457
Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1458
to their own constraint. */
1460
static reg_errcode_t
1462
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1463
Idx root_node, unsigned int init_constraint)
1465
Idx org_node, clone_node;
1467
unsigned int constraint = init_constraint;
1468
for (org_node = top_org_node, clone_node = top_clone_node;;)
1470
Idx org_dest, clone_dest;
1471
if (dfa->nodes[org_node].type == OP_BACK_REF)
1473
/* If the back reference epsilon-transit, its destination must
1474
also have the constraint. Then duplicate the epsilon closure
1475
of the destination of the back reference, and store it in
1476
edests of the back reference. */
1477
org_dest = dfa->nexts[org_node];
1478
re_node_set_empty (dfa->edests + clone_node);
1479
clone_dest = duplicate_node (dfa, org_dest, constraint);
1480
if (BE (clone_dest == REG_MISSING, 0))
1482
dfa->nexts[clone_node] = dfa->nexts[org_node];
1483
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487
else if (dfa->edests[org_node].nelem == 0)
1489
/* In case of the node can't epsilon-transit, don't duplicate the
1490
destination and store the original destination as the
1491
destination of the node. */
1492
dfa->nexts[clone_node] = dfa->nexts[org_node];
1495
else if (dfa->edests[org_node].nelem == 1)
1497
/* In case of the node can epsilon-transit, and it has only one
1499
org_dest = dfa->edests[org_node].elems[0];
1500
re_node_set_empty (dfa->edests + clone_node);
1501
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1502
/* If the node is root_node itself, it means the epsilon closure
1503
has a loop. Then tie it to the destination of the root_node. */
1504
if (org_node == root_node && clone_node != org_node)
1506
ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1511
/* In case the node has another constraint, append it. */
1512
constraint |= dfa->nodes[org_node].constraint;
1513
clone_dest = duplicate_node (dfa, org_dest, constraint);
1514
if (BE (clone_dest == REG_MISSING, 0))
1516
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520
else /* dfa->edests[org_node].nelem == 2 */
1522
/* In case of the node can epsilon-transit, and it has two
1523
destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524
org_dest = dfa->edests[org_node].elems[0];
1525
re_node_set_empty (dfa->edests + clone_node);
1526
/* Search for a duplicated node which satisfies the constraint. */
1527
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528
if (clone_dest == REG_MISSING)
1530
/* There is no such duplicated node, create a new one. */
1532
clone_dest = duplicate_node (dfa, org_dest, constraint);
1533
if (BE (clone_dest == REG_MISSING, 0))
1535
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538
err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539
root_node, constraint);
1540
if (BE (err != REG_NOERROR, 0))
1545
/* There is a duplicated node which satisfy the constraint,
1546
use it to avoid infinite loop. */
1547
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552
org_dest = dfa->edests[org_node].elems[1];
1553
clone_dest = duplicate_node (dfa, org_dest, constraint);
1554
if (BE (clone_dest == REG_MISSING, 0))
1556
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560
org_node = org_dest;
1561
clone_node = clone_dest;
1566
/* Search for a node which is duplicated from the node ORG_NODE, and
1567
satisfies the constraint CONSTRAINT. */
1570
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1571
unsigned int constraint)
1574
for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1576
if (org_node == dfa->org_indices[idx]
1577
&& constraint == dfa->nodes[idx].constraint)
1578
return idx; /* Found. */
1580
return REG_MISSING; /* Not found. */
1583
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584
Return the index of the new node, or REG_MISSING if insufficient storage is
1588
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1590
Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591
if (BE (dup_idx != REG_MISSING, 1))
1593
dfa->nodes[dup_idx].constraint = constraint;
1594
dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595
dfa->nodes[dup_idx].duplicated = 1;
1597
/* Store the index of the original node. */
1598
dfa->org_indices[dup_idx] = org_idx;
1603
static reg_errcode_t
1604
calc_inveclosure (re_dfa_t *dfa)
1608
for (idx = 0; idx < dfa->nodes_len; ++idx)
1609
re_node_set_init_empty (dfa->inveclosures + idx);
1611
for (src = 0; src < dfa->nodes_len; ++src)
1613
Idx *elems = dfa->eclosures[src].elems;
1614
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1616
ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625
/* Calculate "eclosure" for all the node in DFA. */
1627
static reg_errcode_t
1628
calc_eclosure (re_dfa_t *dfa)
1633
assert (dfa->nodes_len > 0);
1636
/* For each nodes, calculate epsilon closure. */
1637
for (node_idx = 0; ; ++node_idx)
1640
re_node_set eclosure_elem;
1641
if (node_idx == dfa->nodes_len)
1650
assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1653
/* If we have already calculated, skip it. */
1654
if (dfa->eclosures[node_idx].nelem != 0)
1656
/* Calculate epsilon closure of `node_idx'. */
1657
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1658
if (BE (err != REG_NOERROR, 0))
1661
if (dfa->eclosures[node_idx].nelem == 0)
1664
re_node_set_free (&eclosure_elem);
1670
/* Calculate epsilon closure of NODE. */
1672
static reg_errcode_t
1673
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1679
re_node_set eclosure;
1681
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682
if (BE (err != REG_NOERROR, 0))
1685
/* This indicates that we are calculating this node now.
1686
We reference this value to avoid infinite loop. */
1687
dfa->eclosures[node].nelem = REG_MISSING;
1689
/* If the current node has constraints, duplicate all nodes
1690
since they must inherit the constraints. */
1691
if (dfa->nodes[node].constraint
1692
&& dfa->edests[node].nelem
1693
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1695
err = duplicate_node_closure (dfa, node, node, node,
1696
dfa->nodes[node].constraint);
1697
if (BE (err != REG_NOERROR, 0))
1701
/* Expand each epsilon destination nodes. */
1702
if (IS_EPSILON_NODE(dfa->nodes[node].type))
1703
for (i = 0; i < dfa->edests[node].nelem; ++i)
1705
re_node_set eclosure_elem;
1706
Idx edest = dfa->edests[node].elems[i];
1707
/* If calculating the epsilon closure of `edest' is in progress,
1708
return intermediate result. */
1709
if (dfa->eclosures[edest].nelem == REG_MISSING)
1714
/* If we haven't calculated the epsilon closure of `edest' yet,
1715
calculate now. Otherwise use calculated epsilon closure. */
1716
if (dfa->eclosures[edest].nelem == 0)
1718
err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1719
if (BE (err != REG_NOERROR, 0))
1723
eclosure_elem = dfa->eclosures[edest];
1724
/* Merge the epsilon closure of `edest'. */
1725
re_node_set_merge (&eclosure, &eclosure_elem);
1726
/* If the epsilon closure of `edest' is incomplete,
1727
the epsilon closure of this node is also incomplete. */
1728
if (dfa->eclosures[edest].nelem == 0)
1731
re_node_set_free (&eclosure_elem);
1735
/* Epsilon closures include itself. */
1736
ok = re_node_set_insert (&eclosure, node);
1739
if (incomplete && !root)
1740
dfa->eclosures[node].nelem = 0;
1742
dfa->eclosures[node] = eclosure;
1743
*new_set = eclosure;
1747
/* Functions for token which are used in the parser. */
1749
/* Fetch a token from INPUT.
1750
We must not use this function inside bracket expressions. */
1754
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1756
re_string_skip_bytes (input, peek_token (result, input, syntax));
1759
/* Peek a token from INPUT, and return the length of the token.
1760
We must not use this function inside bracket expressions. */
1764
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1768
if (re_string_eoi (input))
1770
token->type = END_OF_RE;
1774
c = re_string_peek_byte (input, 0);
1777
token->word_char = 0;
1778
#ifdef RE_ENABLE_I18N
1779
token->mb_partial = 0;
1780
if (input->mb_cur_max > 1 &&
1781
!re_string_first_byte (input, re_string_cur_idx (input)))
1783
token->type = CHARACTER;
1784
token->mb_partial = 1;
1791
if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793
token->type = BACK_SLASH;
1797
c2 = re_string_peek_byte_case (input, 1);
1799
token->type = CHARACTER;
1800
#ifdef RE_ENABLE_I18N
1801
if (input->mb_cur_max > 1)
1803
wint_t wc = re_string_wchar_at (input,
1804
re_string_cur_idx (input) + 1);
1805
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1809
token->word_char = IS_WORD_CHAR (c2) != 0;
1814
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1815
token->type = OP_ALT;
1817
case '1': case '2': case '3': case '4': case '5':
1818
case '6': case '7': case '8': case '9':
1819
if (!(syntax & RE_NO_BK_REFS))
1821
token->type = OP_BACK_REF;
1822
token->opr.idx = c2 - '1';
1826
if (!(syntax & RE_NO_GNU_OPS))
1828
token->type = ANCHOR;
1829
token->opr.ctx_type = WORD_FIRST;
1833
if (!(syntax & RE_NO_GNU_OPS))
1835
token->type = ANCHOR;
1836
token->opr.ctx_type = WORD_LAST;
1840
if (!(syntax & RE_NO_GNU_OPS))
1842
token->type = ANCHOR;
1843
token->opr.ctx_type = WORD_DELIM;
1847
if (!(syntax & RE_NO_GNU_OPS))
1849
token->type = ANCHOR;
1850
token->opr.ctx_type = NOT_WORD_DELIM;
1854
if (!(syntax & RE_NO_GNU_OPS))
1855
token->type = OP_WORD;
1858
if (!(syntax & RE_NO_GNU_OPS))
1859
token->type = OP_NOTWORD;
1862
if (!(syntax & RE_NO_GNU_OPS))
1863
token->type = OP_SPACE;
1866
if (!(syntax & RE_NO_GNU_OPS))
1867
token->type = OP_NOTSPACE;
1870
if (!(syntax & RE_NO_GNU_OPS))
1872
token->type = ANCHOR;
1873
token->opr.ctx_type = BUF_FIRST;
1877
if (!(syntax & RE_NO_GNU_OPS))
1879
token->type = ANCHOR;
1880
token->opr.ctx_type = BUF_LAST;
1884
if (!(syntax & RE_NO_BK_PARENS))
1885
token->type = OP_OPEN_SUBEXP;
1888
if (!(syntax & RE_NO_BK_PARENS))
1889
token->type = OP_CLOSE_SUBEXP;
1892
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1893
token->type = OP_DUP_PLUS;
1896
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1897
token->type = OP_DUP_QUESTION;
1900
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1901
token->type = OP_OPEN_DUP_NUM;
1904
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1905
token->type = OP_CLOSE_DUP_NUM;
1913
token->type = CHARACTER;
1914
#ifdef RE_ENABLE_I18N
1915
if (input->mb_cur_max > 1)
1917
wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1918
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1922
token->word_char = IS_WORD_CHAR (token->opr.c);
1927
if (syntax & RE_NEWLINE_ALT)
1928
token->type = OP_ALT;
1931
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1932
token->type = OP_ALT;
1935
token->type = OP_DUP_ASTERISK;
1938
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1939
token->type = OP_DUP_PLUS;
1942
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1943
token->type = OP_DUP_QUESTION;
1946
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1947
token->type = OP_OPEN_DUP_NUM;
1950
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1951
token->type = OP_CLOSE_DUP_NUM;
1954
if (syntax & RE_NO_BK_PARENS)
1955
token->type = OP_OPEN_SUBEXP;
1958
if (syntax & RE_NO_BK_PARENS)
1959
token->type = OP_CLOSE_SUBEXP;
1962
token->type = OP_OPEN_BRACKET;
1965
token->type = OP_PERIOD;
1968
if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1969
re_string_cur_idx (input) != 0)
1971
char prev = re_string_peek_byte (input, -1);
1972
if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1975
token->type = ANCHOR;
1976
token->opr.ctx_type = LINE_FIRST;
1979
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1980
re_string_cur_idx (input) + 1 != re_string_length (input))
1983
re_string_skip_bytes (input, 1);
1984
peek_token (&next, input, syntax);
1985
re_string_skip_bytes (input, -1);
1986
if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1989
token->type = ANCHOR;
1990
token->opr.ctx_type = LINE_LAST;
1998
/* Peek a token from INPUT, and return the length of the token.
1999
We must not use this function out of bracket expressions. */
2003
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006
if (re_string_eoi (input))
2008
token->type = END_OF_RE;
2011
c = re_string_peek_byte (input, 0);
2014
#ifdef RE_ENABLE_I18N
2015
if (input->mb_cur_max > 1 &&
2016
!re_string_first_byte (input, re_string_cur_idx (input)))
2018
token->type = CHARACTER;
2021
#endif /* RE_ENABLE_I18N */
2023
if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2024
&& re_string_cur_idx (input) + 1 < re_string_length (input))
2026
/* In this case, '\' escape a character. */
2028
re_string_skip_bytes (input, 1);
2029
c2 = re_string_peek_byte (input, 0);
2031
token->type = CHARACTER;
2034
if (c == '[') /* '[' is a special char in a bracket exps. */
2038
if (re_string_cur_idx (input) + 1 < re_string_length (input))
2039
c2 = re_string_peek_byte (input, 1);
2047
token->type = OP_OPEN_COLL_ELEM;
2050
token->type = OP_OPEN_EQUIV_CLASS;
2053
if (syntax & RE_CHAR_CLASSES)
2055
token->type = OP_OPEN_CHAR_CLASS;
2058
/* else fall through. */
2060
token->type = CHARACTER;
2070
token->type = OP_CHARSET_RANGE;
2073
token->type = OP_CLOSE_BRACKET;
2076
token->type = OP_NON_MATCH_LIST;
2079
token->type = CHARACTER;
2084
/* Functions for parser. */
2086
/* Entry point of the parser.
2087
Parse the regular expression REGEXP and return the structure tree.
2088
If an error is occured, ERR is set by error code, and return NULL.
2089
This function build the following tree, from regular expression <reg_exp>:
2095
CAT means concatenation.
2096
EOR means end of regular expression. */
2099
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2102
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103
bin_tree_t *tree, *eor, *root;
2104
re_token_t current_token;
2105
dfa->syntax = syntax;
2106
fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2107
tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2108
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2110
eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2112
root = create_tree (dfa, tree, eor, CONCAT);
2115
if (BE (eor == NULL || root == NULL, 0))
2123
/* This function build the following tree, from regular expression
2124
<branch1>|<branch2>:
2130
ALT means alternative, which represents the operator `|'. */
2133
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2136
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137
bin_tree_t *tree, *branch = NULL;
2138
tree = parse_branch (regexp, preg, token, syntax, nest, err);
2139
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142
while (token->type == OP_ALT)
2144
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145
if (token->type != OP_ALT && token->type != END_OF_RE
2146
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148
branch = parse_branch (regexp, preg, token, syntax, nest, err);
2149
if (BE (*err != REG_NOERROR && branch == NULL, 0))
2154
tree = create_tree (dfa, tree, branch, OP_ALT);
2155
if (BE (tree == NULL, 0))
2164
/* This function build the following tree, from regular expression
2171
CAT means concatenation. */
2174
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177
bin_tree_t *tree, *expr;
2178
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179
tree = parse_expression (regexp, preg, token, syntax, nest, err);
2180
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183
while (token->type != OP_ALT && token->type != END_OF_RE
2184
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2186
expr = parse_expression (regexp, preg, token, syntax, nest, err);
2187
if (BE (*err != REG_NOERROR && expr == NULL, 0))
2191
if (tree != NULL && expr != NULL)
2193
tree = create_tree (dfa, tree, expr, CONCAT);
2200
else if (tree == NULL)
2202
/* Otherwise expr == NULL, we don't need to create new tree. */
2207
/* This function build the following tree, from regular expression a*:
2214
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2219
switch (token->type)
2222
tree = create_token_tree (dfa, NULL, NULL, token);
2223
if (BE (tree == NULL, 0))
2228
#ifdef RE_ENABLE_I18N
2229
if (dfa->mb_cur_max > 1)
2231
while (!re_string_eoi (regexp)
2232
&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2234
bin_tree_t *mbc_remain;
2235
fetch_token (token, regexp, syntax);
2236
mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2237
tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2238
if (BE (mbc_remain == NULL || tree == NULL, 0))
2247
case OP_OPEN_SUBEXP:
2248
tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2249
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2252
case OP_OPEN_BRACKET:
2253
tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2254
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258
if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2263
dfa->used_bkref_map |= 1 << token->opr.idx;
2264
tree = create_token_tree (dfa, NULL, NULL, token);
2265
if (BE (tree == NULL, 0))
2271
dfa->has_mb_node = 1;
2273
case OP_OPEN_DUP_NUM:
2274
if (syntax & RE_CONTEXT_INVALID_DUP)
2280
case OP_DUP_ASTERISK:
2282
case OP_DUP_QUESTION:
2283
if (syntax & RE_CONTEXT_INVALID_OPS)
2288
else if (syntax & RE_CONTEXT_INDEP_OPS)
2290
fetch_token (token, regexp, syntax);
2291
return parse_expression (regexp, preg, token, syntax, nest, err);
2293
/* else fall through */
2294
case OP_CLOSE_SUBEXP:
2295
if ((token->type == OP_CLOSE_SUBEXP) &&
2296
!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2301
/* else fall through */
2302
case OP_CLOSE_DUP_NUM:
2303
/* We treat it as a normal character. */
2305
/* Then we can these characters as normal characters. */
2306
token->type = CHARACTER;
2307
/* mb_partial and word_char bits should be initialized already
2309
tree = create_token_tree (dfa, NULL, NULL, token);
2310
if (BE (tree == NULL, 0))
2317
if ((token->opr.ctx_type
2318
& (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2319
&& dfa->word_ops_used == 0)
2320
init_word_char (dfa);
2321
if (token->opr.ctx_type == WORD_DELIM
2322
|| token->opr.ctx_type == NOT_WORD_DELIM)
2324
bin_tree_t *tree_first, *tree_last;
2325
if (token->opr.ctx_type == WORD_DELIM)
2327
token->opr.ctx_type = WORD_FIRST;
2328
tree_first = create_token_tree (dfa, NULL, NULL, token);
2329
token->opr.ctx_type = WORD_LAST;
2333
token->opr.ctx_type = INSIDE_WORD;
2334
tree_first = create_token_tree (dfa, NULL, NULL, token);
2335
token->opr.ctx_type = INSIDE_NOTWORD;
2337
tree_last = create_token_tree (dfa, NULL, NULL, token);
2338
tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2339
if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347
tree = create_token_tree (dfa, NULL, NULL, token);
2348
if (BE (tree == NULL, 0))
2354
/* We must return here, since ANCHORs can't be followed
2355
by repetition operators.
2356
eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2357
it must not be "<ANCHOR(^)><REPEAT(*)>". */
2358
fetch_token (token, regexp, syntax);
2361
tree = create_token_tree (dfa, NULL, NULL, token);
2362
if (BE (tree == NULL, 0))
2367
if (dfa->mb_cur_max > 1)
2368
dfa->has_mb_node = 1;
2372
tree = build_charclass_op (dfa, regexp->trans,
2373
(const unsigned char *) "alnum",
2374
(const unsigned char *) "_",
2375
token->type == OP_NOTWORD, err);
2376
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2381
tree = build_charclass_op (dfa, regexp->trans,
2382
(const unsigned char *) "space",
2383
(const unsigned char *) "",
2384
token->type == OP_NOTSPACE, err);
2385
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395
/* Must not happen? */
2401
fetch_token (token, regexp, syntax);
2403
while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2404
|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2406
tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2407
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2409
/* In BRE consecutive duplications are not allowed. */
2410
if ((syntax & RE_CONTEXT_INVALID_DUP)
2411
&& (token->type == OP_DUP_ASTERISK
2412
|| token->type == OP_OPEN_DUP_NUM))
2422
/* This function build the following tree, from regular expression
2430
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2431
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2436
cur_nsub = preg->re_nsub++;
2438
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440
/* The subexpression may be a null string. */
2441
if (token->type == OP_CLOSE_SUBEXP)
2445
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2446
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2448
if (BE (*err != REG_NOERROR, 0))
2452
if (cur_nsub <= '9' - '1')
2453
dfa->completed_bkref_map |= 1 << cur_nsub;
2455
tree = create_tree (dfa, tree, NULL, SUBEXP);
2456
if (BE (tree == NULL, 0))
2461
tree->token.opr.idx = cur_nsub;
2465
/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2468
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2469
re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2471
bin_tree_t *tree = NULL, *old_tree = NULL;
2472
Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2473
re_token_t start_token = *token;
2475
if (token->type == OP_OPEN_DUP_NUM)
2478
start = fetch_number (regexp, token, syntax);
2479
if (start == REG_MISSING)
2481
if (token->type == CHARACTER && token->opr.c == ',')
2482
start = 0; /* We treat "{,m}" as "{0,m}". */
2485
*err = REG_BADBR; /* <re>{} is invalid. */
2489
if (BE (start != REG_ERROR, 1))
2491
/* We treat "{n}" as "{n,n}". */
2492
end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2493
: ((token->type == CHARACTER && token->opr.c == ',')
2494
? fetch_number (regexp, token, syntax) : REG_ERROR));
2496
if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2498
/* Invalid sequence. */
2499
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2501
if (token->type == END_OF_RE)
2509
/* If the syntax bit is set, rollback. */
2510
re_string_set_index (regexp, start_idx);
2511
*token = start_token;
2512
token->type = CHARACTER;
2513
/* mb_partial and word_char bits should be already initialized by
2518
if (BE (end != REG_MISSING && start > end, 0))
2520
/* First number greater than second. */
2527
start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2528
end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2531
fetch_token (token, regexp, syntax);
2533
if (BE (elem == NULL, 0))
2535
if (BE (start == 0 && end == 0, 0))
2537
postorder (elem, free_tree, NULL);
2541
/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2542
if (BE (start > 0, 0))
2545
for (i = 2; i <= start; ++i)
2547
elem = duplicate_tree (elem, dfa);
2548
tree = create_tree (dfa, tree, elem, CONCAT);
2549
if (BE (elem == NULL || tree == NULL, 0))
2550
goto parse_dup_op_espace;
2556
/* Duplicate ELEM before it is marked optional. */
2557
elem = duplicate_tree (elem, dfa);
2563
if (elem->token.type == SUBEXP)
2564
postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2566
tree = create_tree (dfa, elem, NULL,
2567
(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2568
if (BE (tree == NULL, 0))
2569
goto parse_dup_op_espace;
2571
/* This loop is actually executed only when end != REG_MISSING,
2572
to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2573
already created the start+1-th copy. */
2574
if ((Idx) -1 < 0 || end != REG_MISSING)
2575
for (i = start + 2; i <= end; ++i)
2577
elem = duplicate_tree (elem, dfa);
2578
tree = create_tree (dfa, tree, elem, CONCAT);
2579
if (BE (elem == NULL || tree == NULL, 0))
2580
goto parse_dup_op_espace;
2582
tree = create_tree (dfa, tree, NULL, OP_ALT);
2583
if (BE (tree == NULL, 0))
2584
goto parse_dup_op_espace;
2588
tree = create_tree (dfa, old_tree, tree, CONCAT);
2592
parse_dup_op_espace:
2597
/* Size of the names for collating symbol/equivalence_class/character_class.
2598
I'm not sure, but maybe enough. */
2599
#define BRACKET_NAME_BUF_SIZE 32
2602
/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2603
Build the range expression which starts from START_ELEM, and ends
2604
at END_ELEM. The result are written to MBCSET and SBCSET.
2605
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606
mbcset->range_ends, is a pointer argument sinse we may
2609
static reg_errcode_t
2611
# ifdef RE_ENABLE_I18N
2612
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2613
bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2614
# else /* not RE_ENABLE_I18N */
2615
build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2616
bracket_elem_t *end_elem)
2617
# endif /* not RE_ENABLE_I18N */
2619
unsigned int start_ch, end_ch;
2620
/* Equivalence Classes and Character Classes can't be a range start/end. */
2621
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2622
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2626
/* We can handle no multi character collating elements without libc
2628
if (BE ((start_elem->type == COLL_SYM
2629
&& strlen ((char *) start_elem->opr.name) > 1)
2630
|| (end_elem->type == COLL_SYM
2631
&& strlen ((char *) end_elem->opr.name) > 1), 0))
2632
return REG_ECOLLATE;
2634
# ifdef RE_ENABLE_I18N
2639
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2641
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2642
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2644
end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2645
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2647
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2648
? __btowc (start_ch) : start_elem->opr.wch);
2649
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2650
? __btowc (end_ch) : end_elem->opr.wch);
2651
if (start_wc == WEOF || end_wc == WEOF)
2652
return REG_ECOLLATE;
2653
cmp_buf[0] = start_wc;
2654
cmp_buf[4] = end_wc;
2655
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2658
/* Got valid collation sequence values, add them as a new entry.
2659
However, for !_LIBC we have no collation elements: if the
2660
character set is single byte, the single byte character set
2661
that we build below suffices. parse_bracket_exp passes
2662
no MBCSET if dfa->mb_cur_max == 1. */
2665
/* Check the space of the arrays. */
2666
if (BE (*range_alloc == mbcset->nranges, 0))
2668
/* There is not enough space, need realloc. */
2669
wchar_t *new_array_start, *new_array_end;
2672
/* +1 in case of mbcset->nranges is 0. */
2673
new_nranges = 2 * mbcset->nranges + 1;
2674
/* Use realloc since mbcset->range_starts and mbcset->range_ends
2675
are NULL if *range_alloc == 0. */
2676
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2678
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2681
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2684
mbcset->range_starts = new_array_start;
2685
mbcset->range_ends = new_array_end;
2686
*range_alloc = new_nranges;
2689
mbcset->range_starts[mbcset->nranges] = start_wc;
2690
mbcset->range_ends[mbcset->nranges++] = end_wc;
2693
/* Build the table for single byte characters. */
2694
for (wc = 0; wc < SBC_MAX; ++wc)
2697
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2698
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2699
bitset_set (sbcset, wc);
2702
# else /* not RE_ENABLE_I18N */
2705
start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2706
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2708
end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2709
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2711
if (start_ch > end_ch)
2713
/* Build the table for single byte characters. */
2714
for (ch = 0; ch < SBC_MAX; ++ch)
2715
if (start_ch <= ch && ch <= end_ch)
2716
bitset_set (sbcset, ch);
2718
# endif /* not RE_ENABLE_I18N */
2721
#endif /* not _LIBC */
2724
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2725
Build the collating element which is represented by NAME.
2726
The result are written to MBCSET and SBCSET.
2727
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2728
pointer argument since we may update it. */
2730
static reg_errcode_t
2732
build_collating_symbol (bitset_t sbcset,
2733
# ifdef RE_ENABLE_I18N
2734
re_charset_t *mbcset, Idx *coll_sym_alloc,
2736
const unsigned char *name)
2738
size_t name_len = strlen ((const char *) name);
2739
if (BE (name_len != 1, 0))
2740
return REG_ECOLLATE;
2743
bitset_set (sbcset, name[0]);
2747
#endif /* not _LIBC */
2749
/* This function parse bracket expression like "[abc]", "[a-c]",
2753
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2754
reg_syntax_t syntax, reg_errcode_t *err)
2757
const unsigned char *collseqmb;
2758
const char *collseqwc;
2761
const int32_t *symb_table;
2762
const unsigned char *extra;
2764
/* Local function for parse_bracket_exp used in _LIBC environement.
2765
Seek the collating symbol entry correspondings to NAME.
2766
Return the index of the symbol in the SYMB_TABLE. */
2769
__attribute ((always_inline))
2770
seek_collating_symbol_entry (name, name_len)
2771
const unsigned char *name;
2774
int32_t hash = elem_hash ((const char *) name, name_len);
2775
int32_t elem = hash % table_size;
2776
if (symb_table[2 * elem] != 0)
2778
int32_t second = hash % (table_size - 2) + 1;
2782
/* First compare the hashing value. */
2783
if (symb_table[2 * elem] == hash
2784
/* Compare the length of the name. */
2785
&& name_len == extra[symb_table[2 * elem + 1]]
2786
/* Compare the name. */
2787
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2790
/* Yep, this is the entry. */
2797
while (symb_table[2 * elem] != 0);
2802
/* Local function for parse_bracket_exp used in _LIBC environement.
2803
Look up the collation sequence value of BR_ELEM.
2804
Return the value if succeeded, UINT_MAX otherwise. */
2806
auto inline unsigned int
2807
__attribute ((always_inline))
2808
lookup_collation_sequence_value (br_elem)
2809
bracket_elem_t *br_elem;
2811
if (br_elem->type == SB_CHAR)
2814
if (MB_CUR_MAX == 1)
2817
return collseqmb[br_elem->opr.ch];
2820
wint_t wc = __btowc (br_elem->opr.ch);
2821
return __collseq_table_lookup (collseqwc, wc);
2824
else if (br_elem->type == MB_CHAR)
2826
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2828
else if (br_elem->type == COLL_SYM)
2830
size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2834
elem = seek_collating_symbol_entry (br_elem->opr.name,
2836
if (symb_table[2 * elem] != 0)
2838
/* We found the entry. */
2839
idx = symb_table[2 * elem + 1];
2840
/* Skip the name of collating element name. */
2841
idx += 1 + extra[idx];
2842
/* Skip the byte sequence of the collating element. */
2843
idx += 1 + extra[idx];
2844
/* Adjust for the alignment. */
2845
idx = (idx + 3) & ~3;
2846
/* Skip the multibyte collation sequence value. */
2847
idx += sizeof (unsigned int);
2848
/* Skip the wide char sequence of the collating element. */
2849
idx += sizeof (unsigned int) *
2850
(1 + *(unsigned int *) (extra + idx));
2851
/* Return the collation sequence value. */
2852
return *(unsigned int *) (extra + idx);
2854
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2856
/* No valid character. Match it as a single byte
2858
return collseqmb[br_elem->opr.name[0]];
2861
else if (sym_name_len == 1)
2862
return collseqmb[br_elem->opr.name[0]];
2867
/* Local function for parse_bracket_exp used in _LIBC environement.
2868
Build the range expression which starts from START_ELEM, and ends
2869
at END_ELEM. The result are written to MBCSET and SBCSET.
2870
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2871
mbcset->range_ends, is a pointer argument sinse we may
2874
auto inline reg_errcode_t
2875
__attribute ((always_inline))
2876
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2877
re_charset_t *mbcset;
2880
bracket_elem_t *start_elem, *end_elem;
2883
uint32_t start_collseq;
2884
uint32_t end_collseq;
2886
/* Equivalence Classes and Character Classes can't be a range
2888
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2889
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2893
start_collseq = lookup_collation_sequence_value (start_elem);
2894
end_collseq = lookup_collation_sequence_value (end_elem);
2895
/* Check start/end collation sequence values. */
2896
if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2897
return REG_ECOLLATE;
2898
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2901
/* Got valid collation sequence values, add them as a new entry.
2902
However, if we have no collation elements, and the character set
2903
is single byte, the single byte character set that we
2904
build below suffices. */
2905
if (nrules > 0 || dfa->mb_cur_max > 1)
2907
/* Check the space of the arrays. */
2908
if (BE (*range_alloc == mbcset->nranges, 0))
2910
/* There is not enough space, need realloc. */
2911
uint32_t *new_array_start;
2912
uint32_t *new_array_end;
2915
/* +1 in case of mbcset->nranges is 0. */
2916
new_nranges = 2 * mbcset->nranges + 1;
2917
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2919
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2925
mbcset->range_starts = new_array_start;
2926
mbcset->range_ends = new_array_end;
2927
*range_alloc = new_nranges;
2930
mbcset->range_starts[mbcset->nranges] = start_collseq;
2931
mbcset->range_ends[mbcset->nranges++] = end_collseq;
2934
/* Build the table for single byte characters. */
2935
for (ch = 0; ch < SBC_MAX; ch++)
2937
uint32_t ch_collseq;
2939
if (MB_CUR_MAX == 1)
2942
ch_collseq = collseqmb[ch];
2944
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2945
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2946
bitset_set (sbcset, ch);
2951
/* Local function for parse_bracket_exp used in _LIBC environement.
2952
Build the collating element which is represented by NAME.
2953
The result are written to MBCSET and SBCSET.
2954
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2955
pointer argument sinse we may update it. */
2957
auto inline reg_errcode_t
2958
__attribute ((always_inline))
2959
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2960
re_charset_t *mbcset;
2961
Idx *coll_sym_alloc;
2963
const unsigned char *name;
2966
size_t name_len = strlen ((const char *) name);
2969
elem = seek_collating_symbol_entry (name, name_len);
2970
if (symb_table[2 * elem] != 0)
2972
/* We found the entry. */
2973
idx = symb_table[2 * elem + 1];
2974
/* Skip the name of collating element name. */
2975
idx += 1 + extra[idx];
2977
else if (symb_table[2 * elem] == 0 && name_len == 1)
2979
/* No valid character, treat it as a normal
2981
bitset_set (sbcset, name[0]);
2985
return REG_ECOLLATE;
2987
/* Got valid collation sequence, add it as a new entry. */
2988
/* Check the space of the arrays. */
2989
if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2991
/* Not enough, realloc it. */
2992
/* +1 in case of mbcset->ncoll_syms is 0. */
2993
Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994
/* Use realloc since mbcset->coll_syms is NULL
2996
int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997
new_coll_sym_alloc);
2998
if (BE (new_coll_syms == NULL, 0))
3000
mbcset->coll_syms = new_coll_syms;
3001
*coll_sym_alloc = new_coll_sym_alloc;
3003
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3008
if (BE (name_len != 1, 0))
3009
return REG_ECOLLATE;
3012
bitset_set (sbcset, name[0]);
3019
re_token_t br_token;
3020
re_bitset_ptr_t sbcset;
3021
#ifdef RE_ENABLE_I18N
3022
re_charset_t *mbcset;
3023
Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3024
Idx equiv_class_alloc = 0, char_class_alloc = 0;
3025
#endif /* not RE_ENABLE_I18N */
3026
bool non_match = false;
3027
bin_tree_t *work_tree;
3029
bool first_round = true;
3031
collseqmb = (const unsigned char *)
3032
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3033
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3039
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3040
table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3041
symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3042
_NL_COLLATE_SYMB_TABLEMB);
3043
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3044
_NL_COLLATE_SYMB_EXTRAMB);
3047
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3048
#ifdef RE_ENABLE_I18N
3049
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050
#endif /* RE_ENABLE_I18N */
3051
#ifdef RE_ENABLE_I18N
3052
if (BE (sbcset == NULL || mbcset == NULL, 0))
3054
if (BE (sbcset == NULL, 0))
3055
#endif /* RE_ENABLE_I18N */
3061
token_len = peek_token_bracket (token, regexp, syntax);
3062
if (BE (token->type == END_OF_RE, 0))
3065
goto parse_bracket_exp_free_return;
3067
if (token->type == OP_NON_MATCH_LIST)
3069
#ifdef RE_ENABLE_I18N
3070
mbcset->non_match = 1;
3071
#endif /* not RE_ENABLE_I18N */
3073
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074
bitset_set (sbcset, '\n');
3075
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3076
token_len = peek_token_bracket (token, regexp, syntax);
3077
if (BE (token->type == END_OF_RE, 0))
3080
goto parse_bracket_exp_free_return;
3084
/* We treat the first ']' as a normal character. */
3085
if (token->type == OP_CLOSE_BRACKET)
3086
token->type = CHARACTER;
3090
bracket_elem_t start_elem, end_elem;
3091
unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092
unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3095
bool is_range_exp = false;
3098
start_elem.opr.name = start_name_buf;
3099
ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3100
syntax, first_round);
3101
if (BE (ret != REG_NOERROR, 0))
3104
goto parse_bracket_exp_free_return;
3106
first_round = false;
3108
/* Get information about the next token. We need it in any case. */
3109
token_len = peek_token_bracket (token, regexp, syntax);
3111
/* Do not check for ranges if we know they are not allowed. */
3112
if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3114
if (BE (token->type == END_OF_RE, 0))
3117
goto parse_bracket_exp_free_return;
3119
if (token->type == OP_CHARSET_RANGE)
3121
re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3122
token_len2 = peek_token_bracket (&token2, regexp, syntax);
3123
if (BE (token2.type == END_OF_RE, 0))
3126
goto parse_bracket_exp_free_return;
3128
if (token2.type == OP_CLOSE_BRACKET)
3130
/* We treat the last '-' as a normal character. */
3131
re_string_skip_bytes (regexp, -token_len);
3132
token->type = CHARACTER;
3135
is_range_exp = true;
3139
if (is_range_exp == true)
3141
end_elem.opr.name = end_name_buf;
3142
ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3144
if (BE (ret != REG_NOERROR, 0))
3147
goto parse_bracket_exp_free_return;
3150
token_len = peek_token_bracket (token, regexp, syntax);
3153
*err = build_range_exp (sbcset, mbcset, &range_alloc,
3154
&start_elem, &end_elem);
3156
# ifdef RE_ENABLE_I18N
3157
*err = build_range_exp (sbcset,
3158
dfa->mb_cur_max > 1 ? mbcset : NULL,
3159
&range_alloc, &start_elem, &end_elem);
3161
*err = build_range_exp (sbcset, &start_elem, &end_elem);
3163
#endif /* RE_ENABLE_I18N */
3164
if (BE (*err != REG_NOERROR, 0))
3165
goto parse_bracket_exp_free_return;
3169
switch (start_elem.type)
3172
bitset_set (sbcset, start_elem.opr.ch);
3174
#ifdef RE_ENABLE_I18N
3176
/* Check whether the array has enough space. */
3177
if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3179
wchar_t *new_mbchars;
3180
/* Not enough, realloc it. */
3181
/* +1 in case of mbcset->nmbchars is 0. */
3182
mbchar_alloc = 2 * mbcset->nmbchars + 1;
3183
/* Use realloc since array is NULL if *alloc == 0. */
3184
new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3186
if (BE (new_mbchars == NULL, 0))
3187
goto parse_bracket_exp_espace;
3188
mbcset->mbchars = new_mbchars;
3190
mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3192
#endif /* RE_ENABLE_I18N */
3194
*err = build_equiv_class (sbcset,
3195
#ifdef RE_ENABLE_I18N
3196
mbcset, &equiv_class_alloc,
3197
#endif /* RE_ENABLE_I18N */
3198
start_elem.opr.name);
3199
if (BE (*err != REG_NOERROR, 0))
3200
goto parse_bracket_exp_free_return;
3203
*err = build_collating_symbol (sbcset,
3204
#ifdef RE_ENABLE_I18N
3205
mbcset, &coll_sym_alloc,
3206
#endif /* RE_ENABLE_I18N */
3207
start_elem.opr.name);
3208
if (BE (*err != REG_NOERROR, 0))
3209
goto parse_bracket_exp_free_return;
3212
*err = build_charclass (regexp->trans, sbcset,
3213
#ifdef RE_ENABLE_I18N
3214
mbcset, &char_class_alloc,
3215
#endif /* RE_ENABLE_I18N */
3216
start_elem.opr.name, syntax);
3217
if (BE (*err != REG_NOERROR, 0))
3218
goto parse_bracket_exp_free_return;
3225
if (BE (token->type == END_OF_RE, 0))
3228
goto parse_bracket_exp_free_return;
3230
if (token->type == OP_CLOSE_BRACKET)
3234
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3236
/* If it is non-matching list. */
3238
bitset_not (sbcset);
3240
#ifdef RE_ENABLE_I18N
3241
/* Ensure only single byte characters are set. */
3242
if (dfa->mb_cur_max > 1)
3243
bitset_mask (sbcset, dfa->sb_char);
3245
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3246
|| mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3247
|| mbcset->non_match)))
3249
bin_tree_t *mbc_tree;
3251
/* Build a tree for complex bracket. */
3252
dfa->has_mb_node = 1;
3253
br_token.type = COMPLEX_BRACKET;
3254
br_token.opr.mbcset = mbcset;
3255
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3256
if (BE (mbc_tree == NULL, 0))
3257
goto parse_bracket_exp_espace;
3258
for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3259
if (sbcset[sbc_idx])
3261
/* If there are no bits set in sbcset, there is no point
3262
of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3263
if (sbc_idx < BITSET_WORDS)
3265
/* Build a tree for simple bracket. */
3266
br_token.type = SIMPLE_BRACKET;
3267
br_token.opr.sbcset = sbcset;
3268
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269
if (BE (work_tree == NULL, 0))
3270
goto parse_bracket_exp_espace;
3272
/* Then join them by ALT node. */
3273
work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3274
if (BE (work_tree == NULL, 0))
3275
goto parse_bracket_exp_espace;
3280
work_tree = mbc_tree;
3284
#endif /* not RE_ENABLE_I18N */
3286
#ifdef RE_ENABLE_I18N
3287
free_charset (mbcset);
3289
/* Build a tree for simple bracket. */
3290
br_token.type = SIMPLE_BRACKET;
3291
br_token.opr.sbcset = sbcset;
3292
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293
if (BE (work_tree == NULL, 0))
3294
goto parse_bracket_exp_espace;
3298
parse_bracket_exp_espace:
3300
parse_bracket_exp_free_return:
3302
#ifdef RE_ENABLE_I18N
3303
free_charset (mbcset);
3304
#endif /* RE_ENABLE_I18N */
3308
/* Parse an element in the bracket expression. */
3310
static reg_errcode_t
3311
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3312
re_token_t *token, int token_len, re_dfa_t *dfa,
3313
reg_syntax_t syntax, bool accept_hyphen)
3315
#ifdef RE_ENABLE_I18N
3317
cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3318
if (cur_char_size > 1)
3320
elem->type = MB_CHAR;
3321
elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3322
re_string_skip_bytes (regexp, cur_char_size);
3325
#endif /* RE_ENABLE_I18N */
3326
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3327
if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3328
|| token->type == OP_OPEN_EQUIV_CLASS)
3329
return parse_bracket_symbol (elem, regexp, token);
3330
if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3332
/* A '-' must only appear as anything but a range indicator before
3333
the closing bracket. Everything else is an error. */
3335
(void) peek_token_bracket (&token2, regexp, syntax);
3336
if (token2.type != OP_CLOSE_BRACKET)
3337
/* The actual error value is not standardized since this whole
3338
case is undefined. But ERANGE makes good sense. */
3341
elem->type = SB_CHAR;
3342
elem->opr.ch = token->opr.c;
3346
/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3347
such as [:<character_class>:], [.<collating_element>.], and
3348
[=<equivalent_class>=]. */
3350
static reg_errcode_t
3351
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3354
unsigned char ch, delim = token->opr.c;
3356
if (re_string_eoi(regexp))
3360
if (i >= BRACKET_NAME_BUF_SIZE)
3362
if (token->type == OP_OPEN_CHAR_CLASS)
3363
ch = re_string_fetch_byte_case (regexp);
3365
ch = re_string_fetch_byte (regexp);
3366
if (re_string_eoi(regexp))
3368
if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3370
elem->opr.name[i] = ch;
3372
re_string_skip_bytes (regexp, 1);
3373
elem->opr.name[i] = '\0';
3374
switch (token->type)
3376
case OP_OPEN_COLL_ELEM:
3377
elem->type = COLL_SYM;
3379
case OP_OPEN_EQUIV_CLASS:
3380
elem->type = EQUIV_CLASS;
3382
case OP_OPEN_CHAR_CLASS:
3383
elem->type = CHAR_CLASS;
3391
/* Helper function for parse_bracket_exp.
3392
Build the equivalence class which is represented by NAME.
3393
The result are written to MBCSET and SBCSET.
3394
EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3395
is a pointer argument sinse we may update it. */
3397
static reg_errcode_t
3398
#ifdef RE_ENABLE_I18N
3399
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3400
Idx *equiv_class_alloc, const unsigned char *name)
3401
#else /* not RE_ENABLE_I18N */
3402
build_equiv_class (bitset_t sbcset, const unsigned char *name)
3403
#endif /* not RE_ENABLE_I18N */
3406
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3409
const int32_t *table, *indirect;
3410
const unsigned char *weights, *extra, *cp;
3411
unsigned char char_buf[2];
3415
/* This #include defines a local function! */
3416
# include <locale/weight.h>
3417
/* Calculate the index for equivalence class. */
3419
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3420
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3421
_NL_COLLATE_WEIGHTMB);
3422
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3423
_NL_COLLATE_EXTRAMB);
3424
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3425
_NL_COLLATE_INDIRECTMB);
3426
idx1 = findidx (&cp);
3427
if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3428
/* This isn't a valid character. */
3429
return REG_ECOLLATE;
3431
/* Build single byte matcing table for this equivalence class. */
3432
char_buf[1] = (unsigned char) '\0';
3433
len = weights[idx1];
3434
for (ch = 0; ch < SBC_MAX; ++ch)
3438
idx2 = findidx (&cp);
3443
/* This isn't a valid character. */
3445
if (len == weights[idx2])
3448
while (cnt <= len &&
3449
weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3453
bitset_set (sbcset, ch);
3456
/* Check whether the array has enough space. */
3457
if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3459
/* Not enough, realloc it. */
3460
/* +1 in case of mbcset->nequiv_classes is 0. */
3461
Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3462
/* Use realloc since the array is NULL if *alloc == 0. */
3463
int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3465
new_equiv_class_alloc);
3466
if (BE (new_equiv_classes == NULL, 0))
3468
mbcset->equiv_classes = new_equiv_classes;
3469
*equiv_class_alloc = new_equiv_class_alloc;
3471
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3476
if (BE (strlen ((const char *) name) != 1, 0))
3477
return REG_ECOLLATE;
3478
bitset_set (sbcset, *name);
3483
/* Helper function for parse_bracket_exp.
3484
Build the character class which is represented by NAME.
3485
The result are written to MBCSET and SBCSET.
3486
CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3487
is a pointer argument sinse we may update it. */
3489
static reg_errcode_t
3490
#ifdef RE_ENABLE_I18N
3491
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3492
re_charset_t *mbcset, Idx *char_class_alloc,
3493
const unsigned char *class_name, reg_syntax_t syntax)
3494
#else /* not RE_ENABLE_I18N */
3495
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3496
const unsigned char *class_name, reg_syntax_t syntax)
3497
#endif /* not RE_ENABLE_I18N */
3500
const char *name = (const char *) class_name;
3502
/* In case of REG_ICASE "upper" and "lower" match the both of
3503
upper and lower cases. */
3504
if ((syntax & RE_ICASE)
3505
&& (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508
#ifdef RE_ENABLE_I18N
3509
/* Check the space of the arrays. */
3510
if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3512
/* Not enough, realloc it. */
3513
/* +1 in case of mbcset->nchar_classes is 0. */
3514
Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3515
/* Use realloc since array is NULL if *alloc == 0. */
3516
wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3517
new_char_class_alloc);
3518
if (BE (new_char_classes == NULL, 0))
3520
mbcset->char_classes = new_char_classes;
3521
*char_class_alloc = new_char_class_alloc;
3523
mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3524
#endif /* RE_ENABLE_I18N */
3526
#define BUILD_CHARCLASS_LOOP(ctype_func) \
3528
if (BE (trans != NULL, 0)) \
3530
for (i = 0; i < SBC_MAX; ++i) \
3531
if (ctype_func (i)) \
3532
bitset_set (sbcset, trans[i]); \
3536
for (i = 0; i < SBC_MAX; ++i) \
3537
if (ctype_func (i)) \
3538
bitset_set (sbcset, i); \
3542
if (strcmp (name, "alnum") == 0)
3543
BUILD_CHARCLASS_LOOP (isalnum);
3544
else if (strcmp (name, "cntrl") == 0)
3545
BUILD_CHARCLASS_LOOP (iscntrl);
3546
else if (strcmp (name, "lower") == 0)
3547
BUILD_CHARCLASS_LOOP (islower);
3548
else if (strcmp (name, "space") == 0)
3549
BUILD_CHARCLASS_LOOP (isspace);
3550
else if (strcmp (name, "alpha") == 0)
3551
BUILD_CHARCLASS_LOOP (isalpha);
3552
else if (strcmp (name, "digit") == 0)
3553
BUILD_CHARCLASS_LOOP (isdigit);
3554
else if (strcmp (name, "print") == 0)
3555
BUILD_CHARCLASS_LOOP (isprint);
3556
else if (strcmp (name, "upper") == 0)
3557
BUILD_CHARCLASS_LOOP (isupper);
3558
else if (strcmp (name, "blank") == 0)
3559
BUILD_CHARCLASS_LOOP (isblank);
3560
else if (strcmp (name, "graph") == 0)
3561
BUILD_CHARCLASS_LOOP (isgraph);
3562
else if (strcmp (name, "punct") == 0)
3563
BUILD_CHARCLASS_LOOP (ispunct);
3564
else if (strcmp (name, "xdigit") == 0)
3565
BUILD_CHARCLASS_LOOP (isxdigit);
3573
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3574
const unsigned char *class_name,
3575
const unsigned char *extra, bool non_match,
3578
re_bitset_ptr_t sbcset;
3579
#ifdef RE_ENABLE_I18N
3580
re_charset_t *mbcset;
3582
#endif /* not RE_ENABLE_I18N */
3584
re_token_t br_token;
3587
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3588
#ifdef RE_ENABLE_I18N
3589
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3590
#endif /* RE_ENABLE_I18N */
3592
#ifdef RE_ENABLE_I18N
3593
if (BE (sbcset == NULL || mbcset == NULL, 0))
3594
#else /* not RE_ENABLE_I18N */
3595
if (BE (sbcset == NULL, 0))
3596
#endif /* not RE_ENABLE_I18N */
3604
#ifdef RE_ENABLE_I18N
3605
mbcset->non_match = 1;
3606
#endif /* not RE_ENABLE_I18N */
3609
/* We don't care the syntax in this case. */
3610
ret = build_charclass (trans, sbcset,
3611
#ifdef RE_ENABLE_I18N
3613
#endif /* RE_ENABLE_I18N */
3616
if (BE (ret != REG_NOERROR, 0))
3619
#ifdef RE_ENABLE_I18N
3620
free_charset (mbcset);
3621
#endif /* RE_ENABLE_I18N */
3625
/* \w match '_' also. */
3626
for (; *extra; extra++)
3627
bitset_set (sbcset, *extra);
3629
/* If it is non-matching list. */
3631
bitset_not (sbcset);
3633
#ifdef RE_ENABLE_I18N
3634
/* Ensure only single byte characters are set. */
3635
if (dfa->mb_cur_max > 1)
3636
bitset_mask (sbcset, dfa->sb_char);
3639
/* Build a tree for simple bracket. */
3640
br_token.type = SIMPLE_BRACKET;
3641
br_token.opr.sbcset = sbcset;
3642
tree = create_token_tree (dfa, NULL, NULL, &br_token);
3643
if (BE (tree == NULL, 0))
3644
goto build_word_op_espace;
3646
#ifdef RE_ENABLE_I18N
3647
if (dfa->mb_cur_max > 1)
3649
bin_tree_t *mbc_tree;
3650
/* Build a tree for complex bracket. */
3651
br_token.type = COMPLEX_BRACKET;
3652
br_token.opr.mbcset = mbcset;
3653
dfa->has_mb_node = 1;
3654
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3655
if (BE (mbc_tree == NULL, 0))
3656
goto build_word_op_espace;
3657
/* Then join them by ALT node. */
3658
tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3659
if (BE (mbc_tree != NULL, 1))
3664
free_charset (mbcset);
3667
#else /* not RE_ENABLE_I18N */
3669
#endif /* not RE_ENABLE_I18N */
3671
build_word_op_espace:
3673
#ifdef RE_ENABLE_I18N
3674
free_charset (mbcset);
3675
#endif /* RE_ENABLE_I18N */
3680
/* This is intended for the expressions like "a{1,3}".
3681
Fetch a number from `input', and return the number.
3682
Return REG_MISSING if the number field is empty like "{,1}".
3683
Return REG_ERROR if an error occurred. */
3686
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3688
Idx num = REG_MISSING;
3692
fetch_token (token, input, syntax);
3694
if (BE (token->type == END_OF_RE, 0))
3696
if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3698
num = ((token->type != CHARACTER || c < '0' || '9' < c
3699
|| num == REG_ERROR)
3701
: ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3702
num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3707
#ifdef RE_ENABLE_I18N
3709
free_charset (re_charset_t *cset)
3711
re_free (cset->mbchars);
3713
re_free (cset->coll_syms);
3714
re_free (cset->equiv_classes);
3715
re_free (cset->range_starts);
3716
re_free (cset->range_ends);
3718
re_free (cset->char_classes);
3721
#endif /* RE_ENABLE_I18N */
3723
/* Functions for binary tree operation. */
3725
/* Create a tree node. */
3728
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729
re_token_type_t type)
3733
return create_token_tree (dfa, left, right, &t);
3737
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738
const re_token_t *token)
3741
if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3743
bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3745
if (storage == NULL)
3747
storage->next = dfa->str_tree_storage;
3748
dfa->str_tree_storage = storage;
3749
dfa->str_tree_storage_idx = 0;
3751
tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3753
tree->parent = NULL;
3755
tree->right = right;
3756
tree->token = *token;
3757
tree->token.duplicated = 0;
3758
tree->token.opt_subexp = 0;
3761
tree->node_idx = REG_MISSING;
3764
left->parent = tree;
3766
right->parent = tree;
3770
/* Mark the tree SRC as an optional subexpression.
3771
To be called from preorder or postorder. */
3773
static reg_errcode_t
3774
mark_opt_subexp (void *extra, bin_tree_t *node)
3776
Idx idx = (Idx) (long) extra;
3777
if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778
node->token.opt_subexp = 1;
3783
/* Free the allocated memory inside NODE. */
3786
free_token (re_token_t *node)
3788
#ifdef RE_ENABLE_I18N
3789
if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790
free_charset (node->opr.mbcset);
3792
#endif /* RE_ENABLE_I18N */
3793
if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794
re_free (node->opr.sbcset);
3797
/* Worker function for tree walking. Free the allocated memory inside NODE
3798
and its children. */
3800
static reg_errcode_t
3801
free_tree (void *extra, bin_tree_t *node)
3803
free_token (&node->token);
3808
/* Duplicate the node SRC, and return new node. This is a preorder
3809
visit similar to the one implemented by the generic visitor, but
3810
we need more infrastructure to maintain two parallel trees --- so,
3811
it's easier to duplicate. */
3814
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3816
const bin_tree_t *node;
3817
bin_tree_t *dup_root;
3818
bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3820
for (node = root; ; )
3822
/* Create a new tree and link it back to the current parent. */
3823
*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3826
(*p_new)->parent = dup_node;
3827
(*p_new)->token.duplicated = 1;
3830
/* Go to the left node, or up and to the right. */
3834
p_new = &dup_node->left;
3838
const bin_tree_t *prev = NULL;
3839
while (node->right == prev || node->right == NULL)
3842
node = node->parent;
3843
dup_node = dup_node->parent;
3848
p_new = &dup_node->right;