1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3
This file is part of the GNU C Library.
4
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU Lesser General Public
8
License as published by the Free Software Foundation; either
9
version 2.1 of the License, or (at your option) any later version.
11
The GNU C Library is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
Lesser General Public License for more details.
16
You should have received a copy of the GNU Lesser General Public
17
License along with the GNU C Library; if not, write to the Free
18
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22
int length, reg_syntax_t syntax);
23
static void re_compile_fastmap_iter (regex_t *bufp,
24
const re_dfastate_t *init_state,
26
static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27
static reg_errcode_t init_word_char (re_dfa_t *dfa);
29
static void free_charset (re_charset_t *cset);
30
#endif /* RE_ENABLE_I18N */
31
static void free_workarea_compile (regex_t *preg);
32
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33
static reg_errcode_t analyze (re_dfa_t *dfa);
34
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
35
static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
36
static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
37
static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
38
static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
39
int top_clone_node, int root_node,
40
unsigned int constraint);
41
static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
42
unsigned int constraint);
43
static int search_duplicated_node (re_dfa_t *dfa, int org_node,
44
unsigned int constraint);
45
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
46
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
48
static void calc_inveclosure (re_dfa_t *dfa);
49
static int fetch_number (re_string_t *input, re_token_t *token,
51
static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
52
static int peek_token (re_token_t *token, re_string_t *input,
54
static int peek_token_bracket (re_token_t *token, re_string_t *input,
56
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
57
reg_syntax_t syntax, reg_errcode_t *err);
58
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
59
re_token_t *token, reg_syntax_t syntax,
60
int nest, reg_errcode_t *err);
61
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
62
re_token_t *token, reg_syntax_t syntax,
63
int nest, reg_errcode_t *err);
64
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
65
re_token_t *token, reg_syntax_t syntax,
66
int nest, reg_errcode_t *err);
67
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
68
re_token_t *token, reg_syntax_t syntax,
69
int nest, reg_errcode_t *err);
70
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
71
re_dfa_t *dfa, re_token_t *token,
72
reg_syntax_t syntax, reg_errcode_t *err);
73
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
74
re_token_t *token, reg_syntax_t syntax,
76
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
78
re_token_t *token, int token_len,
81
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85
# ifdef RE_ENABLE_I18N
86
static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
87
re_charset_t *mbcset, int *range_alloc,
88
bracket_elem_t *start_elem,
89
bracket_elem_t *end_elem);
90
static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
93
const unsigned char *name);
94
# else /* not RE_ENABLE_I18N */
95
static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
96
bracket_elem_t *start_elem,
97
bracket_elem_t *end_elem);
98
static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
99
const unsigned char *name);
100
# endif /* not RE_ENABLE_I18N */
101
#endif /* not _LIBC */
102
#ifdef RE_ENABLE_I18N
103
static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
104
re_charset_t *mbcset,
105
int *equiv_class_alloc,
106
const unsigned char *name);
107
static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
108
re_charset_t *mbcset,
109
int *char_class_alloc,
110
const unsigned char *class_name,
111
reg_syntax_t syntax);
112
#else /* not RE_ENABLE_I18N */
113
static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
114
const unsigned char *name);
115
static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
116
const unsigned char *class_name,
117
reg_syntax_t syntax);
118
#endif /* not RE_ENABLE_I18N */
119
static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
120
static void free_bin_tree (bin_tree_t *tree);
121
static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
122
re_token_type_t type, int index);
123
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125
/* This table gives an error message for each of the error codes listed
126
in regex.h. Obviously the order here has to be same as there.
127
POSIX doesn't require that we do anything for REG_NOERROR,
128
but why not be nice? */
130
const char __re_error_msgid[] attribute_hidden =
132
#define REG_NOERROR_IDX 0
133
gettext_noop ("Success") /* REG_NOERROR */
135
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136
gettext_noop ("No match") /* REG_NOMATCH */
138
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139
gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142
gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145
gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148
gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151
gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154
gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157
gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160
gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163
gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166
gettext_noop ("Invalid range end") /* REG_ERANGE */
168
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169
gettext_noop ("Memory exhausted") /* REG_ESPACE */
171
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172
gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175
gettext_noop ("Premature end of regular expression") /* REG_EEND */
177
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178
gettext_noop ("Regular expression too big") /* REG_ESIZE */
180
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181
gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184
const size_t __re_error_msgid_idx[] attribute_hidden =
205
/* Entry points for GNU code. */
207
/* re_compile_pattern is the GNU regular expression compiler: it
208
compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209
Returns 0 if the pattern was valid, otherwise an error string.
211
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212
are set in BUFP on entry. */
215
re_compile_pattern (pattern, length, bufp)
218
struct re_pattern_buffer *bufp;
222
/* And GNU code determines whether or not to get register information
223
by passing null for the REGS argument to re_match, etc., not by
227
/* Match anchors at newline. */
228
bufp->newline_anchor = 1;
230
ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234
return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237
weak_alias (__re_compile_pattern, re_compile_pattern)
240
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241
also be assigned to arbitrarily: each pattern buffer stores its own
242
syntax, so it can be changed between regex compilations. */
243
/* This has no initializer because initialized variables in Emacs
244
become read-only after dumping. */
245
reg_syntax_t re_syntax_options;
248
/* Specify the precise syntax of regexps for compilation. This provides
249
for compatibility for various utilities which historically have
250
different, incompatible syntaxes.
252
The argument SYNTAX is a bit mask comprised of the various bits
253
defined in regex.h. We return the old syntax. */
256
re_set_syntax (syntax)
259
reg_syntax_t ret = re_syntax_options;
261
re_syntax_options = syntax;
265
weak_alias (__re_set_syntax, re_set_syntax)
269
re_compile_fastmap (bufp)
270
struct re_pattern_buffer *bufp;
272
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273
char *fastmap = bufp->fastmap;
275
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277
if (dfa->init_state != dfa->init_state_word)
278
re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279
if (dfa->init_state != dfa->init_state_nl)
280
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281
if (dfa->init_state != dfa->init_state_begbuf)
282
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283
bufp->fastmap_accurate = 1;
287
weak_alias (__re_compile_fastmap, re_compile_fastmap)
291
re_set_fastmap (char *fastmap, int icase, int ch)
295
fastmap[tolower (ch)] = 1;
298
/* Helper function for re_compile_fastmap.
299
Compile fastmap for the initial_state INIT_STATE. */
302
re_compile_fastmap_iter (bufp, init_state, fastmap)
304
const re_dfastate_t *init_state;
307
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
309
int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
310
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
312
int node = init_state->nodes.elems[node_cnt];
313
re_token_type_t type = dfa->nodes[node].type;
315
if (type == CHARACTER)
316
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317
else if (type == SIMPLE_BRACKET)
320
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
321
for (j = 0; j < UINT_BITS; ++j, ++ch)
322
if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
323
re_set_fastmap (fastmap, icase, ch);
325
#ifdef RE_ENABLE_I18N
326
else if (type == COMPLEX_BRACKET)
329
re_charset_t *cset = dfa->nodes[node].opr.mbcset;
330
if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
331
|| cset->nranges || cset->nchar_classes)
334
if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
336
/* In this case we want to catch the bytes which are
337
the first byte of any collation elements.
338
e.g. In da_DK, we want to catch 'a' since "aa"
339
is a valid collation element, and don't catch
340
'b' since 'b' is the only collation element
341
which starts from 'b'. */
343
const int32_t *table = (const int32_t *)
344
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
346
for (j = 0; j < UINT_BITS; ++j, ++ch)
348
re_set_fastmap (fastmap, icase, ch);
352
for (i = 0; i < SBC_MAX; ++i)
353
if (__btowc (i) == WEOF)
354
re_set_fastmap (fastmap, icase, i);
355
# endif /* not _LIBC */
357
for (i = 0; i < cset->nmbchars; ++i)
361
memset (&state, '\0', sizeof (state));
362
__wcrtomb (buf, cset->mbchars[i], &state);
363
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
366
#endif /* RE_ENABLE_I18N */
367
else if (type == END_OF_RE || type == OP_PERIOD)
369
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
370
if (type == END_OF_RE)
371
bufp->can_be_null = 1;
377
/* Entry point for POSIX code. */
378
/* regcomp takes a regular expression as a string and compiles it.
380
PREG is a regex_t *. We do not expect any fields to be initialized,
381
since POSIX says we shouldn't. Thus, we set
383
`buffer' to the compiled pattern;
384
`used' to the length of the compiled pattern;
385
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
386
REG_EXTENDED bit in CFLAGS is set; otherwise, to
387
RE_SYNTAX_POSIX_BASIC;
388
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
389
`fastmap' to an allocated space for the fastmap;
390
`fastmap_accurate' to zero;
391
`re_nsub' to the number of subexpressions in PATTERN.
393
PATTERN is the address of the pattern string.
395
CFLAGS is a series of bits which affect compilation.
397
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
398
use POSIX basic syntax.
400
If REG_NEWLINE is set, then . and [^...] don't match newline.
401
Also, regexec will try a match beginning after every newline.
403
If REG_ICASE is set, then we considers upper- and lowercase
404
versions of letters to be equivalent when matching.
406
If REG_NOSUB is set, then when PREG is passed to regexec, that
407
routine will report only success or failure, and nothing about the
410
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
411
the return codes and their meanings.) */
414
regcomp (preg, pattern, cflags)
415
regex_t *__restrict preg;
416
const char *__restrict pattern;
420
reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
421
: RE_SYNTAX_POSIX_BASIC);
427
/* Try to allocate space for the fastmap. */
428
preg->fastmap = re_malloc (char, SBC_MAX);
429
if (BE (preg->fastmap == NULL, 0))
432
syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
434
/* If REG_NEWLINE is set, newlines are treated differently. */
435
if (cflags & REG_NEWLINE)
436
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
437
syntax &= ~RE_DOT_NEWLINE;
438
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
439
/* It also changes the matching behavior. */
440
preg->newline_anchor = 1;
443
preg->newline_anchor = 0;
444
preg->no_sub = !!(cflags & REG_NOSUB);
445
preg->translate = NULL;
447
ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
449
/* POSIX doesn't distinguish between an unmatched open-group and an
450
unmatched close-group: both are REG_EPAREN. */
451
if (ret == REG_ERPAREN)
454
/* We have already checked preg->fastmap != NULL. */
455
if (BE (ret == REG_NOERROR, 1))
456
/* Compute the fastmap now, since regexec cannot modify the pattern
457
buffer. This function nevers fails in this implementation. */
458
(void) re_compile_fastmap (preg);
461
/* Some error occurred while compiling the expression. */
462
re_free (preg->fastmap);
463
preg->fastmap = NULL;
469
weak_alias (__regcomp, regcomp)
472
/* Returns a message corresponding to an error code, ERRCODE, returned
473
from either regcomp or regexec. We don't use PREG here. */
476
regerror (errcode, preg, errbuf, errbuf_size)
486
|| errcode >= (int) (sizeof (__re_error_msgid_idx)
487
/ sizeof (__re_error_msgid_idx[0])), 0))
488
/* Only error codes returned by the rest of the code should be passed
489
to this routine. If we are given anything else, or if other regex
490
code generates an invalid error code, then the program has a bug.
491
Dump core so we can fix it. */
494
msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
496
msg_size = strlen (msg) + 1; /* Includes the null. */
498
if (BE (errbuf_size != 0, 1))
500
if (BE (msg_size > errbuf_size, 0))
502
#if defined HAVE_MEMPCPY || defined _LIBC
503
*((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
505
memcpy (errbuf, msg, errbuf_size - 1);
506
errbuf[errbuf_size - 1] = 0;
510
memcpy (errbuf, msg, msg_size);
516
weak_alias (__regerror, regerror)
521
free_dfa_content (re_dfa_t *dfa)
525
re_free (dfa->subexps);
527
for (i = 0; i < dfa->nodes_len; ++i)
529
re_token_t *node = dfa->nodes + i;
530
#ifdef RE_ENABLE_I18N
531
if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
532
free_charset (node->opr.mbcset);
534
#endif /* RE_ENABLE_I18N */
535
if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
536
re_free (node->opr.sbcset);
538
re_free (dfa->nexts);
539
for (i = 0; i < dfa->nodes_len; ++i)
541
if (dfa->eclosures != NULL)
542
re_node_set_free (dfa->eclosures + i);
543
if (dfa->inveclosures != NULL)
544
re_node_set_free (dfa->inveclosures + i);
545
if (dfa->edests != NULL)
546
re_node_set_free (dfa->edests + i);
548
re_free (dfa->edests);
549
re_free (dfa->eclosures);
550
re_free (dfa->inveclosures);
551
re_free (dfa->nodes);
553
for (i = 0; i <= dfa->state_hash_mask; ++i)
555
struct re_state_table_entry *entry = dfa->state_table + i;
556
for (j = 0; j < entry->num; ++j)
558
re_dfastate_t *state = entry->array[j];
561
re_free (entry->array);
563
re_free (dfa->state_table);
565
if (dfa->word_char != NULL)
566
re_free (dfa->word_char);
568
re_free (dfa->re_str);
575
/* Free dynamically allocated space used by PREG. */
581
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
582
if (BE (dfa != NULL, 1))
583
free_dfa_content (dfa);
585
re_free (preg->fastmap);
588
weak_alias (__regfree, regfree)
591
/* Entry points compatible with 4.2 BSD regex library. We don't define
592
them unless specifically requested. */
594
#if defined _REGEX_RE_COMP || defined _LIBC
596
/* BSD has one and only one pattern buffer. */
597
static struct re_pattern_buffer re_comp_buf;
601
/* Make these definitions weak in libc, so POSIX programs can redefine
602
these names if they don't use our functions, and still use
603
regcomp/regexec above without link errors. */
614
if (!re_comp_buf.buffer)
615
return gettext ("No previous regular expression");
619
if (re_comp_buf.buffer)
621
fastmap = re_comp_buf.fastmap;
622
re_comp_buf.fastmap = NULL;
623
__regfree (&re_comp_buf);
624
memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
625
re_comp_buf.fastmap = fastmap;
628
if (re_comp_buf.fastmap == NULL)
630
re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
631
if (re_comp_buf.fastmap == NULL)
632
return (char *) gettext (__re_error_msgid
633
+ __re_error_msgid_idx[(int) REG_ESPACE]);
636
/* Since `re_exec' always passes NULL for the `regs' argument, we
637
don't need to initialize the pattern buffer fields which affect it. */
639
/* Match anchors at newlines. */
640
re_comp_buf.newline_anchor = 1;
642
ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
647
/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
648
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
652
libc_freeres_fn (free_mem)
654
__regfree (&re_comp_buf);
658
#endif /* _REGEX_RE_COMP */
660
/* Internal entry point.
661
Compile the regular expression PATTERN, whose length is LENGTH.
662
SYNTAX indicate regular expression's syntax. */
665
re_compile_internal (preg, pattern, length, syntax)
667
const char * pattern;
671
reg_errcode_t err = REG_NOERROR;
675
/* Initialize the pattern buffer. */
676
preg->fastmap_accurate = 0;
677
preg->syntax = syntax;
678
preg->not_bol = preg->not_eol = 0;
681
preg->can_be_null = 0;
682
preg->regs_allocated = REGS_UNALLOCATED;
684
/* Initialize the dfa. */
685
dfa = (re_dfa_t *) preg->buffer;
686
if (preg->allocated < sizeof (re_dfa_t))
688
/* If zero allocated, but buffer is non-null, try to realloc
689
enough space. This loses if buffer's address is bogus, but
690
that is the user's responsibility. If ->buffer is NULL this
691
is a simple allocation. */
692
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
695
preg->allocated = sizeof (re_dfa_t);
697
preg->buffer = (unsigned char *) dfa;
698
preg->used = sizeof (re_dfa_t);
700
err = init_dfa (dfa, length);
701
if (BE (err != REG_NOERROR, 0))
709
dfa->re_str = re_malloc (char, length + 1);
710
strncpy (dfa->re_str, pattern, length + 1);
713
err = re_string_construct (®exp, pattern, length, preg->translate,
715
if (BE (err != REG_NOERROR, 0))
723
/* Parse the regular expression, and build a structure tree. */
725
dfa->str_tree = parse (®exp, preg, syntax, &err);
726
if (BE (dfa->str_tree == NULL, 0))
727
goto re_compile_internal_free_return;
729
/* Analyze the tree and collect information which is necessary to
732
if (BE (err != REG_NOERROR, 0))
733
goto re_compile_internal_free_return;
735
/* Then create the initial state of the dfa. */
736
err = create_initial_state (dfa);
738
/* Release work areas. */
739
free_workarea_compile (preg);
740
re_string_destruct (®exp);
742
if (BE (err != REG_NOERROR, 0))
744
re_compile_internal_free_return:
745
free_dfa_content (dfa);
753
/* Initialize DFA. We use the length of the regular expression PAT_LEN
754
as the initial length of some arrays. */
757
init_dfa (dfa, pat_len)
763
memset (dfa, '\0', sizeof (re_dfa_t));
765
dfa->nodes_alloc = pat_len + 1;
766
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
768
dfa->states_alloc = pat_len + 1;
770
/* table_size = 2 ^ ceil(log pat_len) */
771
for (table_size = 1; table_size > 0; table_size <<= 1)
772
if (table_size > pat_len)
775
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
776
dfa->state_hash_mask = table_size - 1;
778
dfa->subexps_alloc = 1;
779
dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
780
dfa->word_char = NULL;
782
if (BE (dfa->nodes == NULL || dfa->state_table == NULL
783
|| dfa->subexps == NULL, 0))
785
/* We don't bother to free anything which was allocated. Very
786
soon the process will go down anyway. */
788
dfa->state_table = NULL;
795
/* Initialize WORD_CHAR table, which indicate which character is
796
"word". In this case "word" means that it is the word construction
797
character used by some operators like "\<", "\>", etc. */
804
dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
805
if (BE (dfa->word_char == NULL, 0))
807
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
808
for (j = 0; j < UINT_BITS; ++j, ++ch)
809
if (isalnum (ch) || ch == '_')
810
dfa->word_char[i] |= 1 << j;
814
/* Free the work area which are only used while compiling. */
817
free_workarea_compile (preg)
820
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
821
free_bin_tree (dfa->str_tree);
822
dfa->str_tree = NULL;
823
re_free (dfa->org_indices);
824
dfa->org_indices = NULL;
827
/* Create initial states for all contexts. */
830
create_initial_state (dfa)
835
re_node_set init_nodes;
837
/* Initial states have the epsilon closure of the node which is
838
the first node of the regular expression. */
839
first = dfa->str_tree->first;
840
dfa->init_node = first;
841
err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
842
if (BE (err != REG_NOERROR, 0))
845
/* The back-references which are in initial states can epsilon transit,
846
since in this case all of the subexpressions can be null.
847
Then we add epsilon closures of the nodes which are the next nodes of
848
the back-references. */
849
if (dfa->nbackref > 0)
850
for (i = 0; i < init_nodes.nelem; ++i)
852
int node_idx = init_nodes.elems[i];
853
re_token_type_t type = dfa->nodes[node_idx].type;
856
if (type != OP_BACK_REF)
858
for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
860
re_token_t *clexp_node;
861
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
862
if (clexp_node->type == OP_CLOSE_SUBEXP
863
&& clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
866
if (clexp_idx == init_nodes.nelem)
869
if (type == OP_BACK_REF)
871
int dest_idx = dfa->edests[node_idx].elems[0];
872
if (!re_node_set_contains (&init_nodes, dest_idx))
874
re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
880
/* It must be the first time to invoke acquire_state. */
881
dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
882
/* We don't check ERR here, since the initial state must not be NULL. */
883
if (BE (dfa->init_state == NULL, 0))
885
if (dfa->init_state->has_constraint)
887
dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
889
dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
891
dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
895
if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
896
|| dfa->init_state_begbuf == NULL, 0))
900
dfa->init_state_word = dfa->init_state_nl
901
= dfa->init_state_begbuf = dfa->init_state;
903
re_node_set_free (&init_nodes);
907
/* Analyze the structure tree, and calculate "first", "next", "edest",
908
"eclosure", and "inveclosure". */
917
/* Allocate arrays. */
918
dfa->nexts = re_malloc (int, dfa->nodes_alloc);
919
dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
920
dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
921
dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
922
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
923
if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
924
|| dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
926
/* Initialize them. */
927
for (i = 0; i < dfa->nodes_len; ++i)
930
re_node_set_init_empty (dfa->edests + i);
931
re_node_set_init_empty (dfa->eclosures + i);
932
re_node_set_init_empty (dfa->inveclosures + i);
935
ret = analyze_tree (dfa, dfa->str_tree);
936
if (BE (ret == REG_NOERROR, 1))
938
ret = calc_eclosure (dfa);
939
if (ret == REG_NOERROR)
940
calc_inveclosure (dfa);
945
/* Helper functions for analyze.
946
This function calculate "first", "next", and "edest" for the subtree
947
whose root is NODE. */
950
analyze_tree (dfa, node)
955
if (node->first == -1)
956
calc_first (dfa, node);
957
if (node->next == -1)
958
calc_next (dfa, node);
959
if (node->eclosure.nelem == 0)
960
calc_epsdest (dfa, node);
961
/* Calculate "first" etc. for the left child. */
962
if (node->left != NULL)
964
ret = analyze_tree (dfa, node->left);
965
if (BE (ret != REG_NOERROR, 0))
968
/* Calculate "first" etc. for the right child. */
969
if (node->right != NULL)
971
ret = analyze_tree (dfa, node->right);
972
if (BE (ret != REG_NOERROR, 0))
978
/* Calculate "first" for the node NODE. */
980
calc_first (dfa, node)
985
idx = node->node_idx;
986
type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
991
case OP_OPEN_BRACKET:
992
case OP_CLOSE_BRACKET:
993
case OP_OPEN_DUP_NUM:
994
case OP_CLOSE_DUP_NUM:
995
case OP_NON_MATCH_LIST:
996
case OP_OPEN_COLL_ELEM:
997
case OP_CLOSE_COLL_ELEM:
998
case OP_OPEN_EQUIV_CLASS:
999
case OP_CLOSE_EQUIV_CLASS:
1000
case OP_OPEN_CHAR_CLASS:
1001
case OP_CLOSE_CHAR_CLASS:
1002
/* These must not be appeared here. */
1008
case OP_DUP_ASTERISK:
1009
case OP_DUP_QUESTION:
1010
#ifdef RE_ENABLE_I18N
1011
case COMPLEX_BRACKET:
1012
#endif /* RE_ENABLE_I18N */
1013
case SIMPLE_BRACKET:
1016
case OP_OPEN_SUBEXP:
1017
case OP_CLOSE_SUBEXP:
1022
assert (node->left != NULL);
1024
if (node->left->first == -1)
1025
calc_first (dfa, node->left);
1026
node->first = node->left->first;
1031
/* else fall through */
1034
assert (node->left != NULL);
1036
if (node->left->first == -1)
1037
calc_first (dfa, node->left);
1038
node->first = node->left->first;
1043
/* Calculate "next" for the node NODE. */
1046
calc_next (dfa, node)
1051
bin_tree_t *parent = node->parent;
1055
idx = node->node_idx;
1056
if (node->type == 0)
1057
dfa->nexts[idx] = node->next;
1061
idx = parent->node_idx;
1062
type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1066
case OP_DUP_ASTERISK:
1071
if (parent->left == node)
1073
if (parent->right->first == -1)
1074
calc_first (dfa, parent->right);
1075
node->next = parent->right->first;
1078
/* else fall through */
1080
if (parent->next == -1)
1081
calc_next (dfa, parent);
1082
node->next = parent->next;
1085
idx = node->node_idx;
1086
if (node->type == 0)
1087
dfa->nexts[idx] = node->next;
1090
/* Calculate "edest" for the node NODE. */
1093
calc_epsdest (dfa, node)
1098
idx = node->node_idx;
1099
if (node->type == 0)
1101
if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1102
|| dfa->nodes[idx].type == OP_DUP_PLUS
1103
|| dfa->nodes[idx].type == OP_DUP_QUESTION)
1105
if (node->left->first == -1)
1106
calc_first (dfa, node->left);
1107
if (node->next == -1)
1108
calc_next (dfa, node);
1109
re_node_set_init_2 (dfa->edests + idx, node->left->first,
1112
else if (dfa->nodes[idx].type == OP_ALT)
1115
if (node->left != NULL)
1117
if (node->left->first == -1)
1118
calc_first (dfa, node->left);
1119
left = node->left->first;
1123
if (node->next == -1)
1124
calc_next (dfa, node);
1127
if (node->right != NULL)
1129
if (node->right->first == -1)
1130
calc_first (dfa, node->right);
1131
right = node->right->first;
1135
if (node->next == -1)
1136
calc_next (dfa, node);
1139
re_node_set_init_2 (dfa->edests + idx, left, right);
1141
else if (dfa->nodes[idx].type == ANCHOR
1142
|| dfa->nodes[idx].type == OP_OPEN_SUBEXP
1143
|| dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1144
|| dfa->nodes[idx].type == OP_BACK_REF)
1145
re_node_set_init_1 (dfa->edests + idx, node->next);
1149
/* Duplicate the epsilon closure of the node ROOT_NODE.
1150
Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1151
to their own constraint. */
1153
static reg_errcode_t
1154
duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1157
int top_org_node, top_clone_node, root_node;
1158
unsigned int init_constraint;
1161
int org_node, clone_node, ret;
1162
unsigned int constraint = init_constraint;
1163
for (org_node = top_org_node, clone_node = top_clone_node;;)
1165
int org_dest, clone_dest;
1166
if (dfa->nodes[org_node].type == OP_BACK_REF)
1168
/* If the back reference epsilon-transit, its destination must
1169
also have the constraint. Then duplicate the epsilon closure
1170
of the destination of the back reference, and store it in
1171
edests of the back reference. */
1172
org_dest = dfa->nexts[org_node];
1173
re_node_set_empty (dfa->edests + clone_node);
1174
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1175
if (BE (err != REG_NOERROR, 0))
1177
dfa->nexts[clone_node] = dfa->nexts[org_node];
1178
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1179
if (BE (ret < 0, 0))
1182
else if (dfa->edests[org_node].nelem == 0)
1184
/* In case of the node can't epsilon-transit, don't duplicate the
1185
destination and store the original destination as the
1186
destination of the node. */
1187
dfa->nexts[clone_node] = dfa->nexts[org_node];
1190
else if (dfa->edests[org_node].nelem == 1)
1192
/* In case of the node can epsilon-transit, and it has only one
1194
org_dest = dfa->edests[org_node].elems[0];
1195
re_node_set_empty (dfa->edests + clone_node);
1196
if (dfa->nodes[org_node].type == ANCHOR)
1198
/* In case of the node has another constraint, append it. */
1199
if (org_node == root_node && clone_node != org_node)
1201
/* ...but if the node is root_node itself, it means the
1202
epsilon closure have a loop, then tie it to the
1203
destination of the root_node. */
1204
ret = re_node_set_insert (dfa->edests + clone_node,
1206
if (BE (ret < 0, 0))
1210
constraint |= dfa->nodes[org_node].opr.ctx_type;
1212
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1213
if (BE (err != REG_NOERROR, 0))
1215
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216
if (BE (ret < 0, 0))
1219
else /* dfa->edests[org_node].nelem == 2 */
1221
/* In case of the node can epsilon-transit, and it has two
1222
destinations. E.g. '|', '*', '+', '?'. */
1223
org_dest = dfa->edests[org_node].elems[0];
1224
re_node_set_empty (dfa->edests + clone_node);
1225
/* Search for a duplicated node which satisfies the constraint. */
1226
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1227
if (clone_dest == -1)
1229
/* There are no such a duplicated node, create a new one. */
1230
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1231
if (BE (err != REG_NOERROR, 0))
1233
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1234
if (BE (ret < 0, 0))
1236
err = duplicate_node_closure (dfa, org_dest, clone_dest,
1237
root_node, constraint);
1238
if (BE (err != REG_NOERROR, 0))
1243
/* There are a duplicated node which satisfy the constraint,
1244
use it to avoid infinite loop. */
1245
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1246
if (BE (ret < 0, 0))
1250
org_dest = dfa->edests[org_node].elems[1];
1251
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252
if (BE (err != REG_NOERROR, 0))
1254
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255
if (BE (ret < 0, 0))
1258
org_node = org_dest;
1259
clone_node = clone_dest;
1264
/* Search for a node which is duplicated from the node ORG_NODE, and
1265
satisfies the constraint CONSTRAINT. */
1268
search_duplicated_node (dfa, org_node, constraint)
1271
unsigned int constraint;
1274
for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1276
if (org_node == dfa->org_indices[idx]
1277
&& constraint == dfa->nodes[idx].constraint)
1278
return idx; /* Found. */
1280
return -1; /* Not found. */
1283
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1284
The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1285
otherwise return the error code. */
1287
static reg_errcode_t
1288
duplicate_node (new_idx, dfa, org_idx, constraint)
1290
int *new_idx, org_idx;
1291
unsigned int constraint;
1296
dup = dfa->nodes[org_idx];
1297
dup_idx = re_dfa_add_node (dfa, dup, 1);
1298
if (BE (dup_idx == -1, 0))
1300
dfa->nodes[dup_idx].constraint = constraint;
1301
if (dfa->nodes[org_idx].type == ANCHOR)
1302
dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1303
dfa->nodes[dup_idx].duplicated = 1;
1304
re_node_set_init_empty (dfa->edests + dup_idx);
1305
re_node_set_init_empty (dfa->eclosures + dup_idx);
1306
re_node_set_init_empty (dfa->inveclosures + dup_idx);
1308
/* Store the index of the original node. */
1309
dfa->org_indices[dup_idx] = org_idx;
1315
calc_inveclosure (dfa)
1319
for (src = 0; src < dfa->nodes_len; ++src)
1321
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1323
dest = dfa->eclosures[src].elems[idx];
1324
re_node_set_insert (dfa->inveclosures + dest, src);
1329
/* Calculate "eclosure" for all the node in DFA. */
1331
static reg_errcode_t
1335
int node_idx, incomplete;
1337
assert (dfa->nodes_len > 0);
1340
/* For each nodes, calculate epsilon closure. */
1341
for (node_idx = 0; ; ++node_idx)
1344
re_node_set eclosure_elem;
1345
if (node_idx == dfa->nodes_len)
1354
assert (dfa->eclosures[node_idx].nelem != -1);
1356
/* If we have already calculated, skip it. */
1357
if (dfa->eclosures[node_idx].nelem != 0)
1359
/* Calculate epsilon closure of `node_idx'. */
1360
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1361
if (BE (err != REG_NOERROR, 0))
1364
if (dfa->eclosures[node_idx].nelem == 0)
1367
re_node_set_free (&eclosure_elem);
1373
/* Calculate epsilon closure of NODE. */
1375
static reg_errcode_t
1376
calc_eclosure_iter (new_set, dfa, node, root)
1377
re_node_set *new_set;
1382
unsigned int constraint;
1384
re_node_set eclosure;
1386
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1387
if (BE (err != REG_NOERROR, 0))
1390
/* This indicates that we are calculating this node now.
1391
We reference this value to avoid infinite loop. */
1392
dfa->eclosures[node].nelem = -1;
1394
constraint = ((dfa->nodes[node].type == ANCHOR)
1395
? dfa->nodes[node].opr.ctx_type : 0);
1396
/* If the current node has constraints, duplicate all nodes.
1397
Since they must inherit the constraints. */
1398
if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1400
int org_node, cur_node;
1401
org_node = cur_node = node;
1402
err = duplicate_node_closure (dfa, node, node, node, constraint);
1403
if (BE (err != REG_NOERROR, 0))
1407
/* Expand each epsilon destination nodes. */
1408
if (IS_EPSILON_NODE(dfa->nodes[node].type))
1409
for (i = 0; i < dfa->edests[node].nelem; ++i)
1411
re_node_set eclosure_elem;
1412
int edest = dfa->edests[node].elems[i];
1413
/* If calculating the epsilon closure of `edest' is in progress,
1414
return intermediate result. */
1415
if (dfa->eclosures[edest].nelem == -1)
1420
/* If we haven't calculated the epsilon closure of `edest' yet,
1421
calculate now. Otherwise use calculated epsilon closure. */
1422
if (dfa->eclosures[edest].nelem == 0)
1424
err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1425
if (BE (err != REG_NOERROR, 0))
1429
eclosure_elem = dfa->eclosures[edest];
1430
/* Merge the epsilon closure of `edest'. */
1431
re_node_set_merge (&eclosure, &eclosure_elem);
1432
/* If the epsilon closure of `edest' is incomplete,
1433
the epsilon closure of this node is also incomplete. */
1434
if (dfa->eclosures[edest].nelem == 0)
1437
re_node_set_free (&eclosure_elem);
1441
/* Epsilon closures include itself. */
1442
re_node_set_insert (&eclosure, node);
1443
if (incomplete && !root)
1444
dfa->eclosures[node].nelem = 0;
1446
dfa->eclosures[node] = eclosure;
1447
*new_set = eclosure;
1451
/* Functions for token which are used in the parser. */
1453
/* Fetch a token from INPUT.
1454
We must not use this function inside bracket expressions. */
1457
fetch_token (input, syntax)
1459
reg_syntax_t syntax;
1463
consumed_byte = peek_token (&token, input, syntax);
1464
re_string_skip_bytes (input, consumed_byte);
1468
/* Peek a token from INPUT, and return the length of the token.
1469
We must not use this function inside bracket expressions. */
1472
peek_token (token, input, syntax)
1475
reg_syntax_t syntax;
1479
if (re_string_eoi (input))
1481
token->type = END_OF_RE;
1485
c = re_string_peek_byte (input, 0);
1488
#ifdef RE_ENABLE_I18N
1489
token->mb_partial = 0;
1490
if (MB_CUR_MAX > 1 &&
1491
!re_string_first_byte (input, re_string_cur_idx (input)))
1493
token->type = CHARACTER;
1494
token->mb_partial = 1;
1501
if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1503
token->type = BACK_SLASH;
1507
c2 = re_string_peek_byte_case (input, 1);
1509
token->type = CHARACTER;
1513
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514
token->type = OP_ALT;
1516
case '1': case '2': case '3': case '4': case '5':
1517
case '6': case '7': case '8': case '9':
1518
if (!(syntax & RE_NO_BK_REFS))
1520
token->type = OP_BACK_REF;
1521
token->opr.idx = c2 - '0';
1525
if (!(syntax & RE_NO_GNU_OPS))
1527
token->type = ANCHOR;
1528
token->opr.idx = WORD_FIRST;
1532
if (!(syntax & RE_NO_GNU_OPS))
1534
token->type = ANCHOR;
1535
token->opr.idx = WORD_LAST;
1539
if (!(syntax & RE_NO_GNU_OPS))
1541
token->type = ANCHOR;
1542
token->opr.idx = WORD_DELIM;
1546
if (!(syntax & RE_NO_GNU_OPS))
1548
token->type = ANCHOR;
1549
token->opr.idx = INSIDE_WORD;
1553
if (!(syntax & RE_NO_GNU_OPS))
1554
token->type = OP_WORD;
1557
if (!(syntax & RE_NO_GNU_OPS))
1558
token->type = OP_NOTWORD;
1561
if (!(syntax & RE_NO_GNU_OPS))
1563
token->type = ANCHOR;
1564
token->opr.idx = BUF_FIRST;
1568
if (!(syntax & RE_NO_GNU_OPS))
1570
token->type = ANCHOR;
1571
token->opr.idx = BUF_LAST;
1575
if (!(syntax & RE_NO_BK_PARENS))
1576
token->type = OP_OPEN_SUBEXP;
1579
if (!(syntax & RE_NO_BK_PARENS))
1580
token->type = OP_CLOSE_SUBEXP;
1583
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584
token->type = OP_DUP_PLUS;
1587
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588
token->type = OP_DUP_QUESTION;
1591
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592
token->type = OP_OPEN_DUP_NUM;
1595
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596
token->type = OP_CLOSE_DUP_NUM;
1604
token->type = CHARACTER;
1608
if (syntax & RE_NEWLINE_ALT)
1609
token->type = OP_ALT;
1612
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613
token->type = OP_ALT;
1616
token->type = OP_DUP_ASTERISK;
1619
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620
token->type = OP_DUP_PLUS;
1623
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624
token->type = OP_DUP_QUESTION;
1627
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628
token->type = OP_OPEN_DUP_NUM;
1631
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632
token->type = OP_CLOSE_DUP_NUM;
1635
if (syntax & RE_NO_BK_PARENS)
1636
token->type = OP_OPEN_SUBEXP;
1639
if (syntax & RE_NO_BK_PARENS)
1640
token->type = OP_CLOSE_SUBEXP;
1643
token->type = OP_OPEN_BRACKET;
1646
token->type = OP_PERIOD;
1649
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650
re_string_cur_idx (input) != 0)
1652
char prev = re_string_peek_byte (input, -1);
1653
if (prev != '|' && prev != '(' &&
1654
(!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1657
token->type = ANCHOR;
1658
token->opr.idx = LINE_FIRST;
1661
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662
re_string_cur_idx (input) + 1 != re_string_length (input))
1665
re_string_skip_bytes (input, 1);
1666
peek_token (&next, input, syntax);
1667
re_string_skip_bytes (input, -1);
1668
if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1671
token->type = ANCHOR;
1672
token->opr.idx = LINE_LAST;
1680
/* Peek a token from INPUT, and return the length of the token.
1681
We must not use this function out of bracket expressions. */
1684
peek_token_bracket (token, input, syntax)
1687
reg_syntax_t syntax;
1690
if (re_string_eoi (input))
1692
token->type = END_OF_RE;
1695
c = re_string_peek_byte (input, 0);
1698
#ifdef RE_ENABLE_I18N
1699
if (MB_CUR_MAX > 1 &&
1700
!re_string_first_byte (input, re_string_cur_idx (input)))
1702
token->type = CHARACTER;
1705
#endif /* RE_ENABLE_I18N */
1707
if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1709
/* In this case, '\' escape a character. */
1711
re_string_skip_bytes (input, 1);
1712
c2 = re_string_peek_byte (input, 0);
1714
token->type = CHARACTER;
1717
if (c == '[') /* '[' is a special char in a bracket exps. */
1721
c2 = re_string_peek_byte (input, 1);
1727
token->type = OP_OPEN_COLL_ELEM;
1730
token->type = OP_OPEN_EQUIV_CLASS;
1733
if (syntax & RE_CHAR_CLASSES)
1735
token->type = OP_OPEN_CHAR_CLASS;
1738
/* else fall through. */
1740
token->type = CHARACTER;
1750
token->type = OP_CHARSET_RANGE;
1753
token->type = OP_CLOSE_BRACKET;
1756
token->type = OP_NON_MATCH_LIST;
1759
token->type = CHARACTER;
1764
/* Functions for parser. */
1766
/* Entry point of the parser.
1767
Parse the regular expression REGEXP and return the structure tree.
1768
If an error is occured, ERR is set by error code, and return NULL.
1769
This function build the following tree, from regular expression <reg_exp>:
1775
CAT means concatenation.
1776
EOR means end of regular expression. */
1779
parse (regexp, preg, syntax, err)
1780
re_string_t *regexp;
1782
reg_syntax_t syntax;
1785
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1786
bin_tree_t *tree, *eor, *root;
1787
re_token_t current_token;
1789
current_token = fetch_token (regexp, syntax);
1790
tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1791
if (BE (*err != REG_NOERROR && tree == NULL, 0))
1793
new_idx = re_dfa_add_node (dfa, current_token, 0);
1794
eor = create_tree (NULL, NULL, 0, new_idx);
1796
root = create_tree (tree, eor, CONCAT, 0);
1799
if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1807
/* This function build the following tree, from regular expression
1808
<branch1>|<branch2>:
1814
ALT means alternative, which represents the operator `|'. */
1817
parse_reg_exp (regexp, preg, token, syntax, nest, err)
1818
re_string_t *regexp;
1821
reg_syntax_t syntax;
1825
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1826
bin_tree_t *tree, *branch = NULL;
1828
tree = parse_branch (regexp, preg, token, syntax, nest, err);
1829
if (BE (*err != REG_NOERROR && tree == NULL, 0))
1832
while (token->type == OP_ALT)
1834
re_token_t alt_token = *token;
1835
new_idx = re_dfa_add_node (dfa, alt_token, 0);
1836
*token = fetch_token (regexp, syntax);
1837
if (token->type != OP_ALT && token->type != END_OF_RE
1838
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1840
branch = parse_branch (regexp, preg, token, syntax, nest, err);
1841
if (BE (*err != REG_NOERROR && branch == NULL, 0))
1843
free_bin_tree (tree);
1849
tree = create_tree (tree, branch, 0, new_idx);
1850
if (BE (new_idx == -1 || tree == NULL, 0))
1855
dfa->has_plural_match = 1;
1860
/* This function build the following tree, from regular expression
1867
CAT means concatenation. */
1870
parse_branch (regexp, preg, token, syntax, nest, err)
1871
re_string_t *regexp;
1874
reg_syntax_t syntax;
1878
bin_tree_t *tree, *exp;
1879
tree = parse_expression (regexp, preg, token, syntax, nest, err);
1880
if (BE (*err != REG_NOERROR && tree == NULL, 0))
1883
while (token->type != OP_ALT && token->type != END_OF_RE
1884
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1886
exp = parse_expression (regexp, preg, token, syntax, nest, err);
1887
if (BE (*err != REG_NOERROR && exp == NULL, 0))
1889
free_bin_tree (tree);
1892
if (tree != NULL && exp != NULL)
1894
tree = create_tree (tree, exp, CONCAT, 0);
1901
else if (tree == NULL)
1903
/* Otherwise exp == NULL, we don't need to create new tree. */
1908
/* This function build the following tree, from regular expression a*:
1915
parse_expression (regexp, preg, token, syntax, nest, err)
1916
re_string_t *regexp;
1919
reg_syntax_t syntax;
1923
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1926
switch (token->type)
1929
new_idx = re_dfa_add_node (dfa, *token, 0);
1930
tree = create_tree (NULL, NULL, 0, new_idx);
1931
if (BE (new_idx == -1 || tree == NULL, 0))
1936
#ifdef RE_ENABLE_I18N
1939
while (!re_string_eoi (regexp)
1940
&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1942
bin_tree_t *mbc_remain;
1943
*token = fetch_token (regexp, syntax);
1944
new_idx = re_dfa_add_node (dfa, *token, 0);
1945
mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1946
tree = create_tree (tree, mbc_remain, CONCAT, 0);
1947
if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1956
case OP_OPEN_SUBEXP:
1957
tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958
if (BE (*err != REG_NOERROR && tree == NULL, 0))
1961
case OP_OPEN_BRACKET:
1962
tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963
if (BE (*err != REG_NOERROR && tree == NULL, 0))
1967
if (BE (preg->re_nsub < token->opr.idx
1968
|| dfa->subexps[token->opr.idx - 1].end == -1, 0))
1973
dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1974
new_idx = re_dfa_add_node (dfa, *token, 0);
1975
tree = create_tree (NULL, NULL, 0, new_idx);
1976
if (BE (new_idx == -1 || tree == NULL, 0))
1982
dfa->has_mb_node = 1;
1984
case OP_DUP_ASTERISK:
1986
case OP_DUP_QUESTION:
1987
case OP_OPEN_DUP_NUM:
1988
if (syntax & RE_CONTEXT_INVALID_OPS)
1993
else if (syntax & RE_CONTEXT_INDEP_OPS)
1995
*token = fetch_token (regexp, syntax);
1996
return parse_expression (regexp, preg, token, syntax, nest, err);
1998
/* else fall through */
1999
case OP_CLOSE_SUBEXP:
2000
if ((token->type == OP_CLOSE_SUBEXP) &&
2001
!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2006
/* else fall through */
2007
case OP_CLOSE_DUP_NUM:
2008
/* We treat it as a normal character. */
2010
/* Then we can these characters as normal characters. */
2011
token->type = CHARACTER;
2012
new_idx = re_dfa_add_node (dfa, *token, 0);
2013
tree = create_tree (NULL, NULL, 0, new_idx);
2014
if (BE (new_idx == -1 || tree == NULL, 0))
2021
if (dfa->word_char == NULL)
2023
*err = init_word_char (dfa);
2024
if (BE (*err != REG_NOERROR, 0))
2027
if (token->opr.ctx_type == WORD_DELIM)
2029
bin_tree_t *tree_first, *tree_last;
2030
int idx_first, idx_last;
2031
token->opr.ctx_type = WORD_FIRST;
2032
idx_first = re_dfa_add_node (dfa, *token, 0);
2033
tree_first = create_tree (NULL, NULL, 0, idx_first);
2034
token->opr.ctx_type = WORD_LAST;
2035
idx_last = re_dfa_add_node (dfa, *token, 0);
2036
tree_last = create_tree (NULL, NULL, 0, idx_last);
2037
token->type = OP_ALT;
2038
new_idx = re_dfa_add_node (dfa, *token, 0);
2039
tree = create_tree (tree_first, tree_last, 0, new_idx);
2040
if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2041
|| tree_first == NULL || tree_last == NULL
2042
|| tree == NULL, 0))
2050
new_idx = re_dfa_add_node (dfa, *token, 0);
2051
tree = create_tree (NULL, NULL, 0, new_idx);
2052
if (BE (new_idx == -1 || tree == NULL, 0))
2058
/* We must return here, since ANCHORs can't be followed
2059
by repetition operators.
2060
eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2061
it must not be "<ANCHOR(^)><REPEAT(*)>". */
2062
*token = fetch_token (regexp, syntax);
2065
new_idx = re_dfa_add_node (dfa, *token, 0);
2066
tree = create_tree (NULL, NULL, 0, new_idx);
2067
if (BE (new_idx == -1 || tree == NULL, 0))
2073
dfa->has_mb_node = 1;
2076
tree = build_word_op (dfa, 0, err);
2077
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2081
tree = build_word_op (dfa, 1, err);
2082
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2092
/* Must not happen? */
2098
*token = fetch_token (regexp, syntax);
2100
while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2101
|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2103
tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2104
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106
dfa->has_plural_match = 1;
2112
/* This function build the following tree, from regular expression
2120
parse_sub_exp (regexp, preg, token, syntax, nest, err)
2121
re_string_t *regexp;
2124
reg_syntax_t syntax;
2128
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129
bin_tree_t *tree, *left_par, *right_par;
2132
cur_nsub = preg->re_nsub++;
2133
if (dfa->subexps_alloc < preg->re_nsub)
2135
re_subexp_t *new_array;
2136
dfa->subexps_alloc *= 2;
2137
new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2138
if (BE (new_array == NULL, 0))
2140
dfa->subexps_alloc /= 2;
2144
dfa->subexps = new_array;
2146
dfa->subexps[cur_nsub].start = dfa->nodes_len;
2147
dfa->subexps[cur_nsub].end = -1;
2149
new_idx = re_dfa_add_node (dfa, *token, 0);
2150
left_par = create_tree (NULL, NULL, 0, new_idx);
2151
if (BE (new_idx == -1 || left_par == NULL, 0))
2156
dfa->nodes[new_idx].opr.idx = cur_nsub;
2157
*token = fetch_token (regexp, syntax);
2159
/* The subexpression may be a null string. */
2160
if (token->type == OP_CLOSE_SUBEXP)
2164
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2165
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2168
if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2170
free_bin_tree (tree);
2174
new_idx = re_dfa_add_node (dfa, *token, 0);
2175
dfa->subexps[cur_nsub].end = dfa->nodes_len;
2176
right_par = create_tree (NULL, NULL, 0, new_idx);
2177
tree = ((tree == NULL) ? right_par
2178
: create_tree (tree, right_par, CONCAT, 0));
2179
tree = create_tree (left_par, tree, CONCAT, 0);
2180
if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2185
dfa->nodes[new_idx].opr.idx = cur_nsub;
2190
/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2193
parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2194
bin_tree_t *dup_elem;
2195
re_string_t *regexp;
2198
reg_syntax_t syntax;
2201
re_token_t dup_token;
2202
bin_tree_t *tree = dup_elem, *work_tree;
2203
int new_idx, start_idx = re_string_cur_idx (regexp);
2204
re_token_t start_token = *token;
2205
if (token->type == OP_OPEN_DUP_NUM)
2209
int start = fetch_number (regexp, token, syntax);
2213
if (token->type == CHARACTER && token->opr.c == ',')
2214
start = 0; /* We treat "{,m}" as "{0,m}". */
2217
*err = REG_BADBR; /* <re>{} is invalid. */
2221
if (BE (start != -2, 1))
2223
/* We treat "{n}" as "{n,n}". */
2224
end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2225
: ((token->type == CHARACTER && token->opr.c == ',')
2226
? fetch_number (regexp, token, syntax) : -2));
2228
if (BE (start == -2 || end == -2, 0))
2230
/* Invalid sequence. */
2231
if (token->type == OP_CLOSE_DUP_NUM)
2232
goto parse_dup_op_invalid_interval;
2234
goto parse_dup_op_ebrace;
2236
if (BE (start == 0 && end == 0, 0))
2238
/* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2239
*token = fetch_token (regexp, syntax);
2240
free_bin_tree (dup_elem);
2244
/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2246
for (i = 0; i < start; ++i)
2249
work_tree = duplicate_tree (elem, dfa);
2250
tree = create_tree (tree, work_tree, CONCAT, 0);
2251
if (BE (work_tree == NULL || tree == NULL, 0))
2252
goto parse_dup_op_espace;
2257
/* We treat "<re>{0,}" as "<re>*". */
2258
dup_token.type = OP_DUP_ASTERISK;
2261
elem = duplicate_tree (elem, dfa);
2262
new_idx = re_dfa_add_node (dfa, dup_token, 0);
2263
work_tree = create_tree (elem, NULL, 0, new_idx);
2264
tree = create_tree (tree, work_tree, CONCAT, 0);
2265
if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2266
|| tree == NULL, 0))
2267
goto parse_dup_op_espace;
2271
new_idx = re_dfa_add_node (dfa, dup_token, 0);
2272
tree = create_tree (elem, NULL, 0, new_idx);
2273
if (BE (new_idx == -1 || tree == NULL, 0))
2274
goto parse_dup_op_espace;
2277
else if (end - start > 0)
2279
/* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2280
dup_token.type = OP_DUP_QUESTION;
2283
elem = duplicate_tree (elem, dfa);
2284
new_idx = re_dfa_add_node (dfa, dup_token, 0);
2285
elem = create_tree (elem, NULL, 0, new_idx);
2286
tree = create_tree (tree, elem, CONCAT, 0);
2287
if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2288
goto parse_dup_op_espace;
2292
new_idx = re_dfa_add_node (dfa, dup_token, 0);
2293
tree = elem = create_tree (elem, NULL, 0, new_idx);
2294
if (BE (new_idx == -1 || tree == NULL, 0))
2295
goto parse_dup_op_espace;
2297
for (i = 1; i < end - start; ++i)
2299
work_tree = duplicate_tree (elem, dfa);
2300
tree = create_tree (tree, work_tree, CONCAT, 0);
2301
if (BE (work_tree == NULL || tree == NULL, 0))
2311
new_idx = re_dfa_add_node (dfa, *token, 0);
2312
tree = create_tree (tree, NULL, 0, new_idx);
2313
if (BE (new_idx == -1 || tree == NULL, 0))
2319
*token = fetch_token (regexp, syntax);
2322
parse_dup_op_espace:
2323
free_bin_tree (tree);
2327
parse_dup_op_ebrace:
2328
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2333
goto parse_dup_op_rollback;
2334
parse_dup_op_invalid_interval:
2335
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2340
parse_dup_op_rollback:
2341
re_string_set_index (regexp, start_idx);
2342
*token = start_token;
2343
token->type = CHARACTER;
2347
/* Size of the names for collating symbol/equivalence_class/character_class.
2348
I'm not sure, but maybe enough. */
2349
#define BRACKET_NAME_BUF_SIZE 32
2352
/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2353
Build the range expression which starts from START_ELEM, and ends
2354
at END_ELEM. The result are written to MBCSET and SBCSET.
2355
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2356
mbcset->range_ends, is a pointer argument sinse we may
2359
static reg_errcode_t
2360
# ifdef RE_ENABLE_I18N
2361
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2362
re_charset_t *mbcset;
2364
# else /* not RE_ENABLE_I18N */
2365
build_range_exp (sbcset, start_elem, end_elem)
2366
# endif /* not RE_ENABLE_I18N */
2367
re_bitset_ptr_t sbcset;
2368
bracket_elem_t *start_elem, *end_elem;
2370
unsigned int start_ch, end_ch;
2371
/* Equivalence Classes and Character Classes can't be a range start/end. */
2372
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2373
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2377
/* We can handle no multi character collating elements without libc
2379
if (BE ((start_elem->type == COLL_SYM
2380
&& strlen ((char *) start_elem->opr.name) > 1)
2381
|| (end_elem->type == COLL_SYM
2382
&& strlen ((char *) end_elem->opr.name) > 1), 0))
2383
return REG_ECOLLATE;
2385
# ifdef RE_ENABLE_I18N
2387
wchar_t wc, start_wc, end_wc;
2388
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2390
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2391
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393
end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2394
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2397
? __btowc (start_ch) : start_elem->opr.wch);
2398
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2399
? __btowc (end_ch) : end_elem->opr.wch);
2400
cmp_buf[0] = start_wc;
2401
cmp_buf[4] = end_wc;
2402
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2405
/* Check the space of the arrays. */
2406
if (*range_alloc == mbcset->nranges)
2408
/* There are not enough space, need realloc. */
2409
wchar_t *new_array_start, *new_array_end;
2412
/* +1 in case of mbcset->nranges is 0. */
2413
new_nranges = 2 * mbcset->nranges + 1;
2414
/* Use realloc since mbcset->range_starts and mbcset->range_ends
2415
are NULL if *range_alloc == 0. */
2416
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2418
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2421
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2424
mbcset->range_starts = new_array_start;
2425
mbcset->range_ends = new_array_end;
2426
*range_alloc = new_nranges;
2429
mbcset->range_starts[mbcset->nranges] = start_wc;
2430
mbcset->range_ends[mbcset->nranges++] = end_wc;
2432
/* Build the table for single byte characters. */
2433
for (wc = 0; wc <= SBC_MAX; ++wc)
2436
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2437
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2438
bitset_set (sbcset, wc);
2441
# else /* not RE_ENABLE_I18N */
2444
start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2445
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2447
end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2448
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2450
if (start_ch > end_ch)
2452
/* Build the table for single byte characters. */
2453
for (ch = 0; ch <= SBC_MAX; ++ch)
2454
if (start_ch <= ch && ch <= end_ch)
2455
bitset_set (sbcset, ch);
2457
# endif /* not RE_ENABLE_I18N */
2460
#endif /* not _LIBC */
2463
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2464
Build the collating element which is represented by NAME.
2465
The result are written to MBCSET and SBCSET.
2466
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2467
pointer argument since we may update it. */
2469
static reg_errcode_t
2470
# ifdef RE_ENABLE_I18N
2471
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2472
re_charset_t *mbcset;
2473
int *coll_sym_alloc;
2474
# else /* not RE_ENABLE_I18N */
2475
build_collating_symbol (sbcset, name)
2476
# endif /* not RE_ENABLE_I18N */
2477
re_bitset_ptr_t sbcset;
2478
const unsigned char *name;
2480
size_t name_len = strlen ((const char *) name);
2481
if (BE (name_len != 1, 0))
2482
return REG_ECOLLATE;
2485
bitset_set (sbcset, name[0]);
2489
#endif /* not _LIBC */
2491
/* This function parse bracket expression like "[abc]", "[a-c]",
2495
parse_bracket_exp (regexp, dfa, token, syntax, err)
2496
re_string_t *regexp;
2499
reg_syntax_t syntax;
2503
const unsigned char *collseqmb;
2504
const char *collseqwc;
2507
const int32_t *symb_table;
2508
const unsigned char *extra;
2510
/* Local function for parse_bracket_exp used in _LIBC environement.
2511
Seek the collating symbol entry correspondings to NAME.
2512
Return the index of the symbol in the SYMB_TABLE. */
2514
static inline int32_t
2515
seek_collating_symbol_entry (name, name_len)
2516
const unsigned char *name;
2519
int32_t hash = elem_hash ((const char *) name, name_len);
2520
int32_t elem = hash % table_size;
2521
int32_t second = hash % (table_size - 2);
2522
while (symb_table[2 * elem] != 0)
2524
/* First compare the hashing value. */
2525
if (symb_table[2 * elem] == hash
2526
/* Compare the length of the name. */
2527
&& name_len == extra[symb_table[2 * elem + 1]]
2528
/* Compare the name. */
2529
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2532
/* Yep, this is the entry. */
2542
/* Local function for parse_bracket_exp used in _LIBC environement.
2543
Look up the collation sequence value of BR_ELEM.
2544
Return the value if succeeded, UINT_MAX otherwise. */
2546
static inline unsigned int
2547
lookup_collation_sequence_value (br_elem)
2548
bracket_elem_t *br_elem;
2550
if (br_elem->type == SB_CHAR)
2553
if (MB_CUR_MAX == 1)
2556
return collseqmb[br_elem->opr.ch];
2559
wint_t wc = __btowc (br_elem->opr.ch);
2560
return collseq_table_lookup (collseqwc, wc);
2563
else if (br_elem->type == MB_CHAR)
2565
return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2567
else if (br_elem->type == COLL_SYM)
2569
size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2573
elem = seek_collating_symbol_entry (br_elem->opr.name,
2575
if (symb_table[2 * elem] != 0)
2577
/* We found the entry. */
2578
idx = symb_table[2 * elem + 1];
2579
/* Skip the name of collating element name. */
2580
idx += 1 + extra[idx];
2581
/* Skip the byte sequence of the collating element. */
2582
idx += 1 + extra[idx];
2583
/* Adjust for the alignment. */
2584
idx = (idx + 3) & ~3;
2585
/* Skip the multibyte collation sequence value. */
2586
idx += sizeof (unsigned int);
2587
/* Skip the wide char sequence of the collating element. */
2588
idx += sizeof (unsigned int) *
2589
(1 + *(unsigned int *) (extra + idx));
2590
/* Return the collation sequence value. */
2591
return *(unsigned int *) (extra + idx);
2593
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2595
/* No valid character. Match it as a single byte
2597
return collseqmb[br_elem->opr.name[0]];
2600
else if (sym_name_len == 1)
2601
return collseqmb[br_elem->opr.name[0]];
2606
/* Local function for parse_bracket_exp used in _LIBC environement.
2607
Build the range expression which starts from START_ELEM, and ends
2608
at END_ELEM. The result are written to MBCSET and SBCSET.
2609
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2610
mbcset->range_ends, is a pointer argument sinse we may
2613
static inline reg_errcode_t
2614
# ifdef RE_ENABLE_I18N
2615
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2616
re_charset_t *mbcset;
2618
# else /* not RE_ENABLE_I18N */
2619
build_range_exp (sbcset, start_elem, end_elem)
2620
# endif /* not RE_ENABLE_I18N */
2621
re_bitset_ptr_t sbcset;
2622
bracket_elem_t *start_elem, *end_elem;
2625
uint32_t start_collseq;
2626
uint32_t end_collseq;
2628
# ifdef RE_ENABLE_I18N
2629
/* Check the space of the arrays. */
2630
if (*range_alloc == mbcset->nranges)
2632
/* There are not enough space, need realloc. */
2633
uint32_t *new_array_start;
2634
uint32_t *new_array_end;
2637
/* +1 in case of mbcset->nranges is 0. */
2638
new_nranges = 2 * mbcset->nranges + 1;
2639
/* Use realloc since mbcset->range_starts and mbcset->range_ends
2640
are NULL if *range_alloc == 0. */
2641
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2643
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2646
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2649
mbcset->range_starts = new_array_start;
2650
mbcset->range_ends = new_array_end;
2651
*range_alloc = new_nranges;
2653
# endif /* RE_ENABLE_I18N */
2655
/* Equivalence Classes and Character Classes can't be a range
2657
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2658
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2662
start_collseq = lookup_collation_sequence_value (start_elem);
2663
end_collseq = lookup_collation_sequence_value (end_elem);
2664
/* Check start/end collation sequence values. */
2665
if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2666
return REG_ECOLLATE;
2667
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2670
# ifdef RE_ENABLE_I18N
2671
/* Got valid collation sequence values, add them as a new entry. */
2672
mbcset->range_starts[mbcset->nranges] = start_collseq;
2673
mbcset->range_ends[mbcset->nranges++] = end_collseq;
2674
# endif /* RE_ENABLE_I18N */
2676
/* Build the table for single byte characters. */
2677
for (ch = 0; ch <= SBC_MAX; ch++)
2679
uint32_t ch_collseq;
2681
if (MB_CUR_MAX == 1)
2684
ch_collseq = collseqmb[ch];
2686
ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2687
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2688
bitset_set (sbcset, ch);
2693
/* Local function for parse_bracket_exp used in _LIBC environement.
2694
Build the collating element which is represented by NAME.
2695
The result are written to MBCSET and SBCSET.
2696
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2697
pointer argument sinse we may update it. */
2699
static inline reg_errcode_t
2700
# ifdef RE_ENABLE_I18N
2701
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2702
re_charset_t *mbcset;
2703
int *coll_sym_alloc;
2704
# else /* not RE_ENABLE_I18N */
2705
build_collating_symbol (sbcset, name)
2706
# endif /* not RE_ENABLE_I18N */
2707
re_bitset_ptr_t sbcset;
2708
const unsigned char *name;
2711
size_t name_len = strlen ((const char *) name);
2714
elem = seek_collating_symbol_entry (name, name_len);
2715
if (symb_table[2 * elem] != 0)
2717
/* We found the entry. */
2718
idx = symb_table[2 * elem + 1];
2719
/* Skip the name of collating element name. */
2720
idx += 1 + extra[idx];
2722
else if (symb_table[2 * elem] == 0 && name_len == 1)
2724
/* No valid character, treat it as a normal
2726
bitset_set (sbcset, name[0]);
2730
return REG_ECOLLATE;
2732
# ifdef RE_ENABLE_I18N
2733
/* Got valid collation sequence, add it as a new entry. */
2734
/* Check the space of the arrays. */
2735
if (*coll_sym_alloc == mbcset->ncoll_syms)
2737
/* Not enough, realloc it. */
2738
/* +1 in case of mbcset->ncoll_syms is 0. */
2739
*coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2740
/* Use realloc since mbcset->coll_syms is NULL
2742
mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2744
if (BE (mbcset->coll_syms == NULL, 0))
2747
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2748
# endif /* RE_ENABLE_I18N */
2753
if (BE (name_len != 1, 0))
2754
return REG_ECOLLATE;
2757
bitset_set (sbcset, name[0]);
2764
re_token_t br_token;
2765
re_bitset_ptr_t sbcset;
2766
#ifdef RE_ENABLE_I18N
2767
re_charset_t *mbcset;
2768
int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2769
int equiv_class_alloc = 0, char_class_alloc = 0;
2770
#else /* not RE_ENABLE_I18N */
2772
#endif /* not RE_ENABLE_I18N */
2773
bin_tree_t *work_tree;
2774
int token_len, new_idx;
2776
collseqmb = (const unsigned char *)
2777
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2778
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2784
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2785
table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2786
symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2787
_NL_COLLATE_SYMB_TABLEMB);
2788
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2789
_NL_COLLATE_SYMB_EXTRAMB);
2792
sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2793
#ifdef RE_ENABLE_I18N
2794
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2795
#endif /* RE_ENABLE_I18N */
2796
#ifdef RE_ENABLE_I18N
2797
if (BE (sbcset == NULL || mbcset == NULL, 0))
2799
if (BE (sbcset == NULL, 0))
2800
#endif /* RE_ENABLE_I18N */
2806
token_len = peek_token_bracket (token, regexp, syntax);
2807
if (BE (token->type == END_OF_RE, 0))
2810
goto parse_bracket_exp_free_return;
2812
if (token->type == OP_NON_MATCH_LIST)
2814
#ifdef RE_ENABLE_I18N
2816
mbcset->non_match = 1;
2817
#else /* not RE_ENABLE_I18N */
2819
#endif /* not RE_ENABLE_I18N */
2820
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2821
bitset_set (sbcset, '\0');
2822
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2823
token_len = peek_token_bracket (token, regexp, syntax);
2824
if (BE (token->type == END_OF_RE, 0))
2827
goto parse_bracket_exp_free_return;
2829
#ifdef RE_ENABLE_I18N
2831
for (i = 0; i < SBC_MAX; ++i)
2832
if (__btowc (i) == WEOF)
2833
bitset_set (sbcset, i);
2834
#endif /* RE_ENABLE_I18N */
2837
/* We treat the first ']' as a normal character. */
2838
if (token->type == OP_CLOSE_BRACKET)
2839
token->type = CHARACTER;
2843
bracket_elem_t start_elem, end_elem;
2844
unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2845
unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2847
int token_len2 = 0, is_range_exp = 0;
2850
start_elem.opr.name = start_name_buf;
2851
ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2853
if (BE (ret != REG_NOERROR, 0))
2856
goto parse_bracket_exp_free_return;
2859
token_len = peek_token_bracket (token, regexp, syntax);
2860
if (BE (token->type == END_OF_RE, 0))
2863
goto parse_bracket_exp_free_return;
2865
if (token->type == OP_CHARSET_RANGE)
2867
re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2868
token_len2 = peek_token_bracket (&token2, regexp, syntax);
2869
if (BE (token->type == END_OF_RE, 0))
2872
goto parse_bracket_exp_free_return;
2874
if (token2.type == OP_CLOSE_BRACKET)
2876
/* We treat the last '-' as a normal character. */
2877
re_string_skip_bytes (regexp, -token_len);
2878
token->type = CHARACTER;
2884
if (is_range_exp == 1)
2886
end_elem.opr.name = end_name_buf;
2887
ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2889
if (BE (ret != REG_NOERROR, 0))
2892
goto parse_bracket_exp_free_return;
2895
token_len = peek_token_bracket (token, regexp, syntax);
2896
if (BE (token->type == END_OF_RE, 0))
2899
goto parse_bracket_exp_free_return;
2901
*err = build_range_exp (sbcset,
2902
#ifdef RE_ENABLE_I18N
2903
mbcset, &range_alloc,
2904
#endif /* RE_ENABLE_I18N */
2905
&start_elem, &end_elem);
2906
if (BE (*err != REG_NOERROR, 0))
2907
goto parse_bracket_exp_free_return;
2911
switch (start_elem.type)
2914
bitset_set (sbcset, start_elem.opr.ch);
2916
#ifdef RE_ENABLE_I18N
2918
/* Check whether the array has enough space. */
2919
if (mbchar_alloc == mbcset->nmbchars)
2921
/* Not enough, realloc it. */
2922
/* +1 in case of mbcset->nmbchars is 0. */
2923
mbchar_alloc = 2 * mbcset->nmbchars + 1;
2924
/* Use realloc since array is NULL if *alloc == 0. */
2925
mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2927
if (BE (mbcset->mbchars == NULL, 0))
2928
goto parse_bracket_exp_espace;
2930
mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2932
#endif /* RE_ENABLE_I18N */
2934
*err = build_equiv_class (sbcset,
2935
#ifdef RE_ENABLE_I18N
2936
mbcset, &equiv_class_alloc,
2937
#endif /* RE_ENABLE_I18N */
2938
start_elem.opr.name);
2939
if (BE (*err != REG_NOERROR, 0))
2940
goto parse_bracket_exp_free_return;
2943
*err = build_collating_symbol (sbcset,
2944
#ifdef RE_ENABLE_I18N
2945
mbcset, &coll_sym_alloc,
2946
#endif /* RE_ENABLE_I18N */
2947
start_elem.opr.name);
2948
if (BE (*err != REG_NOERROR, 0))
2949
goto parse_bracket_exp_free_return;
2952
*err = build_charclass (sbcset,
2953
#ifdef RE_ENABLE_I18N
2954
mbcset, &char_class_alloc,
2955
#endif /* RE_ENABLE_I18N */
2956
start_elem.opr.name, syntax);
2957
if (BE (*err != REG_NOERROR, 0))
2958
goto parse_bracket_exp_free_return;
2965
if (token->type == OP_CLOSE_BRACKET)
2969
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2971
/* If it is non-matching list. */
2972
#ifdef RE_ENABLE_I18N
2973
if (mbcset->non_match)
2974
#else /* not RE_ENABLE_I18N */
2976
#endif /* not RE_ENABLE_I18N */
2977
bitset_not (sbcset);
2979
/* Build a tree for simple bracket. */
2980
br_token.type = SIMPLE_BRACKET;
2981
br_token.opr.sbcset = sbcset;
2982
new_idx = re_dfa_add_node (dfa, br_token, 0);
2983
work_tree = create_tree (NULL, NULL, 0, new_idx);
2984
if (BE (new_idx == -1 || work_tree == NULL, 0))
2985
goto parse_bracket_exp_espace;
2987
#ifdef RE_ENABLE_I18N
2988
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2989
|| mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2990
|| mbcset->non_match)))
2992
re_token_t alt_token;
2993
bin_tree_t *mbc_tree;
2994
/* Build a tree for complex bracket. */
2995
br_token.type = COMPLEX_BRACKET;
2996
br_token.opr.mbcset = mbcset;
2997
dfa->has_mb_node = 1;
2998
new_idx = re_dfa_add_node (dfa, br_token, 0);
2999
mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3000
if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3001
goto parse_bracket_exp_espace;
3002
/* Then join them by ALT node. */
3003
dfa->has_plural_match = 1;
3004
alt_token.type = OP_ALT;
3005
new_idx = re_dfa_add_node (dfa, alt_token, 0);
3006
work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3007
if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3012
free_charset (mbcset);
3015
#else /* not RE_ENABLE_I18N */
3017
#endif /* not RE_ENABLE_I18N */
3019
parse_bracket_exp_espace:
3021
parse_bracket_exp_free_return:
3023
#ifdef RE_ENABLE_I18N
3024
free_charset (mbcset);
3025
#endif /* RE_ENABLE_I18N */
3029
/* Parse an element in the bracket expression. */
3031
static reg_errcode_t
3032
parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3033
bracket_elem_t *elem;
3034
re_string_t *regexp;
3038
reg_syntax_t syntax;
3040
#ifdef RE_ENABLE_I18N
3042
cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3043
if (cur_char_size > 1)
3045
elem->type = MB_CHAR;
3046
elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3047
re_string_skip_bytes (regexp, cur_char_size);
3050
#endif /* RE_ENABLE_I18N */
3051
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3052
if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3053
|| token->type == OP_OPEN_EQUIV_CLASS)
3054
return parse_bracket_symbol (elem, regexp, token);
3055
elem->type = SB_CHAR;
3056
elem->opr.ch = token->opr.c;
3060
/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3061
such as [:<character_class>:], [.<collating_element>.], and
3062
[=<equivalent_class>=]. */
3064
static reg_errcode_t
3065
parse_bracket_symbol (elem, regexp, token)
3066
bracket_elem_t *elem;
3067
re_string_t *regexp;
3070
unsigned char ch, delim = token->opr.c;
3074
if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3076
if (token->type == OP_OPEN_CHAR_CLASS)
3077
ch = re_string_fetch_byte_case (regexp);
3079
ch = re_string_fetch_byte (regexp);
3080
if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3082
elem->opr.name[i] = ch;
3084
re_string_skip_bytes (regexp, 1);
3085
elem->opr.name[i] = '\0';
3086
switch (token->type)
3088
case OP_OPEN_COLL_ELEM:
3089
elem->type = COLL_SYM;
3091
case OP_OPEN_EQUIV_CLASS:
3092
elem->type = EQUIV_CLASS;
3094
case OP_OPEN_CHAR_CLASS:
3095
elem->type = CHAR_CLASS;
3103
/* Helper function for parse_bracket_exp.
3104
Build the equivalence class which is represented by NAME.
3105
The result are written to MBCSET and SBCSET.
3106
EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3107
is a pointer argument sinse we may update it. */
3109
static reg_errcode_t
3110
#ifdef RE_ENABLE_I18N
3111
build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3112
re_charset_t *mbcset;
3113
int *equiv_class_alloc;
3114
#else /* not RE_ENABLE_I18N */
3115
build_equiv_class (sbcset, name)
3116
#endif /* not RE_ENABLE_I18N */
3117
re_bitset_ptr_t sbcset;
3118
const unsigned char *name;
3120
#if defined _LIBC && defined RE_ENABLE_I18N
3121
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3124
const int32_t *table, *indirect;
3125
const unsigned char *weights, *extra, *cp;
3126
unsigned char char_buf[2];
3130
/* This #include defines a local function! */
3131
# include <locale/weight.h>
3132
/* Calculate the index for equivalence class. */
3134
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3135
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136
_NL_COLLATE_WEIGHTMB);
3137
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3138
_NL_COLLATE_EXTRAMB);
3139
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3140
_NL_COLLATE_INDIRECTMB);
3141
idx1 = findidx (&cp);
3142
if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3143
/* This isn't a valid character. */
3144
return REG_ECOLLATE;
3146
/* Build single byte matcing table for this equivalence class. */
3147
char_buf[1] = (unsigned char) '\0';
3148
len = weights[idx1];
3149
for (ch = 0; ch < SBC_MAX; ++ch)
3153
idx2 = findidx (&cp);
3158
/* This isn't a valid character. */
3160
if (len == weights[idx2])
3163
while (cnt <= len &&
3164
weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3168
bitset_set (sbcset, ch);
3171
/* Check whether the array has enough space. */
3172
if (*equiv_class_alloc == mbcset->nequiv_classes)
3174
/* Not enough, realloc it. */
3175
/* +1 in case of mbcset->nequiv_classes is 0. */
3176
*equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3177
/* Use realloc since the array is NULL if *alloc == 0. */
3178
mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3179
*equiv_class_alloc);
3180
if (BE (mbcset->equiv_classes == NULL, 0))
3183
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3186
#endif /* _LIBC && RE_ENABLE_I18N */
3188
if (BE (strlen ((const char *) name) != 1, 0))
3189
return REG_ECOLLATE;
3190
bitset_set (sbcset, *name);
3195
/* Helper function for parse_bracket_exp.
3196
Build the character class which is represented by NAME.
3197
The result are written to MBCSET and SBCSET.
3198
CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3199
is a pointer argument sinse we may update it. */
3201
static reg_errcode_t
3202
#ifdef RE_ENABLE_I18N
3203
build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3204
re_charset_t *mbcset;
3205
int *char_class_alloc;
3206
#else /* not RE_ENABLE_I18N */
3207
build_charclass (sbcset, class_name, syntax)
3208
#endif /* not RE_ENABLE_I18N */
3209
re_bitset_ptr_t sbcset;
3210
const unsigned char *class_name;
3211
reg_syntax_t syntax;
3214
const char *name = (const char *) class_name;
3216
/* In case of REG_ICASE "upper" and "lower" match the both of
3217
upper and lower cases. */
3218
if ((syntax & RE_ICASE)
3219
&& (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3222
#ifdef RE_ENABLE_I18N
3223
/* Check the space of the arrays. */
3224
if (*char_class_alloc == mbcset->nchar_classes)
3226
/* Not enough, realloc it. */
3227
/* +1 in case of mbcset->nchar_classes is 0. */
3228
*char_class_alloc = 2 * mbcset->nchar_classes + 1;
3229
/* Use realloc since array is NULL if *alloc == 0. */
3230
mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3232
if (BE (mbcset->char_classes == NULL, 0))
3235
mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3236
#endif /* RE_ENABLE_I18N */
3238
#define BUILD_CHARCLASS_LOOP(ctype_func)\
3239
for (i = 0; i < SBC_MAX; ++i) \
3241
if (ctype_func (i)) \
3242
bitset_set (sbcset, i); \
3245
if (strcmp (name, "alnum") == 0)
3246
BUILD_CHARCLASS_LOOP (isalnum)
3247
else if (strcmp (name, "cntrl") == 0)
3248
BUILD_CHARCLASS_LOOP (iscntrl)
3249
else if (strcmp (name, "lower") == 0)
3250
BUILD_CHARCLASS_LOOP (islower)
3251
else if (strcmp (name, "space") == 0)
3252
BUILD_CHARCLASS_LOOP (isspace)
3253
else if (strcmp (name, "alpha") == 0)
3254
BUILD_CHARCLASS_LOOP (isalpha)
3255
else if (strcmp (name, "digit") == 0)
3256
BUILD_CHARCLASS_LOOP (isdigit)
3257
else if (strcmp (name, "print") == 0)
3258
BUILD_CHARCLASS_LOOP (isprint)
3259
else if (strcmp (name, "upper") == 0)
3260
BUILD_CHARCLASS_LOOP (isupper)
3261
else if (strcmp (name, "blank") == 0)
3262
BUILD_CHARCLASS_LOOP (isblank)
3263
else if (strcmp (name, "graph") == 0)
3264
BUILD_CHARCLASS_LOOP (isgraph)
3265
else if (strcmp (name, "punct") == 0)
3266
BUILD_CHARCLASS_LOOP (ispunct)
3267
else if (strcmp (name, "xdigit") == 0)
3268
BUILD_CHARCLASS_LOOP (isxdigit)
3276
build_word_op (dfa, not, err)
3281
re_bitset_ptr_t sbcset;
3282
#ifdef RE_ENABLE_I18N
3283
re_charset_t *mbcset;
3285
#else /* not RE_ENABLE_I18N */
3287
#endif /* not RE_ENABLE_I18N */
3289
re_token_t br_token;
3293
sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3294
#ifdef RE_ENABLE_I18N
3295
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3296
#endif /* RE_ENABLE_I18N */
3298
#ifdef RE_ENABLE_I18N
3299
if (BE (sbcset == NULL || mbcset == NULL, 0))
3300
#else /* not RE_ENABLE_I18N */
3301
if (BE (sbcset == NULL, 0))
3302
#endif /* not RE_ENABLE_I18N */
3310
#ifdef RE_ENABLE_I18N
3313
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3314
bitset_set(cset->sbcset, '\0');
3316
mbcset->non_match = 1;
3318
for (i = 0; i < SBC_MAX; ++i)
3319
if (__btowc (i) == WEOF)
3320
bitset_set (sbcset, i);
3321
#else /* not RE_ENABLE_I18N */
3323
#endif /* not RE_ENABLE_I18N */
3326
/* We don't care the syntax in this case. */
3327
ret = build_charclass (sbcset,
3328
#ifdef RE_ENABLE_I18N
3330
#endif /* RE_ENABLE_I18N */
3331
(const unsigned char *) "alpha", 0);
3333
if (BE (ret != REG_NOERROR, 0))
3336
#ifdef RE_ENABLE_I18N
3337
free_charset (mbcset);
3338
#endif /* RE_ENABLE_I18N */
3342
/* \w match '_' also. */
3343
bitset_set (sbcset, '_');
3345
/* If it is non-matching list. */
3346
#ifdef RE_ENABLE_I18N
3347
if (mbcset->non_match)
3348
#else /* not RE_ENABLE_I18N */
3350
#endif /* not RE_ENABLE_I18N */
3351
bitset_not (sbcset);
3353
/* Build a tree for simple bracket. */
3354
br_token.type = SIMPLE_BRACKET;
3355
br_token.opr.sbcset = sbcset;
3356
new_idx = re_dfa_add_node (dfa, br_token, 0);
3357
tree = create_tree (NULL, NULL, 0, new_idx);
3358
if (BE (new_idx == -1 || tree == NULL, 0))
3359
goto build_word_op_espace;
3361
#ifdef RE_ENABLE_I18N
3364
re_token_t alt_token;
3365
bin_tree_t *mbc_tree;
3366
/* Build a tree for complex bracket. */
3367
br_token.type = COMPLEX_BRACKET;
3368
br_token.opr.mbcset = mbcset;
3369
dfa->has_mb_node = 1;
3370
new_idx = re_dfa_add_node (dfa, br_token, 0);
3371
mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3372
if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3373
goto build_word_op_espace;
3374
/* Then join them by ALT node. */
3375
alt_token.type = OP_ALT;
3376
new_idx = re_dfa_add_node (dfa, alt_token, 0);
3377
tree = create_tree (tree, mbc_tree, 0, new_idx);
3378
if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3383
free_charset (mbcset);
3386
#else /* not RE_ENABLE_I18N */
3388
#endif /* not RE_ENABLE_I18N */
3390
build_word_op_espace:
3392
#ifdef RE_ENABLE_I18N
3393
free_charset (mbcset);
3394
#endif /* RE_ENABLE_I18N */
3399
/* This is intended for the expressions like "a{1,3}".
3400
Fetch a number from `input', and return the number.
3401
Return -1, if the number field is empty like "{,1}".
3402
Return -2, If an error is occured. */
3405
fetch_number (input, token, syntax)
3408
reg_syntax_t syntax;
3414
*token = fetch_token (input, syntax);
3416
if (BE (token->type == END_OF_RE, 0))
3418
if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3420
num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3421
? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3422
num = (num > RE_DUP_MAX) ? -2 : num;
3427
#ifdef RE_ENABLE_I18N
3429
free_charset (re_charset_t *cset)
3431
re_free (cset->mbchars);
3433
re_free (cset->coll_syms);
3434
re_free (cset->equiv_classes);
3435
re_free (cset->range_starts);
3436
re_free (cset->range_ends);
3438
re_free (cset->char_classes);
3441
#endif /* RE_ENABLE_I18N */
3443
/* Functions for binary tree operation. */
3445
/* Create a node of tree.
3446
Note: This function automatically free left and right if malloc fails. */
3449
create_tree (left, right, type, index)
3452
re_token_type_t type;
3456
tree = re_malloc (bin_tree_t, 1);
3457
if (BE (tree == NULL, 0))
3459
free_bin_tree (left);
3460
free_bin_tree (right);
3463
tree->parent = NULL;
3465
tree->right = right;
3467
tree->node_idx = index;
3470
re_node_set_init_empty (&tree->eclosure);
3473
left->parent = tree;
3475
right->parent = tree;
3479
/* Free the sub tree pointed by TREE. */
3482
free_bin_tree (tree)
3487
/*re_node_set_free (&tree->eclosure);*/
3488
free_bin_tree (tree->left);
3489
free_bin_tree (tree->right);
3493
/* Duplicate the node SRC, and return new node. */
3496
duplicate_tree (src, dfa)
3497
const bin_tree_t *src;
3500
bin_tree_t *left = NULL, *right = NULL, *new_tree;
3502
/* Since node indies must be according to Post-order of the tree,
3503
we must duplicate the left at first. */
3504
if (src->left != NULL)
3506
left = duplicate_tree (src->left, dfa);
3511
/* Secondaly, duplicate the right. */
3512
if (src->right != NULL)
3514
right = duplicate_tree (src->right, dfa);
3517
free_bin_tree (left);
3522
/* At last, duplicate itself. */
3523
if (src->type == NON_TYPE)
3525
new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3526
dfa->nodes[new_node_idx].duplicated = 1;
3527
if (BE (new_node_idx == -1, 0))
3529
free_bin_tree (left);
3530
free_bin_tree (right);
3535
new_node_idx = src->type;
3537
new_tree = create_tree (left, right, src->type, new_node_idx);
3538
if (BE (new_tree == NULL, 0))
3540
free_bin_tree (left);
3541
free_bin_tree (right);