1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3
Software Foundation, Inc.
4
This file is part of the GNU C Library.
5
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7
This program is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3, or (at your option)
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
GNU General Public License for more details.
17
You should have received a copy of the GNU General Public License along
18
with this program; if not, write to the Free Software Foundation,
19
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22
Idx n) internal_function;
23
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24
static void match_ctx_free (re_match_context_t *cache) internal_function;
25
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
26
Idx str_idx, Idx from, Idx to)
28
static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
30
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
31
Idx str_idx) internal_function;
32
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33
Idx node, Idx str_idx)
35
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36
re_dfastate_t **limited_sts, Idx last_node,
39
static reg_errcode_t re_search_internal (const regex_t *preg,
40
const char *string, Idx length,
41
Idx start, Idx last_start, Idx stop,
42
size_t nmatch, regmatch_t pmatch[],
43
int eflags) internal_function;
44
static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
45
const char *string1, Idx length1,
46
const char *string2, Idx length2,
47
Idx start, regoff_t range,
48
struct re_registers *regs,
49
Idx stop, bool ret_len) internal_function;
50
static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
51
const char *string, Idx length, Idx start,
52
regoff_t range, Idx stop,
53
struct re_registers *regs,
54
bool ret_len) internal_function;
55
static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
56
Idx nregs, int regs_allocated)
58
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
60
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
61
Idx *p_match_first) internal_function;
62
static Idx check_halt_state_context (const re_match_context_t *mctx,
63
const re_dfastate_t *state, Idx idx)
65
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
66
regmatch_t *prev_idx_match, Idx cur_node,
67
Idx cur_idx, Idx nmatch) internal_function;
68
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
69
Idx str_idx, Idx dest_node, Idx nregs,
71
re_node_set *eps_via_nodes)
73
static reg_errcode_t set_regs (const regex_t *preg,
74
const re_match_context_t *mctx,
75
size_t nmatch, regmatch_t *pmatch,
76
bool fl_backtrack) internal_function;
77
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
81
static int sift_states_iter_mb (const re_match_context_t *mctx,
82
re_sift_context_t *sctx,
83
Idx node_idx, Idx str_idx, Idx max_str_idx)
85
#endif /* RE_ENABLE_I18N */
86
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
87
re_sift_context_t *sctx)
89
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
90
re_sift_context_t *sctx, Idx str_idx,
91
re_node_set *cur_dest)
93
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
94
re_sift_context_t *sctx,
96
re_node_set *dest_nodes)
98
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
99
re_node_set *dest_nodes,
100
const re_node_set *candidates)
102
static bool check_dst_limits (const re_match_context_t *mctx,
103
const re_node_set *limits,
104
Idx dst_node, Idx dst_idx, Idx src_node,
105
Idx src_idx) internal_function;
106
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
107
int boundaries, Idx subexp_idx,
108
Idx from_node, Idx bkref_idx)
110
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
111
Idx limit, Idx subexp_idx,
112
Idx node, Idx str_idx,
113
Idx bkref_idx) internal_function;
114
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
115
re_node_set *dest_nodes,
116
const re_node_set *candidates,
118
struct re_backref_cache_entry *bkref_ents,
119
Idx str_idx) internal_function;
120
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
121
re_sift_context_t *sctx,
122
Idx str_idx, const re_node_set *candidates)
124
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
126
re_dfastate_t **src, Idx num)
128
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
129
re_match_context_t *mctx) internal_function;
130
static re_dfastate_t *transit_state (reg_errcode_t *err,
131
re_match_context_t *mctx,
132
re_dfastate_t *state) internal_function;
133
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
134
re_match_context_t *mctx,
135
re_dfastate_t *next_state)
137
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
138
re_node_set *cur_nodes,
139
Idx str_idx) internal_function;
141
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
142
re_match_context_t *mctx,
143
re_dfastate_t *pstate)
146
#ifdef RE_ENABLE_I18N
147
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148
re_dfastate_t *pstate)
150
#endif /* RE_ENABLE_I18N */
151
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
152
const re_node_set *nodes)
154
static reg_errcode_t get_subexp (re_match_context_t *mctx,
155
Idx bkref_node, Idx bkref_str_idx)
157
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
158
const re_sub_match_top_t *sub_top,
159
re_sub_match_last_t *sub_last,
160
Idx bkref_node, Idx bkref_str)
162
static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
163
Idx subexp_idx, int type) internal_function;
164
static reg_errcode_t check_arrival (re_match_context_t *mctx,
165
state_array_t *path, Idx top_node,
166
Idx top_str, Idx last_node, Idx last_str,
167
int type) internal_function;
168
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
170
re_node_set *cur_nodes,
171
re_node_set *next_nodes)
173
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
174
re_node_set *cur_nodes,
175
Idx ex_subexp, int type)
177
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
178
re_node_set *dst_nodes,
179
Idx target, Idx ex_subexp,
180
int type) internal_function;
181
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
182
re_node_set *cur_nodes, Idx cur_str,
183
Idx subexp_num, int type)
185
static bool build_trtable (const re_dfa_t *dfa,
186
re_dfastate_t *state) internal_function;
187
#ifdef RE_ENABLE_I18N
188
static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
189
const re_string_t *input, Idx idx)
192
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
196
#endif /* RE_ENABLE_I18N */
197
static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
198
const re_dfastate_t *state,
199
re_node_set *states_node,
200
bitset_t *states_ch) internal_function;
201
static bool check_node_accept (const re_match_context_t *mctx,
202
const re_token_t *node, Idx idx)
204
static reg_errcode_t extend_buffers (re_match_context_t *mctx)
207
/* Entry point for POSIX code. */
209
/* regexec searches for a given pattern, specified by PREG, in the
212
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
213
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
214
least NMATCH elements, and we set them to the offsets of the
215
corresponding matched substrings.
217
EFLAGS specifies `execution flags' which affect matching: if
218
REG_NOTBOL is set, then ^ does not match at the beginning of the
219
string; if REG_NOTEOL is set, then $ does not match at the end.
221
We return 0 if we find a match and REG_NOMATCH if not. */
224
regexec (preg, string, nmatch, pmatch, eflags)
225
const regex_t *_Restrict_ preg;
226
const char *_Restrict_ string;
228
regmatch_t pmatch[_Restrict_arr_];
234
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
237
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
240
if (eflags & REG_STARTEND)
242
start = pmatch[0].rm_so;
243
length = pmatch[0].rm_eo;
248
length = strlen (string);
251
__libc_lock_lock (dfa->lock);
253
err = re_search_internal (preg, string, length, start, length,
254
length, 0, NULL, eflags);
256
err = re_search_internal (preg, string, length, start, length,
257
length, nmatch, pmatch, eflags);
258
__libc_lock_unlock (dfa->lock);
259
return err != REG_NOERROR;
263
# include <shlib-compat.h>
264
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
266
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
267
__typeof__ (__regexec) __compat_regexec;
270
attribute_compat_text_section
271
__compat_regexec (const regex_t *_Restrict_ preg,
272
const char *_Restrict_ string, size_t nmatch,
273
regmatch_t pmatch[], int eflags)
275
return regexec (preg, string, nmatch, pmatch,
276
eflags & (REG_NOTBOL | REG_NOTEOL));
278
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
282
/* Entry points for GNU code. */
284
/* re_match, re_search, re_match_2, re_search_2
286
The former two functions operate on STRING with length LENGTH,
287
while the later two operate on concatenation of STRING1 and STRING2
288
with lengths LENGTH1 and LENGTH2, respectively.
290
re_match() matches the compiled pattern in BUFP against the string,
291
starting at index START.
293
re_search() first tries matching at index START, then it tries to match
294
starting from index START + 1, and so on. The last start position tried
295
is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
298
The parameter STOP of re_{match,search}_2 specifies that no match exceeding
299
the first STOP characters of the concatenation of the strings should be
302
If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
303
and all groups is stored in REGS. (For the "_2" variants, the offsets are
304
computed relative to the concatenation, not relative to the individual
307
On success, re_match* functions return the length of the match, re_search*
308
return the position of the start of the match. Return value -1 means no
309
match was found and -2 indicates an internal error. */
312
re_match (bufp, string, length, start, regs)
313
struct re_pattern_buffer *bufp;
316
struct re_registers *regs;
318
return re_search_stub (bufp, string, length, start, 0, length, regs, true);
321
weak_alias (__re_match, re_match)
325
re_search (bufp, string, length, start, range, regs)
326
struct re_pattern_buffer *bufp;
330
struct re_registers *regs;
332
return re_search_stub (bufp, string, length, start, range, length, regs,
336
weak_alias (__re_search, re_search)
340
re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
341
struct re_pattern_buffer *bufp;
342
const char *string1, *string2;
343
Idx length1, length2, start, stop;
344
struct re_registers *regs;
346
return re_search_2_stub (bufp, string1, length1, string2, length2,
347
start, 0, regs, stop, true);
350
weak_alias (__re_match_2, re_match_2)
354
re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
355
struct re_pattern_buffer *bufp;
356
const char *string1, *string2;
357
Idx length1, length2, start, stop;
359
struct re_registers *regs;
361
return re_search_2_stub (bufp, string1, length1, string2, length2,
362
start, range, regs, stop, false);
365
weak_alias (__re_search_2, re_search_2)
370
re_search_2_stub (struct re_pattern_buffer *bufp,
371
const char *string1, Idx length1,
372
const char *string2, Idx length2,
373
Idx start, regoff_t range, struct re_registers *regs,
374
Idx stop, bool ret_len)
378
Idx len = length1 + length2;
381
if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
384
/* Concatenate the strings. */
388
s = re_malloc (char, len);
390
if (BE (s == NULL, 0))
393
memcpy (__mempcpy (s, string1, length1), string2, length2);
395
memcpy (s, string1, length1);
396
memcpy (s + length1, string2, length2);
405
rval = re_search_stub (bufp, str, len, start, range, stop, regs,
411
/* The parameters have the same meaning as those of re_search.
412
Additional parameters:
413
If RET_LEN is true the length of the match is returned (re_match style);
414
otherwise the position of the match is returned. */
418
re_search_stub (struct re_pattern_buffer *bufp,
419
const char *string, Idx length,
420
Idx start, regoff_t range, Idx stop, struct re_registers *regs,
423
reg_errcode_t result;
429
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
431
Idx last_start = start + range;
433
/* Check for out-of-range. */
434
if (BE (start < 0 || start > length, 0))
436
if (BE (length < last_start || (0 <= range && last_start < start), 0))
438
else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
441
__libc_lock_lock (dfa->lock);
443
eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
444
eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
446
/* Compile fastmap if we haven't yet. */
447
if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
448
re_compile_fastmap (bufp);
450
if (BE (bufp->no_sub, 0))
453
/* We need at least 1 register. */
456
else if (BE (bufp->regs_allocated == REGS_FIXED
457
&& regs->num_regs <= bufp->re_nsub, 0))
459
nregs = regs->num_regs;
460
if (BE (nregs < 1, 0))
462
/* Nothing can be copied to regs. */
468
nregs = bufp->re_nsub + 1;
469
pmatch = re_malloc (regmatch_t, nregs);
470
if (BE (pmatch == NULL, 0))
476
result = re_search_internal (bufp, string, length, start, last_start, stop,
477
nregs, pmatch, eflags);
481
/* I hope we needn't fill ther regs with -1's when no match was found. */
482
if (result != REG_NOERROR)
484
else if (regs != NULL)
486
/* If caller wants register contents data back, copy them. */
487
bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
488
bufp->regs_allocated);
489
if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
493
if (BE (rval == 0, 1))
497
assert (pmatch[0].rm_so == start);
498
rval = pmatch[0].rm_eo - start;
501
rval = pmatch[0].rm_so;
505
__libc_lock_unlock (dfa->lock);
511
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
514
int rval = REGS_REALLOCATE;
516
Idx need_regs = nregs + 1;
517
/* We need one extra element beyond `num_regs' for the `-1' marker GNU code
520
/* Have the register data arrays been allocated? */
521
if (regs_allocated == REGS_UNALLOCATED)
522
{ /* No. So allocate them with malloc. */
523
regs->start = re_malloc (regoff_t, need_regs);
524
if (BE (regs->start == NULL, 0))
525
return REGS_UNALLOCATED;
526
regs->end = re_malloc (regoff_t, need_regs);
527
if (BE (regs->end == NULL, 0))
529
re_free (regs->start);
530
return REGS_UNALLOCATED;
532
regs->num_regs = need_regs;
534
else if (regs_allocated == REGS_REALLOCATE)
535
{ /* Yes. If we need more elements than were already
536
allocated, reallocate them. If we need fewer, just
538
if (BE (need_regs > regs->num_regs, 0))
540
regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
542
if (BE (new_start == NULL, 0))
543
return REGS_UNALLOCATED;
544
new_end = re_realloc (regs->end, regoff_t, need_regs);
545
if (BE (new_end == NULL, 0))
548
return REGS_UNALLOCATED;
550
regs->start = new_start;
552
regs->num_regs = need_regs;
557
assert (regs_allocated == REGS_FIXED);
558
/* This function may not be called with REGS_FIXED and nregs too big. */
559
assert (regs->num_regs >= nregs);
564
for (i = 0; i < nregs; ++i)
566
regs->start[i] = pmatch[i].rm_so;
567
regs->end[i] = pmatch[i].rm_eo;
569
for ( ; i < regs->num_regs; ++i)
570
regs->start[i] = regs->end[i] = -1;
575
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
576
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
577
this memory for recording register information. STARTS and ENDS
578
must be allocated using the malloc library routine, and must each
579
be at least NUM_REGS * sizeof (regoff_t) bytes long.
581
If NUM_REGS == 0, then subsequent matches should allocate their own
584
Unless this function is called, the first search or match using
585
PATTERN_BUFFER will allocate its own register data, without
586
freeing the old data. */
589
re_set_registers (bufp, regs, num_regs, starts, ends)
590
struct re_pattern_buffer *bufp;
591
struct re_registers *regs;
592
__re_size_t num_regs;
593
regoff_t *starts, *ends;
597
bufp->regs_allocated = REGS_REALLOCATE;
598
regs->num_regs = num_regs;
599
regs->start = starts;
604
bufp->regs_allocated = REGS_UNALLOCATED;
606
regs->start = regs->end = NULL;
610
weak_alias (__re_set_registers, re_set_registers)
613
/* Entry points compatible with 4.2 BSD regex library. We don't define
614
them unless specifically requested. */
616
#if defined _REGEX_RE_COMP || defined _LIBC
624
return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
626
#endif /* _REGEX_RE_COMP */
628
/* Internal entry point. */
630
/* Searches for a compiled pattern PREG in the string STRING, whose
631
length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
632
meaning as with regexec. LAST_START is START + RANGE, where
633
START and RANGE have the same meaning as with re_search.
634
Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
635
otherwise return the error code.
636
Note: We assume front end functions already check ranges.
637
(0 <= LAST_START && LAST_START <= LENGTH) */
640
internal_function __attribute_warn_unused_result__
641
re_search_internal (const regex_t *preg,
642
const char *string, Idx length,
643
Idx start, Idx last_start, Idx stop,
644
size_t nmatch, regmatch_t pmatch[],
648
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
649
Idx left_lim, right_lim;
651
bool fl_longest_match;
654
Idx match_last = REG_MISSING;
658
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
659
re_match_context_t mctx = { .dfa = dfa };
661
re_match_context_t mctx;
663
char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
664
&& start != last_start && !preg->can_be_null)
665
? preg->fastmap : NULL);
666
RE_TRANSLATE_TYPE t = preg->translate;
668
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
669
memset (&mctx, '\0', sizeof (re_match_context_t));
673
extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
674
nmatch -= extra_nmatch;
676
/* Check if the DFA haven't been compiled. */
677
if (BE (preg->used == 0 || dfa->init_state == NULL
678
|| dfa->init_state_word == NULL || dfa->init_state_nl == NULL
679
|| dfa->init_state_begbuf == NULL, 0))
683
/* We assume front-end functions already check them. */
684
assert (0 <= last_start && last_start <= length);
687
/* If initial states with non-begbuf contexts have no elements,
688
the regex must be anchored. If preg->newline_anchor is set,
689
we'll never use init_state_nl, so do not check it. */
690
if (dfa->init_state->nodes.nelem == 0
691
&& dfa->init_state_word->nodes.nelem == 0
692
&& (dfa->init_state_nl->nodes.nelem == 0
693
|| !preg->newline_anchor))
695
if (start != 0 && last_start != 0)
697
start = last_start = 0;
700
/* We must check the longest matching, if nmatch > 0. */
701
fl_longest_match = (nmatch != 0 || dfa->nbackref);
703
err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
704
preg->translate, (preg->syntax & RE_ICASE) != 0,
706
if (BE (err != REG_NOERROR, 0))
708
mctx.input.stop = stop;
709
mctx.input.raw_stop = stop;
710
mctx.input.newline_anchor = preg->newline_anchor;
712
err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
713
if (BE (err != REG_NOERROR, 0))
716
/* We will log all the DFA states through which the dfa pass,
717
if nmatch > 1, or this dfa has "multibyte node", which is a
718
back-reference or a node which can accept multibyte character or
719
multi character collating element. */
720
if (nmatch > 1 || dfa->has_mb_node)
722
/* Avoid overflow. */
723
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
729
mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
730
if (BE (mctx.state_log == NULL, 0))
737
mctx.state_log = NULL;
740
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
741
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
743
/* Check incrementally whether of not the input string match. */
744
incr = (last_start < start) ? -1 : 1;
745
left_lim = (last_start < start) ? last_start : start;
746
right_lim = (last_start < start) ? start : last_start;
747
sb = dfa->mb_cur_max == 1;
750
? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
751
| (start <= last_start ? 2 : 0)
752
| (t != NULL ? 1 : 0))
755
for (;; match_first += incr)
758
if (match_first < left_lim || right_lim < match_first)
761
/* Advance as rapidly as possible through the string, until we
762
find a plausible place to start matching. This may be done
763
with varying efficiency, so there are various possibilities:
764
only the most common of them are specialized, in order to
765
save on code size. We use a switch statement for speed. */
773
/* Fastmap with single-byte translation, match forward. */
774
while (BE (match_first < right_lim, 1)
775
&& !fastmap[t[(unsigned char) string[match_first]]])
777
goto forward_match_found_start_or_reached_end;
780
/* Fastmap without translation, match forward. */
781
while (BE (match_first < right_lim, 1)
782
&& !fastmap[(unsigned char) string[match_first]])
785
forward_match_found_start_or_reached_end:
786
if (BE (match_first == right_lim, 0))
788
ch = match_first >= length
789
? 0 : (unsigned char) string[match_first];
790
if (!fastmap[t ? t[ch] : ch])
797
/* Fastmap without multi-byte translation, match backwards. */
798
while (match_first >= left_lim)
800
ch = match_first >= length
801
? 0 : (unsigned char) string[match_first];
802
if (fastmap[t ? t[ch] : ch])
806
if (match_first < left_lim)
811
/* In this case, we can't determine easily the current byte,
812
since it might be a component byte of a multibyte
813
character. Then we use the constructed buffer instead. */
816
/* If MATCH_FIRST is out of the valid range, reconstruct the
818
__re_size_t offset = match_first - mctx.input.raw_mbs_idx;
819
if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
821
err = re_string_reconstruct (&mctx.input, match_first,
823
if (BE (err != REG_NOERROR, 0))
826
offset = match_first - mctx.input.raw_mbs_idx;
828
/* If MATCH_FIRST is out of the buffer, leave it as '\0'.
829
Note that MATCH_FIRST must not be smaller than 0. */
830
ch = (match_first >= length
831
? 0 : re_string_byte_at (&mctx.input, offset));
835
if (match_first < left_lim || match_first > right_lim)
844
/* Reconstruct the buffers so that the matcher can assume that
845
the matching starts from the beginning of the buffer. */
846
err = re_string_reconstruct (&mctx.input, match_first, eflags);
847
if (BE (err != REG_NOERROR, 0))
850
#ifdef RE_ENABLE_I18N
851
/* Don't consider this char as a possible match start if it part,
852
yet isn't the head, of a multibyte character. */
853
if (!sb && !re_string_first_byte (&mctx.input, 0))
857
/* It seems to be appropriate one, then use the matcher. */
858
/* We assume that the matching starts from 0. */
859
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
860
match_last = check_matching (&mctx, fl_longest_match,
861
start <= last_start ? &match_first : NULL);
862
if (match_last != REG_MISSING)
864
if (BE (match_last == REG_ERROR, 0))
871
mctx.match_last = match_last;
872
if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
874
re_dfastate_t *pstate = mctx.state_log[match_last];
875
mctx.last_node = check_halt_state_context (&mctx, pstate,
878
if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
881
err = prune_impossible_nodes (&mctx);
882
if (err == REG_NOERROR)
884
if (BE (err != REG_NOMATCH, 0))
886
match_last = REG_MISSING;
889
break; /* We found a match. */
893
match_ctx_clean (&mctx);
897
assert (match_last != REG_MISSING);
898
assert (err == REG_NOERROR);
901
/* Set pmatch[] if we need. */
906
/* Initialize registers. */
907
for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
908
pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
910
/* Set the points where matching start/end. */
912
pmatch[0].rm_eo = mctx.match_last;
913
/* FIXME: This function should fail if mctx.match_last exceeds
914
the maximum possible regoff_t value. We need a new error
915
code REG_OVERFLOW. */
917
if (!preg->no_sub && nmatch > 1)
919
err = set_regs (preg, &mctx, nmatch, pmatch,
920
dfa->has_plural_match && dfa->nbackref > 0);
921
if (BE (err != REG_NOERROR, 0))
925
/* At last, add the offset to the each registers, since we slided
926
the buffers so that we could assume that the matching starts
928
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
929
if (pmatch[reg_idx].rm_so != -1)
931
#ifdef RE_ENABLE_I18N
932
if (BE (mctx.input.offsets_needed != 0, 0))
934
pmatch[reg_idx].rm_so =
935
(pmatch[reg_idx].rm_so == mctx.input.valid_len
936
? mctx.input.valid_raw_len
937
: mctx.input.offsets[pmatch[reg_idx].rm_so]);
938
pmatch[reg_idx].rm_eo =
939
(pmatch[reg_idx].rm_eo == mctx.input.valid_len
940
? mctx.input.valid_raw_len
941
: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
944
assert (mctx.input.offsets_needed == 0);
946
pmatch[reg_idx].rm_so += match_first;
947
pmatch[reg_idx].rm_eo += match_first;
949
for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
951
pmatch[nmatch + reg_idx].rm_so = -1;
952
pmatch[nmatch + reg_idx].rm_eo = -1;
956
for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
957
if (dfa->subexp_map[reg_idx] != reg_idx)
959
pmatch[reg_idx + 1].rm_so
960
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
961
pmatch[reg_idx + 1].rm_eo
962
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
967
re_free (mctx.state_log);
969
match_ctx_free (&mctx);
970
re_string_destruct (&mctx.input);
975
internal_function __attribute_warn_unused_result__
976
prune_impossible_nodes (re_match_context_t *mctx)
978
const re_dfa_t *const dfa = mctx->dfa;
979
Idx halt_node, match_last;
981
re_dfastate_t **sifted_states;
982
re_dfastate_t **lim_states = NULL;
983
re_sift_context_t sctx;
985
assert (mctx->state_log != NULL);
987
match_last = mctx->match_last;
988
halt_node = mctx->last_node;
990
/* Avoid overflow. */
991
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
994
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
995
if (BE (sifted_states == NULL, 0))
1002
lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1003
if (BE (lim_states == NULL, 0))
1010
memset (lim_states, '\0',
1011
sizeof (re_dfastate_t *) * (match_last + 1));
1012
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1014
ret = sift_states_backward (mctx, &sctx);
1015
re_node_set_free (&sctx.limits);
1016
if (BE (ret != REG_NOERROR, 0))
1018
if (sifted_states[0] != NULL || lim_states[0] != NULL)
1023
if (! REG_VALID_INDEX (match_last))
1028
} while (mctx->state_log[match_last] == NULL
1029
|| !mctx->state_log[match_last]->halt);
1030
halt_node = check_halt_state_context (mctx,
1031
mctx->state_log[match_last],
1034
ret = merge_state_array (dfa, sifted_states, lim_states,
1036
re_free (lim_states);
1038
if (BE (ret != REG_NOERROR, 0))
1043
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1044
ret = sift_states_backward (mctx, &sctx);
1045
re_node_set_free (&sctx.limits);
1046
if (BE (ret != REG_NOERROR, 0))
1048
if (sifted_states[0] == NULL)
1054
re_free (mctx->state_log);
1055
mctx->state_log = sifted_states;
1056
sifted_states = NULL;
1057
mctx->last_node = halt_node;
1058
mctx->match_last = match_last;
1061
re_free (sifted_states);
1062
re_free (lim_states);
1066
/* Acquire an initial state and return it.
1067
We must select appropriate initial state depending on the context,
1068
since initial states may have constraints like "\<", "^", etc.. */
1070
static inline re_dfastate_t *
1071
__attribute ((always_inline)) internal_function
1072
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1075
const re_dfa_t *const dfa = mctx->dfa;
1076
if (dfa->init_state->has_constraint)
1078
unsigned int context;
1079
context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1080
if (IS_WORD_CONTEXT (context))
1081
return dfa->init_state_word;
1082
else if (IS_ORDINARY_CONTEXT (context))
1083
return dfa->init_state;
1084
else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1085
return dfa->init_state_begbuf;
1086
else if (IS_NEWLINE_CONTEXT (context))
1087
return dfa->init_state_nl;
1088
else if (IS_BEGBUF_CONTEXT (context))
1090
/* It is relatively rare case, then calculate on demand. */
1091
return re_acquire_state_context (err, dfa,
1092
dfa->init_state->entrance_nodes,
1096
/* Must not happen? */
1097
return dfa->init_state;
1100
return dfa->init_state;
1103
/* Check whether the regular expression match input string INPUT or not,
1104
and return the index where the matching end. Return REG_MISSING if
1105
there is no match, and return REG_ERROR in case of an error.
1106
FL_LONGEST_MATCH means we want the POSIX longest matching.
1107
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1108
next place where we may want to try matching.
1109
Note that the matcher assume that the maching starts from the current
1110
index of the buffer. */
1113
internal_function __attribute_warn_unused_result__
1114
check_matching (re_match_context_t *mctx, bool fl_longest_match,
1117
const re_dfa_t *const dfa = mctx->dfa;
1120
Idx match_last = REG_MISSING;
1121
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1122
re_dfastate_t *cur_state;
1123
bool at_init_state = p_match_first != NULL;
1124
Idx next_start_idx = cur_str_idx;
1127
cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1128
/* An initial state must not be NULL (invalid). */
1129
if (BE (cur_state == NULL, 0))
1131
assert (err == REG_ESPACE);
1135
if (mctx->state_log != NULL)
1137
mctx->state_log[cur_str_idx] = cur_state;
1139
/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1140
later. E.g. Processing back references. */
1141
if (BE (dfa->nbackref, 0))
1143
at_init_state = false;
1144
err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1145
if (BE (err != REG_NOERROR, 0))
1148
if (cur_state->has_backref)
1150
err = transit_state_bkref (mctx, &cur_state->nodes);
1151
if (BE (err != REG_NOERROR, 0))
1157
/* If the RE accepts NULL string. */
1158
if (BE (cur_state->halt, 0))
1160
if (!cur_state->has_constraint
1161
|| check_halt_state_context (mctx, cur_state, cur_str_idx))
1163
if (!fl_longest_match)
1167
match_last = cur_str_idx;
1173
while (!re_string_eoi (&mctx->input))
1175
re_dfastate_t *old_state = cur_state;
1176
Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1178
if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1179
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
1180
&& mctx->input.valid_len < mctx->input.len))
1182
err = extend_buffers (mctx);
1183
if (BE (err != REG_NOERROR, 0))
1185
assert (err == REG_ESPACE);
1190
cur_state = transit_state (&err, mctx, cur_state);
1191
if (mctx->state_log != NULL)
1192
cur_state = merge_state_with_log (&err, mctx, cur_state);
1194
if (cur_state == NULL)
1196
/* Reached the invalid state or an error. Try to recover a valid
1197
state using the state log, if available and if we have not
1198
already found a valid (even if not the longest) match. */
1199
if (BE (err != REG_NOERROR, 0))
1202
if (mctx->state_log == NULL
1203
|| (match && !fl_longest_match)
1204
|| (cur_state = find_recover_state (&err, mctx)) == NULL)
1208
if (BE (at_init_state, 0))
1210
if (old_state == cur_state)
1211
next_start_idx = next_char_idx;
1213
at_init_state = false;
1216
if (cur_state->halt)
1218
/* Reached a halt state.
1219
Check the halt state can satisfy the current context. */
1220
if (!cur_state->has_constraint
1221
|| check_halt_state_context (mctx, cur_state,
1222
re_string_cur_idx (&mctx->input)))
1224
/* We found an appropriate halt state. */
1225
match_last = re_string_cur_idx (&mctx->input);
1228
/* We found a match, do not modify match_first below. */
1229
p_match_first = NULL;
1230
if (!fl_longest_match)
1237
*p_match_first += next_start_idx;
1242
/* Check NODE match the current context. */
1246
check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1248
re_token_type_t type = dfa->nodes[node].type;
1249
unsigned int constraint = dfa->nodes[node].constraint;
1250
if (type != END_OF_RE)
1254
if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1259
/* Check the halt state STATE match the current context.
1260
Return 0 if not match, if the node, STATE has, is a halt node and
1261
match the context, return the node. */
1265
check_halt_state_context (const re_match_context_t *mctx,
1266
const re_dfastate_t *state, Idx idx)
1269
unsigned int context;
1271
assert (state->halt);
1273
context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1274
for (i = 0; i < state->nodes.nelem; ++i)
1275
if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1276
return state->nodes.elems[i];
1280
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1281
corresponding to the DFA).
1282
Return the destination node, and update EPS_VIA_NODES;
1283
return REG_MISSING in case of errors. */
1287
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1288
Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1289
struct re_fail_stack_t *fs)
1291
const re_dfa_t *const dfa = mctx->dfa;
1294
if (IS_EPSILON_NODE (dfa->nodes[node].type))
1296
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1297
re_node_set *edests = &dfa->edests[node];
1299
ok = re_node_set_insert (eps_via_nodes, node);
1302
/* Pick up a valid destination, or return REG_MISSING if none
1304
for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1306
Idx candidate = edests->elems[i];
1307
if (!re_node_set_contains (cur_nodes, candidate))
1309
if (dest_node == REG_MISSING)
1310
dest_node = candidate;
1314
/* In order to avoid infinite loop like "(a*)*", return the second
1315
epsilon-transition if the first was already considered. */
1316
if (re_node_set_contains (eps_via_nodes, dest_node))
1319
/* Otherwise, push the second epsilon-transition on the fail stack. */
1321
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
1325
/* We know we are going to exit. */
1334
re_token_type_t type = dfa->nodes[node].type;
1336
#ifdef RE_ENABLE_I18N
1337
if (dfa->nodes[node].accept_mb)
1338
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1340
#endif /* RE_ENABLE_I18N */
1341
if (type == OP_BACK_REF)
1343
Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1344
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1347
if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1351
char *buf = (char *) re_string_get_buffer (&mctx->input);
1352
if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1361
ok = re_node_set_insert (eps_via_nodes, node);
1364
dest_node = dfa->edests[node].elems[0];
1365
if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1372
|| check_node_accept (mctx, dfa->nodes + node, *pidx))
1374
Idx dest_node = dfa->nexts[node];
1375
*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1376
if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1377
|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1380
re_node_set_empty (eps_via_nodes);
1387
static reg_errcode_t
1388
internal_function __attribute_warn_unused_result__
1389
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1390
Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1393
Idx num = fs->num++;
1394
if (fs->num == fs->alloc)
1396
struct re_fail_stack_ent_t *new_array;
1397
new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1399
if (new_array == NULL)
1402
fs->stack = new_array;
1404
fs->stack[num].idx = str_idx;
1405
fs->stack[num].node = dest_node;
1406
fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1407
if (fs->stack[num].regs == NULL)
1409
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1410
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1416
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1417
regmatch_t *regs, re_node_set *eps_via_nodes)
1419
Idx num = --fs->num;
1420
assert (REG_VALID_INDEX (num));
1421
*pidx = fs->stack[num].idx;
1422
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1423
re_node_set_free (eps_via_nodes);
1424
re_free (fs->stack[num].regs);
1425
*eps_via_nodes = fs->stack[num].eps_via_nodes;
1426
return fs->stack[num].node;
1429
/* Set the positions where the subexpressions are starts/ends to registers
1431
Note: We assume that pmatch[0] is already set, and
1432
pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1434
static reg_errcode_t
1435
internal_function __attribute_warn_unused_result__
1436
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1437
regmatch_t *pmatch, bool fl_backtrack)
1439
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1441
re_node_set eps_via_nodes;
1442
struct re_fail_stack_t *fs;
1443
struct re_fail_stack_t fs_body = { 0, 2, NULL };
1444
regmatch_t *prev_idx_match;
1445
bool prev_idx_match_malloced = false;
1448
assert (nmatch > 1);
1449
assert (mctx->state_log != NULL);
1454
fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1455
if (fs->stack == NULL)
1461
cur_node = dfa->init_node;
1462
re_node_set_init_empty (&eps_via_nodes);
1464
if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1465
prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1468
prev_idx_match = re_malloc (regmatch_t, nmatch);
1469
if (prev_idx_match == NULL)
1471
free_fail_stack_return (fs);
1474
prev_idx_match_malloced = true;
1476
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1478
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1480
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1482
if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1487
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1488
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1490
if (reg_idx == nmatch)
1492
re_node_set_free (&eps_via_nodes);
1493
if (prev_idx_match_malloced)
1494
re_free (prev_idx_match);
1495
return free_fail_stack_return (fs);
1497
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1502
re_node_set_free (&eps_via_nodes);
1503
if (prev_idx_match_malloced)
1504
re_free (prev_idx_match);
1509
/* Proceed to next node. */
1510
cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1511
&eps_via_nodes, fs);
1513
if (BE (! REG_VALID_INDEX (cur_node), 0))
1515
if (BE (cur_node == REG_ERROR, 0))
1517
re_node_set_free (&eps_via_nodes);
1518
if (prev_idx_match_malloced)
1519
re_free (prev_idx_match);
1520
free_fail_stack_return (fs);
1524
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1528
re_node_set_free (&eps_via_nodes);
1529
if (prev_idx_match_malloced)
1530
re_free (prev_idx_match);
1535
re_node_set_free (&eps_via_nodes);
1536
if (prev_idx_match_malloced)
1537
re_free (prev_idx_match);
1538
return free_fail_stack_return (fs);
1541
static reg_errcode_t
1543
free_fail_stack_return (struct re_fail_stack_t *fs)
1548
for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1550
re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1551
re_free (fs->stack[fs_idx].regs);
1553
re_free (fs->stack);
1560
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1561
regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1563
int type = dfa->nodes[cur_node].type;
1564
if (type == OP_OPEN_SUBEXP)
1566
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1568
/* We are at the first node of this sub expression. */
1569
if (reg_num < nmatch)
1571
pmatch[reg_num].rm_so = cur_idx;
1572
pmatch[reg_num].rm_eo = -1;
1575
else if (type == OP_CLOSE_SUBEXP)
1577
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1578
if (reg_num < nmatch)
1580
/* We are at the last node of this sub expression. */
1581
if (pmatch[reg_num].rm_so < cur_idx)
1583
pmatch[reg_num].rm_eo = cur_idx;
1584
/* This is a non-empty match or we are not inside an optional
1585
subexpression. Accept this right away. */
1586
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1590
if (dfa->nodes[cur_node].opt_subexp
1591
&& prev_idx_match[reg_num].rm_so != -1)
1592
/* We transited through an empty match for an optional
1593
subexpression, like (a?)*, and this is not the subexp's
1594
first match. Copy back the old content of the registers
1595
so that matches of an inner subexpression are undone as
1596
well, like in ((a?))*. */
1597
memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1599
/* We completed a subexpression, but it may be part of
1600
an optional one, so do not update PREV_IDX_MATCH. */
1601
pmatch[reg_num].rm_eo = cur_idx;
1607
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1608
and sift the nodes in each states according to the following rules.
1609
Updated state_log will be wrote to STATE_LOG.
1611
Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1612
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1613
If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1614
the LAST_NODE, we throw away the node `a'.
1615
2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1616
string `s' and transit to `b':
1617
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1619
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1620
thrown away, we throw away the node `a'.
1621
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1622
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1624
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1625
we throw away the node `a'. */
1627
#define STATE_NODE_CONTAINS(state,node) \
1628
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1630
static reg_errcode_t
1632
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1636
Idx str_idx = sctx->last_str_idx;
1637
re_node_set cur_dest;
1640
assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1643
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1644
transit to the last_node and the last_node itself. */
1645
err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1646
if (BE (err != REG_NOERROR, 0))
1648
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1649
if (BE (err != REG_NOERROR, 0))
1652
/* Then check each states in the state_log. */
1655
/* Update counters. */
1656
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1657
if (null_cnt > mctx->max_mb_elem_len)
1659
memset (sctx->sifted_states, '\0',
1660
sizeof (re_dfastate_t *) * str_idx);
1661
re_node_set_free (&cur_dest);
1664
re_node_set_empty (&cur_dest);
1667
if (mctx->state_log[str_idx])
1669
err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1670
if (BE (err != REG_NOERROR, 0))
1674
/* Add all the nodes which satisfy the following conditions:
1675
- It can epsilon transit to a node in CUR_DEST.
1677
And update state_log. */
1678
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1679
if (BE (err != REG_NOERROR, 0))
1684
re_node_set_free (&cur_dest);
1688
static reg_errcode_t
1689
internal_function __attribute_warn_unused_result__
1690
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1691
Idx str_idx, re_node_set *cur_dest)
1693
const re_dfa_t *const dfa = mctx->dfa;
1694
const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1697
/* Then build the next sifted state.
1698
We build the next sifted state on `cur_dest', and update
1699
`sifted_states[str_idx]' with `cur_dest'.
1701
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1702
`cur_src' points the node_set of the old `state_log[str_idx]'
1703
(with the epsilon nodes pre-filtered out). */
1704
for (i = 0; i < cur_src->nelem; i++)
1706
Idx prev_node = cur_src->elems[i];
1711
re_token_type_t type = dfa->nodes[prev_node].type;
1712
assert (!IS_EPSILON_NODE (type));
1714
#ifdef RE_ENABLE_I18N
1715
/* If the node may accept `multi byte'. */
1716
if (dfa->nodes[prev_node].accept_mb)
1717
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1718
str_idx, sctx->last_str_idx);
1719
#endif /* RE_ENABLE_I18N */
1721
/* We don't check backreferences here.
1722
See update_cur_sifted_state(). */
1724
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1725
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1726
dfa->nexts[prev_node]))
1732
if (sctx->limits.nelem)
1734
Idx to_idx = str_idx + naccepted;
1735
if (check_dst_limits (mctx, &sctx->limits,
1736
dfa->nexts[prev_node], to_idx,
1737
prev_node, str_idx))
1740
ok = re_node_set_insert (cur_dest, prev_node);
1748
/* Helper functions. */
1750
static reg_errcode_t
1752
clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1754
Idx top = mctx->state_log_top;
1756
if (next_state_log_idx >= mctx->input.bufs_len
1757
|| (next_state_log_idx >= mctx->input.valid_len
1758
&& mctx->input.valid_len < mctx->input.len))
1761
err = extend_buffers (mctx);
1762
if (BE (err != REG_NOERROR, 0))
1766
if (top < next_state_log_idx)
1768
memset (mctx->state_log + top + 1, '\0',
1769
sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1770
mctx->state_log_top = next_state_log_idx;
1775
static reg_errcode_t
1777
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1778
re_dfastate_t **src, Idx num)
1782
for (st_idx = 0; st_idx < num; ++st_idx)
1784
if (dst[st_idx] == NULL)
1785
dst[st_idx] = src[st_idx];
1786
else if (src[st_idx] != NULL)
1788
re_node_set merged_set;
1789
err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1790
&src[st_idx]->nodes);
1791
if (BE (err != REG_NOERROR, 0))
1793
dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1794
re_node_set_free (&merged_set);
1795
if (BE (err != REG_NOERROR, 0))
1802
static reg_errcode_t
1804
update_cur_sifted_state (const re_match_context_t *mctx,
1805
re_sift_context_t *sctx, Idx str_idx,
1806
re_node_set *dest_nodes)
1808
const re_dfa_t *const dfa = mctx->dfa;
1809
reg_errcode_t err = REG_NOERROR;
1810
const re_node_set *candidates;
1811
candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1812
: &mctx->state_log[str_idx]->nodes);
1814
if (dest_nodes->nelem == 0)
1815
sctx->sifted_states[str_idx] = NULL;
1820
/* At first, add the nodes which can epsilon transit to a node in
1822
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1823
if (BE (err != REG_NOERROR, 0))
1826
/* Then, check the limitations in the current sift_context. */
1827
if (sctx->limits.nelem)
1829
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1830
mctx->bkref_ents, str_idx);
1831
if (BE (err != REG_NOERROR, 0))
1836
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1837
if (BE (err != REG_NOERROR, 0))
1841
if (candidates && mctx->state_log[str_idx]->has_backref)
1843
err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1844
if (BE (err != REG_NOERROR, 0))
1850
static reg_errcode_t
1851
internal_function __attribute_warn_unused_result__
1852
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1853
const re_node_set *candidates)
1855
reg_errcode_t err = REG_NOERROR;
1858
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1859
if (BE (err != REG_NOERROR, 0))
1862
if (!state->inveclosure.alloc)
1864
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1865
if (BE (err != REG_NOERROR, 0))
1867
for (i = 0; i < dest_nodes->nelem; i++)
1869
err = re_node_set_merge (&state->inveclosure,
1870
dfa->inveclosures + dest_nodes->elems[i]);
1871
if (BE (err != REG_NOERROR, 0))
1875
return re_node_set_add_intersect (dest_nodes, candidates,
1876
&state->inveclosure);
1879
static reg_errcode_t
1881
sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1882
const re_node_set *candidates)
1886
re_node_set *inv_eclosure = dfa->inveclosures + node;
1887
re_node_set except_nodes;
1888
re_node_set_init_empty (&except_nodes);
1889
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891
Idx cur_node = inv_eclosure->elems[ecl_idx];
1892
if (cur_node == node)
1894
if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1896
Idx edst1 = dfa->edests[cur_node].elems[0];
1897
Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1898
? dfa->edests[cur_node].elems[1] : REG_MISSING);
1899
if ((!re_node_set_contains (inv_eclosure, edst1)
1900
&& re_node_set_contains (dest_nodes, edst1))
1901
|| (REG_VALID_NONZERO_INDEX (edst2)
1902
&& !re_node_set_contains (inv_eclosure, edst2)
1903
&& re_node_set_contains (dest_nodes, edst2)))
1905
err = re_node_set_add_intersect (&except_nodes, candidates,
1906
dfa->inveclosures + cur_node);
1907
if (BE (err != REG_NOERROR, 0))
1909
re_node_set_free (&except_nodes);
1915
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1917
Idx cur_node = inv_eclosure->elems[ecl_idx];
1918
if (!re_node_set_contains (&except_nodes, cur_node))
1920
Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1921
re_node_set_remove_at (dest_nodes, idx);
1924
re_node_set_free (&except_nodes);
1930
check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1931
Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1933
const re_dfa_t *const dfa = mctx->dfa;
1934
Idx lim_idx, src_pos, dst_pos;
1936
Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1937
Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1938
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1941
struct re_backref_cache_entry *ent;
1942
ent = mctx->bkref_ents + limits->elems[lim_idx];
1943
subexp_idx = dfa->nodes[ent->node].opr.idx;
1945
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1946
subexp_idx, dst_node, dst_idx,
1948
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1949
subexp_idx, src_node, src_idx,
1953
<src> <dst> ( <subexp> )
1954
( <subexp> ) <src> <dst>
1955
( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1956
if (src_pos == dst_pos)
1957
continue; /* This is unrelated limitation. */
1966
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1967
Idx subexp_idx, Idx from_node, Idx bkref_idx)
1969
const re_dfa_t *const dfa = mctx->dfa;
1970
const re_node_set *eclosures = dfa->eclosures + from_node;
1973
/* Else, we are on the boundary: examine the nodes on the epsilon
1975
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1977
Idx node = eclosures->elems[node_idx];
1978
switch (dfa->nodes[node].type)
1981
if (bkref_idx != REG_MISSING)
1983
struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1989
if (ent->node != node)
1992
if (subexp_idx < BITSET_WORD_BITS
1993
&& !(ent->eps_reachable_subexps_map
1994
& ((bitset_word_t) 1 << subexp_idx)))
1997
/* Recurse trying to reach the OP_OPEN_SUBEXP and
1998
OP_CLOSE_SUBEXP cases below. But, if the
1999
destination node is the same node as the source
2000
node, don't recurse because it would cause an
2001
infinite loop: a regex that exhibits this behavior
2003
dst = dfa->edests[node].elems[0];
2004
if (dst == from_node)
2008
else /* if (boundaries & 2) */
2013
check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2015
if (cpos == -1 /* && (boundaries & 1) */)
2017
if (cpos == 0 && (boundaries & 2))
2020
if (subexp_idx < BITSET_WORD_BITS)
2021
ent->eps_reachable_subexps_map
2022
&= ~((bitset_word_t) 1 << subexp_idx);
2024
while (ent++->more);
2028
case OP_OPEN_SUBEXP:
2029
if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2033
case OP_CLOSE_SUBEXP:
2034
if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2043
return (boundaries & 2) ? 1 : 0;
2048
check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2049
Idx subexp_idx, Idx from_node, Idx str_idx,
2052
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2055
/* If we are outside the range of the subexpression, return -1 or 1. */
2056
if (str_idx < lim->subexp_from)
2059
if (lim->subexp_to < str_idx)
2062
/* If we are within the subexpression, return 0. */
2063
boundaries = (str_idx == lim->subexp_from);
2064
boundaries |= (str_idx == lim->subexp_to) << 1;
2065
if (boundaries == 0)
2068
/* Else, examine epsilon closure. */
2069
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2070
from_node, bkref_idx);
2073
/* Check the limitations of sub expressions LIMITS, and remove the nodes
2074
which are against limitations from DEST_NODES. */
2076
static reg_errcode_t
2078
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2079
const re_node_set *candidates, re_node_set *limits,
2080
struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2083
Idx node_idx, lim_idx;
2085
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2088
struct re_backref_cache_entry *ent;
2089
ent = bkref_ents + limits->elems[lim_idx];
2091
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2092
continue; /* This is unrelated limitation. */
2094
subexp_idx = dfa->nodes[ent->node].opr.idx;
2095
if (ent->subexp_to == str_idx)
2097
Idx ops_node = REG_MISSING;
2098
Idx cls_node = REG_MISSING;
2099
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2101
Idx node = dest_nodes->elems[node_idx];
2102
re_token_type_t type = dfa->nodes[node].type;
2103
if (type == OP_OPEN_SUBEXP
2104
&& subexp_idx == dfa->nodes[node].opr.idx)
2106
else if (type == OP_CLOSE_SUBEXP
2107
&& subexp_idx == dfa->nodes[node].opr.idx)
2111
/* Check the limitation of the open subexpression. */
2112
/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2113
if (REG_VALID_INDEX (ops_node))
2115
err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2117
if (BE (err != REG_NOERROR, 0))
2121
/* Check the limitation of the close subexpression. */
2122
if (REG_VALID_INDEX (cls_node))
2123
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2125
Idx node = dest_nodes->elems[node_idx];
2126
if (!re_node_set_contains (dfa->inveclosures + node,
2128
&& !re_node_set_contains (dfa->eclosures + node,
2131
/* It is against this limitation.
2132
Remove it form the current sifted state. */
2133
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2135
if (BE (err != REG_NOERROR, 0))
2141
else /* (ent->subexp_to != str_idx) */
2143
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2145
Idx node = dest_nodes->elems[node_idx];
2146
re_token_type_t type = dfa->nodes[node].type;
2147
if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2149
if (subexp_idx != dfa->nodes[node].opr.idx)
2151
/* It is against this limitation.
2152
Remove it form the current sifted state. */
2153
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2155
if (BE (err != REG_NOERROR, 0))
2164
static reg_errcode_t
2165
internal_function __attribute_warn_unused_result__
2166
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2167
Idx str_idx, const re_node_set *candidates)
2169
const re_dfa_t *const dfa = mctx->dfa;
2172
re_sift_context_t local_sctx;
2173
Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2175
if (first_idx == REG_MISSING)
2178
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2180
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2183
re_token_type_t type;
2184
struct re_backref_cache_entry *entry;
2185
node = candidates->elems[node_idx];
2186
type = dfa->nodes[node].type;
2187
/* Avoid infinite loop for the REs like "()\1+". */
2188
if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2190
if (type != OP_BACK_REF)
2193
entry = mctx->bkref_ents + first_idx;
2194
enabled_idx = first_idx;
2201
re_dfastate_t *cur_state;
2203
if (entry->node != node)
2205
subexp_len = entry->subexp_to - entry->subexp_from;
2206
to_idx = str_idx + subexp_len;
2207
dst_node = (subexp_len ? dfa->nexts[node]
2208
: dfa->edests[node].elems[0]);
2210
if (to_idx > sctx->last_str_idx
2211
|| sctx->sifted_states[to_idx] == NULL
2212
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2213
|| check_dst_limits (mctx, &sctx->limits, node,
2214
str_idx, dst_node, to_idx))
2217
if (local_sctx.sifted_states == NULL)
2220
err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2221
if (BE (err != REG_NOERROR, 0))
2224
local_sctx.last_node = node;
2225
local_sctx.last_str_idx = str_idx;
2226
ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2232
cur_state = local_sctx.sifted_states[str_idx];
2233
err = sift_states_backward (mctx, &local_sctx);
2234
if (BE (err != REG_NOERROR, 0))
2236
if (sctx->limited_states != NULL)
2238
err = merge_state_array (dfa, sctx->limited_states,
2239
local_sctx.sifted_states,
2241
if (BE (err != REG_NOERROR, 0))
2244
local_sctx.sifted_states[str_idx] = cur_state;
2245
re_node_set_remove (&local_sctx.limits, enabled_idx);
2247
/* mctx->bkref_ents may have changed, reload the pointer. */
2248
entry = mctx->bkref_ents + enabled_idx;
2250
while (enabled_idx++, entry++->more);
2254
if (local_sctx.sifted_states != NULL)
2256
re_node_set_free (&local_sctx.limits);
2263
#ifdef RE_ENABLE_I18N
2266
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2267
Idx node_idx, Idx str_idx, Idx max_str_idx)
2269
const re_dfa_t *const dfa = mctx->dfa;
2271
/* Check the node can accept `multi byte'. */
2272
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2273
if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2274
!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2275
dfa->nexts[node_idx]))
2276
/* The node can't accept the `multi byte', or the
2277
destination was already thrown away, then the node
2278
could't accept the current input `multi byte'. */
2280
/* Otherwise, it is sure that the node could accept
2281
`naccepted' bytes input. */
2284
#endif /* RE_ENABLE_I18N */
2287
/* Functions for state transition. */
2289
/* Return the next state to which the current state STATE will transit by
2290
accepting the current input byte, and update STATE_LOG if necessary.
2291
If STATE can accept a multibyte char/collating element/back reference
2292
update the destination of STATE_LOG. */
2294
static re_dfastate_t *
2295
internal_function __attribute_warn_unused_result__
2296
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2297
re_dfastate_t *state)
2299
re_dfastate_t **trtable;
2302
#ifdef RE_ENABLE_I18N
2303
/* If the current state can accept multibyte. */
2304
if (BE (state->accept_mb, 0))
2306
*err = transit_state_mb (mctx, state);
2307
if (BE (*err != REG_NOERROR, 0))
2310
#endif /* RE_ENABLE_I18N */
2312
/* Then decide the next state with the single byte. */
2315
/* don't use transition table */
2316
return transit_state_sb (err, mctx, state);
2319
/* Use transition table */
2320
ch = re_string_fetch_byte (&mctx->input);
2323
trtable = state->trtable;
2324
if (BE (trtable != NULL, 1))
2327
trtable = state->word_trtable;
2328
if (BE (trtable != NULL, 1))
2330
unsigned int context;
2332
= re_string_context_at (&mctx->input,
2333
re_string_cur_idx (&mctx->input) - 1,
2335
if (IS_WORD_CONTEXT (context))
2336
return trtable[ch + SBC_MAX];
2341
if (!build_trtable (mctx->dfa, state))
2347
/* Retry, we now have a transition table. */
2351
/* Update the state_log if we need */
2352
static re_dfastate_t *
2354
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2355
re_dfastate_t *next_state)
2357
const re_dfa_t *const dfa = mctx->dfa;
2358
Idx cur_idx = re_string_cur_idx (&mctx->input);
2360
if (cur_idx > mctx->state_log_top)
2362
mctx->state_log[cur_idx] = next_state;
2363
mctx->state_log_top = cur_idx;
2365
else if (mctx->state_log[cur_idx] == 0)
2367
mctx->state_log[cur_idx] = next_state;
2371
re_dfastate_t *pstate;
2372
unsigned int context;
2373
re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2374
/* If (state_log[cur_idx] != 0), it implies that cur_idx is
2375
the destination of a multibyte char/collating element/
2376
back reference. Then the next state is the union set of
2377
these destinations and the results of the transition table. */
2378
pstate = mctx->state_log[cur_idx];
2379
log_nodes = pstate->entrance_nodes;
2380
if (next_state != NULL)
2382
table_nodes = next_state->entrance_nodes;
2383
*err = re_node_set_init_union (&next_nodes, table_nodes,
2385
if (BE (*err != REG_NOERROR, 0))
2389
next_nodes = *log_nodes;
2390
/* Note: We already add the nodes of the initial state,
2391
then we don't need to add them here. */
2393
context = re_string_context_at (&mctx->input,
2394
re_string_cur_idx (&mctx->input) - 1,
2396
next_state = mctx->state_log[cur_idx]
2397
= re_acquire_state_context (err, dfa, &next_nodes, context);
2398
/* We don't need to check errors here, since the return value of
2399
this function is next_state and ERR is already set. */
2401
if (table_nodes != NULL)
2402
re_node_set_free (&next_nodes);
2405
if (BE (dfa->nbackref, 0) && next_state != NULL)
2407
/* Check OP_OPEN_SUBEXP in the current state in case that we use them
2408
later. We must check them here, since the back references in the
2409
next state might use them. */
2410
*err = check_subexp_matching_top (mctx, &next_state->nodes,
2412
if (BE (*err != REG_NOERROR, 0))
2415
/* If the next state has back references. */
2416
if (next_state->has_backref)
2418
*err = transit_state_bkref (mctx, &next_state->nodes);
2419
if (BE (*err != REG_NOERROR, 0))
2421
next_state = mctx->state_log[cur_idx];
2428
/* Skip bytes in the input that correspond to part of a
2429
multi-byte match, then look in the log for a state
2430
from which to restart matching. */
2431
static re_dfastate_t *
2433
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2435
re_dfastate_t *cur_state;
2438
Idx max = mctx->state_log_top;
2439
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2443
if (++cur_str_idx > max)
2445
re_string_skip_bytes (&mctx->input, 1);
2447
while (mctx->state_log[cur_str_idx] == NULL);
2449
cur_state = merge_state_with_log (err, mctx, NULL);
2451
while (*err == REG_NOERROR && cur_state == NULL);
2455
/* Helper functions for transit_state. */
2457
/* From the node set CUR_NODES, pick up the nodes whose types are
2458
OP_OPEN_SUBEXP and which have corresponding back references in the regular
2459
expression. And register them to use them later for evaluating the
2460
correspoding back references. */
2462
static reg_errcode_t
2464
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2467
const re_dfa_t *const dfa = mctx->dfa;
2471
/* TODO: This isn't efficient.
2472
Because there might be more than one nodes whose types are
2473
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2476
for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2478
Idx node = cur_nodes->elems[node_idx];
2479
if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2480
&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2481
&& (dfa->used_bkref_map
2482
& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2484
err = match_ctx_add_subtop (mctx, node, str_idx);
2485
if (BE (err != REG_NOERROR, 0))
2493
/* Return the next state to which the current state STATE will transit by
2494
accepting the current input byte. */
2496
static re_dfastate_t *
2497
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2498
re_dfastate_t *state)
2500
const re_dfa_t *const dfa = mctx->dfa;
2501
re_node_set next_nodes;
2502
re_dfastate_t *next_state;
2503
Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2504
unsigned int context;
2506
*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2507
if (BE (*err != REG_NOERROR, 0))
2509
for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2511
Idx cur_node = state->nodes.elems[node_cnt];
2512
if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2514
*err = re_node_set_merge (&next_nodes,
2515
dfa->eclosures + dfa->nexts[cur_node]);
2516
if (BE (*err != REG_NOERROR, 0))
2518
re_node_set_free (&next_nodes);
2523
context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2524
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2525
/* We don't need to check errors here, since the return value of
2526
this function is next_state and ERR is already set. */
2528
re_node_set_free (&next_nodes);
2529
re_string_skip_bytes (&mctx->input, 1);
2534
#ifdef RE_ENABLE_I18N
2535
static reg_errcode_t
2537
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2539
const re_dfa_t *const dfa = mctx->dfa;
2543
for (i = 0; i < pstate->nodes.nelem; ++i)
2545
re_node_set dest_nodes, *new_nodes;
2546
Idx cur_node_idx = pstate->nodes.elems[i];
2549
unsigned int context;
2550
re_dfastate_t *dest_state;
2552
if (!dfa->nodes[cur_node_idx].accept_mb)
2555
if (dfa->nodes[cur_node_idx].constraint)
2557
context = re_string_context_at (&mctx->input,
2558
re_string_cur_idx (&mctx->input),
2560
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2565
/* How many bytes the node can accept? */
2566
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2567
re_string_cur_idx (&mctx->input));
2571
/* The node can accepts `naccepted' bytes. */
2572
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2573
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2574
: mctx->max_mb_elem_len);
2575
err = clean_state_log_if_needed (mctx, dest_idx);
2576
if (BE (err != REG_NOERROR, 0))
2579
assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2581
new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2583
dest_state = mctx->state_log[dest_idx];
2584
if (dest_state == NULL)
2585
dest_nodes = *new_nodes;
2588
err = re_node_set_init_union (&dest_nodes,
2589
dest_state->entrance_nodes, new_nodes);
2590
if (BE (err != REG_NOERROR, 0))
2593
context = re_string_context_at (&mctx->input, dest_idx - 1,
2595
mctx->state_log[dest_idx]
2596
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2597
if (dest_state != NULL)
2598
re_node_set_free (&dest_nodes);
2599
if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2604
#endif /* RE_ENABLE_I18N */
2606
static reg_errcode_t
2608
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2610
const re_dfa_t *const dfa = mctx->dfa;
2613
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2615
for (i = 0; i < nodes->nelem; ++i)
2617
Idx dest_str_idx, prev_nelem, bkc_idx;
2618
Idx node_idx = nodes->elems[i];
2619
unsigned int context;
2620
const re_token_t *node = dfa->nodes + node_idx;
2621
re_node_set *new_dest_nodes;
2623
/* Check whether `node' is a backreference or not. */
2624
if (node->type != OP_BACK_REF)
2627
if (node->constraint)
2629
context = re_string_context_at (&mctx->input, cur_str_idx,
2631
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2635
/* `node' is a backreference.
2636
Check the substring which the substring matched. */
2637
bkc_idx = mctx->nbkref_ents;
2638
err = get_subexp (mctx, node_idx, cur_str_idx);
2639
if (BE (err != REG_NOERROR, 0))
2642
/* And add the epsilon closures (which is `new_dest_nodes') of
2643
the backreference to appropriate state_log. */
2645
assert (dfa->nexts[node_idx] != REG_MISSING);
2647
for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2650
re_dfastate_t *dest_state;
2651
struct re_backref_cache_entry *bkref_ent;
2652
bkref_ent = mctx->bkref_ents + bkc_idx;
2653
if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2655
subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2656
new_dest_nodes = (subexp_len == 0
2657
? dfa->eclosures + dfa->edests[node_idx].elems[0]
2658
: dfa->eclosures + dfa->nexts[node_idx]);
2659
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2660
- bkref_ent->subexp_from);
2661
context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2663
dest_state = mctx->state_log[dest_str_idx];
2664
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2665
: mctx->state_log[cur_str_idx]->nodes.nelem);
2666
/* Add `new_dest_node' to state_log. */
2667
if (dest_state == NULL)
2669
mctx->state_log[dest_str_idx]
2670
= re_acquire_state_context (&err, dfa, new_dest_nodes,
2672
if (BE (mctx->state_log[dest_str_idx] == NULL
2673
&& err != REG_NOERROR, 0))
2678
re_node_set dest_nodes;
2679
err = re_node_set_init_union (&dest_nodes,
2680
dest_state->entrance_nodes,
2682
if (BE (err != REG_NOERROR, 0))
2684
re_node_set_free (&dest_nodes);
2687
mctx->state_log[dest_str_idx]
2688
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2689
re_node_set_free (&dest_nodes);
2690
if (BE (mctx->state_log[dest_str_idx] == NULL
2691
&& err != REG_NOERROR, 0))
2694
/* We need to check recursively if the backreference can epsilon
2697
&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2699
err = check_subexp_matching_top (mctx, new_dest_nodes,
2701
if (BE (err != REG_NOERROR, 0))
2703
err = transit_state_bkref (mctx, new_dest_nodes);
2704
if (BE (err != REG_NOERROR, 0))
2714
/* Enumerate all the candidates which the backreference BKREF_NODE can match
2715
at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2716
Note that we might collect inappropriate candidates here.
2717
However, the cost of checking them strictly here is too high, then we
2718
delay these checking for prune_impossible_nodes(). */
2720
static reg_errcode_t
2721
internal_function __attribute_warn_unused_result__
2722
get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2724
const re_dfa_t *const dfa = mctx->dfa;
2725
Idx subexp_num, sub_top_idx;
2726
const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2727
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2728
Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2729
if (cache_idx != REG_MISSING)
2731
const struct re_backref_cache_entry *entry
2732
= mctx->bkref_ents + cache_idx;
2734
if (entry->node == bkref_node)
2735
return REG_NOERROR; /* We already checked it. */
2736
while (entry++->more);
2739
subexp_num = dfa->nodes[bkref_node].opr.idx;
2741
/* For each sub expression */
2742
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2745
re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2746
re_sub_match_last_t *sub_last;
2747
Idx sub_last_idx, sl_str, bkref_str_off;
2749
if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2750
continue; /* It isn't related. */
2752
sl_str = sub_top->str_idx;
2753
bkref_str_off = bkref_str_idx;
2754
/* At first, check the last node of sub expressions we already
2756
for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2758
regoff_t sl_str_diff;
2759
sub_last = sub_top->lasts[sub_last_idx];
2760
sl_str_diff = sub_last->str_idx - sl_str;
2761
/* The matched string by the sub expression match with the substring
2762
at the back reference? */
2763
if (sl_str_diff > 0)
2765
if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2767
/* Not enough chars for a successful match. */
2768
if (bkref_str_off + sl_str_diff > mctx->input.len)
2771
err = clean_state_log_if_needed (mctx,
2774
if (BE (err != REG_NOERROR, 0))
2776
buf = (const char *) re_string_get_buffer (&mctx->input);
2778
if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2779
/* We don't need to search this sub expression any more. */
2782
bkref_str_off += sl_str_diff;
2783
sl_str += sl_str_diff;
2784
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2787
/* Reload buf, since the preceding call might have reallocated
2789
buf = (const char *) re_string_get_buffer (&mctx->input);
2791
if (err == REG_NOMATCH)
2793
if (BE (err != REG_NOERROR, 0))
2797
if (sub_last_idx < sub_top->nlasts)
2799
if (sub_last_idx > 0)
2801
/* Then, search for the other last nodes of the sub expression. */
2802
for (; sl_str <= bkref_str_idx; ++sl_str)
2805
regoff_t sl_str_off;
2806
const re_node_set *nodes;
2807
sl_str_off = sl_str - sub_top->str_idx;
2808
/* The matched string by the sub expression match with the substring
2809
at the back reference? */
2812
if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2814
/* If we are at the end of the input, we cannot match. */
2815
if (bkref_str_off >= mctx->input.len)
2818
err = extend_buffers (mctx);
2819
if (BE (err != REG_NOERROR, 0))
2822
buf = (const char *) re_string_get_buffer (&mctx->input);
2824
if (buf [bkref_str_off++] != buf[sl_str - 1])
2825
break; /* We don't need to search this sub expression
2828
if (mctx->state_log[sl_str] == NULL)
2830
/* Does this state have a ')' of the sub expression? */
2831
nodes = &mctx->state_log[sl_str]->nodes;
2832
cls_node = find_subexp_node (dfa, nodes, subexp_num,
2834
if (cls_node == REG_MISSING)
2836
if (sub_top->path == NULL)
2838
sub_top->path = calloc (sizeof (state_array_t),
2839
sl_str - sub_top->str_idx + 1);
2840
if (sub_top->path == NULL)
2843
/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2844
in the current context? */
2845
err = check_arrival (mctx, sub_top->path, sub_top->node,
2846
sub_top->str_idx, cls_node, sl_str,
2848
if (err == REG_NOMATCH)
2850
if (BE (err != REG_NOERROR, 0))
2852
sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2853
if (BE (sub_last == NULL, 0))
2855
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2857
if (err == REG_NOMATCH)
2864
/* Helper functions for get_subexp(). */
2866
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2867
If it can arrive, register the sub expression expressed with SUB_TOP
2870
static reg_errcode_t
2872
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2873
re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2877
/* Can the subexpression arrive the back reference? */
2878
err = check_arrival (mctx, &sub_last->path, sub_last->node,
2879
sub_last->str_idx, bkref_node, bkref_str,
2881
if (err != REG_NOERROR)
2883
err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2885
if (BE (err != REG_NOERROR, 0))
2887
to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2888
return clean_state_log_if_needed (mctx, to_idx);
2891
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2892
Search '(' if FL_OPEN, or search ')' otherwise.
2893
TODO: This function isn't efficient...
2894
Because there might be more than one nodes whose types are
2895
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2901
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2902
Idx subexp_idx, int type)
2905
for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2907
Idx cls_node = nodes->elems[cls_idx];
2908
const re_token_t *node = dfa->nodes + cls_node;
2909
if (node->type == type
2910
&& node->opr.idx == subexp_idx)
2916
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2917
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2919
Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2921
static reg_errcode_t
2922
internal_function __attribute_warn_unused_result__
2923
check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2924
Idx top_str, Idx last_node, Idx last_str, int type)
2926
const re_dfa_t *const dfa = mctx->dfa;
2927
reg_errcode_t err = REG_NOERROR;
2928
Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2929
re_dfastate_t *cur_state = NULL;
2930
re_node_set *cur_nodes, next_nodes;
2931
re_dfastate_t **backup_state_log;
2932
unsigned int context;
2934
subexp_num = dfa->nodes[top_node].opr.idx;
2935
/* Extend the buffer if we need. */
2936
if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2938
re_dfastate_t **new_array;
2939
Idx old_alloc = path->alloc;
2940
Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2941
if (BE (new_alloc < old_alloc, 0)
2942
|| BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2944
new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2945
if (BE (new_array == NULL, 0))
2947
path->array = new_array;
2948
path->alloc = new_alloc;
2949
memset (new_array + old_alloc, '\0',
2950
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2953
str_idx = path->next_idx ? path->next_idx : top_str;
2955
/* Temporary modify MCTX. */
2956
backup_state_log = mctx->state_log;
2957
backup_cur_idx = mctx->input.cur_idx;
2958
mctx->state_log = path->array;
2959
mctx->input.cur_idx = str_idx;
2961
/* Setup initial node set. */
2962
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2963
if (str_idx == top_str)
2965
err = re_node_set_init_1 (&next_nodes, top_node);
2966
if (BE (err != REG_NOERROR, 0))
2968
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2969
if (BE (err != REG_NOERROR, 0))
2971
re_node_set_free (&next_nodes);
2977
cur_state = mctx->state_log[str_idx];
2978
if (cur_state && cur_state->has_backref)
2980
err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2981
if (BE (err != REG_NOERROR, 0))
2985
re_node_set_init_empty (&next_nodes);
2987
if (str_idx == top_str || (cur_state && cur_state->has_backref))
2989
if (next_nodes.nelem)
2991
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2993
if (BE (err != REG_NOERROR, 0))
2995
re_node_set_free (&next_nodes);
2999
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3000
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3002
re_node_set_free (&next_nodes);
3005
mctx->state_log[str_idx] = cur_state;
3008
for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3010
re_node_set_empty (&next_nodes);
3011
if (mctx->state_log[str_idx + 1])
3013
err = re_node_set_merge (&next_nodes,
3014
&mctx->state_log[str_idx + 1]->nodes);
3015
if (BE (err != REG_NOERROR, 0))
3017
re_node_set_free (&next_nodes);
3023
err = check_arrival_add_next_nodes (mctx, str_idx,
3024
&cur_state->non_eps_nodes,
3026
if (BE (err != REG_NOERROR, 0))
3028
re_node_set_free (&next_nodes);
3033
if (next_nodes.nelem)
3035
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3036
if (BE (err != REG_NOERROR, 0))
3038
re_node_set_free (&next_nodes);
3041
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3043
if (BE (err != REG_NOERROR, 0))
3045
re_node_set_free (&next_nodes);
3049
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3050
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3051
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3053
re_node_set_free (&next_nodes);
3056
mctx->state_log[str_idx] = cur_state;
3057
null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3059
re_node_set_free (&next_nodes);
3060
cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3061
: &mctx->state_log[last_str]->nodes);
3062
path->next_idx = str_idx;
3065
mctx->state_log = backup_state_log;
3066
mctx->input.cur_idx = backup_cur_idx;
3068
/* Then check the current node set has the node LAST_NODE. */
3069
if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3075
/* Helper functions for check_arrival. */
3077
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3079
TODO: This function is similar to the functions transit_state*(),
3080
however this function has many additional works.
3081
Can't we unify them? */
3083
static reg_errcode_t
3084
internal_function __attribute_warn_unused_result__
3085
check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3086
re_node_set *cur_nodes, re_node_set *next_nodes)
3088
const re_dfa_t *const dfa = mctx->dfa;
3091
#ifdef RE_ENABLE_I18N
3092
reg_errcode_t err = REG_NOERROR;
3094
re_node_set union_set;
3095
re_node_set_init_empty (&union_set);
3096
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3099
Idx cur_node = cur_nodes->elems[cur_idx];
3101
re_token_type_t type = dfa->nodes[cur_node].type;
3102
assert (!IS_EPSILON_NODE (type));
3104
#ifdef RE_ENABLE_I18N
3105
/* If the node may accept `multi byte'. */
3106
if (dfa->nodes[cur_node].accept_mb)
3108
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3112
re_dfastate_t *dest_state;
3113
Idx next_node = dfa->nexts[cur_node];
3114
Idx next_idx = str_idx + naccepted;
3115
dest_state = mctx->state_log[next_idx];
3116
re_node_set_empty (&union_set);
3119
err = re_node_set_merge (&union_set, &dest_state->nodes);
3120
if (BE (err != REG_NOERROR, 0))
3122
re_node_set_free (&union_set);
3126
ok = re_node_set_insert (&union_set, next_node);
3129
re_node_set_free (&union_set);
3132
mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3134
if (BE (mctx->state_log[next_idx] == NULL
3135
&& err != REG_NOERROR, 0))
3137
re_node_set_free (&union_set);
3142
#endif /* RE_ENABLE_I18N */
3144
|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3146
ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3149
re_node_set_free (&union_set);
3154
re_node_set_free (&union_set);
3158
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3159
CUR_NODES, however exclude the nodes which are:
3160
- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3161
- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3164
static reg_errcode_t
3166
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3167
Idx ex_subexp, int type)
3170
Idx idx, outside_node;
3171
re_node_set new_nodes;
3173
assert (cur_nodes->nelem);
3175
err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3176
if (BE (err != REG_NOERROR, 0))
3178
/* Create a new node set NEW_NODES with the nodes which are epsilon
3179
closures of the node in CUR_NODES. */
3181
for (idx = 0; idx < cur_nodes->nelem; ++idx)
3183
Idx cur_node = cur_nodes->elems[idx];
3184
const re_node_set *eclosure = dfa->eclosures + cur_node;
3185
outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3186
if (outside_node == REG_MISSING)
3188
/* There are no problematic nodes, just merge them. */
3189
err = re_node_set_merge (&new_nodes, eclosure);
3190
if (BE (err != REG_NOERROR, 0))
3192
re_node_set_free (&new_nodes);
3198
/* There are problematic nodes, re-calculate incrementally. */
3199
err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3201
if (BE (err != REG_NOERROR, 0))
3203
re_node_set_free (&new_nodes);
3208
re_node_set_free (cur_nodes);
3209
*cur_nodes = new_nodes;
3213
/* Helper function for check_arrival_expand_ecl.
3214
Check incrementally the epsilon closure of TARGET, and if it isn't
3215
problematic append it to DST_NODES. */
3217
static reg_errcode_t
3218
internal_function __attribute_warn_unused_result__
3219
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3220
Idx target, Idx ex_subexp, int type)
3223
for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3227
if (dfa->nodes[cur_node].type == type
3228
&& dfa->nodes[cur_node].opr.idx == ex_subexp)
3230
if (type == OP_CLOSE_SUBEXP)
3232
ok = re_node_set_insert (dst_nodes, cur_node);
3238
ok = re_node_set_insert (dst_nodes, cur_node);
3241
if (dfa->edests[cur_node].nelem == 0)
3243
if (dfa->edests[cur_node].nelem == 2)
3246
err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3247
dfa->edests[cur_node].elems[1],
3249
if (BE (err != REG_NOERROR, 0))
3252
cur_node = dfa->edests[cur_node].elems[0];
3258
/* For all the back references in the current state, calculate the
3259
destination of the back references by the appropriate entry
3260
in MCTX->BKREF_ENTS. */
3262
static reg_errcode_t
3263
internal_function __attribute_warn_unused_result__
3264
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3265
Idx cur_str, Idx subexp_num, int type)
3267
const re_dfa_t *const dfa = mctx->dfa;
3269
Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3270
struct re_backref_cache_entry *ent;
3272
if (cache_idx_start == REG_MISSING)
3276
ent = mctx->bkref_ents + cache_idx_start;
3279
Idx to_idx, next_node;
3281
/* Is this entry ENT is appropriate? */
3282
if (!re_node_set_contains (cur_nodes, ent->node))
3285
to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3286
/* Calculate the destination of the back reference, and append it
3287
to MCTX->STATE_LOG. */
3288
if (to_idx == cur_str)
3290
/* The backreference did epsilon transit, we must re-check all the
3291
node in the current state. */
3292
re_node_set new_dests;
3293
reg_errcode_t err2, err3;
3294
next_node = dfa->edests[ent->node].elems[0];
3295
if (re_node_set_contains (cur_nodes, next_node))
3297
err = re_node_set_init_1 (&new_dests, next_node);
3298
err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3299
err3 = re_node_set_merge (cur_nodes, &new_dests);
3300
re_node_set_free (&new_dests);
3301
if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3302
|| err3 != REG_NOERROR, 0))
3304
err = (err != REG_NOERROR ? err
3305
: (err2 != REG_NOERROR ? err2 : err3));
3308
/* TODO: It is still inefficient... */
3313
re_node_set union_set;
3314
next_node = dfa->nexts[ent->node];
3315
if (mctx->state_log[to_idx])
3318
if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3321
err = re_node_set_init_copy (&union_set,
3322
&mctx->state_log[to_idx]->nodes);
3323
ok = re_node_set_insert (&union_set, next_node);
3324
if (BE (err != REG_NOERROR || ! ok, 0))
3326
re_node_set_free (&union_set);
3327
err = err != REG_NOERROR ? err : REG_ESPACE;
3333
err = re_node_set_init_1 (&union_set, next_node);
3334
if (BE (err != REG_NOERROR, 0))
3337
mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3338
re_node_set_free (&union_set);
3339
if (BE (mctx->state_log[to_idx] == NULL
3340
&& err != REG_NOERROR, 0))
3344
while (ent++->more);
3348
/* Build transition table for the state.
3349
Return true if successful. */
3353
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3358
bool need_word_trtable = false;
3359
bitset_word_t elem, mask;
3360
bool dests_node_malloced = false;
3361
bool dest_states_malloced = false;
3362
Idx ndests; /* Number of the destination states from `state'. */
3363
re_dfastate_t **trtable;
3364
re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3365
re_node_set follows, *dests_node;
3367
bitset_t acceptable;
3371
re_node_set dests_node[SBC_MAX];
3372
bitset_t dests_ch[SBC_MAX];
3375
/* We build DFA states which corresponds to the destination nodes
3376
from `state'. `dests_node[i]' represents the nodes which i-th
3377
destination state contains, and `dests_ch[i]' represents the
3378
characters which i-th destination state accepts. */
3379
if (__libc_use_alloca (sizeof (struct dests_alloc)))
3380
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3383
dests_alloc = re_malloc (struct dests_alloc, 1);
3384
if (BE (dests_alloc == NULL, 0))
3386
dests_node_malloced = true;
3388
dests_node = dests_alloc->dests_node;
3389
dests_ch = dests_alloc->dests_ch;
3391
/* Initialize transiton table. */
3392
state->word_trtable = state->trtable = NULL;
3394
/* At first, group all nodes belonging to `state' into several
3396
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3397
if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3399
if (dests_node_malloced)
3403
state->trtable = (re_dfastate_t **)
3404
calloc (sizeof (re_dfastate_t *), SBC_MAX);
3410
err = re_node_set_alloc (&follows, ndests + 1);
3411
if (BE (err != REG_NOERROR, 0))
3414
/* Avoid arithmetic overflow in size calculation. */
3415
if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3416
/ (3 * sizeof (re_dfastate_t *)))
3421
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3422
+ ndests * 3 * sizeof (re_dfastate_t *)))
3423
dest_states = (re_dfastate_t **)
3424
alloca (ndests * 3 * sizeof (re_dfastate_t *));
3427
dest_states = (re_dfastate_t **)
3428
malloc (ndests * 3 * sizeof (re_dfastate_t *));
3429
if (BE (dest_states == NULL, 0))
3432
if (dest_states_malloced)
3434
re_node_set_free (&follows);
3435
for (i = 0; i < ndests; ++i)
3436
re_node_set_free (dests_node + i);
3437
if (dests_node_malloced)
3441
dest_states_malloced = true;
3443
dest_states_word = dest_states + ndests;
3444
dest_states_nl = dest_states_word + ndests;
3445
bitset_empty (acceptable);
3447
/* Then build the states for all destinations. */
3448
for (i = 0; i < ndests; ++i)
3451
re_node_set_empty (&follows);
3452
/* Merge the follows of this destination states. */
3453
for (j = 0; j < dests_node[i].nelem; ++j)
3455
next_node = dfa->nexts[dests_node[i].elems[j]];
3456
if (next_node != REG_MISSING)
3458
err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3459
if (BE (err != REG_NOERROR, 0))
3463
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3464
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3466
/* If the new state has context constraint,
3467
build appropriate states for these contexts. */
3468
if (dest_states[i]->has_constraint)
3470
dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3472
if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3475
if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3476
need_word_trtable = true;
3478
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3480
if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3485
dest_states_word[i] = dest_states[i];
3486
dest_states_nl[i] = dest_states[i];
3488
bitset_merge (acceptable, dests_ch[i]);
3491
if (!BE (need_word_trtable, 0))
3493
/* We don't care about whether the following character is a word
3494
character, or we are in a single-byte character set so we can
3495
discern by looking at the character code: allocate a
3496
256-entry transition table. */
3497
trtable = state->trtable =
3498
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3499
if (BE (trtable == NULL, 0))
3502
/* For all characters ch...: */
3503
for (i = 0; i < BITSET_WORDS; ++i)
3504
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3506
mask <<= 1, elem >>= 1, ++ch)
3507
if (BE (elem & 1, 0))
3509
/* There must be exactly one destination which accepts
3510
character ch. See group_nodes_into_DFAstates. */
3511
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3514
/* j-th destination accepts the word character ch. */
3515
if (dfa->word_char[i] & mask)
3516
trtable[ch] = dest_states_word[j];
3518
trtable[ch] = dest_states[j];
3523
/* We care about whether the following character is a word
3524
character, and we are in a multi-byte character set: discern
3525
by looking at the character code: build two 256-entry
3526
transition tables, one starting at trtable[0] and one
3527
starting at trtable[SBC_MAX]. */
3528
trtable = state->word_trtable =
3529
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3530
if (BE (trtable == NULL, 0))
3533
/* For all characters ch...: */
3534
for (i = 0; i < BITSET_WORDS; ++i)
3535
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3537
mask <<= 1, elem >>= 1, ++ch)
3538
if (BE (elem & 1, 0))
3540
/* There must be exactly one destination which accepts
3541
character ch. See group_nodes_into_DFAstates. */
3542
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3545
/* j-th destination accepts the word character ch. */
3546
trtable[ch] = dest_states[j];
3547
trtable[ch + SBC_MAX] = dest_states_word[j];
3552
if (bitset_contain (acceptable, NEWLINE_CHAR))
3554
/* The current state accepts newline character. */
3555
for (j = 0; j < ndests; ++j)
3556
if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3558
/* k-th destination accepts newline character. */
3559
trtable[NEWLINE_CHAR] = dest_states_nl[j];
3560
if (need_word_trtable)
3561
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3562
/* There must be only one destination which accepts
3563
newline. See group_nodes_into_DFAstates. */
3568
if (dest_states_malloced)
3571
re_node_set_free (&follows);
3572
for (i = 0; i < ndests; ++i)
3573
re_node_set_free (dests_node + i);
3575
if (dests_node_malloced)
3581
/* Group all nodes belonging to STATE into several destinations.
3582
Then for all destinations, set the nodes belonging to the destination
3583
to DESTS_NODE[i] and set the characters accepted by the destination
3584
to DEST_CH[i]. This function return the number of destinations. */
3588
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3589
re_node_set *dests_node, bitset_t *dests_ch)
3594
Idx ndests; /* Number of the destinations from `state'. */
3595
bitset_t accepts; /* Characters a node can accept. */
3596
const re_node_set *cur_nodes = &state->nodes;
3597
bitset_empty (accepts);
3600
/* For all the nodes belonging to `state', */
3601
for (i = 0; i < cur_nodes->nelem; ++i)
3603
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3604
re_token_type_t type = node->type;
3605
unsigned int constraint = node->constraint;
3607
/* Enumerate all single byte character this node can accept. */
3608
if (type == CHARACTER)
3609
bitset_set (accepts, node->opr.c);
3610
else if (type == SIMPLE_BRACKET)
3612
bitset_merge (accepts, node->opr.sbcset);
3614
else if (type == OP_PERIOD)
3616
#ifdef RE_ENABLE_I18N
3617
if (dfa->mb_cur_max > 1)
3618
bitset_merge (accepts, dfa->sb_char);
3621
bitset_set_all (accepts);
3622
if (!(dfa->syntax & RE_DOT_NEWLINE))
3623
bitset_clear (accepts, '\n');
3624
if (dfa->syntax & RE_DOT_NOT_NULL)
3625
bitset_clear (accepts, '\0');
3627
#ifdef RE_ENABLE_I18N
3628
else if (type == OP_UTF8_PERIOD)
3630
if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3631
memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3633
bitset_merge (accepts, utf8_sb_map);
3634
if (!(dfa->syntax & RE_DOT_NEWLINE))
3635
bitset_clear (accepts, '\n');
3636
if (dfa->syntax & RE_DOT_NOT_NULL)
3637
bitset_clear (accepts, '\0');
3643
/* Check the `accepts' and sift the characters which are not
3644
match it the context. */
3647
if (constraint & NEXT_NEWLINE_CONSTRAINT)
3649
bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3650
bitset_empty (accepts);
3651
if (accepts_newline)
3652
bitset_set (accepts, NEWLINE_CHAR);
3656
if (constraint & NEXT_ENDBUF_CONSTRAINT)
3658
bitset_empty (accepts);
3662
if (constraint & NEXT_WORD_CONSTRAINT)
3664
bitset_word_t any_set = 0;
3665
if (type == CHARACTER && !node->word_char)
3667
bitset_empty (accepts);
3670
#ifdef RE_ENABLE_I18N
3671
if (dfa->mb_cur_max > 1)
3672
for (j = 0; j < BITSET_WORDS; ++j)
3673
any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3676
for (j = 0; j < BITSET_WORDS; ++j)
3677
any_set |= (accepts[j] &= dfa->word_char[j]);
3681
if (constraint & NEXT_NOTWORD_CONSTRAINT)
3683
bitset_word_t any_set = 0;
3684
if (type == CHARACTER && node->word_char)
3686
bitset_empty (accepts);
3689
#ifdef RE_ENABLE_I18N
3690
if (dfa->mb_cur_max > 1)
3691
for (j = 0; j < BITSET_WORDS; ++j)
3692
any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3695
for (j = 0; j < BITSET_WORDS; ++j)
3696
any_set |= (accepts[j] &= ~dfa->word_char[j]);
3702
/* Then divide `accepts' into DFA states, or create a new
3703
state. Above, we make sure that accepts is not empty. */
3704
for (j = 0; j < ndests; ++j)
3706
bitset_t intersec; /* Intersection sets, see below. */
3708
/* Flags, see below. */
3709
bitset_word_t has_intersec, not_subset, not_consumed;
3711
/* Optimization, skip if this state doesn't accept the character. */
3712
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3715
/* Enumerate the intersection set of this state and `accepts'. */
3717
for (k = 0; k < BITSET_WORDS; ++k)
3718
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3719
/* And skip if the intersection set is empty. */
3723
/* Then check if this state is a subset of `accepts'. */
3724
not_subset = not_consumed = 0;
3725
for (k = 0; k < BITSET_WORDS; ++k)
3727
not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3728
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3731
/* If this state isn't a subset of `accepts', create a
3732
new group state, which has the `remains'. */
3735
bitset_copy (dests_ch[ndests], remains);
3736
bitset_copy (dests_ch[j], intersec);
3737
err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3738
if (BE (err != REG_NOERROR, 0))
3743
/* Put the position in the current group. */
3744
ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3748
/* If all characters are consumed, go to next node. */
3752
/* Some characters remain, create a new group. */
3755
bitset_copy (dests_ch[ndests], accepts);
3756
err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3757
if (BE (err != REG_NOERROR, 0))
3760
bitset_empty (accepts);
3765
for (j = 0; j < ndests; ++j)
3766
re_node_set_free (dests_node + j);
3770
#ifdef RE_ENABLE_I18N
3771
/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3772
Return the number of the bytes the node accepts.
3773
STR_IDX is the current index of the input string.
3775
This function handles the nodes which can accept one character, or
3776
one collating element like '.', '[a-z]', opposite to the other nodes
3777
can only accept one byte. */
3781
check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3782
const re_string_t *input, Idx str_idx)
3784
const re_token_t *node = dfa->nodes + node_idx;
3785
int char_len, elem_len;
3788
if (BE (node->type == OP_UTF8_PERIOD, 0))
3790
unsigned char c = re_string_byte_at (input, str_idx), d;
3791
if (BE (c < 0xc2, 1))
3794
if (str_idx + 2 > input->len)
3797
d = re_string_byte_at (input, str_idx + 1);
3799
return (d < 0x80 || d > 0xbf) ? 0 : 2;
3803
if (c == 0xe0 && d < 0xa0)
3809
if (c == 0xf0 && d < 0x90)
3815
if (c == 0xf8 && d < 0x88)
3821
if (c == 0xfc && d < 0x84)
3827
if (str_idx + char_len > input->len)
3830
for (i = 1; i < char_len; ++i)
3832
d = re_string_byte_at (input, str_idx + i);
3833
if (d < 0x80 || d > 0xbf)
3839
char_len = re_string_char_size_at (input, str_idx);
3840
if (node->type == OP_PERIOD)
3844
/* FIXME: I don't think this if is needed, as both '\n'
3845
and '\0' are char_len == 1. */
3846
/* '.' accepts any one character except the following two cases. */
3847
if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3848
re_string_byte_at (input, str_idx) == '\n') ||
3849
((dfa->syntax & RE_DOT_NOT_NULL) &&
3850
re_string_byte_at (input, str_idx) == '\0'))
3855
elem_len = re_string_elem_size_at (input, str_idx);
3856
if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3859
if (node->type == COMPLEX_BRACKET)
3861
const re_charset_t *cset = node->opr.mbcset;
3863
const unsigned char *pin
3864
= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3869
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3870
? re_string_wchar_at (input, str_idx) : 0);
3872
/* match with multibyte character? */
3873
for (i = 0; i < cset->nmbchars; ++i)
3874
if (wc == cset->mbchars[i])
3876
match_len = char_len;
3877
goto check_node_accept_bytes_match;
3879
/* match with character_class? */
3880
for (i = 0; i < cset->nchar_classes; ++i)
3882
wctype_t wt = cset->char_classes[i];
3883
if (__iswctype (wc, wt))
3885
match_len = char_len;
3886
goto check_node_accept_bytes_match;
3891
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3894
unsigned int in_collseq = 0;
3895
const int32_t *table, *indirect;
3896
const unsigned char *weights, *extra;
3897
const char *collseqwc;
3899
/* This #include defines a local function! */
3900
# include <locale/weight.h>
3902
/* match with collating_symbol? */
3903
if (cset->ncoll_syms)
3904
extra = (const unsigned char *)
3905
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3906
for (i = 0; i < cset->ncoll_syms; ++i)
3908
const unsigned char *coll_sym = extra + cset->coll_syms[i];
3909
/* Compare the length of input collating element and
3910
the length of current collating element. */
3911
if (*coll_sym != elem_len)
3913
/* Compare each bytes. */
3914
for (j = 0; j < *coll_sym; j++)
3915
if (pin[j] != coll_sym[1 + j])
3919
/* Match if every bytes is equal. */
3921
goto check_node_accept_bytes_match;
3927
if (elem_len <= char_len)
3929
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3930
in_collseq = __collseq_table_lookup (collseqwc, wc);
3933
in_collseq = find_collation_sequence_value (pin, elem_len);
3935
/* match with range expression? */
3936
for (i = 0; i < cset->nranges; ++i)
3937
if (cset->range_starts[i] <= in_collseq
3938
&& in_collseq <= cset->range_ends[i])
3940
match_len = elem_len;
3941
goto check_node_accept_bytes_match;
3944
/* match with equivalence_class? */
3945
if (cset->nequiv_classes)
3947
const unsigned char *cp = pin;
3948
table = (const int32_t *)
3949
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3950
weights = (const unsigned char *)
3951
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3952
extra = (const unsigned char *)
3953
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3954
indirect = (const int32_t *)
3955
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3956
int32_t idx = findidx (&cp);
3958
for (i = 0; i < cset->nequiv_classes; ++i)
3960
int32_t equiv_class_idx = cset->equiv_classes[i];
3961
size_t weight_len = weights[idx & 0xffffff];
3962
if (weight_len == weights[equiv_class_idx & 0xffffff]
3963
&& (idx >> 24) == (equiv_class_idx >> 24))
3968
equiv_class_idx &= 0xffffff;
3970
while (cnt <= weight_len
3971
&& (weights[equiv_class_idx + 1 + cnt]
3972
== weights[idx + 1 + cnt]))
3974
if (cnt > weight_len)
3976
match_len = elem_len;
3977
goto check_node_accept_bytes_match;
3986
/* match with range expression? */
3987
#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3988
wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3990
wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3993
for (i = 0; i < cset->nranges; ++i)
3995
cmp_buf[0] = cset->range_starts[i];
3996
cmp_buf[4] = cset->range_ends[i];
3997
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3998
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4000
match_len = char_len;
4001
goto check_node_accept_bytes_match;
4005
check_node_accept_bytes_match:
4006
if (!cset->non_match)
4013
return (elem_len > char_len) ? elem_len : char_len;
4022
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4024
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4029
/* No valid character. Match it as a single byte character. */
4030
const unsigned char *collseq = (const unsigned char *)
4031
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4032
return collseq[mbs[0]];
4039
const unsigned char *extra = (const unsigned char *)
4040
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4041
int32_t extrasize = (const unsigned char *)
4042
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4044
for (idx = 0; idx < extrasize;)
4048
int32_t elem_mbs_len;
4049
/* Skip the name of collating element name. */
4050
idx = idx + extra[idx] + 1;
4051
elem_mbs_len = extra[idx++];
4052
if (mbs_len == elem_mbs_len)
4054
for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4055
if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4057
if (mbs_cnt == elem_mbs_len)
4058
/* Found the entry. */
4061
/* Skip the byte sequence of the collating element. */
4062
idx += elem_mbs_len;
4063
/* Adjust for the alignment. */
4064
idx = (idx + 3) & ~3;
4065
/* Skip the collation sequence value. */
4066
idx += sizeof (uint32_t);
4067
/* Skip the wide char sequence of the collating element. */
4068
idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4069
/* If we found the entry, return the sequence value. */
4071
return *(uint32_t *) (extra + idx);
4072
/* Skip the collation sequence value. */
4073
idx += sizeof (uint32_t);
4079
#endif /* RE_ENABLE_I18N */
4081
/* Check whether the node accepts the byte which is IDX-th
4082
byte of the INPUT. */
4086
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4090
ch = re_string_byte_at (&mctx->input, idx);
4094
if (node->opr.c != ch)
4098
case SIMPLE_BRACKET:
4099
if (!bitset_contain (node->opr.sbcset, ch))
4103
#ifdef RE_ENABLE_I18N
4104
case OP_UTF8_PERIOD:
4105
if (ch >= ASCII_CHARS)
4110
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4111
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4119
if (node->constraint)
4121
/* The node has constraints. Check whether the current context
4122
satisfies the constraints. */
4123
unsigned int context = re_string_context_at (&mctx->input, idx,
4125
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4132
/* Extend the buffers, if the buffers have run out. */
4134
static reg_errcode_t
4135
internal_function __attribute_warn_unused_result__
4136
extend_buffers (re_match_context_t *mctx)
4139
re_string_t *pstr = &mctx->input;
4141
/* Avoid overflow. */
4142
if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4145
/* Double the lengthes of the buffers. */
4146
ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4147
if (BE (ret != REG_NOERROR, 0))
4150
if (mctx->state_log != NULL)
4152
/* And double the length of state_log. */
4153
/* XXX We have no indication of the size of this buffer. If this
4154
allocation fail we have no indication that the state_log array
4155
does not have the right size. */
4156
re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4157
pstr->bufs_len + 1);
4158
if (BE (new_array == NULL, 0))
4160
mctx->state_log = new_array;
4163
/* Then reconstruct the buffers. */
4166
#ifdef RE_ENABLE_I18N
4167
if (pstr->mb_cur_max > 1)
4169
ret = build_wcs_upper_buffer (pstr);
4170
if (BE (ret != REG_NOERROR, 0))
4174
#endif /* RE_ENABLE_I18N */
4175
build_upper_buffer (pstr);
4179
#ifdef RE_ENABLE_I18N
4180
if (pstr->mb_cur_max > 1)
4181
build_wcs_buffer (pstr);
4183
#endif /* RE_ENABLE_I18N */
4185
if (pstr->trans != NULL)
4186
re_string_translate_buffer (pstr);
4193
/* Functions for matching context. */
4195
/* Initialize MCTX. */
4197
static reg_errcode_t
4198
internal_function __attribute_warn_unused_result__
4199
match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4201
mctx->eflags = eflags;
4202
mctx->match_last = REG_MISSING;
4205
/* Avoid overflow. */
4206
size_t max_object_size =
4207
MAX (sizeof (struct re_backref_cache_entry),
4208
sizeof (re_sub_match_top_t *));
4209
if (BE (SIZE_MAX / max_object_size < n, 0))
4212
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4213
mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4214
if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4217
/* Already zero-ed by the caller.
4219
mctx->bkref_ents = NULL;
4220
mctx->nbkref_ents = 0;
4221
mctx->nsub_tops = 0; */
4222
mctx->abkref_ents = n;
4223
mctx->max_mb_elem_len = 1;
4224
mctx->asub_tops = n;
4228
/* Clean the entries which depend on the current input in MCTX.
4229
This function must be invoked when the matcher changes the start index
4230
of the input, or changes the input string. */
4234
match_ctx_clean (re_match_context_t *mctx)
4237
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4240
re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4241
for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4243
re_sub_match_last_t *last = top->lasts[sl_idx];
4244
re_free (last->path.array);
4247
re_free (top->lasts);
4250
re_free (top->path->array);
4251
re_free (top->path);
4256
mctx->nsub_tops = 0;
4257
mctx->nbkref_ents = 0;
4260
/* Free all the memory associated with MCTX. */
4264
match_ctx_free (re_match_context_t *mctx)
4266
/* First, free all the memory associated with MCTX->SUB_TOPS. */
4267
match_ctx_clean (mctx);
4268
re_free (mctx->sub_tops);
4269
re_free (mctx->bkref_ents);
4272
/* Add a new backreference entry to MCTX.
4273
Note that we assume that caller never call this function with duplicate
4274
entry, and call with STR_IDX which isn't smaller than any existing entry.
4277
static reg_errcode_t
4278
internal_function __attribute_warn_unused_result__
4279
match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4282
if (mctx->nbkref_ents >= mctx->abkref_ents)
4284
struct re_backref_cache_entry* new_entry;
4285
new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4286
mctx->abkref_ents * 2);
4287
if (BE (new_entry == NULL, 0))
4289
re_free (mctx->bkref_ents);
4292
mctx->bkref_ents = new_entry;
4293
memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4294
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4295
mctx->abkref_ents *= 2;
4297
if (mctx->nbkref_ents > 0
4298
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4299
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4301
mctx->bkref_ents[mctx->nbkref_ents].node = node;
4302
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4303
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4304
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4306
/* This is a cache that saves negative results of check_dst_limits_calc_pos.
4307
If bit N is clear, means that this entry won't epsilon-transition to
4308
an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4309
it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4312
A backreference does not epsilon-transition unless it is empty, so set
4313
to all zeros if FROM != TO. */
4314
mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4315
= (from == to ? -1 : 0);
4317
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4318
if (mctx->max_mb_elem_len < to - from)
4319
mctx->max_mb_elem_len = to - from;
4323
/* Return the first entry with the same str_idx, or REG_MISSING if none is
4324
found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4328
search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4330
Idx left, right, mid, last;
4331
last = right = mctx->nbkref_ents;
4332
for (left = 0; left < right;)
4334
mid = (left + right) / 2;
4335
if (mctx->bkref_ents[mid].str_idx < str_idx)
4340
if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4346
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4349
static reg_errcode_t
4350
internal_function __attribute_warn_unused_result__
4351
match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4354
assert (mctx->sub_tops != NULL);
4355
assert (mctx->asub_tops > 0);
4357
if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4359
Idx new_asub_tops = mctx->asub_tops * 2;
4360
re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4361
re_sub_match_top_t *,
4363
if (BE (new_array == NULL, 0))
4365
mctx->sub_tops = new_array;
4366
mctx->asub_tops = new_asub_tops;
4368
mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4369
if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4371
mctx->sub_tops[mctx->nsub_tops]->node = node;
4372
mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4376
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4377
at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4379
static re_sub_match_last_t *
4381
match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4383
re_sub_match_last_t *new_entry;
4384
if (BE (subtop->nlasts == subtop->alasts, 0))
4386
Idx new_alasts = 2 * subtop->alasts + 1;
4387
re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4388
re_sub_match_last_t *,
4390
if (BE (new_array == NULL, 0))
4392
subtop->lasts = new_array;
4393
subtop->alasts = new_alasts;
4395
new_entry = calloc (1, sizeof (re_sub_match_last_t));
4396
if (BE (new_entry != NULL, 1))
4398
subtop->lasts[subtop->nlasts] = new_entry;
4399
new_entry->node = node;
4400
new_entry->str_idx = str_idx;
4408
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4409
re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4411
sctx->sifted_states = sifted_sts;
4412
sctx->limited_states = limited_sts;
4413
sctx->last_node = last_node;
4414
sctx->last_str_idx = last_str_idx;
4415
re_node_set_init_empty (&sctx->limits);