1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002-2023 Free Software Foundation, Inc.
3
This file is part of the GNU C Library.
4
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU Lesser General Public
8
License as published by the Free Software Foundation; either
9
version 2.1 of the License, or (at your option) any later version.
11
The GNU C Library is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
Lesser General Public License for more details.
16
You should have received a copy of the GNU Lesser General Public
17
License along with the GNU C Library; if not, see
18
<https://www.gnu.org/licenses/>. */
20
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22
static void match_ctx_clean (re_match_context_t *mctx);
23
static void match_ctx_free (re_match_context_t *cache);
24
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25
Idx str_idx, Idx from, Idx to);
26
static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
29
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30
Idx node, Idx str_idx);
31
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32
re_dfastate_t **limited_sts, Idx last_node,
34
static reg_errcode_t re_search_internal (const regex_t *preg,
35
const char *string, Idx length,
36
Idx start, Idx last_start, Idx stop,
37
size_t nmatch, regmatch_t pmatch[],
39
static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40
const char *string1, Idx length1,
41
const char *string2, Idx length2,
42
Idx start, regoff_t range,
43
struct re_registers *regs,
44
Idx stop, bool ret_len);
45
static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46
const char *string, Idx length, Idx start,
47
regoff_t range, Idx stop,
48
struct re_registers *regs,
50
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51
Idx nregs, int regs_allocated);
52
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
55
static Idx check_halt_state_context (const re_match_context_t *mctx,
56
const re_dfastate_t *state, Idx idx);
57
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58
regmatch_t *prev_idx_match, Idx cur_node,
59
Idx cur_idx, Idx nmatch);
60
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61
Idx str_idx, Idx dest_node, Idx nregs,
62
regmatch_t *regs, regmatch_t *prevregs,
63
re_node_set *eps_via_nodes);
64
static reg_errcode_t set_regs (const regex_t *preg,
65
const re_match_context_t *mctx,
66
size_t nmatch, regmatch_t *pmatch,
68
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
70
static int sift_states_iter_mb (const re_match_context_t *mctx,
71
re_sift_context_t *sctx,
72
Idx node_idx, Idx str_idx, Idx max_str_idx);
73
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74
re_sift_context_t *sctx);
75
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76
re_sift_context_t *sctx, Idx str_idx,
77
re_node_set *cur_dest);
78
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79
re_sift_context_t *sctx,
81
re_node_set *dest_nodes);
82
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83
re_node_set *dest_nodes,
84
const re_node_set *candidates);
85
static bool check_dst_limits (const re_match_context_t *mctx,
86
const re_node_set *limits,
87
Idx dst_node, Idx dst_idx, Idx src_node,
89
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90
int boundaries, Idx subexp_idx,
91
Idx from_node, Idx bkref_idx);
92
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93
Idx limit, Idx subexp_idx,
94
Idx node, Idx str_idx,
96
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97
re_node_set *dest_nodes,
98
const re_node_set *candidates,
100
struct re_backref_cache_entry *bkref_ents,
102
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103
re_sift_context_t *sctx,
104
Idx str_idx, const re_node_set *candidates);
105
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
107
re_dfastate_t **src, Idx num);
108
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109
re_match_context_t *mctx);
110
static re_dfastate_t *transit_state (reg_errcode_t *err,
111
re_match_context_t *mctx,
112
re_dfastate_t *state);
113
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114
re_match_context_t *mctx,
115
re_dfastate_t *next_state);
116
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117
re_node_set *cur_nodes,
120
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121
re_match_context_t *mctx,
122
re_dfastate_t *pstate);
124
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125
re_dfastate_t *pstate);
126
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127
const re_node_set *nodes);
128
static reg_errcode_t get_subexp (re_match_context_t *mctx,
129
Idx bkref_node, Idx bkref_str_idx);
130
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131
const re_sub_match_top_t *sub_top,
132
re_sub_match_last_t *sub_last,
133
Idx bkref_node, Idx bkref_str);
134
static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135
Idx subexp_idx, int type);
136
static reg_errcode_t check_arrival (re_match_context_t *mctx,
137
state_array_t *path, Idx top_node,
138
Idx top_str, Idx last_node, Idx last_str,
140
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
142
re_node_set *cur_nodes,
143
re_node_set *next_nodes);
144
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145
re_node_set *cur_nodes,
146
Idx ex_subexp, int type);
147
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148
re_node_set *dst_nodes,
149
Idx target, Idx ex_subexp,
151
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152
re_node_set *cur_nodes, Idx cur_str,
153
Idx subexp_num, int type);
154
static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155
static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156
const re_string_t *input, Idx idx);
158
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
161
static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162
const re_dfastate_t *state,
163
re_node_set *states_node,
164
bitset_t *states_ch);
165
static bool check_node_accept (const re_match_context_t *mctx,
166
const re_token_t *node, Idx idx);
167
static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
169
/* Entry point for POSIX code. */
171
/* regexec searches for a given pattern, specified by PREG, in the
174
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
175
'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
176
least NMATCH elements, and we set them to the offsets of the
177
corresponding matched substrings.
179
EFLAGS specifies "execution flags" which affect matching: if
180
REG_NOTBOL is set, then ^ does not match at the beginning of the
181
string; if REG_NOTEOL is set, then $ does not match at the end.
183
Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
184
EFLAGS is invalid. */
187
regexec (const regex_t *__restrict preg, const char *__restrict string,
188
size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
192
re_dfa_t *dfa = preg->buffer;
194
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
197
if (eflags & REG_STARTEND)
199
start = pmatch[0].rm_so;
200
length = pmatch[0].rm_eo;
205
length = strlen (string);
208
lock_lock (dfa->lock);
210
err = re_search_internal (preg, string, length, start, length,
211
length, 0, NULL, eflags);
213
err = re_search_internal (preg, string, length, start, length,
214
length, nmatch, pmatch, eflags);
215
lock_unlock (dfa->lock);
216
return err != REG_NOERROR;
220
libc_hidden_def (__regexec)
222
# include <shlib-compat.h>
223
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
225
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
226
__typeof__ (__regexec) __compat_regexec;
229
attribute_compat_text_section
230
__compat_regexec (const regex_t *__restrict preg,
231
const char *__restrict string, size_t nmatch,
232
regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
234
return regexec (preg, string, nmatch, pmatch,
235
eflags & (REG_NOTBOL | REG_NOTEOL));
237
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
241
/* Entry points for GNU code. */
243
/* re_match, re_search, re_match_2, re_search_2
245
The former two functions operate on STRING with length LENGTH,
246
while the later two operate on concatenation of STRING1 and STRING2
247
with lengths LENGTH1 and LENGTH2, respectively.
249
re_match() matches the compiled pattern in BUFP against the string,
250
starting at index START.
252
re_search() first tries matching at index START, then it tries to match
253
starting from index START + 1, and so on. The last start position tried
254
is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
257
The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258
the first STOP characters of the concatenation of the strings should be
261
If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262
and all groups is stored in REGS. (For the "_2" variants, the offsets are
263
computed relative to the concatenation, not relative to the individual
266
On success, re_match* functions return the length of the match, re_search*
267
return the position of the start of the match. They return -1 on
268
match failure, -2 on error. */
271
re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
272
Idx start, struct re_registers *regs)
274
return re_search_stub (bufp, string, length, start, 0, length, regs, true);
277
weak_alias (__re_match, re_match)
281
re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
282
Idx start, regoff_t range, struct re_registers *regs)
284
return re_search_stub (bufp, string, length, start, range, length, regs,
288
weak_alias (__re_search, re_search)
292
re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
293
const char *string2, Idx length2, Idx start,
294
struct re_registers *regs, Idx stop)
296
return re_search_2_stub (bufp, string1, length1, string2, length2,
297
start, 0, regs, stop, true);
300
weak_alias (__re_match_2, re_match_2)
304
re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
305
const char *string2, Idx length2, Idx start, regoff_t range,
306
struct re_registers *regs, Idx stop)
308
return re_search_2_stub (bufp, string1, length1, string2, length2,
309
start, range, regs, stop, false);
312
weak_alias (__re_search_2, re_search_2)
316
re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
317
Idx length1, const char *string2, Idx length2, Idx start,
318
regoff_t range, struct re_registers *regs,
319
Idx stop, bool ret_len)
326
if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
327
|| INT_ADD_WRAPV (length1, length2, &len))))
330
/* Concatenate the strings. */
334
s = re_malloc (char, len);
336
if (__glibc_unlikely (s == NULL))
339
memcpy (__mempcpy (s, string1, length1), string2, length2);
341
memcpy (s, string1, length1);
342
memcpy (s + length1, string2, length2);
351
rval = re_search_stub (bufp, str, len, start, range, stop, regs,
357
/* The parameters have the same meaning as those of re_search.
358
Additional parameters:
359
If RET_LEN is true the length of the match is returned (re_match style);
360
otherwise the position of the match is returned. */
363
re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
364
Idx start, regoff_t range, Idx stop, struct re_registers *regs,
367
reg_errcode_t result;
372
re_dfa_t *dfa = bufp->buffer;
373
Idx last_start = start + range;
375
/* Check for out-of-range. */
376
if (__glibc_unlikely (start < 0 || start > length))
378
if (__glibc_unlikely (length < last_start
379
|| (0 <= range && last_start < start)))
381
else if (__glibc_unlikely (last_start < 0
382
|| (range < 0 && start <= last_start)))
385
lock_lock (dfa->lock);
387
eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388
eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
390
/* Compile fastmap if we haven't yet. */
391
if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392
re_compile_fastmap (bufp);
394
if (__glibc_unlikely (bufp->no_sub))
397
/* We need at least 1 register. */
400
else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
401
&& regs->num_regs <= bufp->re_nsub))
403
nregs = regs->num_regs;
404
if (__glibc_unlikely (nregs < 1))
406
/* Nothing can be copied to regs. */
412
nregs = bufp->re_nsub + 1;
413
pmatch = re_malloc (regmatch_t, nregs);
414
if (__glibc_unlikely (pmatch == NULL))
420
result = re_search_internal (bufp, string, length, start, last_start, stop,
421
nregs, pmatch, eflags);
425
/* I hope we needn't fill their regs with -1's when no match was found. */
426
if (result != REG_NOERROR)
427
rval = result == REG_NOMATCH ? -1 : -2;
428
else if (regs != NULL)
430
/* If caller wants register contents data back, copy them. */
431
bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432
bufp->regs_allocated);
433
if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
437
if (__glibc_likely (rval == 0))
441
DEBUG_ASSERT (pmatch[0].rm_so == start);
442
rval = pmatch[0].rm_eo - start;
445
rval = pmatch[0].rm_so;
449
lock_unlock (dfa->lock);
454
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
457
int rval = REGS_REALLOCATE;
459
Idx need_regs = nregs + 1;
460
/* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
463
/* Have the register data arrays been allocated? */
464
if (regs_allocated == REGS_UNALLOCATED)
465
{ /* No. So allocate them with malloc. */
466
regs->start = re_malloc (regoff_t, need_regs);
467
if (__glibc_unlikely (regs->start == NULL))
468
return REGS_UNALLOCATED;
469
regs->end = re_malloc (regoff_t, need_regs);
470
if (__glibc_unlikely (regs->end == NULL))
472
re_free (regs->start);
473
return REGS_UNALLOCATED;
475
regs->num_regs = need_regs;
477
else if (regs_allocated == REGS_REALLOCATE)
478
{ /* Yes. If we need more elements than were already
479
allocated, reallocate them. If we need fewer, just
481
if (__glibc_unlikely (need_regs > regs->num_regs))
483
regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
485
if (__glibc_unlikely (new_start == NULL))
486
return REGS_UNALLOCATED;
487
new_end = re_realloc (regs->end, regoff_t, need_regs);
488
if (__glibc_unlikely (new_end == NULL))
491
return REGS_UNALLOCATED;
493
regs->start = new_start;
495
regs->num_regs = need_regs;
500
DEBUG_ASSERT (regs_allocated == REGS_FIXED);
501
/* This function may not be called with REGS_FIXED and nregs too big. */
502
DEBUG_ASSERT (nregs <= regs->num_regs);
507
for (i = 0; i < nregs; ++i)
509
regs->start[i] = pmatch[i].rm_so;
510
regs->end[i] = pmatch[i].rm_eo;
512
for ( ; i < regs->num_regs; ++i)
513
regs->start[i] = regs->end[i] = -1;
518
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
520
this memory for recording register information. STARTS and ENDS
521
must be allocated using the malloc library routine, and must each
522
be at least NUM_REGS * sizeof (regoff_t) bytes long.
524
If NUM_REGS == 0, then subsequent matches should allocate their own
527
Unless this function is called, the first search or match using
528
PATTERN_BUFFER will allocate its own register data, without
529
freeing the old data. */
532
re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533
__re_size_t num_regs, regoff_t *starts, regoff_t *ends)
537
bufp->regs_allocated = REGS_REALLOCATE;
538
regs->num_regs = num_regs;
539
regs->start = starts;
544
bufp->regs_allocated = REGS_UNALLOCATED;
546
regs->start = regs->end = NULL;
550
weak_alias (__re_set_registers, re_set_registers)
553
/* Entry points compatible with 4.2 BSD regex library. We don't define
554
them unless specifically requested. */
556
#if defined _REGEX_RE_COMP || defined _LIBC
561
re_exec (const char *s)
563
return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
565
#endif /* _REGEX_RE_COMP */
567
/* Internal entry point. */
569
/* Searches for a compiled pattern PREG in the string STRING, whose
570
length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571
meaning as with regexec. LAST_START is START + RANGE, where
572
START and RANGE have the same meaning as with re_search.
573
Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574
otherwise return the error code.
575
Note: We assume front end functions already check ranges.
576
(0 <= LAST_START && LAST_START <= LENGTH) */
579
__attribute_warn_unused_result__
580
re_search_internal (const regex_t *preg, const char *string, Idx length,
581
Idx start, Idx last_start, Idx stop, size_t nmatch,
582
regmatch_t pmatch[], int eflags)
585
const re_dfa_t *dfa = preg->buffer;
586
Idx left_lim, right_lim;
588
bool fl_longest_match;
595
re_match_context_t mctx = { .dfa = dfa };
596
char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
597
&& start != last_start && !preg->can_be_null)
598
? preg->fastmap : NULL);
599
RE_TRANSLATE_TYPE t = preg->translate;
601
extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602
nmatch -= extra_nmatch;
604
/* Check if the DFA haven't been compiled. */
605
if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
606
|| dfa->init_state_word == NULL
607
|| dfa->init_state_nl == NULL
608
|| dfa->init_state_begbuf == NULL))
611
/* We assume front-end functions already check them. */
612
DEBUG_ASSERT (0 <= last_start && last_start <= length);
614
/* If initial states with non-begbuf contexts have no elements,
615
the regex must be anchored. If preg->newline_anchor is set,
616
we'll never use init_state_nl, so do not check it. */
617
if (dfa->init_state->nodes.nelem == 0
618
&& dfa->init_state_word->nodes.nelem == 0
619
&& (dfa->init_state_nl->nodes.nelem == 0
620
|| !preg->newline_anchor))
622
if (start != 0 && last_start != 0)
624
start = last_start = 0;
627
/* We must check the longest matching, if nmatch > 0. */
628
fl_longest_match = (nmatch != 0 || dfa->nbackref);
630
err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
631
preg->translate, (preg->syntax & RE_ICASE) != 0,
633
if (__glibc_unlikely (err != REG_NOERROR))
635
mctx.input.stop = stop;
636
mctx.input.raw_stop = stop;
637
mctx.input.newline_anchor = preg->newline_anchor;
639
err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640
if (__glibc_unlikely (err != REG_NOERROR))
643
/* We will log all the DFA states through which the dfa pass,
644
if nmatch > 1, or this dfa has "multibyte node", which is a
645
back-reference or a node which can accept multibyte character or
646
multi character collating element. */
647
if (nmatch > 1 || dfa->has_mb_node)
649
/* Avoid overflow. */
650
if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
651
<= mctx.input.bufs_len)))
657
mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
658
if (__glibc_unlikely (mctx.state_log == NULL))
666
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
667
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
669
/* Check incrementally whether the input string matches. */
670
incr = (last_start < start) ? -1 : 1;
671
left_lim = (last_start < start) ? last_start : start;
672
right_lim = (last_start < start) ? start : last_start;
673
sb = dfa->mb_cur_max == 1;
676
? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
677
| (start <= last_start ? 2 : 0)
678
| (t != NULL ? 1 : 0))
681
for (;; match_first += incr)
684
if (match_first < left_lim || right_lim < match_first)
687
/* Advance as rapidly as possible through the string, until we
688
find a plausible place to start matching. This may be done
689
with varying efficiency, so there are various possibilities:
690
only the most common of them are specialized, in order to
691
save on code size. We use a switch statement for speed. */
699
/* Fastmap with single-byte translation, match forward. */
700
while (__glibc_likely (match_first < right_lim)
701
&& !fastmap[t[(unsigned char) string[match_first]]])
703
goto forward_match_found_start_or_reached_end;
706
/* Fastmap without translation, match forward. */
707
while (__glibc_likely (match_first < right_lim)
708
&& !fastmap[(unsigned char) string[match_first]])
711
forward_match_found_start_or_reached_end:
712
if (__glibc_unlikely (match_first == right_lim))
714
ch = match_first >= length
715
? 0 : (unsigned char) string[match_first];
716
if (!fastmap[t ? t[ch] : ch])
723
/* Fastmap without multi-byte translation, match backwards. */
724
while (match_first >= left_lim)
726
ch = match_first >= length
727
? 0 : (unsigned char) string[match_first];
728
if (fastmap[t ? t[ch] : ch])
732
if (match_first < left_lim)
737
/* In this case, we can't determine easily the current byte,
738
since it might be a component byte of a multibyte
739
character. Then we use the constructed buffer instead. */
742
/* If MATCH_FIRST is out of the valid range, reconstruct the
744
__re_size_t offset = match_first - mctx.input.raw_mbs_idx;
745
if (__glibc_unlikely (offset
746
>= (__re_size_t) mctx.input.valid_raw_len))
748
err = re_string_reconstruct (&mctx.input, match_first,
750
if (__glibc_unlikely (err != REG_NOERROR))
753
offset = match_first - mctx.input.raw_mbs_idx;
755
/* Use buffer byte if OFFSET is in buffer, otherwise '\0'. */
756
ch = (offset < mctx.input.valid_len
757
? re_string_byte_at (&mctx.input, offset) : 0);
761
if (match_first < left_lim || match_first > right_lim)
770
/* Reconstruct the buffers so that the matcher can assume that
771
the matching starts from the beginning of the buffer. */
772
err = re_string_reconstruct (&mctx.input, match_first, eflags);
773
if (__glibc_unlikely (err != REG_NOERROR))
776
/* Don't consider this char as a possible match start if it part,
777
yet isn't the head, of a multibyte character. */
778
if (!sb && !re_string_first_byte (&mctx.input, 0))
781
/* It seems to be appropriate one, then use the matcher. */
782
/* We assume that the matching starts from 0. */
783
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
784
match_last = check_matching (&mctx, fl_longest_match,
785
start <= last_start ? &match_first : NULL);
786
if (match_last != -1)
788
if (__glibc_unlikely (match_last == -2))
795
mctx.match_last = match_last;
796
if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
798
re_dfastate_t *pstate = mctx.state_log[match_last];
799
mctx.last_node = check_halt_state_context (&mctx, pstate,
802
if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
805
err = prune_impossible_nodes (&mctx);
806
if (err == REG_NOERROR)
808
if (__glibc_unlikely (err != REG_NOMATCH))
813
break; /* We found a match. */
817
match_ctx_clean (&mctx);
820
DEBUG_ASSERT (match_last != -1);
821
DEBUG_ASSERT (err == REG_NOERROR);
823
/* Set pmatch[] if we need. */
828
/* Initialize registers. */
829
for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
830
pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
832
/* Set the points where matching start/end. */
834
pmatch[0].rm_eo = mctx.match_last;
835
/* FIXME: This function should fail if mctx.match_last exceeds
836
the maximum possible regoff_t value. We need a new error
837
code REG_OVERFLOW. */
839
if (!preg->no_sub && nmatch > 1)
841
err = set_regs (preg, &mctx, nmatch, pmatch,
842
dfa->has_plural_match && dfa->nbackref > 0);
843
if (__glibc_unlikely (err != REG_NOERROR))
847
/* At last, add the offset to each register, since we slid
848
the buffers so that we could assume that the matching starts
850
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851
if (pmatch[reg_idx].rm_so != -1)
853
if (__glibc_unlikely (mctx.input.offsets_needed != 0))
855
pmatch[reg_idx].rm_so =
856
(pmatch[reg_idx].rm_so == mctx.input.valid_len
857
? mctx.input.valid_raw_len
858
: mctx.input.offsets[pmatch[reg_idx].rm_so]);
859
pmatch[reg_idx].rm_eo =
860
(pmatch[reg_idx].rm_eo == mctx.input.valid_len
861
? mctx.input.valid_raw_len
862
: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
864
pmatch[reg_idx].rm_so += match_first;
865
pmatch[reg_idx].rm_eo += match_first;
867
for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
869
pmatch[nmatch + reg_idx].rm_so = -1;
870
pmatch[nmatch + reg_idx].rm_eo = -1;
874
for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875
if (dfa->subexp_map[reg_idx] != reg_idx)
877
pmatch[reg_idx + 1].rm_so
878
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879
pmatch[reg_idx + 1].rm_eo
880
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
885
re_free (mctx.state_log);
887
match_ctx_free (&mctx);
888
re_string_destruct (&mctx.input);
893
__attribute_warn_unused_result__
894
prune_impossible_nodes (re_match_context_t *mctx)
896
const re_dfa_t *const dfa = mctx->dfa;
897
Idx halt_node, match_last;
899
re_dfastate_t **sifted_states;
900
re_dfastate_t **lim_states = NULL;
901
re_sift_context_t sctx;
902
DEBUG_ASSERT (mctx->state_log != NULL);
903
match_last = mctx->match_last;
904
halt_node = mctx->last_node;
906
/* Avoid overflow. */
907
if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
911
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
912
if (__glibc_unlikely (sifted_states == NULL))
919
lim_states = re_malloc (re_dfastate_t *, match_last + 1);
920
if (__glibc_unlikely (lim_states == NULL))
927
memset (lim_states, '\0',
928
sizeof (re_dfastate_t *) * (match_last + 1));
929
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
931
ret = sift_states_backward (mctx, &sctx);
932
re_node_set_free (&sctx.limits);
933
if (__glibc_unlikely (ret != REG_NOERROR))
935
if (sifted_states[0] != NULL || lim_states[0] != NULL)
945
} while (mctx->state_log[match_last] == NULL
946
|| !mctx->state_log[match_last]->halt);
947
halt_node = check_halt_state_context (mctx,
948
mctx->state_log[match_last],
951
ret = merge_state_array (dfa, sifted_states, lim_states,
953
re_free (lim_states);
955
if (__glibc_unlikely (ret != REG_NOERROR))
960
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
961
ret = sift_states_backward (mctx, &sctx);
962
re_node_set_free (&sctx.limits);
963
if (__glibc_unlikely (ret != REG_NOERROR))
965
if (sifted_states[0] == NULL)
971
re_free (mctx->state_log);
972
mctx->state_log = sifted_states;
973
sifted_states = NULL;
974
mctx->last_node = halt_node;
975
mctx->match_last = match_last;
978
re_free (sifted_states);
979
re_free (lim_states);
983
/* Acquire an initial state and return it.
984
We must select appropriate initial state depending on the context,
985
since initial states may have constraints like "\<", "^", etc.. */
987
static __always_inline re_dfastate_t *
988
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
991
const re_dfa_t *const dfa = mctx->dfa;
992
if (dfa->init_state->has_constraint)
994
unsigned int context;
995
context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
996
if (IS_WORD_CONTEXT (context))
997
return dfa->init_state_word;
998
else if (IS_ORDINARY_CONTEXT (context))
999
return dfa->init_state;
1000
else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1001
return dfa->init_state_begbuf;
1002
else if (IS_NEWLINE_CONTEXT (context))
1003
return dfa->init_state_nl;
1004
else if (IS_BEGBUF_CONTEXT (context))
1006
/* It is relatively rare case, then calculate on demand. */
1007
return re_acquire_state_context (err, dfa,
1008
dfa->init_state->entrance_nodes,
1012
/* Must not happen? */
1013
return dfa->init_state;
1016
return dfa->init_state;
1019
/* Check whether the regular expression match input string INPUT or not,
1020
and return the index where the matching end. Return -1 if
1021
there is no match, and return -2 in case of an error.
1022
FL_LONGEST_MATCH means we want the POSIX longest matching.
1023
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1024
next place where we may want to try matching.
1025
Note that the matcher assumes that the matching starts from the current
1026
index of the buffer. */
1029
__attribute_warn_unused_result__
1030
check_matching (re_match_context_t *mctx, bool fl_longest_match,
1033
const re_dfa_t *const dfa = mctx->dfa;
1036
Idx match_last = -1;
1037
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1038
re_dfastate_t *cur_state;
1039
bool at_init_state = p_match_first != NULL;
1040
Idx next_start_idx = cur_str_idx;
1043
cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1044
/* An initial state must not be NULL (invalid). */
1045
if (__glibc_unlikely (cur_state == NULL))
1047
DEBUG_ASSERT (err == REG_ESPACE);
1051
if (mctx->state_log != NULL)
1053
mctx->state_log[cur_str_idx] = cur_state;
1055
/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1056
later. E.g. Processing back references. */
1057
if (__glibc_unlikely (dfa->nbackref))
1059
at_init_state = false;
1060
err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1061
if (__glibc_unlikely (err != REG_NOERROR))
1064
if (cur_state->has_backref)
1066
err = transit_state_bkref (mctx, &cur_state->nodes);
1067
if (__glibc_unlikely (err != REG_NOERROR))
1073
/* If the RE accepts NULL string. */
1074
if (__glibc_unlikely (cur_state->halt))
1076
if (!cur_state->has_constraint
1077
|| check_halt_state_context (mctx, cur_state, cur_str_idx))
1079
if (!fl_longest_match)
1083
match_last = cur_str_idx;
1089
while (!re_string_eoi (&mctx->input))
1091
re_dfastate_t *old_state = cur_state;
1092
Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1094
if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1095
&& mctx->input.bufs_len < mctx->input.len)
1096
|| (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1097
&& mctx->input.valid_len < mctx->input.len))
1099
err = extend_buffers (mctx, next_char_idx + 1);
1100
if (__glibc_unlikely (err != REG_NOERROR))
1102
DEBUG_ASSERT (err == REG_ESPACE);
1107
cur_state = transit_state (&err, mctx, cur_state);
1108
if (mctx->state_log != NULL)
1109
cur_state = merge_state_with_log (&err, mctx, cur_state);
1111
if (cur_state == NULL)
1113
/* Reached the invalid state or an error. Try to recover a valid
1114
state using the state log, if available and if we have not
1115
already found a valid (even if not the longest) match. */
1116
if (__glibc_unlikely (err != REG_NOERROR))
1119
if (mctx->state_log == NULL
1120
|| (match && !fl_longest_match)
1121
|| (cur_state = find_recover_state (&err, mctx)) == NULL)
1125
if (__glibc_unlikely (at_init_state))
1127
if (old_state == cur_state)
1128
next_start_idx = next_char_idx;
1130
at_init_state = false;
1133
if (cur_state->halt)
1135
/* Reached a halt state.
1136
Check the halt state can satisfy the current context. */
1137
if (!cur_state->has_constraint
1138
|| check_halt_state_context (mctx, cur_state,
1139
re_string_cur_idx (&mctx->input)))
1141
/* We found an appropriate halt state. */
1142
match_last = re_string_cur_idx (&mctx->input);
1145
/* We found a match, do not modify match_first below. */
1146
p_match_first = NULL;
1147
if (!fl_longest_match)
1154
*p_match_first += next_start_idx;
1159
/* Check NODE match the current context. */
1162
check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1164
re_token_type_t type = dfa->nodes[node].type;
1165
unsigned int constraint = dfa->nodes[node].constraint;
1166
if (type != END_OF_RE)
1170
if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1175
/* Check the halt state STATE match the current context.
1176
Return 0 if not match, if the node, STATE has, is a halt node and
1177
match the context, return the node. */
1180
check_halt_state_context (const re_match_context_t *mctx,
1181
const re_dfastate_t *state, Idx idx)
1184
unsigned int context;
1185
DEBUG_ASSERT (state->halt);
1186
context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1187
for (i = 0; i < state->nodes.nelem; ++i)
1188
if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1189
return state->nodes.elems[i];
1193
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1194
corresponding to the DFA).
1195
Return the destination node, and update EPS_VIA_NODES;
1196
return -1 on match failure, -2 on error. */
1199
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1200
regmatch_t *prevregs,
1201
Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1202
struct re_fail_stack_t *fs)
1204
const re_dfa_t *const dfa = mctx->dfa;
1205
if (IS_EPSILON_NODE (dfa->nodes[node].type))
1207
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208
re_node_set *edests = &dfa->edests[node];
1210
if (! re_node_set_contains (eps_via_nodes, node))
1212
bool ok = re_node_set_insert (eps_via_nodes, node);
1213
if (__glibc_unlikely (! ok))
1217
/* Pick a valid destination, or return -1 if none is found. */
1219
for (Idx i = 0; i < edests->nelem; i++)
1221
Idx candidate = edests->elems[i];
1222
if (!re_node_set_contains (cur_nodes, candidate))
1224
if (dest_node == -1)
1225
dest_node = candidate;
1229
/* In order to avoid infinite loop like "(a*)*", return the second
1230
epsilon-transition if the first was already considered. */
1231
if (re_node_set_contains (eps_via_nodes, dest_node))
1234
/* Otherwise, push the second epsilon-transition on the fail stack. */
1236
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
1237
prevregs, eps_via_nodes))
1240
/* We know we are going to exit. */
1249
re_token_type_t type = dfa->nodes[node].type;
1251
if (dfa->nodes[node].accept_mb)
1252
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1253
else if (type == OP_BACK_REF)
1255
Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1256
if (subexp_idx < nregs)
1257
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1260
if (subexp_idx >= nregs
1261
|| regs[subexp_idx].rm_so == -1
1262
|| regs[subexp_idx].rm_eo == -1)
1266
char *buf = (char *) re_string_get_buffer (&mctx->input);
1267
if (mctx->input.valid_len - *pidx < naccepted
1268
|| (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1278
bool ok = re_node_set_insert (eps_via_nodes, node);
1279
if (__glibc_unlikely (! ok))
1281
dest_node = dfa->edests[node].elems[0];
1282
if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1289
|| check_node_accept (mctx, dfa->nodes + node, *pidx))
1291
Idx dest_node = dfa->nexts[node];
1292
*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1293
if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1294
|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1297
re_node_set_empty (eps_via_nodes);
1304
static reg_errcode_t
1305
__attribute_warn_unused_result__
1306
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1307
Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1308
re_node_set *eps_via_nodes)
1312
if (num == fs->alloc)
1314
struct re_fail_stack_ent_t *new_array;
1315
new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1317
if (new_array == NULL)
1320
fs->stack = new_array;
1322
fs->stack[num].idx = str_idx;
1323
fs->stack[num].node = dest_node;
1324
fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1325
if (fs->stack[num].regs == NULL)
1328
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1329
memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1330
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1335
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1336
regmatch_t *regs, regmatch_t *prevregs,
1337
re_node_set *eps_via_nodes)
1339
if (fs == NULL || fs->num == 0)
1341
Idx num = --fs->num;
1342
*pidx = fs->stack[num].idx;
1343
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1344
memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1345
re_node_set_free (eps_via_nodes);
1346
re_free (fs->stack[num].regs);
1347
*eps_via_nodes = fs->stack[num].eps_via_nodes;
1348
DEBUG_ASSERT (0 <= fs->stack[num].node);
1349
return fs->stack[num].node;
1353
#define DYNARRAY_STRUCT regmatch_list
1354
#define DYNARRAY_ELEMENT regmatch_t
1355
#define DYNARRAY_PREFIX regmatch_list_
1356
#include <malloc/dynarray-skeleton.c>
1358
/* Set the positions where the subexpressions are starts/ends to registers
1360
Note: We assume that pmatch[0] is already set, and
1361
pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1363
static reg_errcode_t
1364
__attribute_warn_unused_result__
1365
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1366
regmatch_t *pmatch, bool fl_backtrack)
1368
const re_dfa_t *dfa = preg->buffer;
1370
re_node_set eps_via_nodes;
1371
struct re_fail_stack_t *fs;
1372
struct re_fail_stack_t fs_body = { 0, 2, NULL };
1373
struct regmatch_list prev_match;
1374
regmatch_list_init (&prev_match);
1376
DEBUG_ASSERT (nmatch > 1);
1377
DEBUG_ASSERT (mctx->state_log != NULL);
1381
fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1382
if (fs->stack == NULL)
1388
cur_node = dfa->init_node;
1389
re_node_set_init_empty (&eps_via_nodes);
1391
if (!regmatch_list_resize (&prev_match, nmatch))
1393
regmatch_list_free (&prev_match);
1394
free_fail_stack_return (fs);
1397
regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1398
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1400
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1402
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1404
if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1405
|| (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1411
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1412
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1414
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1415
prev_idx_match, &eps_via_nodes);
1421
re_node_set_free (&eps_via_nodes);
1422
regmatch_list_free (&prev_match);
1423
return free_fail_stack_return (fs);
1427
/* Proceed to next node. */
1428
cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1430
&eps_via_nodes, fs);
1432
if (__glibc_unlikely (cur_node < 0))
1434
if (__glibc_unlikely (cur_node == -2))
1436
re_node_set_free (&eps_via_nodes);
1437
regmatch_list_free (&prev_match);
1438
free_fail_stack_return (fs);
1441
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1442
prev_idx_match, &eps_via_nodes);
1445
re_node_set_free (&eps_via_nodes);
1446
regmatch_list_free (&prev_match);
1447
free_fail_stack_return (fs);
1452
re_node_set_free (&eps_via_nodes);
1453
regmatch_list_free (&prev_match);
1454
return free_fail_stack_return (fs);
1457
static reg_errcode_t
1458
free_fail_stack_return (struct re_fail_stack_t *fs)
1463
for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1465
re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1466
re_free (fs->stack[fs_idx].regs);
1468
re_free (fs->stack);
1474
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1475
regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1477
int type = dfa->nodes[cur_node].type;
1478
if (type == OP_OPEN_SUBEXP)
1480
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1482
/* We are at the first node of this sub expression. */
1483
if (reg_num < nmatch)
1485
pmatch[reg_num].rm_so = cur_idx;
1486
pmatch[reg_num].rm_eo = -1;
1489
else if (type == OP_CLOSE_SUBEXP)
1491
/* We are at the last node of this sub expression. */
1492
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493
if (reg_num < nmatch)
1495
if (pmatch[reg_num].rm_so < cur_idx)
1497
pmatch[reg_num].rm_eo = cur_idx;
1498
/* This is a non-empty match or we are not inside an optional
1499
subexpression. Accept this right away. */
1500
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1504
if (dfa->nodes[cur_node].opt_subexp
1505
&& prev_idx_match[reg_num].rm_so != -1)
1506
/* We transited through an empty match for an optional
1507
subexpression, like (a?)*, and this is not the subexp's
1508
first match. Copy back the old content of the registers
1509
so that matches of an inner subexpression are undone as
1510
well, like in ((a?))*. */
1511
memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1513
/* We completed a subexpression, but it may be part of
1514
an optional one, so do not update PREV_IDX_MATCH. */
1515
pmatch[reg_num].rm_eo = cur_idx;
1521
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1522
and sift the nodes in each states according to the following rules.
1523
Updated state_log will be wrote to STATE_LOG.
1525
Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1526
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1527
If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1528
the LAST_NODE, we throw away the node 'a'.
1529
2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1530
string 's' and transit to 'b':
1531
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1533
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1534
thrown away, we throw away the node 'a'.
1535
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1536
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1538
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1539
we throw away the node 'a'. */
1541
#define STATE_NODE_CONTAINS(state,node) \
1542
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1544
static reg_errcode_t
1545
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1549
Idx str_idx = sctx->last_str_idx;
1550
re_node_set cur_dest;
1552
DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1554
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1555
transit to the last_node and the last_node itself. */
1556
err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1557
if (__glibc_unlikely (err != REG_NOERROR))
1559
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1560
if (__glibc_unlikely (err != REG_NOERROR))
1563
/* Then check each states in the state_log. */
1566
/* Update counters. */
1567
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1568
if (null_cnt > mctx->max_mb_elem_len)
1570
memset (sctx->sifted_states, '\0',
1571
sizeof (re_dfastate_t *) * str_idx);
1572
re_node_set_free (&cur_dest);
1575
re_node_set_empty (&cur_dest);
1578
if (mctx->state_log[str_idx])
1580
err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1581
if (__glibc_unlikely (err != REG_NOERROR))
1585
/* Add all the nodes which satisfy the following conditions:
1586
- It can epsilon transit to a node in CUR_DEST.
1588
And update state_log. */
1589
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1590
if (__glibc_unlikely (err != REG_NOERROR))
1595
re_node_set_free (&cur_dest);
1599
static reg_errcode_t
1600
__attribute_warn_unused_result__
1601
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1602
Idx str_idx, re_node_set *cur_dest)
1604
const re_dfa_t *const dfa = mctx->dfa;
1605
const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1608
/* Then build the next sifted state.
1609
We build the next sifted state on 'cur_dest', and update
1610
'sifted_states[str_idx]' with 'cur_dest'.
1612
'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1613
'cur_src' points the node_set of the old 'state_log[str_idx]'
1614
(with the epsilon nodes pre-filtered out). */
1615
for (i = 0; i < cur_src->nelem; i++)
1617
Idx prev_node = cur_src->elems[i];
1620
DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1622
/* If the node may accept "multi byte". */
1623
if (dfa->nodes[prev_node].accept_mb)
1624
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1625
str_idx, sctx->last_str_idx);
1627
/* We don't check backreferences here.
1628
See update_cur_sifted_state(). */
1630
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1631
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1632
dfa->nexts[prev_node]))
1638
if (sctx->limits.nelem)
1640
Idx to_idx = str_idx + naccepted;
1641
if (check_dst_limits (mctx, &sctx->limits,
1642
dfa->nexts[prev_node], to_idx,
1643
prev_node, str_idx))
1646
ok = re_node_set_insert (cur_dest, prev_node);
1647
if (__glibc_unlikely (! ok))
1654
/* Helper functions. */
1656
static reg_errcode_t
1657
clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1659
Idx top = mctx->state_log_top;
1661
if ((next_state_log_idx >= mctx->input.bufs_len
1662
&& mctx->input.bufs_len < mctx->input.len)
1663
|| (next_state_log_idx >= mctx->input.valid_len
1664
&& mctx->input.valid_len < mctx->input.len))
1667
err = extend_buffers (mctx, next_state_log_idx + 1);
1668
if (__glibc_unlikely (err != REG_NOERROR))
1672
if (top < next_state_log_idx)
1674
DEBUG_ASSERT (mctx->state_log != NULL);
1675
memset (mctx->state_log + top + 1, '\0',
1676
sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1677
mctx->state_log_top = next_state_log_idx;
1682
static reg_errcode_t
1683
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1684
re_dfastate_t **src, Idx num)
1688
for (st_idx = 0; st_idx < num; ++st_idx)
1690
if (dst[st_idx] == NULL)
1691
dst[st_idx] = src[st_idx];
1692
else if (src[st_idx] != NULL)
1694
re_node_set merged_set;
1695
err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1696
&src[st_idx]->nodes);
1697
if (__glibc_unlikely (err != REG_NOERROR))
1699
dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1700
re_node_set_free (&merged_set);
1701
if (__glibc_unlikely (err != REG_NOERROR))
1708
static reg_errcode_t
1709
update_cur_sifted_state (const re_match_context_t *mctx,
1710
re_sift_context_t *sctx, Idx str_idx,
1711
re_node_set *dest_nodes)
1713
const re_dfa_t *const dfa = mctx->dfa;
1714
reg_errcode_t err = REG_NOERROR;
1715
const re_node_set *candidates;
1716
candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1717
: &mctx->state_log[str_idx]->nodes);
1719
if (dest_nodes->nelem == 0)
1720
sctx->sifted_states[str_idx] = NULL;
1725
/* At first, add the nodes which can epsilon transit to a node in
1727
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1728
if (__glibc_unlikely (err != REG_NOERROR))
1731
/* Then, check the limitations in the current sift_context. */
1732
if (sctx->limits.nelem)
1734
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1735
mctx->bkref_ents, str_idx);
1736
if (__glibc_unlikely (err != REG_NOERROR))
1741
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1742
if (__glibc_unlikely (err != REG_NOERROR))
1746
if (candidates && mctx->state_log[str_idx]->has_backref)
1748
err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1749
if (__glibc_unlikely (err != REG_NOERROR))
1755
static reg_errcode_t
1756
__attribute_warn_unused_result__
1757
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1758
const re_node_set *candidates)
1760
reg_errcode_t err = REG_NOERROR;
1763
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1764
if (__glibc_unlikely (err != REG_NOERROR))
1767
if (!state->inveclosure.alloc)
1769
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1770
if (__glibc_unlikely (err != REG_NOERROR))
1772
for (i = 0; i < dest_nodes->nelem; i++)
1774
err = re_node_set_merge (&state->inveclosure,
1775
dfa->inveclosures + dest_nodes->elems[i]);
1776
if (__glibc_unlikely (err != REG_NOERROR))
1780
return re_node_set_add_intersect (dest_nodes, candidates,
1781
&state->inveclosure);
1784
static reg_errcode_t
1785
sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1786
const re_node_set *candidates)
1790
re_node_set *inv_eclosure = dfa->inveclosures + node;
1791
re_node_set except_nodes;
1792
re_node_set_init_empty (&except_nodes);
1793
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1795
Idx cur_node = inv_eclosure->elems[ecl_idx];
1796
if (cur_node == node)
1798
if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1800
Idx edst1 = dfa->edests[cur_node].elems[0];
1801
Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1802
? dfa->edests[cur_node].elems[1] : -1);
1803
if ((!re_node_set_contains (inv_eclosure, edst1)
1804
&& re_node_set_contains (dest_nodes, edst1))
1806
&& !re_node_set_contains (inv_eclosure, edst2)
1807
&& re_node_set_contains (dest_nodes, edst2)))
1809
err = re_node_set_add_intersect (&except_nodes, candidates,
1810
dfa->inveclosures + cur_node);
1811
if (__glibc_unlikely (err != REG_NOERROR))
1813
re_node_set_free (&except_nodes);
1819
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1821
Idx cur_node = inv_eclosure->elems[ecl_idx];
1822
if (!re_node_set_contains (&except_nodes, cur_node))
1824
Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1825
re_node_set_remove_at (dest_nodes, idx);
1828
re_node_set_free (&except_nodes);
1833
check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1834
Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1836
const re_dfa_t *const dfa = mctx->dfa;
1837
Idx lim_idx, src_pos, dst_pos;
1839
Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1840
Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1841
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1844
struct re_backref_cache_entry *ent;
1845
ent = mctx->bkref_ents + limits->elems[lim_idx];
1846
subexp_idx = dfa->nodes[ent->node].opr.idx;
1848
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1849
subexp_idx, dst_node, dst_idx,
1851
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852
subexp_idx, src_node, src_idx,
1856
<src> <dst> ( <subexp> )
1857
( <subexp> ) <src> <dst>
1858
( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1859
if (src_pos == dst_pos)
1860
continue; /* This is unrelated limitation. */
1868
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1869
Idx subexp_idx, Idx from_node, Idx bkref_idx)
1871
const re_dfa_t *const dfa = mctx->dfa;
1872
const re_node_set *eclosures = dfa->eclosures + from_node;
1875
/* Else, we are on the boundary: examine the nodes on the epsilon
1877
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1879
Idx node = eclosures->elems[node_idx];
1880
switch (dfa->nodes[node].type)
1883
if (bkref_idx != -1)
1885
struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1891
if (ent->node != node)
1894
if (subexp_idx < BITSET_WORD_BITS
1895
&& !(ent->eps_reachable_subexps_map
1896
& ((bitset_word_t) 1 << subexp_idx)))
1899
/* Recurse trying to reach the OP_OPEN_SUBEXP and
1900
OP_CLOSE_SUBEXP cases below. But, if the
1901
destination node is the same node as the source
1902
node, don't recurse because it would cause an
1903
infinite loop: a regex that exhibits this behavior
1905
dst = dfa->edests[node].elems[0];
1906
if (dst == from_node)
1910
else /* if (boundaries & 2) */
1915
check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1917
if (cpos == -1 /* && (boundaries & 1) */)
1919
if (cpos == 0 && (boundaries & 2))
1922
if (subexp_idx < BITSET_WORD_BITS)
1923
ent->eps_reachable_subexps_map
1924
&= ~((bitset_word_t) 1 << subexp_idx);
1926
while (ent++->more);
1930
case OP_OPEN_SUBEXP:
1931
if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1935
case OP_CLOSE_SUBEXP:
1936
if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1945
return (boundaries & 2) ? 1 : 0;
1949
check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1950
Idx subexp_idx, Idx from_node, Idx str_idx,
1953
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1956
/* If we are outside the range of the subexpression, return -1 or 1. */
1957
if (str_idx < lim->subexp_from)
1960
if (lim->subexp_to < str_idx)
1963
/* If we are within the subexpression, return 0. */
1964
boundaries = (str_idx == lim->subexp_from);
1965
boundaries |= (str_idx == lim->subexp_to) << 1;
1966
if (boundaries == 0)
1969
/* Else, examine epsilon closure. */
1970
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1971
from_node, bkref_idx);
1974
/* Check the limitations of sub expressions LIMITS, and remove the nodes
1975
which are against limitations from DEST_NODES. */
1977
static reg_errcode_t
1978
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1979
const re_node_set *candidates, re_node_set *limits,
1980
struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1983
Idx node_idx, lim_idx;
1985
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1988
struct re_backref_cache_entry *ent;
1989
ent = bkref_ents + limits->elems[lim_idx];
1991
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1992
continue; /* This is unrelated limitation. */
1994
subexp_idx = dfa->nodes[ent->node].opr.idx;
1995
if (ent->subexp_to == str_idx)
1999
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2001
Idx node = dest_nodes->elems[node_idx];
2002
re_token_type_t type = dfa->nodes[node].type;
2003
if (type == OP_OPEN_SUBEXP
2004
&& subexp_idx == dfa->nodes[node].opr.idx)
2006
else if (type == OP_CLOSE_SUBEXP
2007
&& subexp_idx == dfa->nodes[node].opr.idx)
2011
/* Check the limitation of the open subexpression. */
2012
/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2015
err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2017
if (__glibc_unlikely (err != REG_NOERROR))
2021
/* Check the limitation of the close subexpression. */
2023
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2025
Idx node = dest_nodes->elems[node_idx];
2026
if (!re_node_set_contains (dfa->inveclosures + node,
2028
&& !re_node_set_contains (dfa->eclosures + node,
2031
/* It is against this limitation.
2032
Remove it form the current sifted state. */
2033
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2035
if (__glibc_unlikely (err != REG_NOERROR))
2041
else /* (ent->subexp_to != str_idx) */
2043
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2045
Idx node = dest_nodes->elems[node_idx];
2046
re_token_type_t type = dfa->nodes[node].type;
2047
if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2049
if (subexp_idx != dfa->nodes[node].opr.idx)
2051
/* It is against this limitation.
2052
Remove it form the current sifted state. */
2053
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2055
if (__glibc_unlikely (err != REG_NOERROR))
2064
static reg_errcode_t
2065
__attribute_warn_unused_result__
2066
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2067
Idx str_idx, const re_node_set *candidates)
2069
const re_dfa_t *const dfa = mctx->dfa;
2072
re_sift_context_t local_sctx;
2073
Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2075
if (first_idx == -1)
2078
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2080
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2083
re_token_type_t type;
2084
struct re_backref_cache_entry *entry;
2085
node = candidates->elems[node_idx];
2086
type = dfa->nodes[node].type;
2087
/* Avoid infinite loop for the REs like "()\1+". */
2088
if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2090
if (type != OP_BACK_REF)
2093
entry = mctx->bkref_ents + first_idx;
2094
enabled_idx = first_idx;
2101
re_dfastate_t *cur_state;
2103
if (entry->node != node)
2105
subexp_len = entry->subexp_to - entry->subexp_from;
2106
to_idx = str_idx + subexp_len;
2107
dst_node = (subexp_len ? dfa->nexts[node]
2108
: dfa->edests[node].elems[0]);
2110
if (to_idx > sctx->last_str_idx
2111
|| sctx->sifted_states[to_idx] == NULL
2112
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2113
|| check_dst_limits (mctx, &sctx->limits, node,
2114
str_idx, dst_node, to_idx))
2117
if (local_sctx.sifted_states == NULL)
2120
err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2121
if (__glibc_unlikely (err != REG_NOERROR))
2124
local_sctx.last_node = node;
2125
local_sctx.last_str_idx = str_idx;
2126
ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2127
if (__glibc_unlikely (! ok))
2132
cur_state = local_sctx.sifted_states[str_idx];
2133
err = sift_states_backward (mctx, &local_sctx);
2134
if (__glibc_unlikely (err != REG_NOERROR))
2136
if (sctx->limited_states != NULL)
2138
err = merge_state_array (dfa, sctx->limited_states,
2139
local_sctx.sifted_states,
2141
if (__glibc_unlikely (err != REG_NOERROR))
2144
local_sctx.sifted_states[str_idx] = cur_state;
2145
re_node_set_remove (&local_sctx.limits, enabled_idx);
2147
/* mctx->bkref_ents may have changed, reload the pointer. */
2148
entry = mctx->bkref_ents + enabled_idx;
2150
while (enabled_idx++, entry++->more);
2154
if (local_sctx.sifted_states != NULL)
2156
re_node_set_free (&local_sctx.limits);
2164
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2165
Idx node_idx, Idx str_idx, Idx max_str_idx)
2167
const re_dfa_t *const dfa = mctx->dfa;
2169
/* Check the node can accept "multi byte". */
2170
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2171
if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2172
&& !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2173
dfa->nexts[node_idx]))
2174
/* The node can't accept the "multi byte", or the
2175
destination was already thrown away, then the node
2176
couldn't accept the current input "multi byte". */
2178
/* Otherwise, it is sure that the node could accept
2179
'naccepted' bytes input. */
2183
/* Functions for state transition. */
2185
/* Return the next state to which the current state STATE will transit by
2186
accepting the current input byte, and update STATE_LOG if necessary.
2187
Return NULL on failure.
2188
If STATE can accept a multibyte char/collating element/back reference
2189
update the destination of STATE_LOG. */
2191
static re_dfastate_t *
2192
__attribute_warn_unused_result__
2193
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2194
re_dfastate_t *state)
2196
re_dfastate_t **trtable;
2199
/* If the current state can accept multibyte. */
2200
if (__glibc_unlikely (state->accept_mb))
2202
*err = transit_state_mb (mctx, state);
2203
if (__glibc_unlikely (*err != REG_NOERROR))
2207
/* Then decide the next state with the single byte. */
2210
/* don't use transition table */
2211
return transit_state_sb (err, mctx, state);
2214
/* Use transition table */
2215
ch = re_string_fetch_byte (&mctx->input);
2218
trtable = state->trtable;
2219
if (__glibc_likely (trtable != NULL))
2222
trtable = state->word_trtable;
2223
if (__glibc_likely (trtable != NULL))
2225
unsigned int context;
2227
= re_string_context_at (&mctx->input,
2228
re_string_cur_idx (&mctx->input) - 1,
2230
if (IS_WORD_CONTEXT (context))
2231
return trtable[ch + SBC_MAX];
2236
if (!build_trtable (mctx->dfa, state))
2242
/* Retry, we now have a transition table. */
2246
/* Update the state_log if we need */
2247
static re_dfastate_t *
2248
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2249
re_dfastate_t *next_state)
2251
const re_dfa_t *const dfa = mctx->dfa;
2252
Idx cur_idx = re_string_cur_idx (&mctx->input);
2254
if (cur_idx > mctx->state_log_top)
2256
mctx->state_log[cur_idx] = next_state;
2257
mctx->state_log_top = cur_idx;
2259
else if (mctx->state_log[cur_idx] == 0)
2261
mctx->state_log[cur_idx] = next_state;
2265
re_dfastate_t *pstate;
2266
unsigned int context;
2267
re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2268
/* If (state_log[cur_idx] != 0), it implies that cur_idx is
2269
the destination of a multibyte char/collating element/
2270
back reference. Then the next state is the union set of
2271
these destinations and the results of the transition table. */
2272
pstate = mctx->state_log[cur_idx];
2273
log_nodes = pstate->entrance_nodes;
2274
if (next_state != NULL)
2276
table_nodes = next_state->entrance_nodes;
2277
*err = re_node_set_init_union (&next_nodes, table_nodes,
2279
if (__glibc_unlikely (*err != REG_NOERROR))
2283
next_nodes = *log_nodes;
2284
/* Note: We already add the nodes of the initial state,
2285
then we don't need to add them here. */
2287
context = re_string_context_at (&mctx->input,
2288
re_string_cur_idx (&mctx->input) - 1,
2290
next_state = mctx->state_log[cur_idx]
2291
= re_acquire_state_context (err, dfa, &next_nodes, context);
2292
/* We don't need to check errors here, since the return value of
2293
this function is next_state and ERR is already set. */
2295
if (table_nodes != NULL)
2296
re_node_set_free (&next_nodes);
2299
if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2301
/* Check OP_OPEN_SUBEXP in the current state in case that we use them
2302
later. We must check them here, since the back references in the
2303
next state might use them. */
2304
*err = check_subexp_matching_top (mctx, &next_state->nodes,
2306
if (__glibc_unlikely (*err != REG_NOERROR))
2309
/* If the next state has back references. */
2310
if (next_state->has_backref)
2312
*err = transit_state_bkref (mctx, &next_state->nodes);
2313
if (__glibc_unlikely (*err != REG_NOERROR))
2315
next_state = mctx->state_log[cur_idx];
2322
/* Skip bytes in the input that correspond to part of a
2323
multi-byte match, then look in the log for a state
2324
from which to restart matching. */
2325
static re_dfastate_t *
2326
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2328
re_dfastate_t *cur_state;
2331
Idx max = mctx->state_log_top;
2332
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2336
if (++cur_str_idx > max)
2338
re_string_skip_bytes (&mctx->input, 1);
2340
while (mctx->state_log[cur_str_idx] == NULL);
2342
cur_state = merge_state_with_log (err, mctx, NULL);
2344
while (*err == REG_NOERROR && cur_state == NULL);
2348
/* Helper functions for transit_state. */
2350
/* From the node set CUR_NODES, pick up the nodes whose types are
2351
OP_OPEN_SUBEXP and which have corresponding back references in the regular
2352
expression. And register them to use them later for evaluating the
2353
corresponding back references. */
2355
static reg_errcode_t
2356
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2359
const re_dfa_t *const dfa = mctx->dfa;
2363
/* TODO: This isn't efficient.
2364
Because there might be more than one nodes whose types are
2365
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2368
for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2370
Idx node = cur_nodes->elems[node_idx];
2371
if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2372
&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2373
&& (dfa->used_bkref_map
2374
& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2376
err = match_ctx_add_subtop (mctx, node, str_idx);
2377
if (__glibc_unlikely (err != REG_NOERROR))
2385
/* Return the next state to which the current state STATE will transit by
2386
accepting the current input byte. Return NULL on failure. */
2388
static re_dfastate_t *
2389
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2390
re_dfastate_t *state)
2392
const re_dfa_t *const dfa = mctx->dfa;
2393
re_node_set next_nodes;
2394
re_dfastate_t *next_state;
2395
Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2396
unsigned int context;
2398
*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2399
if (__glibc_unlikely (*err != REG_NOERROR))
2401
for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2403
Idx cur_node = state->nodes.elems[node_cnt];
2404
if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2406
*err = re_node_set_merge (&next_nodes,
2407
dfa->eclosures + dfa->nexts[cur_node]);
2408
if (__glibc_unlikely (*err != REG_NOERROR))
2410
re_node_set_free (&next_nodes);
2415
context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2416
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2417
/* We don't need to check errors here, since the return value of
2418
this function is next_state and ERR is already set. */
2420
re_node_set_free (&next_nodes);
2421
re_string_skip_bytes (&mctx->input, 1);
2426
static reg_errcode_t
2427
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2429
const re_dfa_t *const dfa = mctx->dfa;
2433
for (i = 0; i < pstate->nodes.nelem; ++i)
2435
re_node_set dest_nodes, *new_nodes;
2436
Idx cur_node_idx = pstate->nodes.elems[i];
2439
unsigned int context;
2440
re_dfastate_t *dest_state;
2442
if (!dfa->nodes[cur_node_idx].accept_mb)
2445
if (dfa->nodes[cur_node_idx].constraint)
2447
context = re_string_context_at (&mctx->input,
2448
re_string_cur_idx (&mctx->input),
2450
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2455
/* How many bytes the node can accept? */
2456
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2457
re_string_cur_idx (&mctx->input));
2461
/* The node can accepts 'naccepted' bytes. */
2462
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2463
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2464
: mctx->max_mb_elem_len);
2465
err = clean_state_log_if_needed (mctx, dest_idx);
2466
if (__glibc_unlikely (err != REG_NOERROR))
2468
DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2469
new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2471
dest_state = mctx->state_log[dest_idx];
2472
if (dest_state == NULL)
2473
dest_nodes = *new_nodes;
2476
err = re_node_set_init_union (&dest_nodes,
2477
dest_state->entrance_nodes, new_nodes);
2478
if (__glibc_unlikely (err != REG_NOERROR))
2481
context = re_string_context_at (&mctx->input, dest_idx - 1,
2483
mctx->state_log[dest_idx]
2484
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2485
if (dest_state != NULL)
2486
re_node_set_free (&dest_nodes);
2487
if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2488
&& err != REG_NOERROR))
2494
static reg_errcode_t
2495
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2497
const re_dfa_t *const dfa = mctx->dfa;
2500
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2502
for (i = 0; i < nodes->nelem; ++i)
2504
Idx dest_str_idx, prev_nelem, bkc_idx;
2505
Idx node_idx = nodes->elems[i];
2506
unsigned int context;
2507
const re_token_t *node = dfa->nodes + node_idx;
2508
re_node_set *new_dest_nodes;
2510
/* Check whether 'node' is a backreference or not. */
2511
if (node->type != OP_BACK_REF)
2514
if (node->constraint)
2516
context = re_string_context_at (&mctx->input, cur_str_idx,
2518
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2522
/* 'node' is a backreference.
2523
Check the substring which the substring matched. */
2524
bkc_idx = mctx->nbkref_ents;
2525
err = get_subexp (mctx, node_idx, cur_str_idx);
2526
if (__glibc_unlikely (err != REG_NOERROR))
2529
/* And add the epsilon closures (which is 'new_dest_nodes') of
2530
the backreference to appropriate state_log. */
2531
DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2532
for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2535
re_dfastate_t *dest_state;
2536
struct re_backref_cache_entry *bkref_ent;
2537
bkref_ent = mctx->bkref_ents + bkc_idx;
2538
if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2540
subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2541
new_dest_nodes = (subexp_len == 0
2542
? dfa->eclosures + dfa->edests[node_idx].elems[0]
2543
: dfa->eclosures + dfa->nexts[node_idx]);
2544
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2545
- bkref_ent->subexp_from);
2546
context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2548
dest_state = mctx->state_log[dest_str_idx];
2549
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2550
: mctx->state_log[cur_str_idx]->nodes.nelem);
2551
/* Add 'new_dest_node' to state_log. */
2552
if (dest_state == NULL)
2554
mctx->state_log[dest_str_idx]
2555
= re_acquire_state_context (&err, dfa, new_dest_nodes,
2557
if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2558
&& err != REG_NOERROR))
2563
re_node_set dest_nodes;
2564
err = re_node_set_init_union (&dest_nodes,
2565
dest_state->entrance_nodes,
2567
if (__glibc_unlikely (err != REG_NOERROR))
2569
re_node_set_free (&dest_nodes);
2572
mctx->state_log[dest_str_idx]
2573
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574
re_node_set_free (&dest_nodes);
2575
if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2576
&& err != REG_NOERROR))
2579
/* We need to check recursively if the backreference can epsilon
2582
&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2584
err = check_subexp_matching_top (mctx, new_dest_nodes,
2586
if (__glibc_unlikely (err != REG_NOERROR))
2588
err = transit_state_bkref (mctx, new_dest_nodes);
2589
if (__glibc_unlikely (err != REG_NOERROR))
2599
/* Enumerate all the candidates which the backreference BKREF_NODE can match
2600
at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2601
Note that we might collect inappropriate candidates here.
2602
However, the cost of checking them strictly here is too high, then we
2603
delay these checking for prune_impossible_nodes(). */
2605
static reg_errcode_t
2606
__attribute_warn_unused_result__
2607
get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2609
const re_dfa_t *const dfa = mctx->dfa;
2610
Idx subexp_num, sub_top_idx;
2611
const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2612
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2613
Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2614
if (cache_idx != -1)
2616
const struct re_backref_cache_entry *entry
2617
= mctx->bkref_ents + cache_idx;
2619
if (entry->node == bkref_node)
2620
return REG_NOERROR; /* We already checked it. */
2621
while (entry++->more);
2624
subexp_num = dfa->nodes[bkref_node].opr.idx;
2626
/* For each sub expression */
2627
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2630
re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2631
re_sub_match_last_t *sub_last;
2632
Idx sub_last_idx, sl_str, bkref_str_off;
2634
if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2635
continue; /* It isn't related. */
2637
sl_str = sub_top->str_idx;
2638
bkref_str_off = bkref_str_idx;
2639
/* At first, check the last node of sub expressions we already
2641
for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2643
regoff_t sl_str_diff;
2644
sub_last = sub_top->lasts[sub_last_idx];
2645
sl_str_diff = sub_last->str_idx - sl_str;
2646
/* The matched string by the sub expression match with the substring
2647
at the back reference? */
2648
if (sl_str_diff > 0)
2650
if (__glibc_unlikely (bkref_str_off + sl_str_diff
2651
> mctx->input.valid_len))
2653
/* Not enough chars for a successful match. */
2654
if (bkref_str_off + sl_str_diff > mctx->input.len)
2657
err = clean_state_log_if_needed (mctx,
2660
if (__glibc_unlikely (err != REG_NOERROR))
2662
buf = (const char *) re_string_get_buffer (&mctx->input);
2664
if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2665
/* We don't need to search this sub expression any more. */
2668
bkref_str_off += sl_str_diff;
2669
sl_str += sl_str_diff;
2670
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2673
/* Reload buf, since the preceding call might have reallocated
2675
buf = (const char *) re_string_get_buffer (&mctx->input);
2677
if (err == REG_NOMATCH)
2679
if (__glibc_unlikely (err != REG_NOERROR))
2683
if (sub_last_idx < sub_top->nlasts)
2685
if (sub_last_idx > 0)
2687
/* Then, search for the other last nodes of the sub expression. */
2688
for (; sl_str <= bkref_str_idx; ++sl_str)
2691
regoff_t sl_str_off;
2692
const re_node_set *nodes;
2693
sl_str_off = sl_str - sub_top->str_idx;
2694
/* The matched string by the sub expression match with the substring
2695
at the back reference? */
2698
if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2700
/* If we are at the end of the input, we cannot match. */
2701
if (bkref_str_off >= mctx->input.len)
2704
err = extend_buffers (mctx, bkref_str_off + 1);
2705
if (__glibc_unlikely (err != REG_NOERROR))
2708
buf = (const char *) re_string_get_buffer (&mctx->input);
2710
if (buf [bkref_str_off++] != buf[sl_str - 1])
2711
break; /* We don't need to search this sub expression
2714
if (mctx->state_log[sl_str] == NULL)
2716
/* Does this state have a ')' of the sub expression? */
2717
nodes = &mctx->state_log[sl_str]->nodes;
2718
cls_node = find_subexp_node (dfa, nodes, subexp_num,
2722
if (sub_top->path == NULL)
2724
sub_top->path = calloc (sizeof (state_array_t),
2725
sl_str - sub_top->str_idx + 1);
2726
if (sub_top->path == NULL)
2729
/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2730
in the current context? */
2731
err = check_arrival (mctx, sub_top->path, sub_top->node,
2732
sub_top->str_idx, cls_node, sl_str,
2734
if (err == REG_NOMATCH)
2736
if (__glibc_unlikely (err != REG_NOERROR))
2738
sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2739
if (__glibc_unlikely (sub_last == NULL))
2741
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2743
buf = (const char *) re_string_get_buffer (&mctx->input);
2744
if (err == REG_NOMATCH)
2746
if (__glibc_unlikely (err != REG_NOERROR))
2753
/* Helper functions for get_subexp(). */
2755
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2756
If it can arrive, register the sub expression expressed with SUB_TOP
2759
static reg_errcode_t
2760
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2761
re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2765
/* Can the subexpression arrive the back reference? */
2766
err = check_arrival (mctx, &sub_last->path, sub_last->node,
2767
sub_last->str_idx, bkref_node, bkref_str,
2769
if (err != REG_NOERROR)
2771
err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2773
if (__glibc_unlikely (err != REG_NOERROR))
2775
to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2776
return clean_state_log_if_needed (mctx, to_idx);
2779
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2780
Search '(' if FL_OPEN, or search ')' otherwise.
2781
TODO: This function isn't efficient...
2782
Because there might be more than one nodes whose types are
2783
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2788
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2789
Idx subexp_idx, int type)
2792
for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2794
Idx cls_node = nodes->elems[cls_idx];
2795
const re_token_t *node = dfa->nodes + cls_node;
2796
if (node->type == type
2797
&& node->opr.idx == subexp_idx)
2803
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2804
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2806
Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2807
REG_ESPACE if memory is exhausted. */
2809
static reg_errcode_t
2810
__attribute_warn_unused_result__
2811
check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2812
Idx top_str, Idx last_node, Idx last_str, int type)
2814
const re_dfa_t *const dfa = mctx->dfa;
2815
reg_errcode_t err = REG_NOERROR;
2816
Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2817
re_dfastate_t *cur_state = NULL;
2818
re_node_set *cur_nodes, next_nodes;
2819
re_dfastate_t **backup_state_log;
2820
unsigned int context;
2822
subexp_num = dfa->nodes[top_node].opr.idx;
2823
/* Extend the buffer if we need. */
2824
if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2826
re_dfastate_t **new_array;
2827
Idx old_alloc = path->alloc;
2828
Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2830
if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2832
new_alloc = old_alloc + incr_alloc;
2833
if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2835
new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2836
if (__glibc_unlikely (new_array == NULL))
2838
path->array = new_array;
2839
path->alloc = new_alloc;
2840
memset (new_array + old_alloc, '\0',
2841
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2844
str_idx = path->next_idx ? path->next_idx : top_str;
2846
/* Temporary modify MCTX. */
2847
backup_state_log = mctx->state_log;
2848
backup_cur_idx = mctx->input.cur_idx;
2849
mctx->state_log = path->array;
2850
mctx->input.cur_idx = str_idx;
2852
/* Setup initial node set. */
2853
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2854
if (str_idx == top_str)
2856
err = re_node_set_init_1 (&next_nodes, top_node);
2857
if (__glibc_unlikely (err != REG_NOERROR))
2859
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2860
if (__glibc_unlikely (err != REG_NOERROR))
2862
re_node_set_free (&next_nodes);
2868
cur_state = mctx->state_log[str_idx];
2869
if (cur_state && cur_state->has_backref)
2871
err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2872
if (__glibc_unlikely (err != REG_NOERROR))
2876
re_node_set_init_empty (&next_nodes);
2878
if (str_idx == top_str || (cur_state && cur_state->has_backref))
2880
if (next_nodes.nelem)
2882
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2884
if (__glibc_unlikely (err != REG_NOERROR))
2886
re_node_set_free (&next_nodes);
2890
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2891
if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2893
re_node_set_free (&next_nodes);
2896
mctx->state_log[str_idx] = cur_state;
2899
for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2901
re_node_set_empty (&next_nodes);
2902
if (mctx->state_log[str_idx + 1])
2904
err = re_node_set_merge (&next_nodes,
2905
&mctx->state_log[str_idx + 1]->nodes);
2906
if (__glibc_unlikely (err != REG_NOERROR))
2908
re_node_set_free (&next_nodes);
2914
err = check_arrival_add_next_nodes (mctx, str_idx,
2915
&cur_state->non_eps_nodes,
2917
if (__glibc_unlikely (err != REG_NOERROR))
2919
re_node_set_free (&next_nodes);
2924
if (next_nodes.nelem)
2926
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2927
if (__glibc_unlikely (err != REG_NOERROR))
2929
re_node_set_free (&next_nodes);
2932
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2934
if (__glibc_unlikely (err != REG_NOERROR))
2936
re_node_set_free (&next_nodes);
2940
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2942
if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2944
re_node_set_free (&next_nodes);
2947
mctx->state_log[str_idx] = cur_state;
2948
null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2950
re_node_set_free (&next_nodes);
2951
cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2952
: &mctx->state_log[last_str]->nodes);
2953
path->next_idx = str_idx;
2956
mctx->state_log = backup_state_log;
2957
mctx->input.cur_idx = backup_cur_idx;
2959
/* Then check the current node set has the node LAST_NODE. */
2960
if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2966
/* Helper functions for check_arrival. */
2968
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2970
TODO: This function is similar to the functions transit_state*(),
2971
however this function has many additional works.
2972
Can't we unify them? */
2974
static reg_errcode_t
2975
__attribute_warn_unused_result__
2976
check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2977
re_node_set *cur_nodes, re_node_set *next_nodes)
2979
const re_dfa_t *const dfa = mctx->dfa;
2982
reg_errcode_t err = REG_NOERROR;
2983
re_node_set union_set;
2984
re_node_set_init_empty (&union_set);
2985
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2988
Idx cur_node = cur_nodes->elems[cur_idx];
2989
DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2991
/* If the node may accept "multi byte". */
2992
if (dfa->nodes[cur_node].accept_mb)
2994
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2998
re_dfastate_t *dest_state;
2999
Idx next_node = dfa->nexts[cur_node];
3000
Idx next_idx = str_idx + naccepted;
3001
dest_state = mctx->state_log[next_idx];
3002
re_node_set_empty (&union_set);
3005
err = re_node_set_merge (&union_set, &dest_state->nodes);
3006
if (__glibc_unlikely (err != REG_NOERROR))
3008
re_node_set_free (&union_set);
3012
ok = re_node_set_insert (&union_set, next_node);
3013
if (__glibc_unlikely (! ok))
3015
re_node_set_free (&union_set);
3018
mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3020
if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3021
&& err != REG_NOERROR))
3023
re_node_set_free (&union_set);
3030
|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3032
ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3033
if (__glibc_unlikely (! ok))
3035
re_node_set_free (&union_set);
3040
re_node_set_free (&union_set);
3044
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3045
CUR_NODES, however exclude the nodes which are:
3046
- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3047
- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3050
static reg_errcode_t
3051
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3052
Idx ex_subexp, int type)
3055
Idx idx, outside_node;
3056
re_node_set new_nodes;
3057
DEBUG_ASSERT (cur_nodes->nelem);
3058
err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3059
if (__glibc_unlikely (err != REG_NOERROR))
3061
/* Create a new node set NEW_NODES with the nodes which are epsilon
3062
closures of the node in CUR_NODES. */
3064
for (idx = 0; idx < cur_nodes->nelem; ++idx)
3066
Idx cur_node = cur_nodes->elems[idx];
3067
const re_node_set *eclosure = dfa->eclosures + cur_node;
3068
outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3069
if (outside_node == -1)
3071
/* There are no problematic nodes, just merge them. */
3072
err = re_node_set_merge (&new_nodes, eclosure);
3073
if (__glibc_unlikely (err != REG_NOERROR))
3075
re_node_set_free (&new_nodes);
3081
/* There are problematic nodes, re-calculate incrementally. */
3082
err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3084
if (__glibc_unlikely (err != REG_NOERROR))
3086
re_node_set_free (&new_nodes);
3091
re_node_set_free (cur_nodes);
3092
*cur_nodes = new_nodes;
3096
/* Helper function for check_arrival_expand_ecl.
3097
Check incrementally the epsilon closure of TARGET, and if it isn't
3098
problematic append it to DST_NODES. */
3100
static reg_errcode_t
3101
__attribute_warn_unused_result__
3102
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3103
Idx target, Idx ex_subexp, int type)
3106
for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3110
if (dfa->nodes[cur_node].type == type
3111
&& dfa->nodes[cur_node].opr.idx == ex_subexp)
3113
if (type == OP_CLOSE_SUBEXP)
3115
ok = re_node_set_insert (dst_nodes, cur_node);
3116
if (__glibc_unlikely (! ok))
3121
ok = re_node_set_insert (dst_nodes, cur_node);
3122
if (__glibc_unlikely (! ok))
3124
if (dfa->edests[cur_node].nelem == 0)
3126
if (dfa->edests[cur_node].nelem == 2)
3129
err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3130
dfa->edests[cur_node].elems[1],
3132
if (__glibc_unlikely (err != REG_NOERROR))
3135
cur_node = dfa->edests[cur_node].elems[0];
3141
/* For all the back references in the current state, calculate the
3142
destination of the back references by the appropriate entry
3143
in MCTX->BKREF_ENTS. */
3145
static reg_errcode_t
3146
__attribute_warn_unused_result__
3147
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3148
Idx cur_str, Idx subexp_num, int type)
3150
const re_dfa_t *const dfa = mctx->dfa;
3152
Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3153
struct re_backref_cache_entry *ent;
3155
if (cache_idx_start == -1)
3159
ent = mctx->bkref_ents + cache_idx_start;
3162
Idx to_idx, next_node;
3164
/* Is this entry ENT is appropriate? */
3165
if (!re_node_set_contains (cur_nodes, ent->node))
3168
to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3169
/* Calculate the destination of the back reference, and append it
3170
to MCTX->STATE_LOG. */
3171
if (to_idx == cur_str)
3173
/* The backreference did epsilon transit, we must re-check all the
3174
node in the current state. */
3175
re_node_set new_dests;
3176
reg_errcode_t err2, err3;
3177
next_node = dfa->edests[ent->node].elems[0];
3178
if (re_node_set_contains (cur_nodes, next_node))
3180
err = re_node_set_init_1 (&new_dests, next_node);
3181
err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3182
err3 = re_node_set_merge (cur_nodes, &new_dests);
3183
re_node_set_free (&new_dests);
3184
if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3185
|| err3 != REG_NOERROR))
3187
err = (err != REG_NOERROR ? err
3188
: (err2 != REG_NOERROR ? err2 : err3));
3191
/* TODO: It is still inefficient... */
3196
re_node_set union_set;
3197
next_node = dfa->nexts[ent->node];
3198
if (mctx->state_log[to_idx])
3201
if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3204
err = re_node_set_init_copy (&union_set,
3205
&mctx->state_log[to_idx]->nodes);
3206
ok = re_node_set_insert (&union_set, next_node);
3207
if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3209
re_node_set_free (&union_set);
3210
err = err != REG_NOERROR ? err : REG_ESPACE;
3216
err = re_node_set_init_1 (&union_set, next_node);
3217
if (__glibc_unlikely (err != REG_NOERROR))
3220
mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3221
re_node_set_free (&union_set);
3222
if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3223
&& err != REG_NOERROR))
3227
while (ent++->more);
3231
/* Build transition table for the state.
3232
Return true if successful. */
3234
static bool __attribute_noinline__
3235
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3240
bool need_word_trtable = false;
3241
bitset_word_t elem, mask;
3242
Idx ndests; /* Number of the destination states from 'state'. */
3243
re_dfastate_t **trtable;
3244
re_dfastate_t *dest_states[SBC_MAX];
3245
re_dfastate_t *dest_states_word[SBC_MAX];
3246
re_dfastate_t *dest_states_nl[SBC_MAX];
3247
re_node_set follows;
3248
bitset_t acceptable;
3250
/* We build DFA states which corresponds to the destination nodes
3251
from 'state'. 'dests_node[i]' represents the nodes which i-th
3252
destination state contains, and 'dests_ch[i]' represents the
3253
characters which i-th destination state accepts. */
3254
re_node_set dests_node[SBC_MAX];
3255
bitset_t dests_ch[SBC_MAX];
3257
/* Initialize transition table. */
3258
state->word_trtable = state->trtable = NULL;
3260
/* At first, group all nodes belonging to 'state' into several
3262
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3263
if (__glibc_unlikely (ndests <= 0))
3265
/* Return false in case of an error, true otherwise. */
3268
state->trtable = (re_dfastate_t **)
3269
calloc (sizeof (re_dfastate_t *), SBC_MAX);
3270
if (__glibc_unlikely (state->trtable == NULL))
3277
err = re_node_set_alloc (&follows, ndests + 1);
3278
if (__glibc_unlikely (err != REG_NOERROR))
3281
re_node_set_free (&follows);
3282
for (i = 0; i < ndests; ++i)
3283
re_node_set_free (dests_node + i);
3287
bitset_empty (acceptable);
3289
/* Then build the states for all destinations. */
3290
for (i = 0; i < ndests; ++i)
3293
re_node_set_empty (&follows);
3294
/* Merge the follows of this destination states. */
3295
for (j = 0; j < dests_node[i].nelem; ++j)
3297
next_node = dfa->nexts[dests_node[i].elems[j]];
3298
if (next_node != -1)
3300
err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3301
if (__glibc_unlikely (err != REG_NOERROR))
3305
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3306
if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3308
/* If the new state has context constraint,
3309
build appropriate states for these contexts. */
3310
if (dest_states[i]->has_constraint)
3312
dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3314
if (__glibc_unlikely (dest_states_word[i] == NULL
3315
&& err != REG_NOERROR))
3318
if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3319
need_word_trtable = true;
3321
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3323
if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3328
dest_states_word[i] = dest_states[i];
3329
dest_states_nl[i] = dest_states[i];
3331
bitset_merge (acceptable, dests_ch[i]);
3334
if (!__glibc_unlikely (need_word_trtable))
3336
/* We don't care about whether the following character is a word
3337
character, or we are in a single-byte character set so we can
3338
discern by looking at the character code: allocate a
3339
256-entry transition table. */
3340
trtable = state->trtable =
3341
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3342
if (__glibc_unlikely (trtable == NULL))
3345
/* For all characters ch...: */
3346
for (i = 0; i < BITSET_WORDS; ++i)
3347
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3349
mask <<= 1, elem >>= 1, ++ch)
3350
if (__glibc_unlikely (elem & 1))
3352
/* There must be exactly one destination which accepts
3353
character ch. See group_nodes_into_DFAstates. */
3354
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3357
/* j-th destination accepts the word character ch. */
3358
if (dfa->word_char[i] & mask)
3359
trtable[ch] = dest_states_word[j];
3361
trtable[ch] = dest_states[j];
3366
/* We care about whether the following character is a word
3367
character, and we are in a multi-byte character set: discern
3368
by looking at the character code: build two 256-entry
3369
transition tables, one starting at trtable[0] and one
3370
starting at trtable[SBC_MAX]. */
3371
trtable = state->word_trtable =
3372
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3373
if (__glibc_unlikely (trtable == NULL))
3376
/* For all characters ch...: */
3377
for (i = 0; i < BITSET_WORDS; ++i)
3378
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3380
mask <<= 1, elem >>= 1, ++ch)
3381
if (__glibc_unlikely (elem & 1))
3383
/* There must be exactly one destination which accepts
3384
character ch. See group_nodes_into_DFAstates. */
3385
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3388
/* j-th destination accepts the word character ch. */
3389
trtable[ch] = dest_states[j];
3390
trtable[ch + SBC_MAX] = dest_states_word[j];
3395
if (bitset_contain (acceptable, NEWLINE_CHAR))
3397
/* The current state accepts newline character. */
3398
for (j = 0; j < ndests; ++j)
3399
if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3401
/* k-th destination accepts newline character. */
3402
trtable[NEWLINE_CHAR] = dest_states_nl[j];
3403
if (need_word_trtable)
3404
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3405
/* There must be only one destination which accepts
3406
newline. See group_nodes_into_DFAstates. */
3411
re_node_set_free (&follows);
3412
for (i = 0; i < ndests; ++i)
3413
re_node_set_free (dests_node + i);
3417
/* Group all nodes belonging to STATE into several destinations.
3418
Then for all destinations, set the nodes belonging to the destination
3419
to DESTS_NODE[i] and set the characters accepted by the destination
3420
to DEST_CH[i]. Return the number of destinations if successful,
3421
-1 on internal error. */
3424
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3425
re_node_set *dests_node, bitset_t *dests_ch)
3430
Idx ndests; /* Number of the destinations from 'state'. */
3431
bitset_t accepts; /* Characters a node can accept. */
3432
const re_node_set *cur_nodes = &state->nodes;
3433
bitset_empty (accepts);
3436
/* For all the nodes belonging to 'state', */
3437
for (i = 0; i < cur_nodes->nelem; ++i)
3439
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3440
re_token_type_t type = node->type;
3441
unsigned int constraint = node->constraint;
3443
/* Enumerate all single byte character this node can accept. */
3444
if (type == CHARACTER)
3445
bitset_set (accepts, node->opr.c);
3446
else if (type == SIMPLE_BRACKET)
3448
bitset_merge (accepts, node->opr.sbcset);
3450
else if (type == OP_PERIOD)
3452
if (dfa->mb_cur_max > 1)
3453
bitset_merge (accepts, dfa->sb_char);
3455
bitset_set_all (accepts);
3456
if (!(dfa->syntax & RE_DOT_NEWLINE))
3457
bitset_clear (accepts, '\n');
3458
if (dfa->syntax & RE_DOT_NOT_NULL)
3459
bitset_clear (accepts, '\0');
3461
else if (type == OP_UTF8_PERIOD)
3463
if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3464
memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3466
bitset_merge (accepts, utf8_sb_map);
3467
if (!(dfa->syntax & RE_DOT_NEWLINE))
3468
bitset_clear (accepts, '\n');
3469
if (dfa->syntax & RE_DOT_NOT_NULL)
3470
bitset_clear (accepts, '\0');
3475
/* Check the 'accepts' and sift the characters which are not
3476
match it the context. */
3479
if (constraint & NEXT_NEWLINE_CONSTRAINT)
3481
bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3482
bitset_empty (accepts);
3483
if (accepts_newline)
3484
bitset_set (accepts, NEWLINE_CHAR);
3488
if (constraint & NEXT_ENDBUF_CONSTRAINT)
3490
bitset_empty (accepts);
3494
if (constraint & NEXT_WORD_CONSTRAINT)
3496
bitset_word_t any_set = 0;
3497
if (type == CHARACTER && !node->word_char)
3499
bitset_empty (accepts);
3502
if (dfa->mb_cur_max > 1)
3503
for (j = 0; j < BITSET_WORDS; ++j)
3504
any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3506
for (j = 0; j < BITSET_WORDS; ++j)
3507
any_set |= (accepts[j] &= dfa->word_char[j]);
3511
if (constraint & NEXT_NOTWORD_CONSTRAINT)
3513
bitset_word_t any_set = 0;
3514
if (type == CHARACTER && node->word_char)
3516
bitset_empty (accepts);
3519
if (dfa->mb_cur_max > 1)
3520
for (j = 0; j < BITSET_WORDS; ++j)
3521
any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3523
for (j = 0; j < BITSET_WORDS; ++j)
3524
any_set |= (accepts[j] &= ~dfa->word_char[j]);
3530
/* Then divide 'accepts' into DFA states, or create a new
3531
state. Above, we make sure that accepts is not empty. */
3532
for (j = 0; j < ndests; ++j)
3534
bitset_t intersec; /* Intersection sets, see below. */
3536
/* Flags, see below. */
3537
bitset_word_t has_intersec, not_subset, not_consumed;
3539
/* Optimization, skip if this state doesn't accept the character. */
3540
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3543
/* Enumerate the intersection set of this state and 'accepts'. */
3545
for (k = 0; k < BITSET_WORDS; ++k)
3546
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3547
/* And skip if the intersection set is empty. */
3551
/* Then check if this state is a subset of 'accepts'. */
3552
not_subset = not_consumed = 0;
3553
for (k = 0; k < BITSET_WORDS; ++k)
3555
not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3556
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3559
/* If this state isn't a subset of 'accepts', create a
3560
new group state, which has the 'remains'. */
3563
bitset_copy (dests_ch[ndests], remains);
3564
bitset_copy (dests_ch[j], intersec);
3565
err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3566
if (__glibc_unlikely (err != REG_NOERROR))
3571
/* Put the position in the current group. */
3572
ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3573
if (__glibc_unlikely (! ok))
3576
/* If all characters are consumed, go to next node. */
3580
/* Some characters remain, create a new group. */
3583
bitset_copy (dests_ch[ndests], accepts);
3584
err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3585
if (__glibc_unlikely (err != REG_NOERROR))
3588
bitset_empty (accepts);
3591
assume (ndests <= SBC_MAX);
3594
for (j = 0; j < ndests; ++j)
3595
re_node_set_free (dests_node + j);
3599
/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3600
Return the number of the bytes the node accepts.
3601
STR_IDX is the current index of the input string.
3603
This function handles the nodes which can accept one character, or
3604
one collating element like '.', '[a-z]', opposite to the other nodes
3605
can only accept one byte. */
3608
# include <locale/weight.h>
3612
check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3613
const re_string_t *input, Idx str_idx)
3615
const re_token_t *node = dfa->nodes + node_idx;
3616
int char_len, elem_len;
3619
if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3621
unsigned char c = re_string_byte_at (input, str_idx), d;
3622
if (__glibc_likely (c < 0xc2))
3625
if (str_idx + 2 > input->len)
3628
d = re_string_byte_at (input, str_idx + 1);
3630
return (d < 0x80 || d > 0xbf) ? 0 : 2;
3634
if (c == 0xe0 && d < 0xa0)
3640
if (c == 0xf0 && d < 0x90)
3646
if (c == 0xf8 && d < 0x88)
3652
if (c == 0xfc && d < 0x84)
3658
if (str_idx + char_len > input->len)
3661
for (i = 1; i < char_len; ++i)
3663
d = re_string_byte_at (input, str_idx + i);
3664
if (d < 0x80 || d > 0xbf)
3670
char_len = re_string_char_size_at (input, str_idx);
3671
if (node->type == OP_PERIOD)
3675
/* FIXME: I don't think this if is needed, as both '\n'
3676
and '\0' are char_len == 1. */
3677
/* '.' accepts any one character except the following two cases. */
3678
if ((!(dfa->syntax & RE_DOT_NEWLINE)
3679
&& re_string_byte_at (input, str_idx) == '\n')
3680
|| ((dfa->syntax & RE_DOT_NOT_NULL)
3681
&& re_string_byte_at (input, str_idx) == '\0'))
3686
elem_len = re_string_elem_size_at (input, str_idx);
3687
if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3690
if (node->type == COMPLEX_BRACKET)
3692
const re_charset_t *cset = node->opr.mbcset;
3694
const unsigned char *pin
3695
= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3700
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3701
? re_string_wchar_at (input, str_idx) : 0);
3703
/* match with multibyte character? */
3704
for (i = 0; i < cset->nmbchars; ++i)
3705
if (wc == cset->mbchars[i])
3707
match_len = char_len;
3708
goto check_node_accept_bytes_match;
3710
/* match with character_class? */
3711
for (i = 0; i < cset->nchar_classes; ++i)
3713
wctype_t wt = cset->char_classes[i];
3714
if (__iswctype (wc, wt))
3716
match_len = char_len;
3717
goto check_node_accept_bytes_match;
3722
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3725
unsigned int in_collseq = 0;
3726
const int32_t *table, *indirect;
3727
const unsigned char *weights, *extra;
3728
const char *collseqwc;
3730
/* match with collating_symbol? */
3731
if (cset->ncoll_syms)
3732
extra = (const unsigned char *)
3733
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3734
for (i = 0; i < cset->ncoll_syms; ++i)
3736
const unsigned char *coll_sym = extra + cset->coll_syms[i];
3737
/* Compare the length of input collating element and
3738
the length of current collating element. */
3739
if (*coll_sym != elem_len)
3741
/* Compare each bytes. */
3742
for (j = 0; j < *coll_sym; j++)
3743
if (pin[j] != coll_sym[1 + j])
3747
/* Match if every bytes is equal. */
3749
goto check_node_accept_bytes_match;
3755
if (elem_len <= char_len)
3757
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3758
in_collseq = __collseq_table_lookup (collseqwc, wc);
3761
in_collseq = find_collation_sequence_value (pin, elem_len);
3763
/* match with range expression? */
3764
/* FIXME: Implement rational ranges here, too. */
3765
for (i = 0; i < cset->nranges; ++i)
3766
if (cset->range_starts[i] <= in_collseq
3767
&& in_collseq <= cset->range_ends[i])
3769
match_len = elem_len;
3770
goto check_node_accept_bytes_match;
3773
/* match with equivalence_class? */
3774
if (cset->nequiv_classes)
3776
const unsigned char *cp = pin;
3777
table = (const int32_t *)
3778
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3779
weights = (const unsigned char *)
3780
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3781
extra = (const unsigned char *)
3782
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3783
indirect = (const int32_t *)
3784
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3785
int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3786
int32_t rule = idx >> 24;
3790
size_t weight_len = weights[idx];
3791
for (i = 0; i < cset->nequiv_classes; ++i)
3793
int32_t equiv_class_idx = cset->equiv_classes[i];
3794
int32_t equiv_class_rule = equiv_class_idx >> 24;
3795
equiv_class_idx &= 0xffffff;
3796
if (weights[equiv_class_idx] == weight_len
3797
&& equiv_class_rule == rule
3798
&& memcmp (weights + idx + 1,
3799
weights + equiv_class_idx + 1,
3802
match_len = elem_len;
3803
goto check_node_accept_bytes_match;
3812
/* match with range expression? */
3813
for (i = 0; i < cset->nranges; ++i)
3815
if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3817
match_len = char_len;
3818
goto check_node_accept_bytes_match;
3822
check_node_accept_bytes_match:
3823
if (!cset->non_match)
3830
return (elem_len > char_len) ? elem_len : char_len;
3838
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3840
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3845
/* No valid character. Match it as a single byte character. */
3846
const unsigned char *collseq = (const unsigned char *)
3847
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3848
return collseq[mbs[0]];
3855
const unsigned char *extra = (const unsigned char *)
3856
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3857
int32_t extrasize = (const unsigned char *)
3858
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3860
for (idx = 0; idx < extrasize;)
3864
int32_t elem_mbs_len;
3865
/* Skip the name of collating element name. */
3866
idx = idx + extra[idx] + 1;
3867
elem_mbs_len = extra[idx++];
3868
if (mbs_len == elem_mbs_len)
3870
for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3871
if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3873
if (mbs_cnt == elem_mbs_len)
3874
/* Found the entry. */
3877
/* Skip the byte sequence of the collating element. */
3878
idx += elem_mbs_len;
3879
/* Adjust for the alignment. */
3880
idx = (idx + 3) & ~3;
3881
/* Skip the collation sequence value. */
3882
idx += sizeof (uint32_t);
3883
/* Skip the wide char sequence of the collating element. */
3884
idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3885
/* If we found the entry, return the sequence value. */
3887
return *(uint32_t *) (extra + idx);
3888
/* Skip the collation sequence value. */
3889
idx += sizeof (uint32_t);
3896
/* Check whether the node accepts the byte which is IDX-th
3897
byte of the INPUT. */
3900
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3904
ch = re_string_byte_at (&mctx->input, idx);
3908
if (node->opr.c != ch)
3912
case SIMPLE_BRACKET:
3913
if (!bitset_contain (node->opr.sbcset, ch))
3917
case OP_UTF8_PERIOD:
3918
if (ch >= ASCII_CHARS)
3922
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3923
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3931
if (node->constraint)
3933
/* The node has constraints. Check whether the current context
3934
satisfies the constraints. */
3935
unsigned int context = re_string_context_at (&mctx->input, idx,
3937
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3944
/* Extend the buffers, if the buffers have run out. */
3946
static reg_errcode_t
3947
__attribute_warn_unused_result__
3948
extend_buffers (re_match_context_t *mctx, int min_len)
3951
re_string_t *pstr = &mctx->input;
3953
/* Avoid overflow. */
3954
if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3958
/* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3959
ret = re_string_realloc_buffers (pstr,
3961
MIN (pstr->len, pstr->bufs_len * 2)));
3962
if (__glibc_unlikely (ret != REG_NOERROR))
3965
if (mctx->state_log != NULL)
3967
/* And double the length of state_log. */
3968
/* XXX We have no indication of the size of this buffer. If this
3969
allocation fail we have no indication that the state_log array
3970
does not have the right size. */
3971
re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3972
pstr->bufs_len + 1);
3973
if (__glibc_unlikely (new_array == NULL))
3975
mctx->state_log = new_array;
3978
/* Then reconstruct the buffers. */
3981
if (pstr->mb_cur_max > 1)
3983
ret = build_wcs_upper_buffer (pstr);
3984
if (__glibc_unlikely (ret != REG_NOERROR))
3988
build_upper_buffer (pstr);
3992
if (pstr->mb_cur_max > 1)
3993
build_wcs_buffer (pstr);
3996
if (pstr->trans != NULL)
3997
re_string_translate_buffer (pstr);
4004
/* Functions for matching context. */
4006
/* Initialize MCTX. */
4008
static reg_errcode_t
4009
__attribute_warn_unused_result__
4010
match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4012
mctx->eflags = eflags;
4013
mctx->match_last = -1;
4016
/* Avoid overflow. */
4017
size_t max_object_size =
4018
MAX (sizeof (struct re_backref_cache_entry),
4019
sizeof (re_sub_match_top_t *));
4020
if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4023
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4024
mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4025
if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4028
/* Already zero-ed by the caller.
4030
mctx->bkref_ents = NULL;
4031
mctx->nbkref_ents = 0;
4032
mctx->nsub_tops = 0; */
4033
mctx->abkref_ents = n;
4034
mctx->max_mb_elem_len = 1;
4035
mctx->asub_tops = n;
4039
/* Clean the entries which depend on the current input in MCTX.
4040
This function must be invoked when the matcher changes the start index
4041
of the input, or changes the input string. */
4044
match_ctx_clean (re_match_context_t *mctx)
4047
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4050
re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4051
for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4053
re_sub_match_last_t *last = top->lasts[sl_idx];
4054
re_free (last->path.array);
4057
re_free (top->lasts);
4060
re_free (top->path->array);
4061
re_free (top->path);
4066
mctx->nsub_tops = 0;
4067
mctx->nbkref_ents = 0;
4070
/* Free all the memory associated with MCTX. */
4073
match_ctx_free (re_match_context_t *mctx)
4075
/* First, free all the memory associated with MCTX->SUB_TOPS. */
4076
match_ctx_clean (mctx);
4077
re_free (mctx->sub_tops);
4078
re_free (mctx->bkref_ents);
4081
/* Add a new backreference entry to MCTX.
4082
Note that we assume that caller never call this function with duplicate
4083
entry, and call with STR_IDX which isn't smaller than any existing entry.
4086
static reg_errcode_t
4087
__attribute_warn_unused_result__
4088
match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4091
if (mctx->nbkref_ents >= mctx->abkref_ents)
4093
struct re_backref_cache_entry* new_entry;
4094
new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4095
mctx->abkref_ents * 2);
4096
if (__glibc_unlikely (new_entry == NULL))
4098
re_free (mctx->bkref_ents);
4101
mctx->bkref_ents = new_entry;
4102
memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4103
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4104
mctx->abkref_ents *= 2;
4106
if (mctx->nbkref_ents > 0
4107
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4108
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4110
mctx->bkref_ents[mctx->nbkref_ents].node = node;
4111
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4112
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4113
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4115
/* This is a cache that saves negative results of check_dst_limits_calc_pos.
4116
If bit N is clear, means that this entry won't epsilon-transition to
4117
an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4118
it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4121
A backreference does not epsilon-transition unless it is empty, so set
4122
to all zeros if FROM != TO. */
4123
mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4124
= (from == to ? -1 : 0);
4126
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4127
if (mctx->max_mb_elem_len < to - from)
4128
mctx->max_mb_elem_len = to - from;
4132
/* Return the first entry with the same str_idx, or -1 if none is
4133
found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4136
search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4138
Idx left, right, mid, last;
4139
last = right = mctx->nbkref_ents;
4140
for (left = 0; left < right;)
4142
mid = (left + right) / 2;
4143
if (mctx->bkref_ents[mid].str_idx < str_idx)
4148
if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4154
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4157
static reg_errcode_t
4158
__attribute_warn_unused_result__
4159
match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4161
DEBUG_ASSERT (mctx->sub_tops != NULL);
4162
DEBUG_ASSERT (mctx->asub_tops > 0);
4163
if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4165
Idx new_asub_tops = mctx->asub_tops * 2;
4166
re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4167
re_sub_match_top_t *,
4169
if (__glibc_unlikely (new_array == NULL))
4171
mctx->sub_tops = new_array;
4172
mctx->asub_tops = new_asub_tops;
4174
mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4175
if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4177
mctx->sub_tops[mctx->nsub_tops]->node = node;
4178
mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4182
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4183
at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4184
Return the new entry if successful, NULL if memory is exhausted. */
4186
static re_sub_match_last_t *
4187
match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4189
re_sub_match_last_t *new_entry;
4190
if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4192
Idx new_alasts = 2 * subtop->alasts + 1;
4193
re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4194
re_sub_match_last_t *,
4196
if (__glibc_unlikely (new_array == NULL))
4198
subtop->lasts = new_array;
4199
subtop->alasts = new_alasts;
4201
new_entry = calloc (1, sizeof (re_sub_match_last_t));
4202
if (__glibc_likely (new_entry != NULL))
4204
subtop->lasts[subtop->nlasts] = new_entry;
4205
new_entry->node = node;
4206
new_entry->str_idx = str_idx;
4213
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4214
re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4216
sctx->sifted_states = sifted_sts;
4217
sctx->limited_states = limited_sts;
4218
sctx->last_node = last_node;
4219
sctx->last_str_idx = last_str_idx;
4220
re_node_set_init_empty (&sctx->limits);