1
/* -*- buffer-read-only: t -*- vi: set ro: */
2
/* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3
/* Extended regular expression matching and search library.
4
Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5
Software Foundation, Inc.
6
This file is part of the GNU C Library.
7
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 3, or (at your option)
14
This program is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU General Public License for more details.
19
You should have received a copy of the GNU General Public License along
20
with this program; if not, write to the Free Software Foundation,
21
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
25
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
26
Idx n) internal_function;
27
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
28
static void match_ctx_free (re_match_context_t *cache) internal_function;
29
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
30
Idx str_idx, Idx from, Idx to)
32
static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
34
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
35
Idx str_idx) internal_function;
36
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
37
Idx node, Idx str_idx)
39
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
40
re_dfastate_t **limited_sts, Idx last_node,
43
static reg_errcode_t re_search_internal (const regex_t *preg,
44
const char *string, Idx length,
45
Idx start, Idx last_start, Idx stop,
46
size_t nmatch, regmatch_t pmatch[],
47
int eflags) internal_function;
48
static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
49
const char *string1, Idx length1,
50
const char *string2, Idx length2,
51
Idx start, regoff_t range,
52
struct re_registers *regs,
53
Idx stop, bool ret_len) internal_function;
54
static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
55
const char *string, Idx length, Idx start,
56
regoff_t range, Idx stop,
57
struct re_registers *regs,
58
bool ret_len) internal_function;
59
static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
60
Idx nregs, int regs_allocated)
62
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
64
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
65
Idx *p_match_first) internal_function;
66
static Idx check_halt_state_context (const re_match_context_t *mctx,
67
const re_dfastate_t *state, Idx idx)
69
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
70
regmatch_t *prev_idx_match, Idx cur_node,
71
Idx cur_idx, Idx nmatch) internal_function;
72
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
73
Idx str_idx, Idx dest_node, Idx nregs,
75
re_node_set *eps_via_nodes)
77
static reg_errcode_t set_regs (const regex_t *preg,
78
const re_match_context_t *mctx,
79
size_t nmatch, regmatch_t *pmatch,
80
bool fl_backtrack) internal_function;
81
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
85
static int sift_states_iter_mb (const re_match_context_t *mctx,
86
re_sift_context_t *sctx,
87
Idx node_idx, Idx str_idx, Idx max_str_idx)
89
#endif /* RE_ENABLE_I18N */
90
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
91
re_sift_context_t *sctx)
93
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
94
re_sift_context_t *sctx, Idx str_idx,
95
re_node_set *cur_dest)
97
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
98
re_sift_context_t *sctx,
100
re_node_set *dest_nodes)
102
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
103
re_node_set *dest_nodes,
104
const re_node_set *candidates)
106
static bool check_dst_limits (const re_match_context_t *mctx,
107
const re_node_set *limits,
108
Idx dst_node, Idx dst_idx, Idx src_node,
109
Idx src_idx) internal_function;
110
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
111
int boundaries, Idx subexp_idx,
112
Idx from_node, Idx bkref_idx)
114
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
115
Idx limit, Idx subexp_idx,
116
Idx node, Idx str_idx,
117
Idx bkref_idx) internal_function;
118
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
119
re_node_set *dest_nodes,
120
const re_node_set *candidates,
122
struct re_backref_cache_entry *bkref_ents,
123
Idx str_idx) internal_function;
124
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
125
re_sift_context_t *sctx,
126
Idx str_idx, const re_node_set *candidates)
128
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
130
re_dfastate_t **src, Idx num)
132
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
133
re_match_context_t *mctx) internal_function;
134
static re_dfastate_t *transit_state (reg_errcode_t *err,
135
re_match_context_t *mctx,
136
re_dfastate_t *state) internal_function;
137
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
138
re_match_context_t *mctx,
139
re_dfastate_t *next_state)
141
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
142
re_node_set *cur_nodes,
143
Idx str_idx) internal_function;
145
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
146
re_match_context_t *mctx,
147
re_dfastate_t *pstate)
150
#ifdef RE_ENABLE_I18N
151
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
152
re_dfastate_t *pstate)
154
#endif /* RE_ENABLE_I18N */
155
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
156
const re_node_set *nodes)
158
static reg_errcode_t get_subexp (re_match_context_t *mctx,
159
Idx bkref_node, Idx bkref_str_idx)
161
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
162
const re_sub_match_top_t *sub_top,
163
re_sub_match_last_t *sub_last,
164
Idx bkref_node, Idx bkref_str)
166
static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
167
Idx subexp_idx, int type) internal_function;
168
static reg_errcode_t check_arrival (re_match_context_t *mctx,
169
state_array_t *path, Idx top_node,
170
Idx top_str, Idx last_node, Idx last_str,
171
int type) internal_function;
172
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
174
re_node_set *cur_nodes,
175
re_node_set *next_nodes)
177
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
178
re_node_set *cur_nodes,
179
Idx ex_subexp, int type)
181
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
182
re_node_set *dst_nodes,
183
Idx target, Idx ex_subexp,
184
int type) internal_function;
185
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
186
re_node_set *cur_nodes, Idx cur_str,
187
Idx subexp_num, int type)
189
static bool build_trtable (const re_dfa_t *dfa,
190
re_dfastate_t *state) internal_function;
191
#ifdef RE_ENABLE_I18N
192
static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
193
const re_string_t *input, Idx idx)
196
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
200
#endif /* RE_ENABLE_I18N */
201
static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
202
const re_dfastate_t *state,
203
re_node_set *states_node,
204
bitset_t *states_ch) internal_function;
205
static bool check_node_accept (const re_match_context_t *mctx,
206
const re_token_t *node, Idx idx)
208
static reg_errcode_t extend_buffers (re_match_context_t *mctx)
211
/* Entry point for POSIX code. */
213
/* regexec searches for a given pattern, specified by PREG, in the
216
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
217
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
218
least NMATCH elements, and we set them to the offsets of the
219
corresponding matched substrings.
221
EFLAGS specifies `execution flags' which affect matching: if
222
REG_NOTBOL is set, then ^ does not match at the beginning of the
223
string; if REG_NOTEOL is set, then $ does not match at the end.
225
We return 0 if we find a match and REG_NOMATCH if not. */
228
regexec (preg, string, nmatch, pmatch, eflags)
229
const regex_t *_Restrict_ preg;
230
const char *_Restrict_ string;
232
regmatch_t pmatch[_Restrict_arr_];
238
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
241
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
244
if (eflags & REG_STARTEND)
246
start = pmatch[0].rm_so;
247
length = pmatch[0].rm_eo;
252
length = strlen (string);
255
__libc_lock_lock (dfa->lock);
257
err = re_search_internal (preg, string, length, start, length,
258
length, 0, NULL, eflags);
260
err = re_search_internal (preg, string, length, start, length,
261
length, nmatch, pmatch, eflags);
262
__libc_lock_unlock (dfa->lock);
263
return err != REG_NOERROR;
267
# include <shlib-compat.h>
268
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
270
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
271
__typeof__ (__regexec) __compat_regexec;
274
attribute_compat_text_section
275
__compat_regexec (const regex_t *_Restrict_ preg,
276
const char *_Restrict_ string, size_t nmatch,
277
regmatch_t pmatch[], int eflags)
279
return regexec (preg, string, nmatch, pmatch,
280
eflags & (REG_NOTBOL | REG_NOTEOL));
282
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
286
/* Entry points for GNU code. */
288
/* re_match, re_search, re_match_2, re_search_2
290
The former two functions operate on STRING with length LENGTH,
291
while the later two operate on concatenation of STRING1 and STRING2
292
with lengths LENGTH1 and LENGTH2, respectively.
294
re_match() matches the compiled pattern in BUFP against the string,
295
starting at index START.
297
re_search() first tries matching at index START, then it tries to match
298
starting from index START + 1, and so on. The last start position tried
299
is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
302
The parameter STOP of re_{match,search}_2 specifies that no match exceeding
303
the first STOP characters of the concatenation of the strings should be
306
If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
307
and all groups is stored in REGS. (For the "_2" variants, the offsets are
308
computed relative to the concatenation, not relative to the individual
311
On success, re_match* functions return the length of the match, re_search*
312
return the position of the start of the match. Return value -1 means no
313
match was found and -2 indicates an internal error. */
316
re_match (bufp, string, length, start, regs)
317
struct re_pattern_buffer *bufp;
320
struct re_registers *regs;
322
return re_search_stub (bufp, string, length, start, 0, length, regs, true);
325
weak_alias (__re_match, re_match)
329
re_search (bufp, string, length, start, range, regs)
330
struct re_pattern_buffer *bufp;
334
struct re_registers *regs;
336
return re_search_stub (bufp, string, length, start, range, length, regs,
340
weak_alias (__re_search, re_search)
344
re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
345
struct re_pattern_buffer *bufp;
346
const char *string1, *string2;
347
Idx length1, length2, start, stop;
348
struct re_registers *regs;
350
return re_search_2_stub (bufp, string1, length1, string2, length2,
351
start, 0, regs, stop, true);
354
weak_alias (__re_match_2, re_match_2)
358
re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
359
struct re_pattern_buffer *bufp;
360
const char *string1, *string2;
361
Idx length1, length2, start, stop;
363
struct re_registers *regs;
365
return re_search_2_stub (bufp, string1, length1, string2, length2,
366
start, range, regs, stop, false);
369
weak_alias (__re_search_2, re_search_2)
374
re_search_2_stub (struct re_pattern_buffer *bufp,
375
const char *string1, Idx length1,
376
const char *string2, Idx length2,
377
Idx start, regoff_t range, struct re_registers *regs,
378
Idx stop, bool ret_len)
382
Idx len = length1 + length2;
385
verify (! TYPE_SIGNED (Idx));
386
if (BE (len < length1, 0))
388
/* if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
391
/* Concatenate the strings. */
395
s = re_malloc (char, len);
397
if (BE (s == NULL, 0))
400
memcpy (__mempcpy (s, string1, length1), string2, length2);
402
memcpy (s, string1, length1);
403
memcpy (s + length1, string2, length2);
412
rval = re_search_stub (bufp, str, len, start, range, stop, regs,
418
/* The parameters have the same meaning as those of re_search.
419
Additional parameters:
420
If RET_LEN is true the length of the match is returned (re_match style);
421
otherwise the position of the match is returned. */
425
re_search_stub (struct re_pattern_buffer *bufp,
426
const char *string, Idx length,
427
Idx start, regoff_t range, Idx stop, struct re_registers *regs,
430
reg_errcode_t result;
436
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
438
Idx last_start = start + range;
440
/* Check for out-of-range. */
441
verify (! TYPE_SIGNED (Idx));
442
/* if (BE (start < 0, 0))
444
if (BE (start > length, 0))
446
if (BE (length < last_start || (0 <= range && last_start < start), 0))
448
else if (BE (/* last_start < 0 || */ (range < 0 && start <= last_start), 0))
451
__libc_lock_lock (dfa->lock);
453
eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
454
eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
456
/* Compile fastmap if we haven't yet. */
457
if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
458
re_compile_fastmap (bufp);
460
if (BE (bufp->no_sub, 0))
463
/* We need at least 1 register. */
466
else if (BE (bufp->regs_allocated == REGS_FIXED
467
&& regs->num_regs <= bufp->re_nsub, 0))
469
nregs = regs->num_regs;
470
if (BE (nregs < 1, 0))
472
/* Nothing can be copied to regs. */
478
nregs = bufp->re_nsub + 1;
479
pmatch = re_malloc (regmatch_t, nregs);
480
if (BE (pmatch == NULL, 0))
486
result = re_search_internal (bufp, string, length, start, last_start, stop,
487
nregs, pmatch, eflags);
491
/* I hope we needn't fill ther regs with -1's when no match was found. */
492
if (result != REG_NOERROR)
494
else if (regs != NULL)
496
/* If caller wants register contents data back, copy them. */
497
bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
498
bufp->regs_allocated);
499
if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
503
if (BE (rval == 0, 1))
507
assert (pmatch[0].rm_so == start);
508
rval = pmatch[0].rm_eo - start;
511
rval = pmatch[0].rm_so;
515
__libc_lock_unlock (dfa->lock);
521
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
524
int rval = REGS_REALLOCATE;
526
Idx need_regs = nregs + 1;
527
/* We need one extra element beyond `num_regs' for the `-1' marker GNU code
530
/* Have the register data arrays been allocated? */
531
if (regs_allocated == REGS_UNALLOCATED)
532
{ /* No. So allocate them with malloc. */
533
regs->start = re_malloc (regoff_t, need_regs);
534
if (BE (regs->start == NULL, 0))
535
return REGS_UNALLOCATED;
536
regs->end = re_malloc (regoff_t, need_regs);
537
if (BE (regs->end == NULL, 0))
539
re_free (regs->start);
540
return REGS_UNALLOCATED;
542
regs->num_regs = need_regs;
544
else if (regs_allocated == REGS_REALLOCATE)
545
{ /* Yes. If we need more elements than were already
546
allocated, reallocate them. If we need fewer, just
548
if (BE (need_regs > regs->num_regs, 0))
550
regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
552
if (BE (new_start == NULL, 0))
553
return REGS_UNALLOCATED;
554
new_end = re_realloc (regs->end, regoff_t, need_regs);
555
if (BE (new_end == NULL, 0))
558
return REGS_UNALLOCATED;
560
regs->start = new_start;
562
regs->num_regs = need_regs;
567
assert (regs_allocated == REGS_FIXED);
568
/* This function may not be called with REGS_FIXED and nregs too big. */
569
assert (regs->num_regs >= nregs);
574
for (i = 0; i < nregs; ++i)
576
regs->start[i] = pmatch[i].rm_so;
577
regs->end[i] = pmatch[i].rm_eo;
579
for ( ; i < regs->num_regs; ++i)
580
regs->start[i] = regs->end[i] = -1;
585
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
586
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
587
this memory for recording register information. STARTS and ENDS
588
must be allocated using the malloc library routine, and must each
589
be at least NUM_REGS * sizeof (regoff_t) bytes long.
591
If NUM_REGS == 0, then subsequent matches should allocate their own
594
Unless this function is called, the first search or match using
595
PATTERN_BUFFER will allocate its own register data, without
596
freeing the old data. */
599
re_set_registers (bufp, regs, num_regs, starts, ends)
600
struct re_pattern_buffer *bufp;
601
struct re_registers *regs;
602
__re_size_t num_regs;
603
regoff_t *starts, *ends;
607
bufp->regs_allocated = REGS_REALLOCATE;
608
regs->num_regs = num_regs;
609
regs->start = starts;
614
bufp->regs_allocated = REGS_UNALLOCATED;
616
regs->start = regs->end = NULL;
620
weak_alias (__re_set_registers, re_set_registers)
623
/* Entry points compatible with 4.2 BSD regex library. We don't define
624
them unless specifically requested. */
626
#if defined _REGEX_RE_COMP || defined _LIBC
634
return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
636
#endif /* _REGEX_RE_COMP */
638
/* Internal entry point. */
640
/* Searches for a compiled pattern PREG in the string STRING, whose
641
length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
642
meaning as with regexec. LAST_START is START + RANGE, where
643
START and RANGE have the same meaning as with re_search.
644
Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
645
otherwise return the error code.
646
Note: We assume front end functions already check ranges.
647
(0 <= LAST_START && LAST_START <= LENGTH) */
650
internal_function __attribute_warn_unused_result__
651
re_search_internal (const regex_t *preg,
652
const char *string, Idx length,
653
Idx start, Idx last_start, Idx stop,
654
size_t nmatch, regmatch_t pmatch[],
658
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
659
Idx left_lim, right_lim;
661
bool fl_longest_match;
664
Idx match_last = REG_MISSING;
668
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
669
re_match_context_t mctx = { .dfa = dfa };
671
re_match_context_t mctx;
673
char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
674
&& start != last_start && !preg->can_be_null)
675
? preg->fastmap : NULL);
676
RE_TRANSLATE_TYPE t = preg->translate;
678
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
679
memset (&mctx, '\0', sizeof (re_match_context_t));
683
extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
684
nmatch -= extra_nmatch;
686
/* Check if the DFA haven't been compiled. */
687
if (BE (preg->used == 0 || dfa->init_state == NULL
688
|| dfa->init_state_word == NULL || dfa->init_state_nl == NULL
689
|| dfa->init_state_begbuf == NULL, 0))
693
/* We assume front-end functions already check them. */
694
assert (0 <= last_start && last_start <= length);
697
/* If initial states with non-begbuf contexts have no elements,
698
the regex must be anchored. If preg->newline_anchor is set,
699
we'll never use init_state_nl, so do not check it. */
700
if (dfa->init_state->nodes.nelem == 0
701
&& dfa->init_state_word->nodes.nelem == 0
702
&& (dfa->init_state_nl->nodes.nelem == 0
703
|| !preg->newline_anchor))
705
if (start != 0 && last_start != 0)
707
start = last_start = 0;
710
/* We must check the longest matching, if nmatch > 0. */
711
fl_longest_match = (nmatch != 0 || dfa->nbackref);
713
err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
714
preg->translate, (preg->syntax & RE_ICASE) != 0,
716
if (BE (err != REG_NOERROR, 0))
718
mctx.input.stop = stop;
719
mctx.input.raw_stop = stop;
720
mctx.input.newline_anchor = preg->newline_anchor;
722
err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
723
if (BE (err != REG_NOERROR, 0))
726
/* We will log all the DFA states through which the dfa pass,
727
if nmatch > 1, or this dfa has "multibyte node", which is a
728
back-reference or a node which can accept multibyte character or
729
multi character collating element. */
730
if (nmatch > 1 || dfa->has_mb_node)
732
/* Avoid overflow. */
733
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
739
mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
740
if (BE (mctx.state_log == NULL, 0))
747
mctx.state_log = NULL;
750
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
751
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
753
/* Check incrementally whether of not the input string match. */
754
incr = (last_start < start) ? -1 : 1;
755
left_lim = (last_start < start) ? last_start : start;
756
right_lim = (last_start < start) ? start : last_start;
757
sb = dfa->mb_cur_max == 1;
760
? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
761
| (start <= last_start ? 2 : 0)
762
| (t != NULL ? 1 : 0))
765
for (;; match_first += incr)
768
if (match_first < left_lim || right_lim < match_first)
771
/* Advance as rapidly as possible through the string, until we
772
find a plausible place to start matching. This may be done
773
with varying efficiency, so there are various possibilities:
774
only the most common of them are specialized, in order to
775
save on code size. We use a switch statement for speed. */
783
/* Fastmap with single-byte translation, match forward. */
784
while (BE (match_first < right_lim, 1)
785
&& !fastmap[t[(unsigned char) string[match_first]]])
787
goto forward_match_found_start_or_reached_end;
790
/* Fastmap without translation, match forward. */
791
while (BE (match_first < right_lim, 1)
792
&& !fastmap[(unsigned char) string[match_first]])
795
forward_match_found_start_or_reached_end:
796
if (BE (match_first == right_lim, 0))
798
ch = match_first >= length
799
? 0 : (unsigned char) string[match_first];
800
if (!fastmap[t ? t[ch] : ch])
807
/* Fastmap without multi-byte translation, match backwards. */
808
while (match_first >= left_lim)
810
ch = match_first >= length
811
? 0 : (unsigned char) string[match_first];
812
if (fastmap[t ? t[ch] : ch])
816
if (match_first < left_lim)
821
/* In this case, we can't determine easily the current byte,
822
since it might be a component byte of a multibyte
823
character. Then we use the constructed buffer instead. */
826
/* If MATCH_FIRST is out of the valid range, reconstruct the
828
__re_size_t offset = match_first - mctx.input.raw_mbs_idx;
829
if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
831
err = re_string_reconstruct (&mctx.input, match_first,
833
if (BE (err != REG_NOERROR, 0))
836
offset = match_first - mctx.input.raw_mbs_idx;
838
/* If MATCH_FIRST is out of the buffer, leave it as '\0'.
839
Note that MATCH_FIRST must not be smaller than 0. */
840
ch = (match_first >= length
841
? 0 : re_string_byte_at (&mctx.input, offset));
845
if (match_first < left_lim || match_first > right_lim)
854
/* Reconstruct the buffers so that the matcher can assume that
855
the matching starts from the beginning of the buffer. */
856
err = re_string_reconstruct (&mctx.input, match_first, eflags);
857
if (BE (err != REG_NOERROR, 0))
860
#ifdef RE_ENABLE_I18N
861
/* Don't consider this char as a possible match start if it part,
862
yet isn't the head, of a multibyte character. */
863
if (!sb && !re_string_first_byte (&mctx.input, 0))
867
/* It seems to be appropriate one, then use the matcher. */
868
/* We assume that the matching starts from 0. */
869
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
870
match_last = check_matching (&mctx, fl_longest_match,
871
start <= last_start ? &match_first : NULL);
872
if (match_last != REG_MISSING)
874
if (BE (match_last == REG_ERROR, 0))
881
mctx.match_last = match_last;
882
if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
884
re_dfastate_t *pstate = mctx.state_log[match_last];
885
mctx.last_node = check_halt_state_context (&mctx, pstate,
888
if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
891
err = prune_impossible_nodes (&mctx);
892
if (err == REG_NOERROR)
894
if (BE (err != REG_NOMATCH, 0))
896
match_last = REG_MISSING;
899
break; /* We found a match. */
903
match_ctx_clean (&mctx);
907
assert (match_last != REG_MISSING);
908
assert (err == REG_NOERROR);
911
/* Set pmatch[] if we need. */
916
/* Initialize registers. */
917
for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
918
pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
920
/* Set the points where matching start/end. */
922
pmatch[0].rm_eo = mctx.match_last;
923
/* FIXME: This function should fail if mctx.match_last exceeds
924
the maximum possible regoff_t value. We need a new error
925
code REG_OVERFLOW. */
927
if (!preg->no_sub && nmatch > 1)
929
err = set_regs (preg, &mctx, nmatch, pmatch,
930
dfa->has_plural_match && dfa->nbackref > 0);
931
if (BE (err != REG_NOERROR, 0))
935
/* At last, add the offset to the each registers, since we slided
936
the buffers so that we could assume that the matching starts
938
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
939
if (pmatch[reg_idx].rm_so != -1)
941
#ifdef RE_ENABLE_I18N
942
if (BE (mctx.input.offsets_needed != 0, 0))
944
pmatch[reg_idx].rm_so =
945
(pmatch[reg_idx].rm_so == mctx.input.valid_len
946
? mctx.input.valid_raw_len
947
: mctx.input.offsets[pmatch[reg_idx].rm_so]);
948
pmatch[reg_idx].rm_eo =
949
(pmatch[reg_idx].rm_eo == mctx.input.valid_len
950
? mctx.input.valid_raw_len
951
: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
954
assert (mctx.input.offsets_needed == 0);
956
pmatch[reg_idx].rm_so += match_first;
957
pmatch[reg_idx].rm_eo += match_first;
959
for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
961
pmatch[nmatch + reg_idx].rm_so = -1;
962
pmatch[nmatch + reg_idx].rm_eo = -1;
966
for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
967
if (dfa->subexp_map[reg_idx] != reg_idx)
969
pmatch[reg_idx + 1].rm_so
970
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
971
pmatch[reg_idx + 1].rm_eo
972
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
977
re_free (mctx.state_log);
979
match_ctx_free (&mctx);
980
re_string_destruct (&mctx.input);
985
internal_function __attribute_warn_unused_result__
986
prune_impossible_nodes (re_match_context_t *mctx)
988
const re_dfa_t *const dfa = mctx->dfa;
989
Idx halt_node, match_last;
991
re_dfastate_t **sifted_states;
992
re_dfastate_t **lim_states = NULL;
993
re_sift_context_t sctx;
995
assert (mctx->state_log != NULL);
997
match_last = mctx->match_last;
998
halt_node = mctx->last_node;
1000
/* Avoid overflow. */
1001
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
1004
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
1005
if (BE (sifted_states == NULL, 0))
1012
lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1013
if (BE (lim_states == NULL, 0))
1020
memset (lim_states, '\0',
1021
sizeof (re_dfastate_t *) * (match_last + 1));
1022
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1024
ret = sift_states_backward (mctx, &sctx);
1025
re_node_set_free (&sctx.limits);
1026
if (BE (ret != REG_NOERROR, 0))
1028
if (sifted_states[0] != NULL || lim_states[0] != NULL)
1033
if (! REG_VALID_INDEX (match_last))
1038
} while (mctx->state_log[match_last] == NULL
1039
|| !mctx->state_log[match_last]->halt);
1040
halt_node = check_halt_state_context (mctx,
1041
mctx->state_log[match_last],
1044
ret = merge_state_array (dfa, sifted_states, lim_states,
1046
re_free (lim_states);
1048
if (BE (ret != REG_NOERROR, 0))
1053
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1054
ret = sift_states_backward (mctx, &sctx);
1055
re_node_set_free (&sctx.limits);
1056
if (BE (ret != REG_NOERROR, 0))
1058
if (sifted_states[0] == NULL)
1064
re_free (mctx->state_log);
1065
mctx->state_log = sifted_states;
1066
sifted_states = NULL;
1067
mctx->last_node = halt_node;
1068
mctx->match_last = match_last;
1071
re_free (sifted_states);
1072
re_free (lim_states);
1076
/* Acquire an initial state and return it.
1077
We must select appropriate initial state depending on the context,
1078
since initial states may have constraints like "\<", "^", etc.. */
1080
static inline re_dfastate_t *
1081
__attribute ((always_inline)) internal_function
1082
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1085
const re_dfa_t *const dfa = mctx->dfa;
1086
if (dfa->init_state->has_constraint)
1088
unsigned int context;
1089
context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1090
if (IS_WORD_CONTEXT (context))
1091
return dfa->init_state_word;
1092
else if (IS_ORDINARY_CONTEXT (context))
1093
return dfa->init_state;
1094
else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1095
return dfa->init_state_begbuf;
1096
else if (IS_NEWLINE_CONTEXT (context))
1097
return dfa->init_state_nl;
1098
else if (IS_BEGBUF_CONTEXT (context))
1100
/* It is relatively rare case, then calculate on demand. */
1101
return re_acquire_state_context (err, dfa,
1102
dfa->init_state->entrance_nodes,
1106
/* Must not happen? */
1107
return dfa->init_state;
1110
return dfa->init_state;
1113
/* Check whether the regular expression match input string INPUT or not,
1114
and return the index where the matching end. Return REG_MISSING if
1115
there is no match, and return REG_ERROR in case of an error.
1116
FL_LONGEST_MATCH means we want the POSIX longest matching.
1117
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1118
next place where we may want to try matching.
1119
Note that the matcher assume that the maching starts from the current
1120
index of the buffer. */
1123
internal_function __attribute_warn_unused_result__
1124
check_matching (re_match_context_t *mctx, bool fl_longest_match,
1127
const re_dfa_t *const dfa = mctx->dfa;
1130
Idx match_last = REG_MISSING;
1131
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1132
re_dfastate_t *cur_state;
1133
bool at_init_state = p_match_first != NULL;
1134
Idx next_start_idx = cur_str_idx;
1137
cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1138
/* An initial state must not be NULL (invalid). */
1139
if (BE (cur_state == NULL, 0))
1141
assert (err == REG_ESPACE);
1145
if (mctx->state_log != NULL)
1147
mctx->state_log[cur_str_idx] = cur_state;
1149
/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1150
later. E.g. Processing back references. */
1151
if (BE (dfa->nbackref, 0))
1153
at_init_state = false;
1154
err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1155
if (BE (err != REG_NOERROR, 0))
1158
if (cur_state->has_backref)
1160
err = transit_state_bkref (mctx, &cur_state->nodes);
1161
if (BE (err != REG_NOERROR, 0))
1167
/* If the RE accepts NULL string. */
1168
if (BE (cur_state->halt, 0))
1170
if (!cur_state->has_constraint
1171
|| check_halt_state_context (mctx, cur_state, cur_str_idx))
1173
if (!fl_longest_match)
1177
match_last = cur_str_idx;
1183
while (!re_string_eoi (&mctx->input))
1185
re_dfastate_t *old_state = cur_state;
1186
Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1188
if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1189
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
1190
&& mctx->input.valid_len < mctx->input.len))
1192
err = extend_buffers (mctx);
1193
if (BE (err != REG_NOERROR, 0))
1195
assert (err == REG_ESPACE);
1200
cur_state = transit_state (&err, mctx, cur_state);
1201
if (mctx->state_log != NULL)
1202
cur_state = merge_state_with_log (&err, mctx, cur_state);
1204
if (cur_state == NULL)
1206
/* Reached the invalid state or an error. Try to recover a valid
1207
state using the state log, if available and if we have not
1208
already found a valid (even if not the longest) match. */
1209
if (BE (err != REG_NOERROR, 0))
1212
if (mctx->state_log == NULL
1213
|| (match && !fl_longest_match)
1214
|| (cur_state = find_recover_state (&err, mctx)) == NULL)
1218
if (BE (at_init_state, 0))
1220
if (old_state == cur_state)
1221
next_start_idx = next_char_idx;
1223
at_init_state = false;
1226
if (cur_state->halt)
1228
/* Reached a halt state.
1229
Check the halt state can satisfy the current context. */
1230
if (!cur_state->has_constraint
1231
|| check_halt_state_context (mctx, cur_state,
1232
re_string_cur_idx (&mctx->input)))
1234
/* We found an appropriate halt state. */
1235
match_last = re_string_cur_idx (&mctx->input);
1238
/* We found a match, do not modify match_first below. */
1239
p_match_first = NULL;
1240
if (!fl_longest_match)
1247
*p_match_first += next_start_idx;
1252
/* Check NODE match the current context. */
1256
check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1258
re_token_type_t type = dfa->nodes[node].type;
1259
unsigned int constraint = dfa->nodes[node].constraint;
1260
if (type != END_OF_RE)
1264
if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1269
/* Check the halt state STATE match the current context.
1270
Return 0 if not match, if the node, STATE has, is a halt node and
1271
match the context, return the node. */
1275
check_halt_state_context (const re_match_context_t *mctx,
1276
const re_dfastate_t *state, Idx idx)
1279
unsigned int context;
1281
assert (state->halt);
1283
context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1284
for (i = 0; i < state->nodes.nelem; ++i)
1285
if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1286
return state->nodes.elems[i];
1290
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1291
corresponding to the DFA).
1292
Return the destination node, and update EPS_VIA_NODES;
1293
return REG_MISSING in case of errors. */
1297
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1298
Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1299
struct re_fail_stack_t *fs)
1301
const re_dfa_t *const dfa = mctx->dfa;
1304
if (IS_EPSILON_NODE (dfa->nodes[node].type))
1306
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1307
re_node_set *edests = &dfa->edests[node];
1309
ok = re_node_set_insert (eps_via_nodes, node);
1312
/* Pick up a valid destination, or return REG_MISSING if none
1314
for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1316
Idx candidate = edests->elems[i];
1317
if (!re_node_set_contains (cur_nodes, candidate))
1319
if (dest_node == REG_MISSING)
1320
dest_node = candidate;
1324
/* In order to avoid infinite loop like "(a*)*", return the second
1325
epsilon-transition if the first was already considered. */
1326
if (re_node_set_contains (eps_via_nodes, dest_node))
1329
/* Otherwise, push the second epsilon-transition on the fail stack. */
1331
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
1335
/* We know we are going to exit. */
1344
re_token_type_t type = dfa->nodes[node].type;
1346
#ifdef RE_ENABLE_I18N
1347
if (dfa->nodes[node].accept_mb)
1348
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1350
#endif /* RE_ENABLE_I18N */
1351
if (type == OP_BACK_REF)
1353
Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1354
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1357
if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1361
char *buf = (char *) re_string_get_buffer (&mctx->input);
1362
if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1371
ok = re_node_set_insert (eps_via_nodes, node);
1374
dest_node = dfa->edests[node].elems[0];
1375
if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1382
|| check_node_accept (mctx, dfa->nodes + node, *pidx))
1384
Idx dest_node = dfa->nexts[node];
1385
*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1386
if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1387
|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1390
re_node_set_empty (eps_via_nodes);
1397
static reg_errcode_t
1398
internal_function __attribute_warn_unused_result__
1399
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1400
Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1403
Idx num = fs->num++;
1404
if (fs->num == fs->alloc)
1406
struct re_fail_stack_ent_t *new_array;
1407
new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1409
if (new_array == NULL)
1412
fs->stack = new_array;
1414
fs->stack[num].idx = str_idx;
1415
fs->stack[num].node = dest_node;
1416
fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1417
if (fs->stack[num].regs == NULL)
1419
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1420
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1426
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1427
regmatch_t *regs, re_node_set *eps_via_nodes)
1429
Idx num = --fs->num;
1430
assert (REG_VALID_INDEX (num));
1431
*pidx = fs->stack[num].idx;
1432
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1433
re_node_set_free (eps_via_nodes);
1434
re_free (fs->stack[num].regs);
1435
*eps_via_nodes = fs->stack[num].eps_via_nodes;
1436
return fs->stack[num].node;
1439
/* Set the positions where the subexpressions are starts/ends to registers
1441
Note: We assume that pmatch[0] is already set, and
1442
pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1444
static reg_errcode_t
1445
internal_function __attribute_warn_unused_result__
1446
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1447
regmatch_t *pmatch, bool fl_backtrack)
1449
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1451
re_node_set eps_via_nodes;
1452
struct re_fail_stack_t *fs;
1453
struct re_fail_stack_t fs_body = { 0, 2, NULL };
1454
regmatch_t *prev_idx_match;
1455
bool prev_idx_match_malloced = false;
1458
assert (nmatch > 1);
1459
assert (mctx->state_log != NULL);
1464
fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1465
if (fs->stack == NULL)
1471
cur_node = dfa->init_node;
1472
re_node_set_init_empty (&eps_via_nodes);
1474
if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1475
prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1478
prev_idx_match = re_malloc (regmatch_t, nmatch);
1479
if (prev_idx_match == NULL)
1481
free_fail_stack_return (fs);
1484
prev_idx_match_malloced = true;
1486
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1488
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1490
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1492
if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1497
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1498
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1500
if (reg_idx == nmatch)
1502
re_node_set_free (&eps_via_nodes);
1503
if (prev_idx_match_malloced)
1504
re_free (prev_idx_match);
1505
return free_fail_stack_return (fs);
1507
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1512
re_node_set_free (&eps_via_nodes);
1513
if (prev_idx_match_malloced)
1514
re_free (prev_idx_match);
1519
/* Proceed to next node. */
1520
cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1521
&eps_via_nodes, fs);
1523
if (BE (! REG_VALID_INDEX (cur_node), 0))
1525
if (BE (cur_node == REG_ERROR, 0))
1527
re_node_set_free (&eps_via_nodes);
1528
if (prev_idx_match_malloced)
1529
re_free (prev_idx_match);
1530
free_fail_stack_return (fs);
1534
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1538
re_node_set_free (&eps_via_nodes);
1539
if (prev_idx_match_malloced)
1540
re_free (prev_idx_match);
1545
re_node_set_free (&eps_via_nodes);
1546
if (prev_idx_match_malloced)
1547
re_free (prev_idx_match);
1548
return free_fail_stack_return (fs);
1551
static reg_errcode_t
1553
free_fail_stack_return (struct re_fail_stack_t *fs)
1558
for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1560
re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1561
re_free (fs->stack[fs_idx].regs);
1563
re_free (fs->stack);
1570
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1571
regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1573
int type = dfa->nodes[cur_node].type;
1574
if (type == OP_OPEN_SUBEXP)
1576
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1578
/* We are at the first node of this sub expression. */
1579
if (reg_num < nmatch)
1581
pmatch[reg_num].rm_so = cur_idx;
1582
pmatch[reg_num].rm_eo = -1;
1585
else if (type == OP_CLOSE_SUBEXP)
1587
Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1588
if (reg_num < nmatch)
1590
/* We are at the last node of this sub expression. */
1591
if (pmatch[reg_num].rm_so < cur_idx)
1593
pmatch[reg_num].rm_eo = cur_idx;
1594
/* This is a non-empty match or we are not inside an optional
1595
subexpression. Accept this right away. */
1596
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1600
if (dfa->nodes[cur_node].opt_subexp
1601
&& prev_idx_match[reg_num].rm_so != -1)
1602
/* We transited through an empty match for an optional
1603
subexpression, like (a?)*, and this is not the subexp's
1604
first match. Copy back the old content of the registers
1605
so that matches of an inner subexpression are undone as
1606
well, like in ((a?))*. */
1607
memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1609
/* We completed a subexpression, but it may be part of
1610
an optional one, so do not update PREV_IDX_MATCH. */
1611
pmatch[reg_num].rm_eo = cur_idx;
1617
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1618
and sift the nodes in each states according to the following rules.
1619
Updated state_log will be wrote to STATE_LOG.
1621
Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1622
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1623
If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1624
the LAST_NODE, we throw away the node `a'.
1625
2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1626
string `s' and transit to `b':
1627
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1629
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1630
thrown away, we throw away the node `a'.
1631
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1632
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1634
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1635
we throw away the node `a'. */
1637
#define STATE_NODE_CONTAINS(state,node) \
1638
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1640
static reg_errcode_t
1642
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1646
Idx str_idx = sctx->last_str_idx;
1647
re_node_set cur_dest;
1650
assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1653
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1654
transit to the last_node and the last_node itself. */
1655
err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1656
if (BE (err != REG_NOERROR, 0))
1658
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1659
if (BE (err != REG_NOERROR, 0))
1662
/* Then check each states in the state_log. */
1665
/* Update counters. */
1666
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1667
if (null_cnt > mctx->max_mb_elem_len)
1669
memset (sctx->sifted_states, '\0',
1670
sizeof (re_dfastate_t *) * str_idx);
1671
re_node_set_free (&cur_dest);
1674
re_node_set_empty (&cur_dest);
1677
if (mctx->state_log[str_idx])
1679
err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1680
if (BE (err != REG_NOERROR, 0))
1684
/* Add all the nodes which satisfy the following conditions:
1685
- It can epsilon transit to a node in CUR_DEST.
1687
And update state_log. */
1688
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1689
if (BE (err != REG_NOERROR, 0))
1694
re_node_set_free (&cur_dest);
1698
static reg_errcode_t
1699
internal_function __attribute_warn_unused_result__
1700
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1701
Idx str_idx, re_node_set *cur_dest)
1703
const re_dfa_t *const dfa = mctx->dfa;
1704
const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1707
/* Then build the next sifted state.
1708
We build the next sifted state on `cur_dest', and update
1709
`sifted_states[str_idx]' with `cur_dest'.
1711
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1712
`cur_src' points the node_set of the old `state_log[str_idx]'
1713
(with the epsilon nodes pre-filtered out). */
1714
for (i = 0; i < cur_src->nelem; i++)
1716
Idx prev_node = cur_src->elems[i];
1721
re_token_type_t type = dfa->nodes[prev_node].type;
1722
assert (!IS_EPSILON_NODE (type));
1724
#ifdef RE_ENABLE_I18N
1725
/* If the node may accept `multi byte'. */
1726
if (dfa->nodes[prev_node].accept_mb)
1727
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1728
str_idx, sctx->last_str_idx);
1729
#endif /* RE_ENABLE_I18N */
1731
/* We don't check backreferences here.
1732
See update_cur_sifted_state(). */
1734
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1735
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1736
dfa->nexts[prev_node]))
1742
if (sctx->limits.nelem)
1744
Idx to_idx = str_idx + naccepted;
1745
if (check_dst_limits (mctx, &sctx->limits,
1746
dfa->nexts[prev_node], to_idx,
1747
prev_node, str_idx))
1750
ok = re_node_set_insert (cur_dest, prev_node);
1758
/* Helper functions. */
1760
static reg_errcode_t
1762
clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1764
Idx top = mctx->state_log_top;
1766
if (next_state_log_idx >= mctx->input.bufs_len
1767
|| (next_state_log_idx >= mctx->input.valid_len
1768
&& mctx->input.valid_len < mctx->input.len))
1771
err = extend_buffers (mctx);
1772
if (BE (err != REG_NOERROR, 0))
1776
if (top < next_state_log_idx)
1778
memset (mctx->state_log + top + 1, '\0',
1779
sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1780
mctx->state_log_top = next_state_log_idx;
1785
static reg_errcode_t
1787
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1788
re_dfastate_t **src, Idx num)
1792
for (st_idx = 0; st_idx < num; ++st_idx)
1794
if (dst[st_idx] == NULL)
1795
dst[st_idx] = src[st_idx];
1796
else if (src[st_idx] != NULL)
1798
re_node_set merged_set;
1799
err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1800
&src[st_idx]->nodes);
1801
if (BE (err != REG_NOERROR, 0))
1803
dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1804
re_node_set_free (&merged_set);
1805
if (BE (err != REG_NOERROR, 0))
1812
static reg_errcode_t
1814
update_cur_sifted_state (const re_match_context_t *mctx,
1815
re_sift_context_t *sctx, Idx str_idx,
1816
re_node_set *dest_nodes)
1818
const re_dfa_t *const dfa = mctx->dfa;
1819
reg_errcode_t err = REG_NOERROR;
1820
const re_node_set *candidates;
1821
candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1822
: &mctx->state_log[str_idx]->nodes);
1824
if (dest_nodes->nelem == 0)
1825
sctx->sifted_states[str_idx] = NULL;
1830
/* At first, add the nodes which can epsilon transit to a node in
1832
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1833
if (BE (err != REG_NOERROR, 0))
1836
/* Then, check the limitations in the current sift_context. */
1837
if (sctx->limits.nelem)
1839
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1840
mctx->bkref_ents, str_idx);
1841
if (BE (err != REG_NOERROR, 0))
1846
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1847
if (BE (err != REG_NOERROR, 0))
1851
if (candidates && mctx->state_log[str_idx]->has_backref)
1853
err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1854
if (BE (err != REG_NOERROR, 0))
1860
static reg_errcode_t
1861
internal_function __attribute_warn_unused_result__
1862
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1863
const re_node_set *candidates)
1865
reg_errcode_t err = REG_NOERROR;
1868
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1869
if (BE (err != REG_NOERROR, 0))
1872
if (!state->inveclosure.alloc)
1874
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1875
if (BE (err != REG_NOERROR, 0))
1877
for (i = 0; i < dest_nodes->nelem; i++)
1879
err = re_node_set_merge (&state->inveclosure,
1880
dfa->inveclosures + dest_nodes->elems[i]);
1881
if (BE (err != REG_NOERROR, 0))
1885
return re_node_set_add_intersect (dest_nodes, candidates,
1886
&state->inveclosure);
1889
static reg_errcode_t
1891
sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1892
const re_node_set *candidates)
1896
re_node_set *inv_eclosure = dfa->inveclosures + node;
1897
re_node_set except_nodes;
1898
re_node_set_init_empty (&except_nodes);
1899
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1901
Idx cur_node = inv_eclosure->elems[ecl_idx];
1902
if (cur_node == node)
1904
if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1906
Idx edst1 = dfa->edests[cur_node].elems[0];
1907
Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1908
? dfa->edests[cur_node].elems[1] : REG_MISSING);
1909
if ((!re_node_set_contains (inv_eclosure, edst1)
1910
&& re_node_set_contains (dest_nodes, edst1))
1911
|| (REG_VALID_NONZERO_INDEX (edst2)
1912
&& !re_node_set_contains (inv_eclosure, edst2)
1913
&& re_node_set_contains (dest_nodes, edst2)))
1915
err = re_node_set_add_intersect (&except_nodes, candidates,
1916
dfa->inveclosures + cur_node);
1917
if (BE (err != REG_NOERROR, 0))
1919
re_node_set_free (&except_nodes);
1925
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1927
Idx cur_node = inv_eclosure->elems[ecl_idx];
1928
if (!re_node_set_contains (&except_nodes, cur_node))
1930
Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1931
re_node_set_remove_at (dest_nodes, idx);
1934
re_node_set_free (&except_nodes);
1940
check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1941
Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1943
const re_dfa_t *const dfa = mctx->dfa;
1944
Idx lim_idx, src_pos, dst_pos;
1946
Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1947
Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1948
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1951
struct re_backref_cache_entry *ent;
1952
ent = mctx->bkref_ents + limits->elems[lim_idx];
1953
subexp_idx = dfa->nodes[ent->node].opr.idx;
1955
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1956
subexp_idx, dst_node, dst_idx,
1958
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1959
subexp_idx, src_node, src_idx,
1963
<src> <dst> ( <subexp> )
1964
( <subexp> ) <src> <dst>
1965
( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1966
if (src_pos == dst_pos)
1967
continue; /* This is unrelated limitation. */
1976
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1977
Idx subexp_idx, Idx from_node, Idx bkref_idx)
1979
const re_dfa_t *const dfa = mctx->dfa;
1980
const re_node_set *eclosures = dfa->eclosures + from_node;
1983
/* Else, we are on the boundary: examine the nodes on the epsilon
1985
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1987
Idx node = eclosures->elems[node_idx];
1988
switch (dfa->nodes[node].type)
1991
if (bkref_idx != REG_MISSING)
1993
struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1999
if (ent->node != node)
2002
if (subexp_idx < BITSET_WORD_BITS
2003
&& !(ent->eps_reachable_subexps_map
2004
& ((bitset_word_t) 1 << subexp_idx)))
2007
/* Recurse trying to reach the OP_OPEN_SUBEXP and
2008
OP_CLOSE_SUBEXP cases below. But, if the
2009
destination node is the same node as the source
2010
node, don't recurse because it would cause an
2011
infinite loop: a regex that exhibits this behavior
2013
dst = dfa->edests[node].elems[0];
2014
if (dst == from_node)
2018
else /* if (boundaries & 2) */
2023
check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2025
if (cpos == -1 /* && (boundaries & 1) */)
2027
if (cpos == 0 && (boundaries & 2))
2030
if (subexp_idx < BITSET_WORD_BITS)
2031
ent->eps_reachable_subexps_map
2032
&= ~((bitset_word_t) 1 << subexp_idx);
2034
while (ent++->more);
2038
case OP_OPEN_SUBEXP:
2039
if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2043
case OP_CLOSE_SUBEXP:
2044
if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2053
return (boundaries & 2) ? 1 : 0;
2058
check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2059
Idx subexp_idx, Idx from_node, Idx str_idx,
2062
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2065
/* If we are outside the range of the subexpression, return -1 or 1. */
2066
if (str_idx < lim->subexp_from)
2069
if (lim->subexp_to < str_idx)
2072
/* If we are within the subexpression, return 0. */
2073
boundaries = (str_idx == lim->subexp_from);
2074
boundaries |= (str_idx == lim->subexp_to) << 1;
2075
if (boundaries == 0)
2078
/* Else, examine epsilon closure. */
2079
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2080
from_node, bkref_idx);
2083
/* Check the limitations of sub expressions LIMITS, and remove the nodes
2084
which are against limitations from DEST_NODES. */
2086
static reg_errcode_t
2088
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2089
const re_node_set *candidates, re_node_set *limits,
2090
struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2093
Idx node_idx, lim_idx;
2095
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2098
struct re_backref_cache_entry *ent;
2099
ent = bkref_ents + limits->elems[lim_idx];
2101
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2102
continue; /* This is unrelated limitation. */
2104
subexp_idx = dfa->nodes[ent->node].opr.idx;
2105
if (ent->subexp_to == str_idx)
2107
Idx ops_node = REG_MISSING;
2108
Idx cls_node = REG_MISSING;
2109
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2111
Idx node = dest_nodes->elems[node_idx];
2112
re_token_type_t type = dfa->nodes[node].type;
2113
if (type == OP_OPEN_SUBEXP
2114
&& subexp_idx == dfa->nodes[node].opr.idx)
2116
else if (type == OP_CLOSE_SUBEXP
2117
&& subexp_idx == dfa->nodes[node].opr.idx)
2121
/* Check the limitation of the open subexpression. */
2122
/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2123
if (REG_VALID_INDEX (ops_node))
2125
err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2127
if (BE (err != REG_NOERROR, 0))
2131
/* Check the limitation of the close subexpression. */
2132
if (REG_VALID_INDEX (cls_node))
2133
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2135
Idx node = dest_nodes->elems[node_idx];
2136
if (!re_node_set_contains (dfa->inveclosures + node,
2138
&& !re_node_set_contains (dfa->eclosures + node,
2141
/* It is against this limitation.
2142
Remove it form the current sifted state. */
2143
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2145
if (BE (err != REG_NOERROR, 0))
2151
else /* (ent->subexp_to != str_idx) */
2153
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2155
Idx node = dest_nodes->elems[node_idx];
2156
re_token_type_t type = dfa->nodes[node].type;
2157
if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2159
if (subexp_idx != dfa->nodes[node].opr.idx)
2161
/* It is against this limitation.
2162
Remove it form the current sifted state. */
2163
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2165
if (BE (err != REG_NOERROR, 0))
2174
static reg_errcode_t
2175
internal_function __attribute_warn_unused_result__
2176
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2177
Idx str_idx, const re_node_set *candidates)
2179
const re_dfa_t *const dfa = mctx->dfa;
2182
re_sift_context_t local_sctx;
2183
Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2185
if (first_idx == REG_MISSING)
2188
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2190
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2193
re_token_type_t type;
2194
struct re_backref_cache_entry *entry;
2195
node = candidates->elems[node_idx];
2196
type = dfa->nodes[node].type;
2197
/* Avoid infinite loop for the REs like "()\1+". */
2198
if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2200
if (type != OP_BACK_REF)
2203
entry = mctx->bkref_ents + first_idx;
2204
enabled_idx = first_idx;
2211
re_dfastate_t *cur_state;
2213
if (entry->node != node)
2215
subexp_len = entry->subexp_to - entry->subexp_from;
2216
to_idx = str_idx + subexp_len;
2217
dst_node = (subexp_len ? dfa->nexts[node]
2218
: dfa->edests[node].elems[0]);
2220
if (to_idx > sctx->last_str_idx
2221
|| sctx->sifted_states[to_idx] == NULL
2222
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2223
|| check_dst_limits (mctx, &sctx->limits, node,
2224
str_idx, dst_node, to_idx))
2227
if (local_sctx.sifted_states == NULL)
2230
err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2231
if (BE (err != REG_NOERROR, 0))
2234
local_sctx.last_node = node;
2235
local_sctx.last_str_idx = str_idx;
2236
ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2242
cur_state = local_sctx.sifted_states[str_idx];
2243
err = sift_states_backward (mctx, &local_sctx);
2244
if (BE (err != REG_NOERROR, 0))
2246
if (sctx->limited_states != NULL)
2248
err = merge_state_array (dfa, sctx->limited_states,
2249
local_sctx.sifted_states,
2251
if (BE (err != REG_NOERROR, 0))
2254
local_sctx.sifted_states[str_idx] = cur_state;
2255
re_node_set_remove (&local_sctx.limits, enabled_idx);
2257
/* mctx->bkref_ents may have changed, reload the pointer. */
2258
entry = mctx->bkref_ents + enabled_idx;
2260
while (enabled_idx++, entry++->more);
2264
if (local_sctx.sifted_states != NULL)
2266
re_node_set_free (&local_sctx.limits);
2273
#ifdef RE_ENABLE_I18N
2276
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2277
Idx node_idx, Idx str_idx, Idx max_str_idx)
2279
const re_dfa_t *const dfa = mctx->dfa;
2281
/* Check the node can accept `multi byte'. */
2282
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2283
if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2284
!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2285
dfa->nexts[node_idx]))
2286
/* The node can't accept the `multi byte', or the
2287
destination was already thrown away, then the node
2288
could't accept the current input `multi byte'. */
2290
/* Otherwise, it is sure that the node could accept
2291
`naccepted' bytes input. */
2294
#endif /* RE_ENABLE_I18N */
2297
/* Functions for state transition. */
2299
/* Return the next state to which the current state STATE will transit by
2300
accepting the current input byte, and update STATE_LOG if necessary.
2301
If STATE can accept a multibyte char/collating element/back reference
2302
update the destination of STATE_LOG. */
2304
static re_dfastate_t *
2305
internal_function __attribute_warn_unused_result__
2306
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2307
re_dfastate_t *state)
2309
re_dfastate_t **trtable;
2312
#ifdef RE_ENABLE_I18N
2313
/* If the current state can accept multibyte. */
2314
if (BE (state->accept_mb, 0))
2316
*err = transit_state_mb (mctx, state);
2317
if (BE (*err != REG_NOERROR, 0))
2320
#endif /* RE_ENABLE_I18N */
2322
/* Then decide the next state with the single byte. */
2325
/* don't use transition table */
2326
return transit_state_sb (err, mctx, state);
2329
/* Use transition table */
2330
ch = re_string_fetch_byte (&mctx->input);
2333
trtable = state->trtable;
2334
if (BE (trtable != NULL, 1))
2337
trtable = state->word_trtable;
2338
if (BE (trtable != NULL, 1))
2340
unsigned int context;
2342
= re_string_context_at (&mctx->input,
2343
re_string_cur_idx (&mctx->input) - 1,
2345
if (IS_WORD_CONTEXT (context))
2346
return trtable[ch + SBC_MAX];
2351
if (!build_trtable (mctx->dfa, state))
2357
/* Retry, we now have a transition table. */
2361
/* Update the state_log if we need */
2362
static re_dfastate_t *
2364
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2365
re_dfastate_t *next_state)
2367
const re_dfa_t *const dfa = mctx->dfa;
2368
Idx cur_idx = re_string_cur_idx (&mctx->input);
2370
if (cur_idx > mctx->state_log_top)
2372
mctx->state_log[cur_idx] = next_state;
2373
mctx->state_log_top = cur_idx;
2375
else if (mctx->state_log[cur_idx] == 0)
2377
mctx->state_log[cur_idx] = next_state;
2381
re_dfastate_t *pstate;
2382
unsigned int context;
2383
re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2384
/* If (state_log[cur_idx] != 0), it implies that cur_idx is
2385
the destination of a multibyte char/collating element/
2386
back reference. Then the next state is the union set of
2387
these destinations and the results of the transition table. */
2388
pstate = mctx->state_log[cur_idx];
2389
log_nodes = pstate->entrance_nodes;
2390
if (next_state != NULL)
2392
table_nodes = next_state->entrance_nodes;
2393
*err = re_node_set_init_union (&next_nodes, table_nodes,
2395
if (BE (*err != REG_NOERROR, 0))
2399
next_nodes = *log_nodes;
2400
/* Note: We already add the nodes of the initial state,
2401
then we don't need to add them here. */
2403
context = re_string_context_at (&mctx->input,
2404
re_string_cur_idx (&mctx->input) - 1,
2406
next_state = mctx->state_log[cur_idx]
2407
= re_acquire_state_context (err, dfa, &next_nodes, context);
2408
/* We don't need to check errors here, since the return value of
2409
this function is next_state and ERR is already set. */
2411
if (table_nodes != NULL)
2412
re_node_set_free (&next_nodes);
2415
if (BE (dfa->nbackref, 0) && next_state != NULL)
2417
/* Check OP_OPEN_SUBEXP in the current state in case that we use them
2418
later. We must check them here, since the back references in the
2419
next state might use them. */
2420
*err = check_subexp_matching_top (mctx, &next_state->nodes,
2422
if (BE (*err != REG_NOERROR, 0))
2425
/* If the next state has back references. */
2426
if (next_state->has_backref)
2428
*err = transit_state_bkref (mctx, &next_state->nodes);
2429
if (BE (*err != REG_NOERROR, 0))
2431
next_state = mctx->state_log[cur_idx];
2438
/* Skip bytes in the input that correspond to part of a
2439
multi-byte match, then look in the log for a state
2440
from which to restart matching. */
2441
static re_dfastate_t *
2443
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2445
re_dfastate_t *cur_state;
2448
Idx max = mctx->state_log_top;
2449
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2453
if (++cur_str_idx > max)
2455
re_string_skip_bytes (&mctx->input, 1);
2457
while (mctx->state_log[cur_str_idx] == NULL);
2459
cur_state = merge_state_with_log (err, mctx, NULL);
2461
while (*err == REG_NOERROR && cur_state == NULL);
2465
/* Helper functions for transit_state. */
2467
/* From the node set CUR_NODES, pick up the nodes whose types are
2468
OP_OPEN_SUBEXP and which have corresponding back references in the regular
2469
expression. And register them to use them later for evaluating the
2470
correspoding back references. */
2472
static reg_errcode_t
2474
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2477
const re_dfa_t *const dfa = mctx->dfa;
2481
/* TODO: This isn't efficient.
2482
Because there might be more than one nodes whose types are
2483
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2486
for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2488
Idx node = cur_nodes->elems[node_idx];
2489
if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2490
&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2491
&& (dfa->used_bkref_map
2492
& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2494
err = match_ctx_add_subtop (mctx, node, str_idx);
2495
if (BE (err != REG_NOERROR, 0))
2503
/* Return the next state to which the current state STATE will transit by
2504
accepting the current input byte. */
2506
static re_dfastate_t *
2507
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2508
re_dfastate_t *state)
2510
const re_dfa_t *const dfa = mctx->dfa;
2511
re_node_set next_nodes;
2512
re_dfastate_t *next_state;
2513
Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2514
unsigned int context;
2516
*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2517
if (BE (*err != REG_NOERROR, 0))
2519
for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2521
Idx cur_node = state->nodes.elems[node_cnt];
2522
if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2524
*err = re_node_set_merge (&next_nodes,
2525
dfa->eclosures + dfa->nexts[cur_node]);
2526
if (BE (*err != REG_NOERROR, 0))
2528
re_node_set_free (&next_nodes);
2533
context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2534
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2535
/* We don't need to check errors here, since the return value of
2536
this function is next_state and ERR is already set. */
2538
re_node_set_free (&next_nodes);
2539
re_string_skip_bytes (&mctx->input, 1);
2544
#ifdef RE_ENABLE_I18N
2545
static reg_errcode_t
2547
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2549
const re_dfa_t *const dfa = mctx->dfa;
2553
for (i = 0; i < pstate->nodes.nelem; ++i)
2555
re_node_set dest_nodes, *new_nodes;
2556
Idx cur_node_idx = pstate->nodes.elems[i];
2559
unsigned int context;
2560
re_dfastate_t *dest_state;
2562
if (!dfa->nodes[cur_node_idx].accept_mb)
2565
if (dfa->nodes[cur_node_idx].constraint)
2567
context = re_string_context_at (&mctx->input,
2568
re_string_cur_idx (&mctx->input),
2570
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2575
/* How many bytes the node can accept? */
2576
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2577
re_string_cur_idx (&mctx->input));
2581
/* The node can accepts `naccepted' bytes. */
2582
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2583
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2584
: mctx->max_mb_elem_len);
2585
err = clean_state_log_if_needed (mctx, dest_idx);
2586
if (BE (err != REG_NOERROR, 0))
2589
assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2591
new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2593
dest_state = mctx->state_log[dest_idx];
2594
if (dest_state == NULL)
2595
dest_nodes = *new_nodes;
2598
err = re_node_set_init_union (&dest_nodes,
2599
dest_state->entrance_nodes, new_nodes);
2600
if (BE (err != REG_NOERROR, 0))
2603
context = re_string_context_at (&mctx->input, dest_idx - 1,
2605
mctx->state_log[dest_idx]
2606
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2607
if (dest_state != NULL)
2608
re_node_set_free (&dest_nodes);
2609
if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2614
#endif /* RE_ENABLE_I18N */
2616
static reg_errcode_t
2618
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2620
const re_dfa_t *const dfa = mctx->dfa;
2623
Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2625
for (i = 0; i < nodes->nelem; ++i)
2627
Idx dest_str_idx, prev_nelem, bkc_idx;
2628
Idx node_idx = nodes->elems[i];
2629
unsigned int context;
2630
const re_token_t *node = dfa->nodes + node_idx;
2631
re_node_set *new_dest_nodes;
2633
/* Check whether `node' is a backreference or not. */
2634
if (node->type != OP_BACK_REF)
2637
if (node->constraint)
2639
context = re_string_context_at (&mctx->input, cur_str_idx,
2641
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2645
/* `node' is a backreference.
2646
Check the substring which the substring matched. */
2647
bkc_idx = mctx->nbkref_ents;
2648
err = get_subexp (mctx, node_idx, cur_str_idx);
2649
if (BE (err != REG_NOERROR, 0))
2652
/* And add the epsilon closures (which is `new_dest_nodes') of
2653
the backreference to appropriate state_log. */
2655
assert (dfa->nexts[node_idx] != REG_MISSING);
2657
for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2660
re_dfastate_t *dest_state;
2661
struct re_backref_cache_entry *bkref_ent;
2662
bkref_ent = mctx->bkref_ents + bkc_idx;
2663
if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2665
subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2666
new_dest_nodes = (subexp_len == 0
2667
? dfa->eclosures + dfa->edests[node_idx].elems[0]
2668
: dfa->eclosures + dfa->nexts[node_idx]);
2669
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2670
- bkref_ent->subexp_from);
2671
context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2673
dest_state = mctx->state_log[dest_str_idx];
2674
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2675
: mctx->state_log[cur_str_idx]->nodes.nelem);
2676
/* Add `new_dest_node' to state_log. */
2677
if (dest_state == NULL)
2679
mctx->state_log[dest_str_idx]
2680
= re_acquire_state_context (&err, dfa, new_dest_nodes,
2682
if (BE (mctx->state_log[dest_str_idx] == NULL
2683
&& err != REG_NOERROR, 0))
2688
re_node_set dest_nodes;
2689
err = re_node_set_init_union (&dest_nodes,
2690
dest_state->entrance_nodes,
2692
if (BE (err != REG_NOERROR, 0))
2694
re_node_set_free (&dest_nodes);
2697
mctx->state_log[dest_str_idx]
2698
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2699
re_node_set_free (&dest_nodes);
2700
if (BE (mctx->state_log[dest_str_idx] == NULL
2701
&& err != REG_NOERROR, 0))
2704
/* We need to check recursively if the backreference can epsilon
2707
&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2709
err = check_subexp_matching_top (mctx, new_dest_nodes,
2711
if (BE (err != REG_NOERROR, 0))
2713
err = transit_state_bkref (mctx, new_dest_nodes);
2714
if (BE (err != REG_NOERROR, 0))
2724
/* Enumerate all the candidates which the backreference BKREF_NODE can match
2725
at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2726
Note that we might collect inappropriate candidates here.
2727
However, the cost of checking them strictly here is too high, then we
2728
delay these checking for prune_impossible_nodes(). */
2730
static reg_errcode_t
2731
internal_function __attribute_warn_unused_result__
2732
get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2734
const re_dfa_t *const dfa = mctx->dfa;
2735
Idx subexp_num, sub_top_idx;
2736
const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2737
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2738
Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2739
if (cache_idx != REG_MISSING)
2741
const struct re_backref_cache_entry *entry
2742
= mctx->bkref_ents + cache_idx;
2744
if (entry->node == bkref_node)
2745
return REG_NOERROR; /* We already checked it. */
2746
while (entry++->more);
2749
subexp_num = dfa->nodes[bkref_node].opr.idx;
2751
/* For each sub expression */
2752
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2755
re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2756
re_sub_match_last_t *sub_last;
2757
Idx sub_last_idx, sl_str, bkref_str_off;
2759
if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2760
continue; /* It isn't related. */
2762
sl_str = sub_top->str_idx;
2763
bkref_str_off = bkref_str_idx;
2764
/* At first, check the last node of sub expressions we already
2766
for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2768
regoff_t sl_str_diff;
2769
sub_last = sub_top->lasts[sub_last_idx];
2770
sl_str_diff = sub_last->str_idx - sl_str;
2771
/* The matched string by the sub expression match with the substring
2772
at the back reference? */
2773
if (sl_str_diff > 0)
2775
if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2777
/* Not enough chars for a successful match. */
2778
if (bkref_str_off + sl_str_diff > mctx->input.len)
2781
err = clean_state_log_if_needed (mctx,
2784
if (BE (err != REG_NOERROR, 0))
2786
buf = (const char *) re_string_get_buffer (&mctx->input);
2788
if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2789
/* We don't need to search this sub expression any more. */
2792
bkref_str_off += sl_str_diff;
2793
sl_str += sl_str_diff;
2794
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2797
/* Reload buf, since the preceding call might have reallocated
2799
buf = (const char *) re_string_get_buffer (&mctx->input);
2801
if (err == REG_NOMATCH)
2803
if (BE (err != REG_NOERROR, 0))
2807
if (sub_last_idx < sub_top->nlasts)
2809
if (sub_last_idx > 0)
2811
/* Then, search for the other last nodes of the sub expression. */
2812
for (; sl_str <= bkref_str_idx; ++sl_str)
2815
regoff_t sl_str_off;
2816
const re_node_set *nodes;
2817
sl_str_off = sl_str - sub_top->str_idx;
2818
/* The matched string by the sub expression match with the substring
2819
at the back reference? */
2822
if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2824
/* If we are at the end of the input, we cannot match. */
2825
if (bkref_str_off >= mctx->input.len)
2828
err = extend_buffers (mctx);
2829
if (BE (err != REG_NOERROR, 0))
2832
buf = (const char *) re_string_get_buffer (&mctx->input);
2834
if (buf [bkref_str_off++] != buf[sl_str - 1])
2835
break; /* We don't need to search this sub expression
2838
if (mctx->state_log[sl_str] == NULL)
2840
/* Does this state have a ')' of the sub expression? */
2841
nodes = &mctx->state_log[sl_str]->nodes;
2842
cls_node = find_subexp_node (dfa, nodes, subexp_num,
2844
if (cls_node == REG_MISSING)
2846
if (sub_top->path == NULL)
2848
sub_top->path = calloc (sizeof (state_array_t),
2849
sl_str - sub_top->str_idx + 1);
2850
if (sub_top->path == NULL)
2853
/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2854
in the current context? */
2855
err = check_arrival (mctx, sub_top->path, sub_top->node,
2856
sub_top->str_idx, cls_node, sl_str,
2858
if (err == REG_NOMATCH)
2860
if (BE (err != REG_NOERROR, 0))
2862
sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2863
if (BE (sub_last == NULL, 0))
2865
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2867
if (err == REG_NOMATCH)
2874
/* Helper functions for get_subexp(). */
2876
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2877
If it can arrive, register the sub expression expressed with SUB_TOP
2880
static reg_errcode_t
2882
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2883
re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2887
/* Can the subexpression arrive the back reference? */
2888
err = check_arrival (mctx, &sub_last->path, sub_last->node,
2889
sub_last->str_idx, bkref_node, bkref_str,
2891
if (err != REG_NOERROR)
2893
err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2895
if (BE (err != REG_NOERROR, 0))
2897
to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2898
return clean_state_log_if_needed (mctx, to_idx);
2901
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2902
Search '(' if FL_OPEN, or search ')' otherwise.
2903
TODO: This function isn't efficient...
2904
Because there might be more than one nodes whose types are
2905
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2911
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2912
Idx subexp_idx, int type)
2915
for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2917
Idx cls_node = nodes->elems[cls_idx];
2918
const re_token_t *node = dfa->nodes + cls_node;
2919
if (node->type == type
2920
&& node->opr.idx == subexp_idx)
2926
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2927
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2929
Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2931
static reg_errcode_t
2932
internal_function __attribute_warn_unused_result__
2933
check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2934
Idx top_str, Idx last_node, Idx last_str, int type)
2936
const re_dfa_t *const dfa = mctx->dfa;
2937
reg_errcode_t err = REG_NOERROR;
2938
Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2939
re_dfastate_t *cur_state = NULL;
2940
re_node_set *cur_nodes, next_nodes;
2941
re_dfastate_t **backup_state_log;
2942
unsigned int context;
2944
subexp_num = dfa->nodes[top_node].opr.idx;
2945
/* Extend the buffer if we need. */
2946
if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2948
re_dfastate_t **new_array;
2949
Idx old_alloc = path->alloc;
2950
Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2951
if (BE (new_alloc < old_alloc, 0)
2952
|| BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2954
new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2955
if (BE (new_array == NULL, 0))
2957
path->array = new_array;
2958
path->alloc = new_alloc;
2959
memset (new_array + old_alloc, '\0',
2960
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2963
str_idx = path->next_idx ? path->next_idx : top_str;
2965
/* Temporary modify MCTX. */
2966
backup_state_log = mctx->state_log;
2967
backup_cur_idx = mctx->input.cur_idx;
2968
mctx->state_log = path->array;
2969
mctx->input.cur_idx = str_idx;
2971
/* Setup initial node set. */
2972
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2973
if (str_idx == top_str)
2975
err = re_node_set_init_1 (&next_nodes, top_node);
2976
if (BE (err != REG_NOERROR, 0))
2978
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2979
if (BE (err != REG_NOERROR, 0))
2981
re_node_set_free (&next_nodes);
2987
cur_state = mctx->state_log[str_idx];
2988
if (cur_state && cur_state->has_backref)
2990
err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2991
if (BE (err != REG_NOERROR, 0))
2995
re_node_set_init_empty (&next_nodes);
2997
if (str_idx == top_str || (cur_state && cur_state->has_backref))
2999
if (next_nodes.nelem)
3001
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3003
if (BE (err != REG_NOERROR, 0))
3005
re_node_set_free (&next_nodes);
3009
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3010
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3012
re_node_set_free (&next_nodes);
3015
mctx->state_log[str_idx] = cur_state;
3018
for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3020
re_node_set_empty (&next_nodes);
3021
if (mctx->state_log[str_idx + 1])
3023
err = re_node_set_merge (&next_nodes,
3024
&mctx->state_log[str_idx + 1]->nodes);
3025
if (BE (err != REG_NOERROR, 0))
3027
re_node_set_free (&next_nodes);
3033
err = check_arrival_add_next_nodes (mctx, str_idx,
3034
&cur_state->non_eps_nodes,
3036
if (BE (err != REG_NOERROR, 0))
3038
re_node_set_free (&next_nodes);
3043
if (next_nodes.nelem)
3045
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3046
if (BE (err != REG_NOERROR, 0))
3048
re_node_set_free (&next_nodes);
3051
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3053
if (BE (err != REG_NOERROR, 0))
3055
re_node_set_free (&next_nodes);
3059
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3060
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3061
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3063
re_node_set_free (&next_nodes);
3066
mctx->state_log[str_idx] = cur_state;
3067
null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3069
re_node_set_free (&next_nodes);
3070
cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3071
: &mctx->state_log[last_str]->nodes);
3072
path->next_idx = str_idx;
3075
mctx->state_log = backup_state_log;
3076
mctx->input.cur_idx = backup_cur_idx;
3078
/* Then check the current node set has the node LAST_NODE. */
3079
if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3085
/* Helper functions for check_arrival. */
3087
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3089
TODO: This function is similar to the functions transit_state*(),
3090
however this function has many additional works.
3091
Can't we unify them? */
3093
static reg_errcode_t
3094
internal_function __attribute_warn_unused_result__
3095
check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3096
re_node_set *cur_nodes, re_node_set *next_nodes)
3098
const re_dfa_t *const dfa = mctx->dfa;
3101
#ifdef RE_ENABLE_I18N
3102
reg_errcode_t err = REG_NOERROR;
3104
re_node_set union_set;
3105
re_node_set_init_empty (&union_set);
3106
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3109
Idx cur_node = cur_nodes->elems[cur_idx];
3111
re_token_type_t type = dfa->nodes[cur_node].type;
3112
assert (!IS_EPSILON_NODE (type));
3114
#ifdef RE_ENABLE_I18N
3115
/* If the node may accept `multi byte'. */
3116
if (dfa->nodes[cur_node].accept_mb)
3118
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3122
re_dfastate_t *dest_state;
3123
Idx next_node = dfa->nexts[cur_node];
3124
Idx next_idx = str_idx + naccepted;
3125
dest_state = mctx->state_log[next_idx];
3126
re_node_set_empty (&union_set);
3129
err = re_node_set_merge (&union_set, &dest_state->nodes);
3130
if (BE (err != REG_NOERROR, 0))
3132
re_node_set_free (&union_set);
3136
ok = re_node_set_insert (&union_set, next_node);
3139
re_node_set_free (&union_set);
3142
mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3144
if (BE (mctx->state_log[next_idx] == NULL
3145
&& err != REG_NOERROR, 0))
3147
re_node_set_free (&union_set);
3152
#endif /* RE_ENABLE_I18N */
3154
|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3156
ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3159
re_node_set_free (&union_set);
3164
re_node_set_free (&union_set);
3168
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3169
CUR_NODES, however exclude the nodes which are:
3170
- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3171
- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3174
static reg_errcode_t
3176
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3177
Idx ex_subexp, int type)
3180
Idx idx, outside_node;
3181
re_node_set new_nodes;
3183
assert (cur_nodes->nelem);
3185
err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3186
if (BE (err != REG_NOERROR, 0))
3188
/* Create a new node set NEW_NODES with the nodes which are epsilon
3189
closures of the node in CUR_NODES. */
3191
for (idx = 0; idx < cur_nodes->nelem; ++idx)
3193
Idx cur_node = cur_nodes->elems[idx];
3194
const re_node_set *eclosure = dfa->eclosures + cur_node;
3195
outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3196
if (outside_node == REG_MISSING)
3198
/* There are no problematic nodes, just merge them. */
3199
err = re_node_set_merge (&new_nodes, eclosure);
3200
if (BE (err != REG_NOERROR, 0))
3202
re_node_set_free (&new_nodes);
3208
/* There are problematic nodes, re-calculate incrementally. */
3209
err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3211
if (BE (err != REG_NOERROR, 0))
3213
re_node_set_free (&new_nodes);
3218
re_node_set_free (cur_nodes);
3219
*cur_nodes = new_nodes;
3223
/* Helper function for check_arrival_expand_ecl.
3224
Check incrementally the epsilon closure of TARGET, and if it isn't
3225
problematic append it to DST_NODES. */
3227
static reg_errcode_t
3228
internal_function __attribute_warn_unused_result__
3229
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3230
Idx target, Idx ex_subexp, int type)
3233
for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3237
if (dfa->nodes[cur_node].type == type
3238
&& dfa->nodes[cur_node].opr.idx == ex_subexp)
3240
if (type == OP_CLOSE_SUBEXP)
3242
ok = re_node_set_insert (dst_nodes, cur_node);
3248
ok = re_node_set_insert (dst_nodes, cur_node);
3251
if (dfa->edests[cur_node].nelem == 0)
3253
if (dfa->edests[cur_node].nelem == 2)
3256
err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3257
dfa->edests[cur_node].elems[1],
3259
if (BE (err != REG_NOERROR, 0))
3262
cur_node = dfa->edests[cur_node].elems[0];
3268
/* For all the back references in the current state, calculate the
3269
destination of the back references by the appropriate entry
3270
in MCTX->BKREF_ENTS. */
3272
static reg_errcode_t
3273
internal_function __attribute_warn_unused_result__
3274
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3275
Idx cur_str, Idx subexp_num, int type)
3277
const re_dfa_t *const dfa = mctx->dfa;
3279
Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3280
struct re_backref_cache_entry *ent;
3282
if (cache_idx_start == REG_MISSING)
3286
ent = mctx->bkref_ents + cache_idx_start;
3289
Idx to_idx, next_node;
3291
/* Is this entry ENT is appropriate? */
3292
if (!re_node_set_contains (cur_nodes, ent->node))
3295
to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3296
/* Calculate the destination of the back reference, and append it
3297
to MCTX->STATE_LOG. */
3298
if (to_idx == cur_str)
3300
/* The backreference did epsilon transit, we must re-check all the
3301
node in the current state. */
3302
re_node_set new_dests;
3303
reg_errcode_t err2, err3;
3304
next_node = dfa->edests[ent->node].elems[0];
3305
if (re_node_set_contains (cur_nodes, next_node))
3307
err = re_node_set_init_1 (&new_dests, next_node);
3308
err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3309
err3 = re_node_set_merge (cur_nodes, &new_dests);
3310
re_node_set_free (&new_dests);
3311
if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3312
|| err3 != REG_NOERROR, 0))
3314
err = (err != REG_NOERROR ? err
3315
: (err2 != REG_NOERROR ? err2 : err3));
3318
/* TODO: It is still inefficient... */
3323
re_node_set union_set;
3324
next_node = dfa->nexts[ent->node];
3325
if (mctx->state_log[to_idx])
3328
if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3331
err = re_node_set_init_copy (&union_set,
3332
&mctx->state_log[to_idx]->nodes);
3333
ok = re_node_set_insert (&union_set, next_node);
3334
if (BE (err != REG_NOERROR || ! ok, 0))
3336
re_node_set_free (&union_set);
3337
err = err != REG_NOERROR ? err : REG_ESPACE;
3343
err = re_node_set_init_1 (&union_set, next_node);
3344
if (BE (err != REG_NOERROR, 0))
3347
mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3348
re_node_set_free (&union_set);
3349
if (BE (mctx->state_log[to_idx] == NULL
3350
&& err != REG_NOERROR, 0))
3354
while (ent++->more);
3358
/* Build transition table for the state.
3359
Return true if successful. */
3363
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3368
bool need_word_trtable = false;
3369
bitset_word_t elem, mask;
3370
bool dests_node_malloced = false;
3371
bool dest_states_malloced = false;
3372
Idx ndests; /* Number of the destination states from `state'. */
3373
re_dfastate_t **trtable;
3374
re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3375
re_node_set follows, *dests_node;
3377
bitset_t acceptable;
3381
re_node_set dests_node[SBC_MAX];
3382
bitset_t dests_ch[SBC_MAX];
3385
/* We build DFA states which corresponds to the destination nodes
3386
from `state'. `dests_node[i]' represents the nodes which i-th
3387
destination state contains, and `dests_ch[i]' represents the
3388
characters which i-th destination state accepts. */
3389
if (__libc_use_alloca (sizeof (struct dests_alloc)))
3390
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3393
dests_alloc = re_malloc (struct dests_alloc, 1);
3394
if (BE (dests_alloc == NULL, 0))
3396
dests_node_malloced = true;
3398
dests_node = dests_alloc->dests_node;
3399
dests_ch = dests_alloc->dests_ch;
3401
/* Initialize transiton table. */
3402
state->word_trtable = state->trtable = NULL;
3404
/* At first, group all nodes belonging to `state' into several
3406
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3407
if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3409
if (dests_node_malloced)
3413
state->trtable = (re_dfastate_t **)
3414
calloc (sizeof (re_dfastate_t *), SBC_MAX);
3420
err = re_node_set_alloc (&follows, ndests + 1);
3421
if (BE (err != REG_NOERROR, 0))
3424
/* Avoid arithmetic overflow in size calculation. */
3425
if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3426
/ (3 * sizeof (re_dfastate_t *)))
3431
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3432
+ ndests * 3 * sizeof (re_dfastate_t *)))
3433
dest_states = (re_dfastate_t **)
3434
alloca (ndests * 3 * sizeof (re_dfastate_t *));
3437
dest_states = (re_dfastate_t **)
3438
malloc (ndests * 3 * sizeof (re_dfastate_t *));
3439
if (BE (dest_states == NULL, 0))
3442
if (dest_states_malloced)
3444
re_node_set_free (&follows);
3445
for (i = 0; i < ndests; ++i)
3446
re_node_set_free (dests_node + i);
3447
if (dests_node_malloced)
3451
dest_states_malloced = true;
3453
dest_states_word = dest_states + ndests;
3454
dest_states_nl = dest_states_word + ndests;
3455
bitset_empty (acceptable);
3457
/* Then build the states for all destinations. */
3458
for (i = 0; i < ndests; ++i)
3461
re_node_set_empty (&follows);
3462
/* Merge the follows of this destination states. */
3463
for (j = 0; j < dests_node[i].nelem; ++j)
3465
next_node = dfa->nexts[dests_node[i].elems[j]];
3466
if (next_node != REG_MISSING)
3468
err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3469
if (BE (err != REG_NOERROR, 0))
3473
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3474
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3476
/* If the new state has context constraint,
3477
build appropriate states for these contexts. */
3478
if (dest_states[i]->has_constraint)
3480
dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3482
if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3485
if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3486
need_word_trtable = true;
3488
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3490
if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3495
dest_states_word[i] = dest_states[i];
3496
dest_states_nl[i] = dest_states[i];
3498
bitset_merge (acceptable, dests_ch[i]);
3501
if (!BE (need_word_trtable, 0))
3503
/* We don't care about whether the following character is a word
3504
character, or we are in a single-byte character set so we can
3505
discern by looking at the character code: allocate a
3506
256-entry transition table. */
3507
trtable = state->trtable =
3508
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3509
if (BE (trtable == NULL, 0))
3512
/* For all characters ch...: */
3513
for (i = 0; i < BITSET_WORDS; ++i)
3514
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3516
mask <<= 1, elem >>= 1, ++ch)
3517
if (BE (elem & 1, 0))
3519
/* There must be exactly one destination which accepts
3520
character ch. See group_nodes_into_DFAstates. */
3521
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3524
/* j-th destination accepts the word character ch. */
3525
if (dfa->word_char[i] & mask)
3526
trtable[ch] = dest_states_word[j];
3528
trtable[ch] = dest_states[j];
3533
/* We care about whether the following character is a word
3534
character, and we are in a multi-byte character set: discern
3535
by looking at the character code: build two 256-entry
3536
transition tables, one starting at trtable[0] and one
3537
starting at trtable[SBC_MAX]. */
3538
trtable = state->word_trtable =
3539
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3540
if (BE (trtable == NULL, 0))
3543
/* For all characters ch...: */
3544
for (i = 0; i < BITSET_WORDS; ++i)
3545
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3547
mask <<= 1, elem >>= 1, ++ch)
3548
if (BE (elem & 1, 0))
3550
/* There must be exactly one destination which accepts
3551
character ch. See group_nodes_into_DFAstates. */
3552
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3555
/* j-th destination accepts the word character ch. */
3556
trtable[ch] = dest_states[j];
3557
trtable[ch + SBC_MAX] = dest_states_word[j];
3562
if (bitset_contain (acceptable, NEWLINE_CHAR))
3564
/* The current state accepts newline character. */
3565
for (j = 0; j < ndests; ++j)
3566
if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3568
/* k-th destination accepts newline character. */
3569
trtable[NEWLINE_CHAR] = dest_states_nl[j];
3570
if (need_word_trtable)
3571
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3572
/* There must be only one destination which accepts
3573
newline. See group_nodes_into_DFAstates. */
3578
if (dest_states_malloced)
3581
re_node_set_free (&follows);
3582
for (i = 0; i < ndests; ++i)
3583
re_node_set_free (dests_node + i);
3585
if (dests_node_malloced)
3591
/* Group all nodes belonging to STATE into several destinations.
3592
Then for all destinations, set the nodes belonging to the destination
3593
to DESTS_NODE[i] and set the characters accepted by the destination
3594
to DEST_CH[i]. This function return the number of destinations. */
3598
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3599
re_node_set *dests_node, bitset_t *dests_ch)
3604
Idx ndests; /* Number of the destinations from `state'. */
3605
bitset_t accepts; /* Characters a node can accept. */
3606
const re_node_set *cur_nodes = &state->nodes;
3607
bitset_empty (accepts);
3610
/* For all the nodes belonging to `state', */
3611
for (i = 0; i < cur_nodes->nelem; ++i)
3613
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3614
re_token_type_t type = node->type;
3615
unsigned int constraint = node->constraint;
3617
/* Enumerate all single byte character this node can accept. */
3618
if (type == CHARACTER)
3619
bitset_set (accepts, node->opr.c);
3620
else if (type == SIMPLE_BRACKET)
3622
bitset_merge (accepts, node->opr.sbcset);
3624
else if (type == OP_PERIOD)
3626
#ifdef RE_ENABLE_I18N
3627
if (dfa->mb_cur_max > 1)
3628
bitset_merge (accepts, dfa->sb_char);
3631
bitset_set_all (accepts);
3632
if (!(dfa->syntax & RE_DOT_NEWLINE))
3633
bitset_clear (accepts, '\n');
3634
if (dfa->syntax & RE_DOT_NOT_NULL)
3635
bitset_clear (accepts, '\0');
3637
#ifdef RE_ENABLE_I18N
3638
else if (type == OP_UTF8_PERIOD)
3640
if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3641
memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3643
bitset_merge (accepts, utf8_sb_map);
3644
if (!(dfa->syntax & RE_DOT_NEWLINE))
3645
bitset_clear (accepts, '\n');
3646
if (dfa->syntax & RE_DOT_NOT_NULL)
3647
bitset_clear (accepts, '\0');
3653
/* Check the `accepts' and sift the characters which are not
3654
match it the context. */
3657
if (constraint & NEXT_NEWLINE_CONSTRAINT)
3659
bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3660
bitset_empty (accepts);
3661
if (accepts_newline)
3662
bitset_set (accepts, NEWLINE_CHAR);
3666
if (constraint & NEXT_ENDBUF_CONSTRAINT)
3668
bitset_empty (accepts);
3672
if (constraint & NEXT_WORD_CONSTRAINT)
3674
bitset_word_t any_set = 0;
3675
if (type == CHARACTER && !node->word_char)
3677
bitset_empty (accepts);
3680
#ifdef RE_ENABLE_I18N
3681
if (dfa->mb_cur_max > 1)
3682
for (j = 0; j < BITSET_WORDS; ++j)
3683
any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3686
for (j = 0; j < BITSET_WORDS; ++j)
3687
any_set |= (accepts[j] &= dfa->word_char[j]);
3691
if (constraint & NEXT_NOTWORD_CONSTRAINT)
3693
bitset_word_t any_set = 0;
3694
if (type == CHARACTER && node->word_char)
3696
bitset_empty (accepts);
3699
#ifdef RE_ENABLE_I18N
3700
if (dfa->mb_cur_max > 1)
3701
for (j = 0; j < BITSET_WORDS; ++j)
3702
any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3705
for (j = 0; j < BITSET_WORDS; ++j)
3706
any_set |= (accepts[j] &= ~dfa->word_char[j]);
3712
/* Then divide `accepts' into DFA states, or create a new
3713
state. Above, we make sure that accepts is not empty. */
3714
for (j = 0; j < ndests; ++j)
3716
bitset_t intersec; /* Intersection sets, see below. */
3718
/* Flags, see below. */
3719
bitset_word_t has_intersec, not_subset, not_consumed;
3721
/* Optimization, skip if this state doesn't accept the character. */
3722
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3725
/* Enumerate the intersection set of this state and `accepts'. */
3727
for (k = 0; k < BITSET_WORDS; ++k)
3728
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3729
/* And skip if the intersection set is empty. */
3733
/* Then check if this state is a subset of `accepts'. */
3734
not_subset = not_consumed = 0;
3735
for (k = 0; k < BITSET_WORDS; ++k)
3737
not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3738
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3741
/* If this state isn't a subset of `accepts', create a
3742
new group state, which has the `remains'. */
3745
bitset_copy (dests_ch[ndests], remains);
3746
bitset_copy (dests_ch[j], intersec);
3747
err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3748
if (BE (err != REG_NOERROR, 0))
3753
/* Put the position in the current group. */
3754
ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3758
/* If all characters are consumed, go to next node. */
3762
/* Some characters remain, create a new group. */
3765
bitset_copy (dests_ch[ndests], accepts);
3766
err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3767
if (BE (err != REG_NOERROR, 0))
3770
bitset_empty (accepts);
3775
for (j = 0; j < ndests; ++j)
3776
re_node_set_free (dests_node + j);
3780
#ifdef RE_ENABLE_I18N
3781
/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3782
Return the number of the bytes the node accepts.
3783
STR_IDX is the current index of the input string.
3785
This function handles the nodes which can accept one character, or
3786
one collating element like '.', '[a-z]', opposite to the other nodes
3787
can only accept one byte. */
3791
check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3792
const re_string_t *input, Idx str_idx)
3794
const re_token_t *node = dfa->nodes + node_idx;
3795
int char_len, elem_len;
3798
if (BE (node->type == OP_UTF8_PERIOD, 0))
3800
unsigned char c = re_string_byte_at (input, str_idx), d;
3801
if (BE (c < 0xc2, 1))
3804
if (str_idx + 2 > input->len)
3807
d = re_string_byte_at (input, str_idx + 1);
3809
return (d < 0x80 || d > 0xbf) ? 0 : 2;
3813
if (c == 0xe0 && d < 0xa0)
3819
if (c == 0xf0 && d < 0x90)
3825
if (c == 0xf8 && d < 0x88)
3831
if (c == 0xfc && d < 0x84)
3837
if (str_idx + char_len > input->len)
3840
for (i = 1; i < char_len; ++i)
3842
d = re_string_byte_at (input, str_idx + i);
3843
if (d < 0x80 || d > 0xbf)
3849
char_len = re_string_char_size_at (input, str_idx);
3850
if (node->type == OP_PERIOD)
3854
/* FIXME: I don't think this if is needed, as both '\n'
3855
and '\0' are char_len == 1. */
3856
/* '.' accepts any one character except the following two cases. */
3857
if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3858
re_string_byte_at (input, str_idx) == '\n') ||
3859
((dfa->syntax & RE_DOT_NOT_NULL) &&
3860
re_string_byte_at (input, str_idx) == '\0'))
3865
elem_len = re_string_elem_size_at (input, str_idx);
3866
if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3869
if (node->type == COMPLEX_BRACKET)
3871
const re_charset_t *cset = node->opr.mbcset;
3873
const unsigned char *pin
3874
= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3879
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3880
? re_string_wchar_at (input, str_idx) : 0);
3882
/* match with multibyte character? */
3883
for (i = 0; i < cset->nmbchars; ++i)
3884
if (wc == cset->mbchars[i])
3886
match_len = char_len;
3887
goto check_node_accept_bytes_match;
3889
/* match with character_class? */
3890
for (i = 0; i < cset->nchar_classes; ++i)
3892
wctype_t wt = cset->char_classes[i];
3893
if (__iswctype (wc, wt))
3895
match_len = char_len;
3896
goto check_node_accept_bytes_match;
3901
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3904
unsigned int in_collseq = 0;
3905
const int32_t *table, *indirect;
3906
const unsigned char *weights, *extra;
3907
const char *collseqwc;
3909
/* This #include defines a local function! */
3910
# include <locale/weight.h>
3912
/* match with collating_symbol? */
3913
if (cset->ncoll_syms)
3914
extra = (const unsigned char *)
3915
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3916
for (i = 0; i < cset->ncoll_syms; ++i)
3918
const unsigned char *coll_sym = extra + cset->coll_syms[i];
3919
/* Compare the length of input collating element and
3920
the length of current collating element. */
3921
if (*coll_sym != elem_len)
3923
/* Compare each bytes. */
3924
for (j = 0; j < *coll_sym; j++)
3925
if (pin[j] != coll_sym[1 + j])
3929
/* Match if every bytes is equal. */
3931
goto check_node_accept_bytes_match;
3937
if (elem_len <= char_len)
3939
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3940
in_collseq = __collseq_table_lookup (collseqwc, wc);
3943
in_collseq = find_collation_sequence_value (pin, elem_len);
3945
/* match with range expression? */
3946
for (i = 0; i < cset->nranges; ++i)
3947
if (cset->range_starts[i] <= in_collseq
3948
&& in_collseq <= cset->range_ends[i])
3950
match_len = elem_len;
3951
goto check_node_accept_bytes_match;
3954
/* match with equivalence_class? */
3955
if (cset->nequiv_classes)
3957
const unsigned char *cp = pin;
3958
table = (const int32_t *)
3959
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3960
weights = (const unsigned char *)
3961
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3962
extra = (const unsigned char *)
3963
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3964
indirect = (const int32_t *)
3965
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3966
int32_t idx = findidx (&cp);
3968
for (i = 0; i < cset->nequiv_classes; ++i)
3970
int32_t equiv_class_idx = cset->equiv_classes[i];
3971
size_t weight_len = weights[idx & 0xffffff];
3972
if (weight_len == weights[equiv_class_idx & 0xffffff]
3973
&& (idx >> 24) == (equiv_class_idx >> 24))
3978
equiv_class_idx &= 0xffffff;
3980
while (cnt <= weight_len
3981
&& (weights[equiv_class_idx + 1 + cnt]
3982
== weights[idx + 1 + cnt]))
3984
if (cnt > weight_len)
3986
match_len = elem_len;
3987
goto check_node_accept_bytes_match;
3996
/* match with range expression? */
3997
#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3998
wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
4000
wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
4003
for (i = 0; i < cset->nranges; ++i)
4005
cmp_buf[0] = cset->range_starts[i];
4006
cmp_buf[4] = cset->range_ends[i];
4007
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
4008
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4010
match_len = char_len;
4011
goto check_node_accept_bytes_match;
4015
check_node_accept_bytes_match:
4016
if (!cset->non_match)
4023
return (elem_len > char_len) ? elem_len : char_len;
4032
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4034
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4039
/* No valid character. Match it as a single byte character. */
4040
const unsigned char *collseq = (const unsigned char *)
4041
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4042
return collseq[mbs[0]];
4049
const unsigned char *extra = (const unsigned char *)
4050
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4051
int32_t extrasize = (const unsigned char *)
4052
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4054
for (idx = 0; idx < extrasize;)
4058
int32_t elem_mbs_len;
4059
/* Skip the name of collating element name. */
4060
idx = idx + extra[idx] + 1;
4061
elem_mbs_len = extra[idx++];
4062
if (mbs_len == elem_mbs_len)
4064
for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4065
if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4067
if (mbs_cnt == elem_mbs_len)
4068
/* Found the entry. */
4071
/* Skip the byte sequence of the collating element. */
4072
idx += elem_mbs_len;
4073
/* Adjust for the alignment. */
4074
idx = (idx + 3) & ~3;
4075
/* Skip the collation sequence value. */
4076
idx += sizeof (uint32_t);
4077
/* Skip the wide char sequence of the collating element. */
4078
idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4079
/* If we found the entry, return the sequence value. */
4081
return *(uint32_t *) (extra + idx);
4082
/* Skip the collation sequence value. */
4083
idx += sizeof (uint32_t);
4089
#endif /* RE_ENABLE_I18N */
4091
/* Check whether the node accepts the byte which is IDX-th
4092
byte of the INPUT. */
4096
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4100
ch = re_string_byte_at (&mctx->input, idx);
4104
if (node->opr.c != ch)
4108
case SIMPLE_BRACKET:
4109
if (!bitset_contain (node->opr.sbcset, ch))
4113
#ifdef RE_ENABLE_I18N
4114
case OP_UTF8_PERIOD:
4115
if (ch >= ASCII_CHARS)
4120
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4121
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4129
if (node->constraint)
4131
/* The node has constraints. Check whether the current context
4132
satisfies the constraints. */
4133
unsigned int context = re_string_context_at (&mctx->input, idx,
4135
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4142
/* Extend the buffers, if the buffers have run out. */
4144
static reg_errcode_t
4145
internal_function __attribute_warn_unused_result__
4146
extend_buffers (re_match_context_t *mctx)
4149
re_string_t *pstr = &mctx->input;
4151
/* Avoid overflow. */
4152
if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4155
/* Double the lengthes of the buffers. */
4156
ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4157
if (BE (ret != REG_NOERROR, 0))
4160
if (mctx->state_log != NULL)
4162
/* And double the length of state_log. */
4163
/* XXX We have no indication of the size of this buffer. If this
4164
allocation fail we have no indication that the state_log array
4165
does not have the right size. */
4166
re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4167
pstr->bufs_len + 1);
4168
if (BE (new_array == NULL, 0))
4170
mctx->state_log = new_array;
4173
/* Then reconstruct the buffers. */
4176
#ifdef RE_ENABLE_I18N
4177
if (pstr->mb_cur_max > 1)
4179
ret = build_wcs_upper_buffer (pstr);
4180
if (BE (ret != REG_NOERROR, 0))
4184
#endif /* RE_ENABLE_I18N */
4185
build_upper_buffer (pstr);
4189
#ifdef RE_ENABLE_I18N
4190
if (pstr->mb_cur_max > 1)
4191
build_wcs_buffer (pstr);
4193
#endif /* RE_ENABLE_I18N */
4195
if (pstr->trans != NULL)
4196
re_string_translate_buffer (pstr);
4203
/* Functions for matching context. */
4205
/* Initialize MCTX. */
4207
static reg_errcode_t
4208
internal_function __attribute_warn_unused_result__
4209
match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4211
mctx->eflags = eflags;
4212
mctx->match_last = REG_MISSING;
4215
/* Avoid overflow. */
4216
size_t max_object_size =
4217
MAX (sizeof (struct re_backref_cache_entry),
4218
sizeof (re_sub_match_top_t *));
4219
if (BE (SIZE_MAX / max_object_size < n, 0))
4222
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4223
mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4224
if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4227
/* Already zero-ed by the caller.
4229
mctx->bkref_ents = NULL;
4230
mctx->nbkref_ents = 0;
4231
mctx->nsub_tops = 0; */
4232
mctx->abkref_ents = n;
4233
mctx->max_mb_elem_len = 1;
4234
mctx->asub_tops = n;
4238
/* Clean the entries which depend on the current input in MCTX.
4239
This function must be invoked when the matcher changes the start index
4240
of the input, or changes the input string. */
4244
match_ctx_clean (re_match_context_t *mctx)
4247
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4250
re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4251
for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4253
re_sub_match_last_t *last = top->lasts[sl_idx];
4254
re_free (last->path.array);
4257
re_free (top->lasts);
4260
re_free (top->path->array);
4261
re_free (top->path);
4266
mctx->nsub_tops = 0;
4267
mctx->nbkref_ents = 0;
4270
/* Free all the memory associated with MCTX. */
4274
match_ctx_free (re_match_context_t *mctx)
4276
/* First, free all the memory associated with MCTX->SUB_TOPS. */
4277
match_ctx_clean (mctx);
4278
re_free (mctx->sub_tops);
4279
re_free (mctx->bkref_ents);
4282
/* Add a new backreference entry to MCTX.
4283
Note that we assume that caller never call this function with duplicate
4284
entry, and call with STR_IDX which isn't smaller than any existing entry.
4287
static reg_errcode_t
4288
internal_function __attribute_warn_unused_result__
4289
match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4292
if (mctx->nbkref_ents >= mctx->abkref_ents)
4294
struct re_backref_cache_entry* new_entry;
4295
new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4296
mctx->abkref_ents * 2);
4297
if (BE (new_entry == NULL, 0))
4299
re_free (mctx->bkref_ents);
4302
mctx->bkref_ents = new_entry;
4303
memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4304
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4305
mctx->abkref_ents *= 2;
4307
if (mctx->nbkref_ents > 0
4308
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4309
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4311
mctx->bkref_ents[mctx->nbkref_ents].node = node;
4312
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4313
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4314
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4316
/* This is a cache that saves negative results of check_dst_limits_calc_pos.
4317
If bit N is clear, means that this entry won't epsilon-transition to
4318
an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4319
it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4322
A backreference does not epsilon-transition unless it is empty, so set
4323
to all zeros if FROM != TO. */
4324
mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4325
= (from == to ? -1 : 0);
4327
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4328
if (mctx->max_mb_elem_len < to - from)
4329
mctx->max_mb_elem_len = to - from;
4333
/* Return the first entry with the same str_idx, or REG_MISSING if none is
4334
found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4338
search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4340
Idx left, right, mid, last;
4341
last = right = mctx->nbkref_ents;
4342
for (left = 0; left < right;)
4344
mid = (left + right) / 2;
4345
if (mctx->bkref_ents[mid].str_idx < str_idx)
4350
if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4356
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4359
static reg_errcode_t
4360
internal_function __attribute_warn_unused_result__
4361
match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4364
assert (mctx->sub_tops != NULL);
4365
assert (mctx->asub_tops > 0);
4367
if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4369
Idx new_asub_tops = mctx->asub_tops * 2;
4370
re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4371
re_sub_match_top_t *,
4373
if (BE (new_array == NULL, 0))
4375
mctx->sub_tops = new_array;
4376
mctx->asub_tops = new_asub_tops;
4378
mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4379
if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4381
mctx->sub_tops[mctx->nsub_tops]->node = node;
4382
mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4386
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4387
at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4389
static re_sub_match_last_t *
4391
match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4393
re_sub_match_last_t *new_entry;
4394
if (BE (subtop->nlasts == subtop->alasts, 0))
4396
Idx new_alasts = 2 * subtop->alasts + 1;
4397
re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4398
re_sub_match_last_t *,
4400
if (BE (new_array == NULL, 0))
4402
subtop->lasts = new_array;
4403
subtop->alasts = new_alasts;
4405
new_entry = calloc (1, sizeof (re_sub_match_last_t));
4406
if (BE (new_entry != NULL, 1))
4408
subtop->lasts[subtop->nlasts] = new_entry;
4409
new_entry->node = node;
4410
new_entry->str_idx = str_idx;
4418
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4419
re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4421
sctx->sifted_states = sifted_sts;
4422
sctx->limited_states = limited_sts;
4423
sctx->last_node = last_node;
4424
sctx->last_str_idx = last_str_idx;
4425
re_node_set_init_empty (&sctx->limits);