1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002-2013 Free Software Foundation, Inc.
3
This file is part of the GNU C Library.
4
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU Lesser General Public
8
License as published by the Free Software Foundation; either
9
version 2.1 of the License, or (at your option) any later version.
11
The GNU C Library is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
Lesser General Public License for more details.
16
You should have received a copy of the GNU Lesser General Public
17
License along with the GNU C Library; if not, see
18
<http://www.gnu.org/licenses/>. */
20
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21
int n) internal_function;
22
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23
static void match_ctx_free (re_match_context_t *cache) internal_function;
24
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25
int str_idx, int from, int to)
27
static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29
static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30
int str_idx) internal_function;
31
static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32
int node, int str_idx)
34
static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35
re_dfastate_t **limited_sts, int last_node,
38
static reg_errcode_t re_search_internal (const regex_t *preg,
39
const char *string, int length,
40
int start, int range, int stop,
41
size_t nmatch, regmatch_t pmatch[],
42
int eflags) internal_function;
43
static int re_search_2_stub (struct re_pattern_buffer *bufp,
44
const char *string1, int length1,
45
const char *string2, int length2,
46
int start, int range, struct re_registers *regs,
47
int stop, int ret_len) internal_function;
48
static int re_search_stub (struct re_pattern_buffer *bufp,
49
const char *string, int length, int start,
50
int range, int stop, struct re_registers *regs,
51
int ret_len) internal_function;
52
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53
int nregs, int regs_allocated) internal_function;
54
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56
static int check_matching (re_match_context_t *mctx, int fl_longest_match,
57
int *p_match_first) internal_function;
58
static int check_halt_state_context (const re_match_context_t *mctx,
59
const re_dfastate_t *state, int idx)
61
static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
62
regmatch_t *prev_idx_match, int cur_node,
63
int cur_idx, int nmatch) internal_function;
64
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
65
int str_idx, int dest_node, int nregs,
67
re_node_set *eps_via_nodes)
69
static reg_errcode_t set_regs (const regex_t *preg,
70
const re_match_context_t *mctx,
71
size_t nmatch, regmatch_t *pmatch,
72
int fl_backtrack) internal_function;
73
static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
77
static int sift_states_iter_mb (const re_match_context_t *mctx,
78
re_sift_context_t *sctx,
79
int node_idx, int str_idx, int max_str_idx)
81
#endif /* RE_ENABLE_I18N */
82
static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
83
re_sift_context_t *sctx)
85
static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
86
re_sift_context_t *sctx, int str_idx,
87
re_node_set *cur_dest)
89
static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
90
re_sift_context_t *sctx,
92
re_node_set *dest_nodes)
94
static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
95
re_node_set *dest_nodes,
96
const re_node_set *candidates)
98
static int check_dst_limits (const re_match_context_t *mctx,
100
int dst_node, int dst_idx, int src_node,
101
int src_idx) internal_function;
102
static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
103
int boundaries, int subexp_idx,
104
int from_node, int bkref_idx)
106
static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
107
int limit, int subexp_idx,
108
int node, int str_idx,
109
int bkref_idx) internal_function;
110
static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
111
re_node_set *dest_nodes,
112
const re_node_set *candidates,
114
struct re_backref_cache_entry *bkref_ents,
115
int str_idx) internal_function;
116
static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
117
re_sift_context_t *sctx,
118
int str_idx, const re_node_set *candidates)
120
static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122
re_dfastate_t **src, int num)
124
static re_dfastate_t *find_recover_state (reg_errcode_t *err,
125
re_match_context_t *mctx) internal_function;
126
static re_dfastate_t *transit_state (reg_errcode_t *err,
127
re_match_context_t *mctx,
128
re_dfastate_t *state) internal_function;
129
static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
130
re_match_context_t *mctx,
131
re_dfastate_t *next_state)
133
static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
134
re_node_set *cur_nodes,
135
int str_idx) internal_function;
137
static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
138
re_match_context_t *mctx,
139
re_dfastate_t *pstate)
142
#ifdef RE_ENABLE_I18N
143
static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144
re_dfastate_t *pstate)
146
#endif /* RE_ENABLE_I18N */
147
static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
148
const re_node_set *nodes)
150
static reg_errcode_t get_subexp (re_match_context_t *mctx,
151
int bkref_node, int bkref_str_idx)
153
static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
154
const re_sub_match_top_t *sub_top,
155
re_sub_match_last_t *sub_last,
156
int bkref_node, int bkref_str)
158
static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159
int subexp_idx, int type) internal_function;
160
static reg_errcode_t check_arrival (re_match_context_t *mctx,
161
state_array_t *path, int top_node,
162
int top_str, int last_node, int last_str,
163
int type) internal_function;
164
static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166
re_node_set *cur_nodes,
167
re_node_set *next_nodes)
169
static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
170
re_node_set *cur_nodes,
171
int ex_subexp, int type)
173
static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
174
re_node_set *dst_nodes,
175
int target, int ex_subexp,
176
int type) internal_function;
177
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
178
re_node_set *cur_nodes, int cur_str,
179
int subexp_num, int type)
181
static int build_trtable (const re_dfa_t *dfa,
182
re_dfastate_t *state) internal_function;
183
#ifdef RE_ENABLE_I18N
184
static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
185
const re_string_t *input, int idx)
188
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
192
#endif /* RE_ENABLE_I18N */
193
static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
194
const re_dfastate_t *state,
195
re_node_set *states_node,
196
bitset_t *states_ch) internal_function;
197
static int check_node_accept (const re_match_context_t *mctx,
198
const re_token_t *node, int idx)
200
static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
204
#undef MIN /* safety */
206
MIN(size_t a, size_t b)
208
return (a < b ? a : b);
212
/* Entry point for POSIX code. */
214
/* regexec searches for a given pattern, specified by PREG, in the
217
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
218
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
219
least NMATCH elements, and we set them to the offsets of the
220
corresponding matched substrings.
222
EFLAGS specifies `execution flags' which affect matching: if
223
REG_NOTBOL is set, then ^ does not match at the beginning of the
224
string; if REG_NOTEOL is set, then $ does not match at the end.
226
We return 0 if we find a match and REG_NOMATCH if not. */
229
regexec (preg, string, nmatch, pmatch, eflags)
230
const regex_t *__restrict preg;
231
const char *__restrict string;
239
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
242
if (eflags & REG_STARTEND)
244
start = pmatch[0].rm_so;
245
length = pmatch[0].rm_eo;
250
length = strlen (string);
253
__libc_lock_lock (dfa->lock);
255
err = re_search_internal (preg, string, length, start, length - start,
256
length, 0, NULL, eflags);
258
err = re_search_internal (preg, string, length, start, length - start,
259
length, nmatch, pmatch, eflags);
260
__libc_lock_unlock (dfa->lock);
261
return err != REG_NOERROR;
265
# include <shlib-compat.h>
266
versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
268
# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
269
__typeof__ (__regexec) __compat_regexec;
272
attribute_compat_text_section
273
__compat_regexec (const regex_t *__restrict preg,
274
const char *__restrict string, size_t nmatch,
275
regmatch_t pmatch[], int eflags)
277
return regexec (preg, string, nmatch, pmatch,
278
eflags & (REG_NOTBOL | REG_NOTEOL));
280
compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
284
/* Entry points for GNU code. */
286
/* re_match, re_search, re_match_2, re_search_2
288
The former two functions operate on STRING with length LENGTH,
289
while the later two operate on concatenation of STRING1 and STRING2
290
with lengths LENGTH1 and LENGTH2, respectively.
292
re_match() matches the compiled pattern in BUFP against the string,
293
starting at index START.
295
re_search() first tries matching at index START, then it tries to match
296
starting from index START + 1, and so on. The last start position tried
297
is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
300
The parameter STOP of re_{match,search}_2 specifies that no match exceeding
301
the first STOP characters of the concatenation of the strings should be
304
If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
305
and all groups is stroed in REGS. (For the "_2" variants, the offsets are
306
computed relative to the concatenation, not relative to the individual
309
On success, re_match* functions return the length of the match, re_search*
310
return the position of the start of the match. Return value -1 means no
311
match was found and -2 indicates an internal error. */
314
re_match (bufp, string, length, start, regs)
315
struct re_pattern_buffer *bufp;
318
struct re_registers *regs;
320
return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
323
weak_alias (__re_match, re_match)
327
re_search (bufp, string, length, start, range, regs)
328
struct re_pattern_buffer *bufp;
330
int length, start, range;
331
struct re_registers *regs;
333
return re_search_stub (bufp, string, length, start, range, length, regs, 0);
336
weak_alias (__re_search, re_search)
340
re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
341
struct re_pattern_buffer *bufp;
342
const char *string1, *string2;
343
int length1, length2, start, stop;
344
struct re_registers *regs;
346
return re_search_2_stub (bufp, string1, length1, string2, length2,
347
start, 0, regs, stop, 1);
350
weak_alias (__re_match_2, re_match_2)
354
re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
355
struct re_pattern_buffer *bufp;
356
const char *string1, *string2;
357
int length1, length2, start, range, stop;
358
struct re_registers *regs;
360
return re_search_2_stub (bufp, string1, length1, string2, length2,
361
start, range, regs, stop, 0);
364
weak_alias (__re_search_2, re_search_2)
368
re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
370
struct re_pattern_buffer *bufp;
371
const char *string1, *string2;
372
int length1, length2, start, range, stop, ret_len;
373
struct re_registers *regs;
377
int len = length1 + length2;
380
if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
383
/* Concatenate the strings. */
387
s = re_malloc (char, len);
389
if (BE (s == NULL, 0))
392
memcpy (__mempcpy (s, string1, length1), string2, length2);
394
memcpy (s, string1, length1);
395
memcpy (s + length1, string2, length2);
404
rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
409
/* The parameters have the same meaning as those of re_search.
410
Additional parameters:
411
If RET_LEN is nonzero the length of the match is returned (re_match style);
412
otherwise the position of the match is returned. */
415
re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
416
struct re_pattern_buffer *bufp;
418
int length, start, range, stop, ret_len;
419
struct re_registers *regs;
421
reg_errcode_t result;
426
/* Check for out-of-range. */
427
if (BE (start < 0 || start > length, 0))
429
if (BE (start + range > length, 0))
430
range = length - start;
431
else if (BE (start + range < 0, 0))
434
__libc_lock_lock (dfa->lock);
436
eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
437
eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
439
/* Compile fastmap if we haven't yet. */
440
if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
441
re_compile_fastmap (bufp);
443
if (BE (bufp->no_sub, 0))
446
/* We need at least 1 register. */
449
else if (BE (bufp->regs_allocated == REGS_FIXED &&
450
regs->num_regs < bufp->re_nsub + 1, 0))
452
nregs = regs->num_regs;
453
if (BE (nregs < 1, 0))
455
/* Nothing can be copied to regs. */
461
nregs = bufp->re_nsub + 1;
462
pmatch = re_malloc (regmatch_t, nregs);
463
if (BE (pmatch == NULL, 0))
469
result = re_search_internal (bufp, string, length, start, range, stop,
470
nregs, pmatch, eflags);
474
/* I hope we needn't fill ther regs with -1's when no match was found. */
475
if (result != REG_NOERROR)
477
else if (regs != NULL)
479
/* If caller wants register contents data back, copy them. */
480
bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
481
bufp->regs_allocated);
482
if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
486
if (BE (rval == 0, 1))
490
assert (pmatch[0].rm_so == start);
491
rval = pmatch[0].rm_eo - start;
494
rval = pmatch[0].rm_so;
498
__libc_lock_unlock (dfa->lock);
503
re_copy_regs (regs, pmatch, nregs, regs_allocated)
504
struct re_registers *regs;
506
int nregs, regs_allocated;
508
int rval = REGS_REALLOCATE;
510
int need_regs = nregs + 1;
511
/* We need one extra element beyond `num_regs' for the `-1' marker GNU code
514
/* Have the register data arrays been allocated? */
515
if (regs_allocated == REGS_UNALLOCATED)
516
{ /* No. So allocate them with malloc. */
517
regs->start = re_malloc (regoff_t, need_regs);
518
if (BE (regs->start == NULL, 0))
519
return REGS_UNALLOCATED;
520
regs->end = re_malloc (regoff_t, need_regs);
521
if (BE (regs->end == NULL, 0))
523
re_free (regs->start);
524
return REGS_UNALLOCATED;
526
regs->num_regs = need_regs;
528
else if (regs_allocated == REGS_REALLOCATE)
529
{ /* Yes. If we need more elements than were already
530
allocated, reallocate them. If we need fewer, just
532
if (BE (need_regs > regs->num_regs, 0))
534
regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
536
if (BE (new_start == NULL, 0))
537
return REGS_UNALLOCATED;
538
new_end = re_realloc (regs->end, regoff_t, need_regs);
539
if (BE (new_end == NULL, 0))
542
return REGS_UNALLOCATED;
544
regs->start = new_start;
546
regs->num_regs = need_regs;
551
assert (regs_allocated == REGS_FIXED);
552
/* This function may not be called with REGS_FIXED and nregs too big. */
553
assert (regs->num_regs >= nregs);
558
for (i = 0; i < nregs; ++i)
560
regs->start[i] = pmatch[i].rm_so;
561
regs->end[i] = pmatch[i].rm_eo;
563
for ( ; i < regs->num_regs; ++i)
564
regs->start[i] = regs->end[i] = -1;
569
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
570
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
571
this memory for recording register information. STARTS and ENDS
572
must be allocated using the malloc library routine, and must each
573
be at least NUM_REGS * sizeof (regoff_t) bytes long.
575
If NUM_REGS == 0, then subsequent matches should allocate their own
578
Unless this function is called, the first search or match using
579
PATTERN_BUFFER will allocate its own register data, without
580
freeing the old data. */
583
re_set_registers (bufp, regs, num_regs, starts, ends)
584
struct re_pattern_buffer *bufp;
585
struct re_registers *regs;
587
regoff_t *starts, *ends;
591
bufp->regs_allocated = REGS_REALLOCATE;
592
regs->num_regs = num_regs;
593
regs->start = starts;
598
bufp->regs_allocated = REGS_UNALLOCATED;
600
regs->start = regs->end = (regoff_t *) 0;
604
weak_alias (__re_set_registers, re_set_registers)
607
/* Entry points compatible with 4.2 BSD regex library. We don't define
608
them unless specifically requested. */
610
#if defined _REGEX_RE_COMP || defined _LIBC
618
return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
620
#endif /* _REGEX_RE_COMP */
622
/* Internal entry point. */
624
/* Searches for a compiled pattern PREG in the string STRING, whose
625
length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
626
mingings with regexec. START, and RANGE have the same meanings
628
Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
629
otherwise return the error code.
630
Note: We assume front end functions already check ranges.
631
(START + RANGE >= 0 && START + RANGE <= LENGTH) */
634
__attribute_warn_unused_result__
635
re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
639
int length, start, range, stop, eflags;
644
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
645
int left_lim, right_lim, incr;
646
int fl_longest_match, match_first, match_kind, match_last = -1;
649
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
650
re_match_context_t mctx = { .dfa = dfa };
652
re_match_context_t mctx;
654
char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
655
&& range && !preg->can_be_null) ? preg->fastmap : NULL;
656
RE_TRANSLATE_TYPE t = preg->translate;
658
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
659
memset (&mctx, '\0', sizeof (re_match_context_t));
663
extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
664
nmatch -= extra_nmatch;
666
/* Check if the DFA haven't been compiled. */
667
if (BE (preg->used == 0 || dfa == NULL || dfa->init_state == NULL
668
|| dfa->init_state_word == NULL || dfa->init_state_nl == NULL
669
|| dfa->init_state_begbuf == NULL, 0))
673
/* We assume front-end functions already check them. */
674
assert (start + range >= 0 && start + range <= length);
677
/* If initial states with non-begbuf contexts have no elements,
678
the regex must be anchored. If preg->newline_anchor is set,
679
we'll never use init_state_nl, so do not check it. */
680
if (dfa->init_state->nodes.nelem == 0
681
&& dfa->init_state_word->nodes.nelem == 0
682
&& (dfa->init_state_nl->nodes.nelem == 0
683
|| !preg->newline_anchor))
685
if (start != 0 && start + range != 0)
690
/* We must check the longest matching, if nmatch > 0. */
691
fl_longest_match = (nmatch != 0 || dfa->nbackref);
693
err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
694
preg->translate, preg->syntax & RE_ICASE, dfa);
695
if (BE (err != REG_NOERROR, 0))
697
mctx.input.stop = stop;
698
mctx.input.raw_stop = stop;
699
mctx.input.newline_anchor = preg->newline_anchor;
701
err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
702
if (BE (err != REG_NOERROR, 0))
705
/* We will log all the DFA states through which the dfa pass,
706
if nmatch > 1, or this dfa has "multibyte node", which is a
707
back-reference or a node which can accept multibyte character or
708
multi character collating element. */
709
if (nmatch > 1 || dfa->has_mb_node)
711
/* Avoid overflow. */
712
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
718
mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
719
if (BE (mctx.state_log == NULL, 0))
726
mctx.state_log = NULL;
729
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
730
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
732
/* Check incrementally whether of not the input string match. */
733
incr = (range < 0) ? -1 : 1;
734
left_lim = (range < 0) ? start + range : start;
735
right_lim = (range < 0) ? start : start + range;
736
sb = dfa->mb_cur_max == 1;
739
? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
740
| (range >= 0 ? 2 : 0)
741
| (t != NULL ? 1 : 0))
744
for (;; match_first += incr)
747
if (match_first < left_lim || right_lim < match_first)
750
/* Advance as rapidly as possible through the string, until we
751
find a plausible place to start matching. This may be done
752
with varying efficiency, so there are various possibilities:
753
only the most common of them are specialized, in order to
754
save on code size. We use a switch statement for speed. */
762
/* Fastmap with single-byte translation, match forward. */
763
while (BE (match_first < right_lim, 1)
764
&& !fastmap[t[(unsigned char) string[match_first]]])
766
goto forward_match_found_start_or_reached_end;
769
/* Fastmap without translation, match forward. */
770
while (BE (match_first < right_lim, 1)
771
&& !fastmap[(unsigned char) string[match_first]])
774
forward_match_found_start_or_reached_end:
775
if (BE (match_first == right_lim, 0))
777
ch = match_first >= length
778
? 0 : (unsigned char) string[match_first];
779
if (!fastmap[t ? t[ch] : ch])
786
/* Fastmap without multi-byte translation, match backwards. */
787
while (match_first >= left_lim)
789
ch = match_first >= length
790
? 0 : (unsigned char) string[match_first];
791
if (fastmap[t ? t[ch] : ch])
795
if (match_first < left_lim)
800
/* In this case, we can't determine easily the current byte,
801
since it might be a component byte of a multibyte
802
character. Then we use the constructed buffer instead. */
805
/* If MATCH_FIRST is out of the valid range, reconstruct the
807
unsigned int offset = match_first - mctx.input.raw_mbs_idx;
808
if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
810
err = re_string_reconstruct (&mctx.input, match_first,
812
if (BE (err != REG_NOERROR, 0))
815
offset = match_first - mctx.input.raw_mbs_idx;
817
/* If MATCH_FIRST is out of the buffer, leave it as '\0'.
818
Note that MATCH_FIRST must not be smaller than 0. */
819
ch = (match_first >= length
820
? 0 : re_string_byte_at (&mctx.input, offset));
824
if (match_first < left_lim || match_first > right_lim)
833
/* Reconstruct the buffers so that the matcher can assume that
834
the matching starts from the beginning of the buffer. */
835
err = re_string_reconstruct (&mctx.input, match_first, eflags);
836
if (BE (err != REG_NOERROR, 0))
839
#ifdef RE_ENABLE_I18N
840
/* Don't consider this char as a possible match start if it part,
841
yet isn't the head, of a multibyte character. */
842
if (!sb && !re_string_first_byte (&mctx.input, 0))
846
/* It seems to be appropriate one, then use the matcher. */
847
/* We assume that the matching starts from 0. */
848
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
849
match_last = check_matching (&mctx, fl_longest_match,
850
range >= 0 ? &match_first : NULL);
851
if (match_last != -1)
853
if (BE (match_last == -2, 0))
860
mctx.match_last = match_last;
861
if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
863
re_dfastate_t *pstate = mctx.state_log[match_last];
864
mctx.last_node = check_halt_state_context (&mctx, pstate,
867
if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
870
err = prune_impossible_nodes (&mctx);
871
if (err == REG_NOERROR)
873
if (BE (err != REG_NOMATCH, 0))
878
break; /* We found a match. */
882
match_ctx_clean (&mctx);
886
assert (match_last != -1);
887
assert (err == REG_NOERROR);
890
/* Set pmatch[] if we need. */
895
/* Initialize registers. */
896
for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
897
pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
899
/* Set the points where matching start/end. */
901
pmatch[0].rm_eo = mctx.match_last;
903
if (!preg->no_sub && nmatch > 1)
905
err = set_regs (preg, &mctx, nmatch, pmatch,
906
dfa->has_plural_match && dfa->nbackref > 0);
907
if (BE (err != REG_NOERROR, 0))
911
/* At last, add the offset to the each registers, since we slided
912
the buffers so that we could assume that the matching starts
914
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
915
if (pmatch[reg_idx].rm_so != -1)
917
#ifdef RE_ENABLE_I18N
918
if (BE (mctx.input.offsets_needed != 0, 0))
920
pmatch[reg_idx].rm_so =
921
(pmatch[reg_idx].rm_so == mctx.input.valid_len
922
? mctx.input.valid_raw_len
923
: mctx.input.offsets[pmatch[reg_idx].rm_so]);
924
pmatch[reg_idx].rm_eo =
925
(pmatch[reg_idx].rm_eo == mctx.input.valid_len
926
? mctx.input.valid_raw_len
927
: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
930
assert (mctx.input.offsets_needed == 0);
932
pmatch[reg_idx].rm_so += match_first;
933
pmatch[reg_idx].rm_eo += match_first;
935
for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
937
pmatch[nmatch + reg_idx].rm_so = -1;
938
pmatch[nmatch + reg_idx].rm_eo = -1;
942
for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
943
if (dfa->subexp_map[reg_idx] != reg_idx)
945
pmatch[reg_idx + 1].rm_so
946
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
947
pmatch[reg_idx + 1].rm_eo
948
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
953
re_free (mctx.state_log);
955
match_ctx_free (&mctx);
956
re_string_destruct (&mctx.input);
961
__attribute_warn_unused_result__
962
prune_impossible_nodes (mctx)
963
re_match_context_t *mctx;
965
const re_dfa_t *const dfa = mctx->dfa;
966
int halt_node, match_last;
968
re_dfastate_t **sifted_states;
969
re_dfastate_t **lim_states = NULL;
970
re_sift_context_t sctx;
972
assert (mctx->state_log != NULL);
974
match_last = mctx->match_last;
975
halt_node = mctx->last_node;
977
/* Avoid overflow. */
978
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
981
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
982
if (BE (sifted_states == NULL, 0))
989
lim_states = re_malloc (re_dfastate_t *, match_last + 1);
990
if (BE (lim_states == NULL, 0))
997
memset (lim_states, '\0',
998
sizeof (re_dfastate_t *) * (match_last + 1));
999
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1001
ret = sift_states_backward (mctx, &sctx);
1002
re_node_set_free (&sctx.limits);
1003
if (BE (ret != REG_NOERROR, 0))
1005
if (sifted_states[0] != NULL || lim_states[0] != NULL)
1015
} while (mctx->state_log[match_last] == NULL
1016
|| !mctx->state_log[match_last]->halt);
1017
halt_node = check_halt_state_context (mctx,
1018
mctx->state_log[match_last],
1021
ret = merge_state_array (dfa, sifted_states, lim_states,
1023
re_free (lim_states);
1025
if (BE (ret != REG_NOERROR, 0))
1030
sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1031
ret = sift_states_backward (mctx, &sctx);
1032
re_node_set_free (&sctx.limits);
1033
if (BE (ret != REG_NOERROR, 0))
1035
if (sifted_states[0] == NULL)
1041
re_free (mctx->state_log);
1042
mctx->state_log = sifted_states;
1043
sifted_states = NULL;
1044
mctx->last_node = halt_node;
1045
mctx->match_last = match_last;
1048
re_free (sifted_states);
1049
re_free (lim_states);
1053
/* Acquire an initial state and return it.
1054
We must select appropriate initial state depending on the context,
1055
since initial states may have constraints like "\<", "^", etc.. */
1057
static inline re_dfastate_t *
1058
__attribute ((always_inline)) internal_function
1059
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1062
const re_dfa_t *const dfa = mctx->dfa;
1063
if (dfa->init_state->has_constraint)
1065
unsigned int context;
1066
context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1067
if (IS_WORD_CONTEXT (context))
1068
return dfa->init_state_word;
1069
else if (IS_ORDINARY_CONTEXT (context))
1070
return dfa->init_state;
1071
else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1072
return dfa->init_state_begbuf;
1073
else if (IS_NEWLINE_CONTEXT (context))
1074
return dfa->init_state_nl;
1075
else if (IS_BEGBUF_CONTEXT (context))
1077
/* It is relatively rare case, then calculate on demand. */
1078
return re_acquire_state_context (err, dfa,
1079
dfa->init_state->entrance_nodes,
1083
/* Must not happen? */
1084
return dfa->init_state;
1087
return dfa->init_state;
1090
/* Check whether the regular expression match input string INPUT or not,
1091
and return the index where the matching end, return -1 if not match,
1092
or return -2 in case of an error.
1093
FL_LONGEST_MATCH means we want the POSIX longest matching.
1094
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1095
next place where we may want to try matching.
1096
Note that the matcher assume that the maching starts from the current
1097
index of the buffer. */
1100
internal_function __attribute_warn_unused_result__
1101
check_matching (re_match_context_t *mctx, int fl_longest_match,
1104
const re_dfa_t *const dfa = mctx->dfa;
1107
int match_last = -1;
1108
int cur_str_idx = re_string_cur_idx (&mctx->input);
1109
re_dfastate_t *cur_state;
1110
int at_init_state = p_match_first != NULL;
1111
int next_start_idx = cur_str_idx;
1114
cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1115
/* An initial state must not be NULL (invalid). */
1116
if (BE (cur_state == NULL, 0))
1118
assert (err == REG_ESPACE);
1122
if (mctx->state_log != NULL)
1124
mctx->state_log[cur_str_idx] = cur_state;
1126
/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1127
later. E.g. Processing back references. */
1128
if (BE (dfa->nbackref, 0))
1131
err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1132
if (BE (err != REG_NOERROR, 0))
1135
if (cur_state->has_backref)
1137
err = transit_state_bkref (mctx, &cur_state->nodes);
1138
if (BE (err != REG_NOERROR, 0))
1144
/* If the RE accepts NULL string. */
1145
if (BE (cur_state->halt, 0))
1147
if (!cur_state->has_constraint
1148
|| check_halt_state_context (mctx, cur_state, cur_str_idx))
1150
if (!fl_longest_match)
1154
match_last = cur_str_idx;
1160
while (!re_string_eoi (&mctx->input))
1162
re_dfastate_t *old_state = cur_state;
1163
int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1165
if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1166
&& mctx->input.bufs_len < mctx->input.len)
1167
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
1168
&& mctx->input.valid_len < mctx->input.len))
1170
err = extend_buffers (mctx, next_char_idx + 1);
1171
if (BE (err != REG_NOERROR, 0))
1173
assert (err == REG_ESPACE);
1178
cur_state = transit_state (&err, mctx, cur_state);
1179
if (mctx->state_log != NULL)
1180
cur_state = merge_state_with_log (&err, mctx, cur_state);
1182
if (cur_state == NULL)
1184
/* Reached the invalid state or an error. Try to recover a valid
1185
state using the state log, if available and if we have not
1186
already found a valid (even if not the longest) match. */
1187
if (BE (err != REG_NOERROR, 0))
1190
if (mctx->state_log == NULL
1191
|| (match && !fl_longest_match)
1192
|| (cur_state = find_recover_state (&err, mctx)) == NULL)
1196
if (BE (at_init_state, 0))
1198
if (old_state == cur_state)
1199
next_start_idx = next_char_idx;
1204
if (cur_state->halt)
1206
/* Reached a halt state.
1207
Check the halt state can satisfy the current context. */
1208
if (!cur_state->has_constraint
1209
|| check_halt_state_context (mctx, cur_state,
1210
re_string_cur_idx (&mctx->input)))
1212
/* We found an appropriate halt state. */
1213
match_last = re_string_cur_idx (&mctx->input);
1216
/* We found a match, do not modify match_first below. */
1217
p_match_first = NULL;
1218
if (!fl_longest_match)
1225
*p_match_first += next_start_idx;
1230
/* Check NODE match the current context. */
1234
check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1236
re_token_type_t type = dfa->nodes[node].type;
1237
unsigned int constraint = dfa->nodes[node].constraint;
1238
if (type != END_OF_RE)
1242
if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1247
/* Check the halt state STATE match the current context.
1248
Return 0 if not match, if the node, STATE has, is a halt node and
1249
match the context, return the node. */
1253
check_halt_state_context (const re_match_context_t *mctx,
1254
const re_dfastate_t *state, int idx)
1257
unsigned int context;
1259
assert (state->halt);
1261
context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1262
for (i = 0; i < state->nodes.nelem; ++i)
1263
if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1264
return state->nodes.elems[i];
1268
/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1269
corresponding to the DFA).
1270
Return the destination node, and update EPS_VIA_NODES, return -1 in case
1275
proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1276
int *pidx, int node, re_node_set *eps_via_nodes,
1277
struct re_fail_stack_t *fs)
1279
const re_dfa_t *const dfa = mctx->dfa;
1281
if (IS_EPSILON_NODE (dfa->nodes[node].type))
1283
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1284
re_node_set *edests = &dfa->edests[node];
1286
err = re_node_set_insert (eps_via_nodes, node);
1287
if (BE (err < 0, 0))
1289
/* Pick up a valid destination, or return -1 if none is found. */
1290
for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1292
int candidate = edests->elems[i];
1293
if (!re_node_set_contains (cur_nodes, candidate))
1295
if (dest_node == -1)
1296
dest_node = candidate;
1300
/* In order to avoid infinite loop like "(a*)*", return the second
1301
epsilon-transition if the first was already considered. */
1302
if (re_node_set_contains (eps_via_nodes, dest_node))
1305
/* Otherwise, push the second epsilon-transition on the fail stack. */
1307
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
1311
/* We know we are going to exit. */
1320
re_token_type_t type = dfa->nodes[node].type;
1322
#ifdef RE_ENABLE_I18N
1323
if (dfa->nodes[node].accept_mb)
1324
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1326
#endif /* RE_ENABLE_I18N */
1327
if (type == OP_BACK_REF)
1329
int subexp_idx = dfa->nodes[node].opr.idx + 1;
1330
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1333
if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1337
char *buf = (char *) re_string_get_buffer (&mctx->input);
1338
if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1347
err = re_node_set_insert (eps_via_nodes, node);
1348
if (BE (err < 0, 0))
1350
dest_node = dfa->edests[node].elems[0];
1351
if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1358
|| check_node_accept (mctx, dfa->nodes + node, *pidx))
1360
int dest_node = dfa->nexts[node];
1361
*pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1362
if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1363
|| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1366
re_node_set_empty (eps_via_nodes);
1373
static reg_errcode_t
1374
internal_function __attribute_warn_unused_result__
1375
push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1376
int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1379
int num = fs->num++;
1380
if (fs->num == fs->alloc)
1382
struct re_fail_stack_ent_t *new_array;
1383
new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1385
if (new_array == NULL)
1388
fs->stack = new_array;
1390
fs->stack[num].idx = str_idx;
1391
fs->stack[num].node = dest_node;
1392
fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1393
if (fs->stack[num].regs == NULL)
1395
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1396
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1402
pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1403
regmatch_t *regs, re_node_set *eps_via_nodes)
1405
int num = --fs->num;
1407
*pidx = fs->stack[num].idx;
1408
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1409
re_node_set_free (eps_via_nodes);
1410
re_free (fs->stack[num].regs);
1411
*eps_via_nodes = fs->stack[num].eps_via_nodes;
1412
return fs->stack[num].node;
1415
/* Set the positions where the subexpressions are starts/ends to registers
1417
Note: We assume that pmatch[0] is already set, and
1418
pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1420
static reg_errcode_t
1421
internal_function __attribute_warn_unused_result__
1422
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1423
regmatch_t *pmatch, int fl_backtrack)
1425
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1427
re_node_set eps_via_nodes;
1428
struct re_fail_stack_t *fs;
1429
struct re_fail_stack_t fs_body = { 0, 2, NULL };
1430
regmatch_t *prev_idx_match;
1431
int prev_idx_match_malloced = 0;
1434
assert (nmatch > 1);
1435
assert (mctx->state_log != NULL);
1440
fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1441
if (fs->stack == NULL)
1447
cur_node = dfa->init_node;
1448
re_node_set_init_empty (&eps_via_nodes);
1451
if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1452
prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1456
prev_idx_match = re_malloc (regmatch_t, nmatch);
1457
if (prev_idx_match == NULL)
1459
free_fail_stack_return (fs);
1462
prev_idx_match_malloced = 1;
1464
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1466
for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1468
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1470
if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1475
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1476
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1478
if (reg_idx == nmatch)
1480
re_node_set_free (&eps_via_nodes);
1481
if (prev_idx_match_malloced)
1482
re_free (prev_idx_match);
1483
return free_fail_stack_return (fs);
1485
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1490
re_node_set_free (&eps_via_nodes);
1491
if (prev_idx_match_malloced)
1492
re_free (prev_idx_match);
1497
/* Proceed to next node. */
1498
cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1499
&eps_via_nodes, fs);
1501
if (BE (cur_node < 0, 0))
1503
if (BE (cur_node == -2, 0))
1505
re_node_set_free (&eps_via_nodes);
1506
if (prev_idx_match_malloced)
1507
re_free (prev_idx_match);
1508
free_fail_stack_return (fs);
1512
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1516
re_node_set_free (&eps_via_nodes);
1517
if (prev_idx_match_malloced)
1518
re_free (prev_idx_match);
1523
re_node_set_free (&eps_via_nodes);
1524
if (prev_idx_match_malloced)
1525
re_free (prev_idx_match);
1526
return free_fail_stack_return (fs);
1529
static reg_errcode_t
1531
free_fail_stack_return (struct re_fail_stack_t *fs)
1536
for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1538
re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1539
re_free (fs->stack[fs_idx].regs);
1541
re_free (fs->stack);
1548
update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1549
regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1551
int type = dfa->nodes[cur_node].type;
1552
if (type == OP_OPEN_SUBEXP)
1554
int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1556
/* We are at the first node of this sub expression. */
1557
if (reg_num < nmatch)
1559
pmatch[reg_num].rm_so = cur_idx;
1560
pmatch[reg_num].rm_eo = -1;
1563
else if (type == OP_CLOSE_SUBEXP)
1565
int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1566
if (reg_num < nmatch)
1568
/* We are at the last node of this sub expression. */
1569
if (pmatch[reg_num].rm_so < cur_idx)
1571
pmatch[reg_num].rm_eo = cur_idx;
1572
/* This is a non-empty match or we are not inside an optional
1573
subexpression. Accept this right away. */
1574
memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1578
if (dfa->nodes[cur_node].opt_subexp
1579
&& prev_idx_match[reg_num].rm_so != -1)
1580
/* We transited through an empty match for an optional
1581
subexpression, like (a?)*, and this is not the subexp's
1582
first match. Copy back the old content of the registers
1583
so that matches of an inner subexpression are undone as
1584
well, like in ((a?))*. */
1585
memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1587
/* We completed a subexpression, but it may be part of
1588
an optional one, so do not update PREV_IDX_MATCH. */
1589
pmatch[reg_num].rm_eo = cur_idx;
1595
/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1596
and sift the nodes in each states according to the following rules.
1597
Updated state_log will be wrote to STATE_LOG.
1599
Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1600
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1601
If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1602
the LAST_NODE, we throw away the node `a'.
1603
2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1604
string `s' and transit to `b':
1605
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1607
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1608
thrown away, we throw away the node `a'.
1609
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1610
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1612
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1613
we throw away the node `a'. */
1615
#define STATE_NODE_CONTAINS(state,node) \
1616
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1618
static reg_errcode_t
1620
sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1624
int str_idx = sctx->last_str_idx;
1625
re_node_set cur_dest;
1628
assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1631
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1632
transit to the last_node and the last_node itself. */
1633
err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1634
if (BE (err != REG_NOERROR, 0))
1636
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1637
if (BE (err != REG_NOERROR, 0))
1640
/* Then check each states in the state_log. */
1643
/* Update counters. */
1644
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1645
if (null_cnt > mctx->max_mb_elem_len)
1647
memset (sctx->sifted_states, '\0',
1648
sizeof (re_dfastate_t *) * str_idx);
1649
re_node_set_free (&cur_dest);
1652
re_node_set_empty (&cur_dest);
1655
if (mctx->state_log[str_idx])
1657
err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1658
if (BE (err != REG_NOERROR, 0))
1662
/* Add all the nodes which satisfy the following conditions:
1663
- It can epsilon transit to a node in CUR_DEST.
1665
And update state_log. */
1666
err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1667
if (BE (err != REG_NOERROR, 0))
1672
re_node_set_free (&cur_dest);
1676
static reg_errcode_t
1677
internal_function __attribute_warn_unused_result__
1678
build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1679
int str_idx, re_node_set *cur_dest)
1681
const re_dfa_t *const dfa = mctx->dfa;
1682
const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1685
/* Then build the next sifted state.
1686
We build the next sifted state on `cur_dest', and update
1687
`sifted_states[str_idx]' with `cur_dest'.
1689
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1690
`cur_src' points the node_set of the old `state_log[str_idx]'
1691
(with the epsilon nodes pre-filtered out). */
1692
for (i = 0; i < cur_src->nelem; i++)
1694
int prev_node = cur_src->elems[i];
1699
re_token_type_t type = dfa->nodes[prev_node].type;
1700
assert (!IS_EPSILON_NODE (type));
1702
#ifdef RE_ENABLE_I18N
1703
/* If the node may accept `multi byte'. */
1704
if (dfa->nodes[prev_node].accept_mb)
1705
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1706
str_idx, sctx->last_str_idx);
1707
#endif /* RE_ENABLE_I18N */
1709
/* We don't check backreferences here.
1710
See update_cur_sifted_state(). */
1712
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1713
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1714
dfa->nexts[prev_node]))
1720
if (sctx->limits.nelem)
1722
int to_idx = str_idx + naccepted;
1723
if (check_dst_limits (mctx, &sctx->limits,
1724
dfa->nexts[prev_node], to_idx,
1725
prev_node, str_idx))
1728
ret = re_node_set_insert (cur_dest, prev_node);
1729
if (BE (ret == -1, 0))
1736
/* Helper functions. */
1738
static reg_errcode_t
1740
clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1742
int top = mctx->state_log_top;
1744
if ((next_state_log_idx >= mctx->input.bufs_len
1745
&& mctx->input.bufs_len < mctx->input.len)
1746
|| (next_state_log_idx >= mctx->input.valid_len
1747
&& mctx->input.valid_len < mctx->input.len))
1750
err = extend_buffers (mctx, next_state_log_idx + 1);
1751
if (BE (err != REG_NOERROR, 0))
1755
if (top < next_state_log_idx)
1757
memset (mctx->state_log + top + 1, '\0',
1758
sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1759
mctx->state_log_top = next_state_log_idx;
1764
static reg_errcode_t
1766
merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1767
re_dfastate_t **src, int num)
1771
for (st_idx = 0; st_idx < num; ++st_idx)
1773
if (dst[st_idx] == NULL)
1774
dst[st_idx] = src[st_idx];
1775
else if (src[st_idx] != NULL)
1777
re_node_set merged_set;
1778
err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1779
&src[st_idx]->nodes);
1780
if (BE (err != REG_NOERROR, 0))
1782
dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1783
re_node_set_free (&merged_set);
1784
if (BE (err != REG_NOERROR, 0))
1791
static reg_errcode_t
1793
update_cur_sifted_state (const re_match_context_t *mctx,
1794
re_sift_context_t *sctx, int str_idx,
1795
re_node_set *dest_nodes)
1797
const re_dfa_t *const dfa = mctx->dfa;
1798
reg_errcode_t err = REG_NOERROR;
1799
const re_node_set *candidates;
1800
candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1801
: &mctx->state_log[str_idx]->nodes);
1803
if (dest_nodes->nelem == 0)
1804
sctx->sifted_states[str_idx] = NULL;
1809
/* At first, add the nodes which can epsilon transit to a node in
1811
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1812
if (BE (err != REG_NOERROR, 0))
1815
/* Then, check the limitations in the current sift_context. */
1816
if (sctx->limits.nelem)
1818
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1819
mctx->bkref_ents, str_idx);
1820
if (BE (err != REG_NOERROR, 0))
1825
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1826
if (BE (err != REG_NOERROR, 0))
1830
if (candidates && mctx->state_log[str_idx]->has_backref)
1832
err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1833
if (BE (err != REG_NOERROR, 0))
1839
static reg_errcode_t
1840
internal_function __attribute_warn_unused_result__
1841
add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1842
const re_node_set *candidates)
1844
reg_errcode_t err = REG_NOERROR;
1847
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1848
if (BE (err != REG_NOERROR, 0))
1851
if (!state->inveclosure.alloc)
1853
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1854
if (BE (err != REG_NOERROR, 0))
1856
for (i = 0; i < dest_nodes->nelem; i++)
1858
err = re_node_set_merge (&state->inveclosure,
1859
dfa->inveclosures + dest_nodes->elems[i]);
1860
if (BE (err != REG_NOERROR, 0))
1864
return re_node_set_add_intersect (dest_nodes, candidates,
1865
&state->inveclosure);
1868
static reg_errcode_t
1870
sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1871
const re_node_set *candidates)
1875
re_node_set *inv_eclosure = dfa->inveclosures + node;
1876
re_node_set except_nodes;
1877
re_node_set_init_empty (&except_nodes);
1878
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880
int cur_node = inv_eclosure->elems[ecl_idx];
1881
if (cur_node == node)
1883
if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1885
int edst1 = dfa->edests[cur_node].elems[0];
1886
int edst2 = ((dfa->edests[cur_node].nelem > 1)
1887
? dfa->edests[cur_node].elems[1] : -1);
1888
if ((!re_node_set_contains (inv_eclosure, edst1)
1889
&& re_node_set_contains (dest_nodes, edst1))
1891
&& !re_node_set_contains (inv_eclosure, edst2)
1892
&& re_node_set_contains (dest_nodes, edst2)))
1894
err = re_node_set_add_intersect (&except_nodes, candidates,
1895
dfa->inveclosures + cur_node);
1896
if (BE (err != REG_NOERROR, 0))
1898
re_node_set_free (&except_nodes);
1904
for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1906
int cur_node = inv_eclosure->elems[ecl_idx];
1907
if (!re_node_set_contains (&except_nodes, cur_node))
1909
int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1910
re_node_set_remove_at (dest_nodes, idx);
1913
re_node_set_free (&except_nodes);
1919
check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1920
int dst_node, int dst_idx, int src_node, int src_idx)
1922
const re_dfa_t *const dfa = mctx->dfa;
1923
int lim_idx, src_pos, dst_pos;
1925
int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1926
int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1927
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1930
struct re_backref_cache_entry *ent;
1931
ent = mctx->bkref_ents + limits->elems[lim_idx];
1932
subexp_idx = dfa->nodes[ent->node].opr.idx;
1934
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1935
subexp_idx, dst_node, dst_idx,
1937
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1938
subexp_idx, src_node, src_idx,
1942
<src> <dst> ( <subexp> )
1943
( <subexp> ) <src> <dst>
1944
( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1945
if (src_pos == dst_pos)
1946
continue; /* This is unrelated limitation. */
1955
check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1956
int subexp_idx, int from_node, int bkref_idx)
1958
const re_dfa_t *const dfa = mctx->dfa;
1959
const re_node_set *eclosures = dfa->eclosures + from_node;
1962
/* Else, we are on the boundary: examine the nodes on the epsilon
1964
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1966
int node = eclosures->elems[node_idx];
1967
switch (dfa->nodes[node].type)
1970
if (bkref_idx != -1)
1972
struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1977
if (ent->node != node)
1980
if (subexp_idx < BITSET_WORD_BITS
1981
&& !(ent->eps_reachable_subexps_map
1982
& ((bitset_word_t) 1 << subexp_idx)))
1985
/* Recurse trying to reach the OP_OPEN_SUBEXP and
1986
OP_CLOSE_SUBEXP cases below. But, if the
1987
destination node is the same node as the source
1988
node, don't recurse because it would cause an
1989
infinite loop: a regex that exhibits this behavior
1991
dst = dfa->edests[node].elems[0];
1992
if (dst == from_node)
1996
else /* if (boundaries & 2) */
2001
check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2003
if (cpos == -1 /* && (boundaries & 1) */)
2005
if (cpos == 0 && (boundaries & 2))
2008
if (subexp_idx < BITSET_WORD_BITS)
2009
ent->eps_reachable_subexps_map
2010
&= ~((bitset_word_t) 1 << subexp_idx);
2012
while (ent++->more);
2016
case OP_OPEN_SUBEXP:
2017
if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2021
case OP_CLOSE_SUBEXP:
2022
if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2031
return (boundaries & 2) ? 1 : 0;
2036
check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2037
int subexp_idx, int from_node, int str_idx,
2040
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2043
/* If we are outside the range of the subexpression, return -1 or 1. */
2044
if (str_idx < lim->subexp_from)
2047
if (lim->subexp_to < str_idx)
2050
/* If we are within the subexpression, return 0. */
2051
boundaries = (str_idx == lim->subexp_from);
2052
boundaries |= (str_idx == lim->subexp_to) << 1;
2053
if (boundaries == 0)
2056
/* Else, examine epsilon closure. */
2057
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2058
from_node, bkref_idx);
2061
/* Check the limitations of sub expressions LIMITS, and remove the nodes
2062
which are against limitations from DEST_NODES. */
2064
static reg_errcode_t
2066
check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2067
const re_node_set *candidates, re_node_set *limits,
2068
struct re_backref_cache_entry *bkref_ents, int str_idx)
2071
int node_idx, lim_idx;
2073
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2076
struct re_backref_cache_entry *ent;
2077
ent = bkref_ents + limits->elems[lim_idx];
2079
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2080
continue; /* This is unrelated limitation. */
2082
subexp_idx = dfa->nodes[ent->node].opr.idx;
2083
if (ent->subexp_to == str_idx)
2087
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2089
int node = dest_nodes->elems[node_idx];
2090
re_token_type_t type = dfa->nodes[node].type;
2091
if (type == OP_OPEN_SUBEXP
2092
&& subexp_idx == dfa->nodes[node].opr.idx)
2094
else if (type == OP_CLOSE_SUBEXP
2095
&& subexp_idx == dfa->nodes[node].opr.idx)
2099
/* Check the limitation of the open subexpression. */
2100
/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2103
err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2105
if (BE (err != REG_NOERROR, 0))
2109
/* Check the limitation of the close subexpression. */
2111
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2113
int node = dest_nodes->elems[node_idx];
2114
if (!re_node_set_contains (dfa->inveclosures + node,
2116
&& !re_node_set_contains (dfa->eclosures + node,
2119
/* It is against this limitation.
2120
Remove it form the current sifted state. */
2121
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2123
if (BE (err != REG_NOERROR, 0))
2129
else /* (ent->subexp_to != str_idx) */
2131
for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2133
int node = dest_nodes->elems[node_idx];
2134
re_token_type_t type = dfa->nodes[node].type;
2135
if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2137
if (subexp_idx != dfa->nodes[node].opr.idx)
2139
/* It is against this limitation.
2140
Remove it form the current sifted state. */
2141
err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2143
if (BE (err != REG_NOERROR, 0))
2152
static reg_errcode_t
2153
internal_function __attribute_warn_unused_result__
2154
sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2155
int str_idx, const re_node_set *candidates)
2157
const re_dfa_t *const dfa = mctx->dfa;
2160
re_sift_context_t local_sctx;
2161
int first_idx = search_cur_bkref_entry (mctx, str_idx);
2163
if (first_idx == -1)
2166
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2168
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2171
re_token_type_t type;
2172
struct re_backref_cache_entry *entry;
2173
node = candidates->elems[node_idx];
2174
type = dfa->nodes[node].type;
2175
/* Avoid infinite loop for the REs like "()\1+". */
2176
if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2178
if (type != OP_BACK_REF)
2181
entry = mctx->bkref_ents + first_idx;
2182
enabled_idx = first_idx;
2189
re_dfastate_t *cur_state;
2191
if (entry->node != node)
2193
subexp_len = entry->subexp_to - entry->subexp_from;
2194
to_idx = str_idx + subexp_len;
2195
dst_node = (subexp_len ? dfa->nexts[node]
2196
: dfa->edests[node].elems[0]);
2198
if (to_idx > sctx->last_str_idx
2199
|| sctx->sifted_states[to_idx] == NULL
2200
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2201
|| check_dst_limits (mctx, &sctx->limits, node,
2202
str_idx, dst_node, to_idx))
2205
if (local_sctx.sifted_states == NULL)
2208
err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2209
if (BE (err != REG_NOERROR, 0))
2212
local_sctx.last_node = node;
2213
local_sctx.last_str_idx = str_idx;
2214
ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2215
if (BE (ret < 0, 0))
2220
cur_state = local_sctx.sifted_states[str_idx];
2221
err = sift_states_backward (mctx, &local_sctx);
2222
if (BE (err != REG_NOERROR, 0))
2224
if (sctx->limited_states != NULL)
2226
err = merge_state_array (dfa, sctx->limited_states,
2227
local_sctx.sifted_states,
2229
if (BE (err != REG_NOERROR, 0))
2232
local_sctx.sifted_states[str_idx] = cur_state;
2233
re_node_set_remove (&local_sctx.limits, enabled_idx);
2235
/* mctx->bkref_ents may have changed, reload the pointer. */
2236
entry = mctx->bkref_ents + enabled_idx;
2238
while (enabled_idx++, entry++->more);
2242
if (local_sctx.sifted_states != NULL)
2244
re_node_set_free (&local_sctx.limits);
2251
#ifdef RE_ENABLE_I18N
2254
sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2255
int node_idx, int str_idx, int max_str_idx)
2257
const re_dfa_t *const dfa = mctx->dfa;
2259
/* Check the node can accept `multi byte'. */
2260
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2261
if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2262
!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2263
dfa->nexts[node_idx]))
2264
/* The node can't accept the `multi byte', or the
2265
destination was already thrown away, then the node
2266
could't accept the current input `multi byte'. */
2268
/* Otherwise, it is sure that the node could accept
2269
`naccepted' bytes input. */
2272
#endif /* RE_ENABLE_I18N */
2275
/* Functions for state transition. */
2277
/* Return the next state to which the current state STATE will transit by
2278
accepting the current input byte, and update STATE_LOG if necessary.
2279
If STATE can accept a multibyte char/collating element/back reference
2280
update the destination of STATE_LOG. */
2282
static re_dfastate_t *
2283
internal_function __attribute_warn_unused_result__
2284
transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2285
re_dfastate_t *state)
2287
re_dfastate_t **trtable;
2290
#ifdef RE_ENABLE_I18N
2291
/* If the current state can accept multibyte. */
2292
if (BE (state->accept_mb, 0))
2294
*err = transit_state_mb (mctx, state);
2295
if (BE (*err != REG_NOERROR, 0))
2298
#endif /* RE_ENABLE_I18N */
2300
/* Then decide the next state with the single byte. */
2303
/* don't use transition table */
2304
return transit_state_sb (err, mctx, state);
2307
/* Use transition table */
2308
ch = re_string_fetch_byte (&mctx->input);
2311
trtable = state->trtable;
2312
if (BE (trtable != NULL, 1))
2315
trtable = state->word_trtable;
2316
if (BE (trtable != NULL, 1))
2318
unsigned int context;
2320
= re_string_context_at (&mctx->input,
2321
re_string_cur_idx (&mctx->input) - 1,
2323
if (IS_WORD_CONTEXT (context))
2324
return trtable[ch + SBC_MAX];
2329
if (!build_trtable (mctx->dfa, state))
2335
/* Retry, we now have a transition table. */
2339
/* Update the state_log if we need */
2342
merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2343
re_dfastate_t *next_state)
2345
const re_dfa_t *const dfa = mctx->dfa;
2346
int cur_idx = re_string_cur_idx (&mctx->input);
2348
if (cur_idx > mctx->state_log_top)
2350
mctx->state_log[cur_idx] = next_state;
2351
mctx->state_log_top = cur_idx;
2353
else if (mctx->state_log[cur_idx] == 0)
2355
mctx->state_log[cur_idx] = next_state;
2359
re_dfastate_t *pstate;
2360
unsigned int context;
2361
re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2362
/* If (state_log[cur_idx] != 0), it implies that cur_idx is
2363
the destination of a multibyte char/collating element/
2364
back reference. Then the next state is the union set of
2365
these destinations and the results of the transition table. */
2366
pstate = mctx->state_log[cur_idx];
2367
log_nodes = pstate->entrance_nodes;
2368
if (next_state != NULL)
2370
table_nodes = next_state->entrance_nodes;
2371
*err = re_node_set_init_union (&next_nodes, table_nodes,
2373
if (BE (*err != REG_NOERROR, 0))
2377
next_nodes = *log_nodes;
2378
/* Note: We already add the nodes of the initial state,
2379
then we don't need to add them here. */
2381
context = re_string_context_at (&mctx->input,
2382
re_string_cur_idx (&mctx->input) - 1,
2384
next_state = mctx->state_log[cur_idx]
2385
= re_acquire_state_context (err, dfa, &next_nodes, context);
2386
/* We don't need to check errors here, since the return value of
2387
this function is next_state and ERR is already set. */
2389
if (table_nodes != NULL)
2390
re_node_set_free (&next_nodes);
2393
if (BE (dfa->nbackref, 0) && next_state != NULL)
2395
/* Check OP_OPEN_SUBEXP in the current state in case that we use them
2396
later. We must check them here, since the back references in the
2397
next state might use them. */
2398
*err = check_subexp_matching_top (mctx, &next_state->nodes,
2400
if (BE (*err != REG_NOERROR, 0))
2403
/* If the next state has back references. */
2404
if (next_state->has_backref)
2406
*err = transit_state_bkref (mctx, &next_state->nodes);
2407
if (BE (*err != REG_NOERROR, 0))
2409
next_state = mctx->state_log[cur_idx];
2416
/* Skip bytes in the input that correspond to part of a
2417
multi-byte match, then look in the log for a state
2418
from which to restart matching. */
2421
find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2423
re_dfastate_t *cur_state;
2426
int max = mctx->state_log_top;
2427
int cur_str_idx = re_string_cur_idx (&mctx->input);
2431
if (++cur_str_idx > max)
2433
re_string_skip_bytes (&mctx->input, 1);
2435
while (mctx->state_log[cur_str_idx] == NULL);
2437
cur_state = merge_state_with_log (err, mctx, NULL);
2439
while (*err == REG_NOERROR && cur_state == NULL);
2443
/* Helper functions for transit_state. */
2445
/* From the node set CUR_NODES, pick up the nodes whose types are
2446
OP_OPEN_SUBEXP and which have corresponding back references in the regular
2447
expression. And register them to use them later for evaluating the
2448
correspoding back references. */
2450
static reg_errcode_t
2452
check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2455
const re_dfa_t *const dfa = mctx->dfa;
2459
/* TODO: This isn't efficient.
2460
Because there might be more than one nodes whose types are
2461
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2464
for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2466
int node = cur_nodes->elems[node_idx];
2467
if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2468
&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2469
&& (dfa->used_bkref_map
2470
& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2472
err = match_ctx_add_subtop (mctx, node, str_idx);
2473
if (BE (err != REG_NOERROR, 0))
2481
/* Return the next state to which the current state STATE will transit by
2482
accepting the current input byte. */
2484
static re_dfastate_t *
2485
transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2486
re_dfastate_t *state)
2488
const re_dfa_t *const dfa = mctx->dfa;
2489
re_node_set next_nodes;
2490
re_dfastate_t *next_state;
2491
int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2492
unsigned int context;
2494
*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2495
if (BE (*err != REG_NOERROR, 0))
2497
for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2499
int cur_node = state->nodes.elems[node_cnt];
2500
if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2502
*err = re_node_set_merge (&next_nodes,
2503
dfa->eclosures + dfa->nexts[cur_node]);
2504
if (BE (*err != REG_NOERROR, 0))
2506
re_node_set_free (&next_nodes);
2511
context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2512
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2513
/* We don't need to check errors here, since the return value of
2514
this function is next_state and ERR is already set. */
2516
re_node_set_free (&next_nodes);
2517
re_string_skip_bytes (&mctx->input, 1);
2522
#ifdef RE_ENABLE_I18N
2523
static reg_errcode_t
2525
transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2527
const re_dfa_t *const dfa = mctx->dfa;
2531
for (i = 0; i < pstate->nodes.nelem; ++i)
2533
re_node_set dest_nodes, *new_nodes;
2534
int cur_node_idx = pstate->nodes.elems[i];
2535
int naccepted, dest_idx;
2536
unsigned int context;
2537
re_dfastate_t *dest_state;
2539
if (!dfa->nodes[cur_node_idx].accept_mb)
2542
if (dfa->nodes[cur_node_idx].constraint)
2544
context = re_string_context_at (&mctx->input,
2545
re_string_cur_idx (&mctx->input),
2547
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2552
/* How many bytes the node can accept? */
2553
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2554
re_string_cur_idx (&mctx->input));
2558
/* The node can accepts `naccepted' bytes. */
2559
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2560
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2561
: mctx->max_mb_elem_len);
2562
err = clean_state_log_if_needed (mctx, dest_idx);
2563
if (BE (err != REG_NOERROR, 0))
2566
assert (dfa->nexts[cur_node_idx] != -1);
2568
new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2570
dest_state = mctx->state_log[dest_idx];
2571
if (dest_state == NULL)
2572
dest_nodes = *new_nodes;
2575
err = re_node_set_init_union (&dest_nodes,
2576
dest_state->entrance_nodes, new_nodes);
2577
if (BE (err != REG_NOERROR, 0))
2580
context = re_string_context_at (&mctx->input, dest_idx - 1,
2582
mctx->state_log[dest_idx]
2583
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2584
if (dest_state != NULL)
2585
re_node_set_free (&dest_nodes);
2586
if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2591
#endif /* RE_ENABLE_I18N */
2593
static reg_errcode_t
2595
transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2597
const re_dfa_t *const dfa = mctx->dfa;
2600
int cur_str_idx = re_string_cur_idx (&mctx->input);
2602
for (i = 0; i < nodes->nelem; ++i)
2604
int dest_str_idx, prev_nelem, bkc_idx;
2605
int node_idx = nodes->elems[i];
2606
unsigned int context;
2607
const re_token_t *node = dfa->nodes + node_idx;
2608
re_node_set *new_dest_nodes;
2610
/* Check whether `node' is a backreference or not. */
2611
if (node->type != OP_BACK_REF)
2614
if (node->constraint)
2616
context = re_string_context_at (&mctx->input, cur_str_idx,
2618
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2622
/* `node' is a backreference.
2623
Check the substring which the substring matched. */
2624
bkc_idx = mctx->nbkref_ents;
2625
err = get_subexp (mctx, node_idx, cur_str_idx);
2626
if (BE (err != REG_NOERROR, 0))
2629
/* And add the epsilon closures (which is `new_dest_nodes') of
2630
the backreference to appropriate state_log. */
2632
assert (dfa->nexts[node_idx] != -1);
2634
for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2637
re_dfastate_t *dest_state;
2638
struct re_backref_cache_entry *bkref_ent;
2639
bkref_ent = mctx->bkref_ents + bkc_idx;
2640
if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2642
subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2643
new_dest_nodes = (subexp_len == 0
2644
? dfa->eclosures + dfa->edests[node_idx].elems[0]
2645
: dfa->eclosures + dfa->nexts[node_idx]);
2646
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2647
- bkref_ent->subexp_from);
2648
context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2650
dest_state = mctx->state_log[dest_str_idx];
2651
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2652
: mctx->state_log[cur_str_idx]->nodes.nelem);
2653
/* Add `new_dest_node' to state_log. */
2654
if (dest_state == NULL)
2656
mctx->state_log[dest_str_idx]
2657
= re_acquire_state_context (&err, dfa, new_dest_nodes,
2659
if (BE (mctx->state_log[dest_str_idx] == NULL
2660
&& err != REG_NOERROR, 0))
2665
re_node_set dest_nodes;
2666
err = re_node_set_init_union (&dest_nodes,
2667
dest_state->entrance_nodes,
2669
if (BE (err != REG_NOERROR, 0))
2671
re_node_set_free (&dest_nodes);
2674
mctx->state_log[dest_str_idx]
2675
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2676
re_node_set_free (&dest_nodes);
2677
if (BE (mctx->state_log[dest_str_idx] == NULL
2678
&& err != REG_NOERROR, 0))
2681
/* We need to check recursively if the backreference can epsilon
2684
&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2686
err = check_subexp_matching_top (mctx, new_dest_nodes,
2688
if (BE (err != REG_NOERROR, 0))
2690
err = transit_state_bkref (mctx, new_dest_nodes);
2691
if (BE (err != REG_NOERROR, 0))
2701
/* Enumerate all the candidates which the backreference BKREF_NODE can match
2702
at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2703
Note that we might collect inappropriate candidates here.
2704
However, the cost of checking them strictly here is too high, then we
2705
delay these checking for prune_impossible_nodes(). */
2707
static reg_errcode_t
2708
internal_function __attribute_warn_unused_result__
2709
get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2711
const re_dfa_t *const dfa = mctx->dfa;
2712
int subexp_num, sub_top_idx;
2713
const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2714
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2715
int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2716
if (cache_idx != -1)
2718
const struct re_backref_cache_entry *entry
2719
= mctx->bkref_ents + cache_idx;
2721
if (entry->node == bkref_node)
2722
return REG_NOERROR; /* We already checked it. */
2723
while (entry++->more);
2726
subexp_num = dfa->nodes[bkref_node].opr.idx;
2728
/* For each sub expression */
2729
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2732
re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2733
re_sub_match_last_t *sub_last;
2734
int sub_last_idx, sl_str, bkref_str_off;
2736
if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2737
continue; /* It isn't related. */
2739
sl_str = sub_top->str_idx;
2740
bkref_str_off = bkref_str_idx;
2741
/* At first, check the last node of sub expressions we already
2743
for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2746
sub_last = sub_top->lasts[sub_last_idx];
2747
sl_str_diff = sub_last->str_idx - sl_str;
2748
/* The matched string by the sub expression match with the substring
2749
at the back reference? */
2750
if (sl_str_diff > 0)
2752
if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2754
/* Not enough chars for a successful match. */
2755
if (bkref_str_off + sl_str_diff > mctx->input.len)
2758
err = clean_state_log_if_needed (mctx,
2761
if (BE (err != REG_NOERROR, 0))
2763
buf = (const char *) re_string_get_buffer (&mctx->input);
2765
if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2766
/* We don't need to search this sub expression any more. */
2769
bkref_str_off += sl_str_diff;
2770
sl_str += sl_str_diff;
2771
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2774
/* Reload buf, since the preceding call might have reallocated
2776
buf = (const char *) re_string_get_buffer (&mctx->input);
2778
if (err == REG_NOMATCH)
2780
if (BE (err != REG_NOERROR, 0))
2784
if (sub_last_idx < sub_top->nlasts)
2786
if (sub_last_idx > 0)
2788
/* Then, search for the other last nodes of the sub expression. */
2789
for (; sl_str <= bkref_str_idx; ++sl_str)
2791
int cls_node, sl_str_off;
2792
const re_node_set *nodes;
2793
sl_str_off = sl_str - sub_top->str_idx;
2794
/* The matched string by the sub expression match with the substring
2795
at the back reference? */
2798
if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2800
/* If we are at the end of the input, we cannot match. */
2801
if (bkref_str_off >= mctx->input.len)
2804
err = extend_buffers (mctx, bkref_str_off + 1);
2805
if (BE (err != REG_NOERROR, 0))
2808
buf = (const char *) re_string_get_buffer (&mctx->input);
2810
if (buf [bkref_str_off++] != buf[sl_str - 1])
2811
break; /* We don't need to search this sub expression
2814
if (mctx->state_log[sl_str] == NULL)
2816
/* Does this state have a ')' of the sub expression? */
2817
nodes = &mctx->state_log[sl_str]->nodes;
2818
cls_node = find_subexp_node (dfa, nodes, subexp_num,
2822
if (sub_top->path == NULL)
2824
sub_top->path = calloc (sizeof (state_array_t),
2825
sl_str - sub_top->str_idx + 1);
2826
if (sub_top->path == NULL)
2829
/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2830
in the current context? */
2831
err = check_arrival (mctx, sub_top->path, sub_top->node,
2832
sub_top->str_idx, cls_node, sl_str,
2834
if (err == REG_NOMATCH)
2836
if (BE (err != REG_NOERROR, 0))
2838
sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2839
if (BE (sub_last == NULL, 0))
2841
err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2843
if (err == REG_NOMATCH)
2850
/* Helper functions for get_subexp(). */
2852
/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2853
If it can arrive, register the sub expression expressed with SUB_TOP
2856
static reg_errcode_t
2858
get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2859
re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2863
/* Can the subexpression arrive the back reference? */
2864
err = check_arrival (mctx, &sub_last->path, sub_last->node,
2865
sub_last->str_idx, bkref_node, bkref_str,
2867
if (err != REG_NOERROR)
2869
err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2871
if (BE (err != REG_NOERROR, 0))
2873
to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2874
return clean_state_log_if_needed (mctx, to_idx);
2877
/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2878
Search '(' if FL_OPEN, or search ')' otherwise.
2879
TODO: This function isn't efficient...
2880
Because there might be more than one nodes whose types are
2881
OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2887
find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2888
int subexp_idx, int type)
2891
for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2893
int cls_node = nodes->elems[cls_idx];
2894
const re_token_t *node = dfa->nodes + cls_node;
2895
if (node->type == type
2896
&& node->opr.idx == subexp_idx)
2902
/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2903
LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2905
Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2907
static reg_errcode_t
2908
internal_function __attribute_warn_unused_result__
2909
check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2910
int top_str, int last_node, int last_str, int type)
2912
const re_dfa_t *const dfa = mctx->dfa;
2913
reg_errcode_t err = REG_NOERROR;
2914
int subexp_num, backup_cur_idx, str_idx, null_cnt;
2915
re_dfastate_t *cur_state = NULL;
2916
re_node_set *cur_nodes, next_nodes;
2917
re_dfastate_t **backup_state_log;
2918
unsigned int context;
2920
subexp_num = dfa->nodes[top_node].opr.idx;
2921
/* Extend the buffer if we need. */
2922
if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2924
re_dfastate_t **new_array;
2925
int old_alloc = path->alloc;
2926
path->alloc += last_str + mctx->max_mb_elem_len + 1;
2927
new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2928
if (BE (new_array == NULL, 0))
2930
path->alloc = old_alloc;
2933
path->array = new_array;
2934
memset (new_array + old_alloc, '\0',
2935
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2938
str_idx = path->next_idx ? path->next_idx : top_str;
2940
/* Temporary modify MCTX. */
2941
backup_state_log = mctx->state_log;
2942
backup_cur_idx = mctx->input.cur_idx;
2943
mctx->state_log = path->array;
2944
mctx->input.cur_idx = str_idx;
2946
/* Setup initial node set. */
2947
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2948
if (str_idx == top_str)
2950
err = re_node_set_init_1 (&next_nodes, top_node);
2951
if (BE (err != REG_NOERROR, 0))
2953
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954
if (BE (err != REG_NOERROR, 0))
2956
re_node_set_free (&next_nodes);
2962
cur_state = mctx->state_log[str_idx];
2963
if (cur_state && cur_state->has_backref)
2965
err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2966
if (BE (err != REG_NOERROR, 0))
2970
re_node_set_init_empty (&next_nodes);
2972
if (str_idx == top_str || (cur_state && cur_state->has_backref))
2974
if (next_nodes.nelem)
2976
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2978
if (BE (err != REG_NOERROR, 0))
2980
re_node_set_free (&next_nodes);
2984
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2985
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2987
re_node_set_free (&next_nodes);
2990
mctx->state_log[str_idx] = cur_state;
2993
for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2995
re_node_set_empty (&next_nodes);
2996
if (mctx->state_log[str_idx + 1])
2998
err = re_node_set_merge (&next_nodes,
2999
&mctx->state_log[str_idx + 1]->nodes);
3000
if (BE (err != REG_NOERROR, 0))
3002
re_node_set_free (&next_nodes);
3008
err = check_arrival_add_next_nodes (mctx, str_idx,
3009
&cur_state->non_eps_nodes,
3011
if (BE (err != REG_NOERROR, 0))
3013
re_node_set_free (&next_nodes);
3018
if (next_nodes.nelem)
3020
err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3021
if (BE (err != REG_NOERROR, 0))
3023
re_node_set_free (&next_nodes);
3026
err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3028
if (BE (err != REG_NOERROR, 0))
3030
re_node_set_free (&next_nodes);
3034
context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3035
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3036
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3038
re_node_set_free (&next_nodes);
3041
mctx->state_log[str_idx] = cur_state;
3042
null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3044
re_node_set_free (&next_nodes);
3045
cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3046
: &mctx->state_log[last_str]->nodes);
3047
path->next_idx = str_idx;
3050
mctx->state_log = backup_state_log;
3051
mctx->input.cur_idx = backup_cur_idx;
3053
/* Then check the current node set has the node LAST_NODE. */
3054
if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3060
/* Helper functions for check_arrival. */
3062
/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3064
TODO: This function is similar to the functions transit_state*(),
3065
however this function has many additional works.
3066
Can't we unify them? */
3068
static reg_errcode_t
3069
internal_function __attribute_warn_unused_result__
3070
check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3071
re_node_set *cur_nodes, re_node_set *next_nodes)
3073
const re_dfa_t *const dfa = mctx->dfa;
3076
#ifdef RE_ENABLE_I18N
3077
reg_errcode_t err = REG_NOERROR;
3079
re_node_set union_set;
3080
re_node_set_init_empty (&union_set);
3081
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3084
int cur_node = cur_nodes->elems[cur_idx];
3086
re_token_type_t type = dfa->nodes[cur_node].type;
3087
assert (!IS_EPSILON_NODE (type));
3089
#ifdef RE_ENABLE_I18N
3090
/* If the node may accept `multi byte'. */
3091
if (dfa->nodes[cur_node].accept_mb)
3093
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3097
re_dfastate_t *dest_state;
3098
int next_node = dfa->nexts[cur_node];
3099
int next_idx = str_idx + naccepted;
3100
dest_state = mctx->state_log[next_idx];
3101
re_node_set_empty (&union_set);
3104
err = re_node_set_merge (&union_set, &dest_state->nodes);
3105
if (BE (err != REG_NOERROR, 0))
3107
re_node_set_free (&union_set);
3111
result = re_node_set_insert (&union_set, next_node);
3112
if (BE (result < 0, 0))
3114
re_node_set_free (&union_set);
3117
mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3119
if (BE (mctx->state_log[next_idx] == NULL
3120
&& err != REG_NOERROR, 0))
3122
re_node_set_free (&union_set);
3127
#endif /* RE_ENABLE_I18N */
3129
|| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3131
result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3132
if (BE (result < 0, 0))
3134
re_node_set_free (&union_set);
3139
re_node_set_free (&union_set);
3143
/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3144
CUR_NODES, however exclude the nodes which are:
3145
- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3146
- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3149
static reg_errcode_t
3151
check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3152
int ex_subexp, int type)
3155
int idx, outside_node;
3156
re_node_set new_nodes;
3158
assert (cur_nodes->nelem);
3160
err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3161
if (BE (err != REG_NOERROR, 0))
3163
/* Create a new node set NEW_NODES with the nodes which are epsilon
3164
closures of the node in CUR_NODES. */
3166
for (idx = 0; idx < cur_nodes->nelem; ++idx)
3168
int cur_node = cur_nodes->elems[idx];
3169
const re_node_set *eclosure = dfa->eclosures + cur_node;
3170
outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3171
if (outside_node == -1)
3173
/* There are no problematic nodes, just merge them. */
3174
err = re_node_set_merge (&new_nodes, eclosure);
3175
if (BE (err != REG_NOERROR, 0))
3177
re_node_set_free (&new_nodes);
3183
/* There are problematic nodes, re-calculate incrementally. */
3184
err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3186
if (BE (err != REG_NOERROR, 0))
3188
re_node_set_free (&new_nodes);
3193
re_node_set_free (cur_nodes);
3194
*cur_nodes = new_nodes;
3198
/* Helper function for check_arrival_expand_ecl.
3199
Check incrementally the epsilon closure of TARGET, and if it isn't
3200
problematic append it to DST_NODES. */
3202
static reg_errcode_t
3203
internal_function __attribute_warn_unused_result__
3204
check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3205
int target, int ex_subexp, int type)
3208
for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3212
if (dfa->nodes[cur_node].type == type
3213
&& dfa->nodes[cur_node].opr.idx == ex_subexp)
3215
if (type == OP_CLOSE_SUBEXP)
3217
err = re_node_set_insert (dst_nodes, cur_node);
3218
if (BE (err == -1, 0))
3223
err = re_node_set_insert (dst_nodes, cur_node);
3224
if (BE (err == -1, 0))
3226
if (dfa->edests[cur_node].nelem == 0)
3228
if (dfa->edests[cur_node].nelem == 2)
3230
err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3231
dfa->edests[cur_node].elems[1],
3233
if (BE (err != REG_NOERROR, 0))
3236
cur_node = dfa->edests[cur_node].elems[0];
3242
/* For all the back references in the current state, calculate the
3243
destination of the back references by the appropriate entry
3244
in MCTX->BKREF_ENTS. */
3246
static reg_errcode_t
3247
internal_function __attribute_warn_unused_result__
3248
expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3249
int cur_str, int subexp_num, int type)
3251
const re_dfa_t *const dfa = mctx->dfa;
3253
int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3254
struct re_backref_cache_entry *ent;
3256
if (cache_idx_start == -1)
3260
ent = mctx->bkref_ents + cache_idx_start;
3263
int to_idx, next_node;
3265
/* Is this entry ENT is appropriate? */
3266
if (!re_node_set_contains (cur_nodes, ent->node))
3269
to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3270
/* Calculate the destination of the back reference, and append it
3271
to MCTX->STATE_LOG. */
3272
if (to_idx == cur_str)
3274
/* The backreference did epsilon transit, we must re-check all the
3275
node in the current state. */
3276
re_node_set new_dests;
3277
reg_errcode_t err2, err3;
3278
next_node = dfa->edests[ent->node].elems[0];
3279
if (re_node_set_contains (cur_nodes, next_node))
3281
err = re_node_set_init_1 (&new_dests, next_node);
3282
err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3283
err3 = re_node_set_merge (cur_nodes, &new_dests);
3284
re_node_set_free (&new_dests);
3285
if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3286
|| err3 != REG_NOERROR, 0))
3288
err = (err != REG_NOERROR ? err
3289
: (err2 != REG_NOERROR ? err2 : err3));
3292
/* TODO: It is still inefficient... */
3297
re_node_set union_set;
3298
next_node = dfa->nexts[ent->node];
3299
if (mctx->state_log[to_idx])
3302
if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3305
err = re_node_set_init_copy (&union_set,
3306
&mctx->state_log[to_idx]->nodes);
3307
ret = re_node_set_insert (&union_set, next_node);
3308
if (BE (err != REG_NOERROR || ret < 0, 0))
3310
re_node_set_free (&union_set);
3311
err = err != REG_NOERROR ? err : REG_ESPACE;
3317
err = re_node_set_init_1 (&union_set, next_node);
3318
if (BE (err != REG_NOERROR, 0))
3321
mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3322
re_node_set_free (&union_set);
3323
if (BE (mctx->state_log[to_idx] == NULL
3324
&& err != REG_NOERROR, 0))
3328
while (ent++->more);
3332
/* Build transition table for the state.
3333
Return 1 if succeeded, otherwise return NULL. */
3337
build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3340
int i, j, ch, need_word_trtable = 0;
3341
bitset_word_t elem, mask;
3342
bool dests_node_malloced = false;
3343
bool dest_states_malloced = false;
3344
int ndests; /* Number of the destination states from `state'. */
3345
re_dfastate_t **trtable;
3346
re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3347
re_node_set follows, *dests_node;
3349
bitset_t acceptable;
3353
re_node_set dests_node[SBC_MAX];
3354
bitset_t dests_ch[SBC_MAX];
3357
/* We build DFA states which corresponds to the destination nodes
3358
from `state'. `dests_node[i]' represents the nodes which i-th
3359
destination state contains, and `dests_ch[i]' represents the
3360
characters which i-th destination state accepts. */
3362
if (__libc_use_alloca (sizeof (struct dests_alloc)))
3363
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3367
dests_alloc = re_malloc (struct dests_alloc, 1);
3368
if (BE (dests_alloc == NULL, 0))
3370
dests_node_malloced = true;
3372
dests_node = dests_alloc->dests_node;
3373
dests_ch = dests_alloc->dests_ch;
3375
/* Initialize transiton table. */
3376
state->word_trtable = state->trtable = NULL;
3378
/* At first, group all nodes belonging to `state' into several
3380
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3381
if (BE (ndests <= 0, 0))
3383
if (dests_node_malloced)
3385
/* Return 0 in case of an error, 1 otherwise. */
3388
state->trtable = (re_dfastate_t **)
3389
calloc (sizeof (re_dfastate_t *), SBC_MAX);
3390
if (BE (state->trtable == NULL, 0))
3397
err = re_node_set_alloc (&follows, ndests + 1);
3398
if (BE (err != REG_NOERROR, 0))
3401
/* Avoid arithmetic overflow in size calculation. */
3402
if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3403
/ (3 * sizeof (re_dfastate_t *)))
3409
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3410
+ ndests * 3 * sizeof (re_dfastate_t *)))
3411
dest_states = (re_dfastate_t **)
3412
alloca (ndests * 3 * sizeof (re_dfastate_t *));
3416
dest_states = (re_dfastate_t **)
3417
malloc (ndests * 3 * sizeof (re_dfastate_t *));
3418
if (BE (dest_states == NULL, 0))
3421
if (dest_states_malloced)
3423
re_node_set_free (&follows);
3424
for (i = 0; i < ndests; ++i)
3425
re_node_set_free (dests_node + i);
3426
if (dests_node_malloced)
3430
dest_states_malloced = true;
3432
dest_states_word = dest_states + ndests;
3433
dest_states_nl = dest_states_word + ndests;
3434
bitset_empty (acceptable);
3436
/* Then build the states for all destinations. */
3437
for (i = 0; i < ndests; ++i)
3440
re_node_set_empty (&follows);
3441
/* Merge the follows of this destination states. */
3442
for (j = 0; j < dests_node[i].nelem; ++j)
3444
next_node = dfa->nexts[dests_node[i].elems[j]];
3445
if (next_node != -1)
3447
err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3448
if (BE (err != REG_NOERROR, 0))
3452
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3453
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3455
/* If the new state has context constraint,
3456
build appropriate states for these contexts. */
3457
if (dest_states[i]->has_constraint)
3459
dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3461
if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3464
if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3465
need_word_trtable = 1;
3467
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3469
if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3474
dest_states_word[i] = dest_states[i];
3475
dest_states_nl[i] = dest_states[i];
3477
bitset_merge (acceptable, dests_ch[i]);
3480
if (!BE (need_word_trtable, 0))
3482
/* We don't care about whether the following character is a word
3483
character, or we are in a single-byte character set so we can
3484
discern by looking at the character code: allocate a
3485
256-entry transition table. */
3486
trtable = state->trtable =
3487
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3488
if (BE (trtable == NULL, 0))
3491
/* For all characters ch...: */
3492
for (i = 0; i < BITSET_WORDS; ++i)
3493
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3495
mask <<= 1, elem >>= 1, ++ch)
3496
if (BE (elem & 1, 0))
3498
/* There must be exactly one destination which accepts
3499
character ch. See group_nodes_into_DFAstates. */
3500
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3503
/* j-th destination accepts the word character ch. */
3504
if (dfa->word_char[i] & mask)
3505
trtable[ch] = dest_states_word[j];
3507
trtable[ch] = dest_states[j];
3512
/* We care about whether the following character is a word
3513
character, and we are in a multi-byte character set: discern
3514
by looking at the character code: build two 256-entry
3515
transition tables, one starting at trtable[0] and one
3516
starting at trtable[SBC_MAX]. */
3517
trtable = state->word_trtable =
3518
(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3519
if (BE (trtable == NULL, 0))
3522
/* For all characters ch...: */
3523
for (i = 0; i < BITSET_WORDS; ++i)
3524
for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3526
mask <<= 1, elem >>= 1, ++ch)
3527
if (BE (elem & 1, 0))
3529
/* There must be exactly one destination which accepts
3530
character ch. See group_nodes_into_DFAstates. */
3531
for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3534
/* j-th destination accepts the word character ch. */
3535
trtable[ch] = dest_states[j];
3536
trtable[ch + SBC_MAX] = dest_states_word[j];
3541
if (bitset_contain (acceptable, NEWLINE_CHAR))
3543
/* The current state accepts newline character. */
3544
for (j = 0; j < ndests; ++j)
3545
if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3547
/* k-th destination accepts newline character. */
3548
trtable[NEWLINE_CHAR] = dest_states_nl[j];
3549
if (need_word_trtable)
3550
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3551
/* There must be only one destination which accepts
3552
newline. See group_nodes_into_DFAstates. */
3557
if (dest_states_malloced)
3560
re_node_set_free (&follows);
3561
for (i = 0; i < ndests; ++i)
3562
re_node_set_free (dests_node + i);
3564
if (dests_node_malloced)
3570
/* Group all nodes belonging to STATE into several destinations.
3571
Then for all destinations, set the nodes belonging to the destination
3572
to DESTS_NODE[i] and set the characters accepted by the destination
3573
to DEST_CH[i]. This function return the number of destinations. */
3577
group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3578
re_node_set *dests_node, bitset_t *dests_ch)
3583
int ndests; /* Number of the destinations from `state'. */
3584
bitset_t accepts; /* Characters a node can accept. */
3585
const re_node_set *cur_nodes = &state->nodes;
3586
bitset_empty (accepts);
3589
/* For all the nodes belonging to `state', */
3590
for (i = 0; i < cur_nodes->nelem; ++i)
3592
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3593
re_token_type_t type = node->type;
3594
unsigned int constraint = node->constraint;
3596
/* Enumerate all single byte character this node can accept. */
3597
if (type == CHARACTER)
3598
bitset_set (accepts, node->opr.c);
3599
else if (type == SIMPLE_BRACKET)
3601
bitset_merge (accepts, node->opr.sbcset);
3603
else if (type == OP_PERIOD)
3605
#ifdef RE_ENABLE_I18N
3606
if (dfa->mb_cur_max > 1)
3607
bitset_merge (accepts, dfa->sb_char);
3610
bitset_set_all (accepts);
3611
if (!(dfa->syntax & RE_DOT_NEWLINE))
3612
bitset_clear (accepts, '\n');
3613
if (dfa->syntax & RE_DOT_NOT_NULL)
3614
bitset_clear (accepts, '\0');
3616
#ifdef RE_ENABLE_I18N
3617
else if (type == OP_UTF8_PERIOD)
3619
memset (accepts, '\xff', sizeof (bitset_t) / 2);
3620
if (!(dfa->syntax & RE_DOT_NEWLINE))
3621
bitset_clear (accepts, '\n');
3622
if (dfa->syntax & RE_DOT_NOT_NULL)
3623
bitset_clear (accepts, '\0');
3629
/* Check the `accepts' and sift the characters which are not
3630
match it the context. */
3633
if (constraint & NEXT_NEWLINE_CONSTRAINT)
3635
bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3636
bitset_empty (accepts);
3637
if (accepts_newline)
3638
bitset_set (accepts, NEWLINE_CHAR);
3642
if (constraint & NEXT_ENDBUF_CONSTRAINT)
3644
bitset_empty (accepts);
3648
if (constraint & NEXT_WORD_CONSTRAINT)
3650
bitset_word_t any_set = 0;
3651
if (type == CHARACTER && !node->word_char)
3653
bitset_empty (accepts);
3656
#ifdef RE_ENABLE_I18N
3657
if (dfa->mb_cur_max > 1)
3658
for (j = 0; j < BITSET_WORDS; ++j)
3659
any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3662
for (j = 0; j < BITSET_WORDS; ++j)
3663
any_set |= (accepts[j] &= dfa->word_char[j]);
3667
if (constraint & NEXT_NOTWORD_CONSTRAINT)
3669
bitset_word_t any_set = 0;
3670
if (type == CHARACTER && node->word_char)
3672
bitset_empty (accepts);
3675
#ifdef RE_ENABLE_I18N
3676
if (dfa->mb_cur_max > 1)
3677
for (j = 0; j < BITSET_WORDS; ++j)
3678
any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3681
for (j = 0; j < BITSET_WORDS; ++j)
3682
any_set |= (accepts[j] &= ~dfa->word_char[j]);
3688
/* Then divide `accepts' into DFA states, or create a new
3689
state. Above, we make sure that accepts is not empty. */
3690
for (j = 0; j < ndests; ++j)
3692
bitset_t intersec; /* Intersection sets, see below. */
3694
/* Flags, see below. */
3695
bitset_word_t has_intersec, not_subset, not_consumed;
3697
/* Optimization, skip if this state doesn't accept the character. */
3698
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3701
/* Enumerate the intersection set of this state and `accepts'. */
3703
for (k = 0; k < BITSET_WORDS; ++k)
3704
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3705
/* And skip if the intersection set is empty. */
3709
/* Then check if this state is a subset of `accepts'. */
3710
not_subset = not_consumed = 0;
3711
for (k = 0; k < BITSET_WORDS; ++k)
3713
not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3714
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3717
/* If this state isn't a subset of `accepts', create a
3718
new group state, which has the `remains'. */
3721
bitset_copy (dests_ch[ndests], remains);
3722
bitset_copy (dests_ch[j], intersec);
3723
err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3724
if (BE (err != REG_NOERROR, 0))
3729
/* Put the position in the current group. */
3730
result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3731
if (BE (result < 0, 0))
3734
/* If all characters are consumed, go to next node. */
3738
/* Some characters remain, create a new group. */
3741
bitset_copy (dests_ch[ndests], accepts);
3742
err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3743
if (BE (err != REG_NOERROR, 0))
3746
bitset_empty (accepts);
3751
for (j = 0; j < ndests; ++j)
3752
re_node_set_free (dests_node + j);
3756
#ifdef RE_ENABLE_I18N
3757
/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3758
Return the number of the bytes the node accepts.
3759
STR_IDX is the current index of the input string.
3761
This function handles the nodes which can accept one character, or
3762
one collating element like '.', '[a-z]', opposite to the other nodes
3763
can only accept one byte. */
3767
check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3768
const re_string_t *input, int str_idx)
3770
const re_token_t *node = dfa->nodes + node_idx;
3771
int char_len, elem_len;
3775
if (BE (node->type == OP_UTF8_PERIOD, 0))
3777
unsigned char c = re_string_byte_at (input, str_idx), d;
3778
if (BE (c < 0xc2, 1))
3781
if (str_idx + 2 > input->len)
3784
d = re_string_byte_at (input, str_idx + 1);
3786
return (d < 0x80 || d > 0xbf) ? 0 : 2;
3790
if (c == 0xe0 && d < 0xa0)
3796
if (c == 0xf0 && d < 0x90)
3802
if (c == 0xf8 && d < 0x88)
3808
if (c == 0xfc && d < 0x84)
3814
if (str_idx + char_len > input->len)
3817
for (i = 1; i < char_len; ++i)
3819
d = re_string_byte_at (input, str_idx + i);
3820
if (d < 0x80 || d > 0xbf)
3826
char_len = re_string_char_size_at (input, str_idx);
3827
if (node->type == OP_PERIOD)
3831
/* FIXME: I don't think this if is needed, as both '\n'
3832
and '\0' are char_len == 1. */
3833
/* '.' accepts any one character except the following two cases. */
3834
if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3835
re_string_byte_at (input, str_idx) == '\n') ||
3836
((dfa->syntax & RE_DOT_NOT_NULL) &&
3837
re_string_byte_at (input, str_idx) == '\0'))
3842
elem_len = re_string_elem_size_at (input, str_idx);
3843
wc = __btowc(*(input->mbs+str_idx));
3844
if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3847
if (node->type == COMPLEX_BRACKET)
3849
const re_charset_t *cset = node->opr.mbcset;
3851
const unsigned char *pin
3852
= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3857
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3858
? re_string_wchar_at (input, str_idx) : 0);
3860
/* match with multibyte character? */
3861
for (i = 0; i < cset->nmbchars; ++i)
3862
if (wc == cset->mbchars[i])
3864
match_len = char_len;
3865
goto check_node_accept_bytes_match;
3867
/* match with character_class? */
3868
for (i = 0; i < cset->nchar_classes; ++i)
3870
wctype_t wt = cset->char_classes[i];
3871
if (__iswctype (wc, wt))
3873
match_len = char_len;
3874
goto check_node_accept_bytes_match;
3879
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3882
unsigned int in_collseq = 0;
3883
const int32_t *table, *indirect;
3884
const unsigned char *weights, *extra;
3885
const char *collseqwc;
3886
/* This #include defines a local function! */
3887
# include <locale/weight.h>
3889
/* match with collating_symbol? */
3890
if (cset->ncoll_syms)
3891
extra = (const unsigned char *)
3892
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3893
for (i = 0; i < cset->ncoll_syms; ++i)
3895
const unsigned char *coll_sym = extra + cset->coll_syms[i];
3896
/* Compare the length of input collating element and
3897
the length of current collating element. */
3898
if (*coll_sym != elem_len)
3900
/* Compare each bytes. */
3901
for (j = 0; j < *coll_sym; j++)
3902
if (pin[j] != coll_sym[1 + j])
3906
/* Match if every bytes is equal. */
3908
goto check_node_accept_bytes_match;
3914
if (elem_len <= char_len)
3916
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3917
in_collseq = __collseq_table_lookup (collseqwc, wc);
3920
in_collseq = find_collation_sequence_value (pin, elem_len);
3922
/* match with range expression? */
3923
for (i = 0; i < cset->nranges; ++i)
3924
if (cset->range_starts[i] <= in_collseq
3925
&& in_collseq <= cset->range_ends[i])
3927
match_len = elem_len;
3928
goto check_node_accept_bytes_match;
3931
/* match with equivalence_class? */
3932
if (cset->nequiv_classes)
3934
const unsigned char *cp = pin;
3935
table = (const int32_t *)
3936
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3937
weights = (const unsigned char *)
3938
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3939
extra = (const unsigned char *)
3940
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3941
indirect = (const int32_t *)
3942
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3943
int32_t idx = findidx (&cp, elem_len);
3945
for (i = 0; i < cset->nequiv_classes; ++i)
3947
int32_t equiv_class_idx = cset->equiv_classes[i];
3948
size_t weight_len = weights[idx & 0xffffff];
3949
if (weight_len == weights[equiv_class_idx & 0xffffff]
3950
&& (idx >> 24) == (equiv_class_idx >> 24))
3955
equiv_class_idx &= 0xffffff;
3957
while (cnt <= weight_len
3958
&& (weights[equiv_class_idx + 1 + cnt]
3959
== weights[idx + 1 + cnt]))
3961
if (cnt > weight_len)
3963
match_len = elem_len;
3964
goto check_node_accept_bytes_match;
3973
/* match with range expression? */
3974
for (i = 0; i < cset->nranges; ++i)
3976
if (cset->range_starts[i] <= wc
3977
&& wc <= cset->range_ends[i])
3979
match_len = char_len;
3980
goto check_node_accept_bytes_match;
3984
check_node_accept_bytes_match:
3985
if (!cset->non_match)
3992
return (elem_len > char_len) ? elem_len : char_len;
4001
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4003
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4008
/* No valid character. Match it as a single byte character. */
4009
const unsigned char *collseq = (const unsigned char *)
4010
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4011
return collseq[mbs[0]];
4018
const unsigned char *extra = (const unsigned char *)
4019
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4020
int32_t extrasize = (const unsigned char *)
4021
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4023
for (idx = 0; idx < extrasize;)
4025
int mbs_cnt, found = 0;
4026
int32_t elem_mbs_len;
4027
/* Skip the name of collating element name. */
4028
idx = idx + extra[idx] + 1;
4029
elem_mbs_len = extra[idx++];
4030
if (mbs_len == elem_mbs_len)
4032
for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4033
if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4035
if (mbs_cnt == elem_mbs_len)
4036
/* Found the entry. */
4039
/* Skip the byte sequence of the collating element. */
4040
idx += elem_mbs_len;
4041
/* Adjust for the alignment. */
4042
idx = (idx + 3) & ~3;
4043
/* Skip the collation sequence value. */
4044
idx += sizeof (uint32_t);
4045
/* Skip the wide char sequence of the collating element. */
4046
idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4047
/* If we found the entry, return the sequence value. */
4049
return *(uint32_t *) (extra + idx);
4050
/* Skip the collation sequence value. */
4051
idx += sizeof (uint32_t);
4057
#endif /* RE_ENABLE_I18N */
4059
/* Check whether the node accepts the byte which is IDX-th
4060
byte of the INPUT. */
4064
check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4068
ch = re_string_byte_at (&mctx->input, idx);
4072
if (node->opr.c != ch)
4076
case SIMPLE_BRACKET:
4077
if (!bitset_contain (node->opr.sbcset, ch))
4081
#ifdef RE_ENABLE_I18N
4082
case OP_UTF8_PERIOD:
4088
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4089
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4097
if (node->constraint)
4099
/* The node has constraints. Check whether the current context
4100
satisfies the constraints. */
4101
unsigned int context = re_string_context_at (&mctx->input, idx,
4103
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4110
/* Extend the buffers, if the buffers have run out. */
4112
static reg_errcode_t
4113
internal_function __attribute_warn_unused_result__
4114
extend_buffers (re_match_context_t *mctx, int min_len)
4117
re_string_t *pstr = &mctx->input;
4119
/* Avoid overflow. */
4120
if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4123
/* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4124
ret = re_string_realloc_buffers (pstr,
4126
MIN (pstr->len, pstr->bufs_len * 2)));
4127
if (BE (ret != REG_NOERROR, 0))
4130
if (mctx->state_log != NULL)
4132
/* And double the length of state_log. */
4133
/* XXX We have no indication of the size of this buffer. If this
4134
allocation fail we have no indication that the state_log array
4135
does not have the right size. */
4136
re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4137
pstr->bufs_len + 1);
4138
if (BE (new_array == NULL, 0))
4140
mctx->state_log = new_array;
4143
/* Then reconstruct the buffers. */
4146
#ifdef RE_ENABLE_I18N
4147
if (pstr->mb_cur_max > 1)
4149
ret = build_wcs_upper_buffer (pstr);
4150
if (BE (ret != REG_NOERROR, 0))
4154
#endif /* RE_ENABLE_I18N */
4155
build_upper_buffer (pstr);
4159
#ifdef RE_ENABLE_I18N
4160
if (pstr->mb_cur_max > 1)
4161
build_wcs_buffer (pstr);
4163
#endif /* RE_ENABLE_I18N */
4165
if (pstr->trans != NULL)
4166
re_string_translate_buffer (pstr);
4173
/* Functions for matching context. */
4175
/* Initialize MCTX. */
4177
static reg_errcode_t
4178
internal_function __attribute_warn_unused_result__
4179
match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4181
mctx->eflags = eflags;
4182
mctx->match_last = -1;
4185
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4186
mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4187
if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4190
/* Already zero-ed by the caller.
4192
mctx->bkref_ents = NULL;
4193
mctx->nbkref_ents = 0;
4194
mctx->nsub_tops = 0; */
4195
mctx->abkref_ents = n;
4196
mctx->max_mb_elem_len = 1;
4197
mctx->asub_tops = n;
4201
/* Clean the entries which depend on the current input in MCTX.
4202
This function must be invoked when the matcher changes the start index
4203
of the input, or changes the input string. */
4207
match_ctx_clean (re_match_context_t *mctx)
4210
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4213
re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4214
for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4216
re_sub_match_last_t *last = top->lasts[sl_idx];
4217
re_free (last->path.array);
4220
re_free (top->lasts);
4223
re_free (top->path->array);
4224
re_free (top->path);
4229
mctx->nsub_tops = 0;
4230
mctx->nbkref_ents = 0;
4233
/* Free all the memory associated with MCTX. */
4237
match_ctx_free (re_match_context_t *mctx)
4239
/* First, free all the memory associated with MCTX->SUB_TOPS. */
4240
match_ctx_clean (mctx);
4241
re_free (mctx->sub_tops);
4242
re_free (mctx->bkref_ents);
4245
/* Add a new backreference entry to MCTX.
4246
Note that we assume that caller never call this function with duplicate
4247
entry, and call with STR_IDX which isn't smaller than any existing entry.
4250
static reg_errcode_t
4251
internal_function __attribute_warn_unused_result__
4252
match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4255
if (mctx->nbkref_ents >= mctx->abkref_ents)
4257
struct re_backref_cache_entry* new_entry;
4258
new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4259
mctx->abkref_ents * 2);
4260
if (BE (new_entry == NULL, 0))
4262
re_free (mctx->bkref_ents);
4265
mctx->bkref_ents = new_entry;
4266
memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4267
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4268
mctx->abkref_ents *= 2;
4270
if (mctx->nbkref_ents > 0
4271
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4272
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4274
mctx->bkref_ents[mctx->nbkref_ents].node = node;
4275
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4276
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4277
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4279
/* This is a cache that saves negative results of check_dst_limits_calc_pos.
4280
If bit N is clear, means that this entry won't epsilon-transition to
4281
an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4282
it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4285
A backreference does not epsilon-transition unless it is empty, so set
4286
to all zeros if FROM != TO. */
4287
mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4288
= (from == to ? ~0 : 0);
4290
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4291
if (mctx->max_mb_elem_len < to - from)
4292
mctx->max_mb_elem_len = to - from;
4296
/* Search for the first entry which has the same str_idx, or -1 if none is
4297
found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4301
search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4303
int left, right, mid, last;
4304
last = right = mctx->nbkref_ents;
4305
for (left = 0; left < right;)
4307
mid = (left + right) / 2;
4308
if (mctx->bkref_ents[mid].str_idx < str_idx)
4313
if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4319
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4322
static reg_errcode_t
4323
internal_function __attribute_warn_unused_result__
4324
match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4327
assert (mctx->sub_tops != NULL);
4328
assert (mctx->asub_tops > 0);
4330
if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4332
int new_asub_tops = mctx->asub_tops * 2;
4333
re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4334
re_sub_match_top_t *,
4336
if (BE (new_array == NULL, 0))
4338
mctx->sub_tops = new_array;
4339
mctx->asub_tops = new_asub_tops;
4341
mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4342
if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4344
mctx->sub_tops[mctx->nsub_tops]->node = node;
4345
mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4349
/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4350
at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4352
static re_sub_match_last_t *
4354
match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4356
re_sub_match_last_t *new_entry;
4357
if (BE (subtop->nlasts == subtop->alasts, 0))
4359
int new_alasts = 2 * subtop->alasts + 1;
4360
re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4361
re_sub_match_last_t *,
4363
if (BE (new_array == NULL, 0))
4365
subtop->lasts = new_array;
4366
subtop->alasts = new_alasts;
4368
new_entry = calloc (1, sizeof (re_sub_match_last_t));
4369
if (BE (new_entry != NULL, 1))
4371
subtop->lasts[subtop->nlasts] = new_entry;
4372
new_entry->node = node;
4373
new_entry->str_idx = str_idx;
4381
sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4382
re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4384
sctx->sifted_states = sifted_sts;
4385
sctx->limited_states = limited_sts;
4386
sctx->last_node = last_node;
4387
sctx->last_str_idx = last_str_idx;
4388
re_node_set_init_empty (&sctx->limits);