1
1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3
Software Foundation, Inc.
2
Copyright (C) 2002-2013 Free Software Foundation, Inc.
4
3
This file is part of the GNU C Library.
5
4
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7
This program is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3, or (at your option)
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU General Public
8
License as published by the Free Software Foundation; either
9
version 3 of the License, or (at your option) any later version.
12
This program is distributed in the hope that it will be useful,
11
The GNU C Library is distributed in the hope that it will be useful,
13
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
GNU General Public License for more details.
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
General Public License for more details.
17
You should have received a copy of the GNU General Public License along
18
with this program; if not, write to the Free Software Foundation,
19
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
16
You should have received a copy of the GNU General Public
17
License along with the GNU C Library; if not, see
18
<http://www.gnu.org/licenses/>. */
21
20
static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22
21
Idx n) internal_function;
52
51
regoff_t range, Idx stop,
53
52
struct re_registers *regs,
54
53
bool ret_len) internal_function;
55
static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
56
Idx nregs, int regs_allocated)
54
static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55
Idx nregs, int regs_allocated) internal_function;
58
56
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
60
58
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
201
199
static bool check_node_accept (const re_match_context_t *mctx,
202
200
const re_token_t *node, Idx idx)
203
201
internal_function;
204
static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202
static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
205
203
internal_function;
207
205
/* Entry point for POSIX code. */
212
210
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
213
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211
'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
214
212
least NMATCH elements, and we set them to the offsets of the
215
213
corresponding matched substrings.
217
EFLAGS specifies `execution flags' which affect matching: if
215
EFLAGS specifies "execution flags" which affect matching: if
218
216
REG_NOTBOL is set, then ^ does not match at the beginning of the
219
217
string; if REG_NOTEOL is set, then $ does not match at the end.
370
367
re_search_2_stub (struct re_pattern_buffer *bufp,
371
368
const char *string1, Idx length1,
372
369
const char *string2, Idx length2,
414
411
otherwise the position of the match is returned. */
418
414
re_search_stub (struct re_pattern_buffer *bufp,
419
415
const char *string, Idx length,
420
416
Idx start, regoff_t range, Idx stop, struct re_registers *regs,
481
/* I hope we needn't fill ther regs with -1's when no match was found. */
477
/* I hope we needn't fill their regs with -1's when no match was found. */
482
478
if (result != REG_NOERROR)
479
rval = result == REG_NOMATCH ? -1 : -2;
484
480
else if (regs != NULL)
486
482
/* If caller wants register contents data back, copy them. */
511
506
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
512
507
int regs_allocated)
514
509
int rval = REGS_REALLOCATE;
516
511
Idx need_regs = nregs + 1;
517
/* We need one extra element beyond `num_regs' for the `-1' marker GNU code
512
/* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
520
515
/* Have the register data arrays been allocated? */
637
632
(0 <= LAST_START && LAST_START <= LENGTH) */
639
634
static reg_errcode_t
640
internal_function __attribute_warn_unused_result__
635
__attribute_warn_unused_result__
641
636
re_search_internal (const regex_t *preg,
642
637
const char *string, Idx length,
643
638
Idx start, Idx last_start, Idx stop,
720
715
if (nmatch > 1 || dfa->has_mb_node)
722
717
/* Avoid overflow. */
723
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
718
if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
719
<= mctx.input.bufs_len), 0))
725
721
err = REG_ESPACE;
726
722
goto free_return;
740
736
mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
741
737
: CONTEXT_NEWLINE | CONTEXT_BEGBUF;
743
/* Check incrementally whether of not the input string match. */
739
/* Check incrementally whether the input string matches. */
744
740
incr = (last_start < start) ? -1 : 1;
745
741
left_lim = (last_start < start) ? last_start : start;
746
742
right_lim = (last_start < start) ? start : last_start;
922
918
goto free_return;
925
/* At last, add the offset to the each registers, since we slided
921
/* At last, add the offset to each register, since we slid
926
922
the buffers so that we could assume that the matching starts
928
924
for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
988
984
halt_node = mctx->last_node;
990
986
/* Avoid overflow. */
991
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
987
if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
992
988
return REG_ESPACE;
994
990
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
1068
1064
since initial states may have constraints like "\<", "^", etc.. */
1070
1066
static inline re_dfastate_t *
1071
__attribute ((always_inline)) internal_function
1067
__attribute__ ((always_inline)) internal_function
1072
1068
acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1106
1102
FL_LONGEST_MATCH means we want the POSIX longest matching.
1107
1103
If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1108
1104
next place where we may want to try matching.
1109
Note that the matcher assume that the maching starts from the current
1105
Note that the matcher assumes that the matching starts from the current
1110
1106
index of the buffer. */
1175
1171
re_dfastate_t *old_state = cur_state;
1176
1172
Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1178
if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1174
if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1175
&& mctx->input.bufs_len < mctx->input.len)
1179
1176
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
1180
1177
&& mctx->input.valid_len < mctx->input.len))
1182
err = extend_buffers (mctx);
1179
err = extend_buffers (mctx, next_char_idx + 1);
1183
1180
if (BE (err != REG_NOERROR, 0))
1185
1182
assert (err == REG_ESPACE);
1436
1433
set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1437
1434
regmatch_t *pmatch, bool fl_backtrack)
1439
const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1436
const re_dfa_t *dfa = preg->buffer;
1440
1437
Idx idx, cur_node;
1441
1438
re_node_set eps_via_nodes;
1442
1439
struct re_fail_stack_t *fs;
1608
1605
and sift the nodes in each states according to the following rules.
1609
1606
Updated state_log will be wrote to STATE_LOG.
1611
Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1608
Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1612
1609
1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1613
If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1614
the LAST_NODE, we throw away the node `a'.
1615
2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1616
string `s' and transit to `b':
1610
If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1611
the LAST_NODE, we throw away the node 'a'.
1612
2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1613
string 's' and transit to 'b':
1617
1614
i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1619
1616
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1620
thrown away, we throw away the node `a'.
1617
thrown away, we throw away the node 'a'.
1621
1618
3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1622
1619
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1624
1621
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1625
we throw away the node `a'. */
1622
we throw away the node 'a'. */
1627
1624
#define STATE_NODE_CONTAINS(state,node) \
1628
1625
((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1697
1694
/* Then build the next sifted state.
1698
We build the next sifted state on `cur_dest', and update
1699
`sifted_states[str_idx]' with `cur_dest'.
1695
We build the next sifted state on 'cur_dest', and update
1696
'sifted_states[str_idx]' with 'cur_dest'.
1701
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1702
`cur_src' points the node_set of the old `state_log[str_idx]'
1698
'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1699
'cur_src' points the node_set of the old 'state_log[str_idx]'
1703
1700
(with the epsilon nodes pre-filtered out). */
1704
1701
for (i = 0; i < cur_src->nelem; i++)
1712
1709
assert (!IS_EPSILON_NODE (type));
1714
1711
#ifdef RE_ENABLE_I18N
1715
/* If the node may accept `multi byte'. */
1712
/* If the node may accept "multi byte". */
1716
1713
if (dfa->nodes[prev_node].accept_mb)
1717
1714
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1718
1715
str_idx, sctx->last_str_idx);
1754
1751
Idx top = mctx->state_log_top;
1756
if (next_state_log_idx >= mctx->input.bufs_len
1753
if ((next_state_log_idx >= mctx->input.bufs_len
1754
&& mctx->input.bufs_len < mctx->input.len)
1757
1755
|| (next_state_log_idx >= mctx->input.valid_len
1758
1756
&& mctx->input.valid_len < mctx->input.len))
1760
1758
reg_errcode_t err;
1761
err = extend_buffers (mctx);
1759
err = extend_buffers (mctx, next_state_log_idx + 1);
1762
1760
if (BE (err != REG_NOERROR, 0))
2269
2267
const re_dfa_t *const dfa = mctx->dfa;
2271
/* Check the node can accept `multi byte'. */
2269
/* Check the node can accept "multi byte". */
2272
2270
naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2273
2271
if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2274
2272
!STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2275
2273
dfa->nexts[node_idx]))
2276
/* The node can't accept the `multi byte', or the
2274
/* The node can't accept the "multi byte", or the
2277
2275
destination was already thrown away, then the node
2278
could't accept the current input `multi byte'. */
2276
could't accept the current input "multi byte". */
2280
2278
/* Otherwise, it is sure that the node could accept
2281
`naccepted' bytes input. */
2279
'naccepted' bytes input. */
2282
2280
return naccepted;
2284
2282
#endif /* RE_ENABLE_I18N */
2457
2455
/* From the node set CUR_NODES, pick up the nodes whose types are
2458
2456
OP_OPEN_SUBEXP and which have corresponding back references in the regular
2459
2457
expression. And register them to use them later for evaluating the
2460
correspoding back references. */
2458
corresponding back references. */
2462
2460
static reg_errcode_t
2463
2461
internal_function
2568
2566
if (naccepted == 0)
2571
/* The node can accepts `naccepted' bytes. */
2569
/* The node can accepts 'naccepted' bytes. */
2572
2570
dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2573
2571
mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2574
2572
: mctx->max_mb_elem_len);
2620
2618
const re_token_t *node = dfa->nodes + node_idx;
2621
2619
re_node_set *new_dest_nodes;
2623
/* Check whether `node' is a backreference or not. */
2621
/* Check whether 'node' is a backreference or not. */
2624
2622
if (node->type != OP_BACK_REF)
2635
/* `node' is a backreference.
2633
/* 'node' is a backreference.
2636
2634
Check the substring which the substring matched. */
2637
2635
bkc_idx = mctx->nbkref_ents;
2638
2636
err = get_subexp (mctx, node_idx, cur_str_idx);
2639
2637
if (BE (err != REG_NOERROR, 0))
2640
2638
goto free_return;
2642
/* And add the epsilon closures (which is `new_dest_nodes') of
2640
/* And add the epsilon closures (which is 'new_dest_nodes') of
2643
2641
the backreference to appropriate state_log. */
2645
2643
assert (dfa->nexts[node_idx] != REG_MISSING);
2663
2661
dest_state = mctx->state_log[dest_str_idx];
2664
2662
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2665
2663
: mctx->state_log[cur_str_idx]->nodes.nelem);
2666
/* Add `new_dest_node' to state_log. */
2664
/* Add 'new_dest_node' to state_log. */
2667
2665
if (dest_state == NULL)
2669
2667
mctx->state_log[dest_str_idx]
2815
2813
if (bkref_str_off >= mctx->input.len)
2818
err = extend_buffers (mctx);
2816
err = extend_buffers (mctx, bkref_str_off + 1);
2819
2817
if (BE (err != REG_NOERROR, 0))
2938
2936
re_dfastate_t **new_array;
2939
2937
Idx old_alloc = path->alloc;
2940
Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2941
if (BE (new_alloc < old_alloc, 0)
2942
|| BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2938
Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2940
if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2942
new_alloc = old_alloc + incr_alloc;
2943
if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2943
2944
return REG_ESPACE;
2944
2945
new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2945
2946
if (BE (new_array == NULL, 0))
3102
3103
assert (!IS_EPSILON_NODE (type));
3104
3105
#ifdef RE_ENABLE_I18N
3105
/* If the node may accept `multi byte'. */
3106
/* If the node may accept "multi byte". */
3106
3107
if (dfa->nodes[cur_node].accept_mb)
3108
3109
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3359
3360
bitset_word_t elem, mask;
3360
3361
bool dests_node_malloced = false;
3361
3362
bool dest_states_malloced = false;
3362
Idx ndests; /* Number of the destination states from `state'. */
3363
Idx ndests; /* Number of the destination states from 'state'. */
3363
3364
re_dfastate_t **trtable;
3364
3365
re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3365
3366
re_node_set follows, *dests_node;
3373
3374
} *dests_alloc;
3375
3376
/* We build DFA states which corresponds to the destination nodes
3376
from `state'. `dests_node[i]' represents the nodes which i-th
3377
destination state contains, and `dests_ch[i]' represents the
3377
from 'state'. 'dests_node[i]' represents the nodes which i-th
3378
destination state contains, and 'dests_ch[i]' represents the
3378
3379
characters which i-th destination state accepts. */
3379
3380
if (__libc_use_alloca (sizeof (struct dests_alloc)))
3380
3381
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3388
3389
dests_node = dests_alloc->dests_node;
3389
3390
dests_ch = dests_alloc->dests_ch;
3391
/* Initialize transiton table. */
3392
/* Initialize transition table. */
3392
3393
state->word_trtable = state->trtable = NULL;
3394
/* At first, group all nodes belonging to `state' into several
3395
/* At first, group all nodes belonging to 'state' into several
3395
3396
destinations. */
3396
3397
ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3397
3398
if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3399
3400
if (dests_node_malloced)
3400
3401
free (dests_alloc);
3402
/* Return false in case of an error, true otherwise. */
3401
3403
if (ndests == 0)
3403
3405
state->trtable = (re_dfastate_t **)
3404
3406
calloc (sizeof (re_dfastate_t *), SBC_MAX);
3407
if (BE (state->trtable == NULL, 0))
3591
3595
reg_errcode_t err;
3594
Idx ndests; /* Number of the destinations from `state'. */
3598
Idx ndests; /* Number of the destinations from 'state'. */
3595
3599
bitset_t accepts; /* Characters a node can accept. */
3596
3600
const re_node_set *cur_nodes = &state->nodes;
3597
3601
bitset_empty (accepts);
3600
/* For all the nodes belonging to `state', */
3604
/* For all the nodes belonging to 'state', */
3601
3605
for (i = 0; i < cur_nodes->nelem; ++i)
3603
3607
re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3702
/* Then divide `accepts' into DFA states, or create a new
3706
/* Then divide 'accepts' into DFA states, or create a new
3703
3707
state. Above, we make sure that accepts is not empty. */
3704
3708
for (j = 0; j < ndests; ++j)
3712
3716
if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3715
/* Enumerate the intersection set of this state and `accepts'. */
3719
/* Enumerate the intersection set of this state and 'accepts'. */
3716
3720
has_intersec = 0;
3717
3721
for (k = 0; k < BITSET_WORDS; ++k)
3718
3722
has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3728
3732
not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3731
/* If this state isn't a subset of `accepts', create a
3732
new group state, which has the `remains'. */
3735
/* If this state isn't a subset of 'accepts', create a
3736
new group state, which has the 'remains'. */
3733
3737
if (not_subset)
3735
3739
bitset_copy (dests_ch[ndests], remains);
3895
3899
const int32_t *table, *indirect;
3896
3900
const unsigned char *weights, *extra;
3897
3901
const char *collseqwc;
3899
3902
/* This #include defines a local function! */
3900
3903
# include <locale/weight.h>
3933
3936
in_collseq = find_collation_sequence_value (pin, elem_len);
3935
3938
/* match with range expression? */
3939
/* FIXME: Implement rational ranges here, too. */
3936
3940
for (i = 0; i < cset->nranges; ++i)
3937
3941
if (cset->range_starts[i] <= in_collseq
3938
3942
&& in_collseq <= cset->range_ends[i])
3953
3957
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3954
3958
indirect = (const int32_t *)
3955
3959
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3956
int32_t idx = findidx (&cp);
3960
int32_t idx = findidx (&cp, elem_len);
3958
3962
for (i = 0; i < cset->nequiv_classes; ++i)
3984
3988
# endif /* _LIBC */
3986
3990
/* match with range expression? */
3987
#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3988
wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3990
wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3993
3991
for (i = 0; i < cset->nranges; ++i)
3995
cmp_buf[0] = cset->range_starts[i];
3996
cmp_buf[4] = cset->range_ends[i];
3997
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3998
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3993
if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
4000
3995
match_len = char_len;
4001
3996
goto check_node_accept_bytes_match;
4065
4060
/* Skip the collation sequence value. */
4066
4061
idx += sizeof (uint32_t);
4067
4062
/* Skip the wide char sequence of the collating element. */
4068
idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4063
idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4069
4064
/* If we found the entry, return the sequence value. */
4071
4066
return *(uint32_t *) (extra + idx);
4134
4129
static reg_errcode_t
4135
4130
internal_function __attribute_warn_unused_result__
4136
extend_buffers (re_match_context_t *mctx)
4131
extend_buffers (re_match_context_t *mctx, int min_len)
4138
4133
reg_errcode_t ret;
4139
4134
re_string_t *pstr = &mctx->input;
4141
4136
/* Avoid overflow. */
4142
if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4137
if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4138
<= pstr->bufs_len, 0))
4143
4139
return REG_ESPACE;
4145
/* Double the lengthes of the buffers. */
4146
ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4141
/* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4142
ret = re_string_realloc_buffers (pstr,
4144
MIN (pstr->len, pstr->bufs_len * 2)));
4147
4145
if (BE (ret != REG_NOERROR, 0))
4206
4204
size_t max_object_size =
4207
4205
MAX (sizeof (struct re_backref_cache_entry),
4208
4206
sizeof (re_sub_match_top_t *));
4209
if (BE (SIZE_MAX / max_object_size < n, 0))
4207
if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4210
4208
return REG_ESPACE;
4212
4210
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);