1
1
/* Extended regular expression matching and search library.
2
Copyright (C) 2002-2012 Free Software Foundation, Inc.
2
Copyright (C) 2002-2014 Free Software Foundation, Inc.
3
3
This file is part of the GNU C Library.
4
4
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 3, or (at your option)
6
The GNU C Library is free software; you can redistribute it and/or
7
modify it under the terms of the GNU General Public
8
License as published by the Free Software Foundation; either
9
version 3 of the License, or (at your option) any later version.
11
This program is distributed in the hope that it will be useful,
11
The GNU C Library is distributed in the hope that it will be useful,
12
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
General Public License for more details.
16
You should have received a copy of the GNU General Public License along
17
with this program; if not, see <http://www.gnu.org/licenses/>. */
16
You should have received a copy of the GNU General Public
17
License along with the GNU C Library; if not, see
18
<http://www.gnu.org/licenses/>. */
19
20
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20
21
size_t length, reg_syntax_t syntax);
94
95
re_charset_t *mbcset,
95
96
Idx *char_class_alloc,
96
const unsigned char *class_name,
97
const char *class_name,
97
98
reg_syntax_t syntax);
98
99
#else /* not RE_ENABLE_I18N */
99
100
static reg_errcode_t build_equiv_class (bitset_t sbcset,
100
101
const unsigned char *name);
101
102
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103
const unsigned char *class_name,
104
const char *class_name,
104
105
reg_syntax_t syntax);
105
106
#endif /* not RE_ENABLE_I18N */
106
107
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107
108
RE_TRANSLATE_TYPE trans,
108
const unsigned char *class_name,
109
const unsigned char *extra,
109
const char *class_name,
110
111
bool non_match, reg_errcode_t *err);
111
112
static bin_tree_t *create_tree (re_dfa_t *dfa,
112
113
bin_tree_t *left, bin_tree_t *right,
272
273
re_compile_fastmap (bufp)
273
274
struct re_pattern_buffer *bufp;
275
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
276
re_dfa_t *dfa = bufp->buffer;
276
277
char *fastmap = bufp->fastmap;
278
279
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
306
307
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
310
re_dfa_t *dfa = bufp->buffer;
311
312
bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312
313
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
585
586
static const bitset_t utf8_sb_map =
587
588
/* Set the first 128 bits. */
589
# if defined __GNUC__ && !defined __STRICT_ANSI__
589
590
[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591
592
# if 4 * BITSET_WORD_BITS < ASCII_CHARS
767
771
preg->regs_allocated = REGS_UNALLOCATED;
769
773
/* Initialize the dfa. */
770
dfa = (re_dfa_t *) preg->buffer;
771
775
if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773
777
/* If zero allocated, but buffer is non-null, try to realloc
779
783
return REG_ESPACE;
780
784
preg->allocated = sizeof (re_dfa_t);
781
preg->buffer = (unsigned char *) dfa;
783
787
preg->used = sizeof (re_dfa_t);
785
789
err = init_dfa (dfa, length);
790
if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
786
792
if (BE (err != REG_NOERROR, 0))
788
794
free_dfa_content (dfa);
796
802
strncpy (dfa->re_str, pattern, length + 1);
799
__libc_lock_init (dfa->lock);
801
805
err = re_string_construct (®exp, pattern, length, preg->translate,
802
806
(syntax & RE_ICASE) != 0, dfa);
803
807
if (BE (err != REG_NOERROR, 0))
901
907
codeset_name = nl_langinfo (CODESET);
902
if (strcasecmp (codeset_name, "UTF-8") == 0
903
|| strcasecmp (codeset_name, "UTF8") == 0)
908
if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
909
&& (codeset_name[1] == 'T' || codeset_name[1] == 't')
910
&& (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911
&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
904
912
dfa->is_utf8 = 1;
906
914
/* We check exhaustively in the loop below if this charset is a
950
958
internal_function
951
959
init_word_char (re_dfa_t *dfa)
953
dfa->word_ops_used = 1;
964
dfa->word_ops_used = 1;
957
965
if (BE (dfa->map_notascii == 0, 1))
967
bitset_word_t bits0 = 0x00000000;
968
bitset_word_t bits1 = 0x03ff0000;
969
bitset_word_t bits2 = 0x87fffffe;
970
bitset_word_t bits3 = 0x07fffffe;
959
971
if (BITSET_WORD_BITS == 64)
961
dfa->word_char[0] = UINT64_C (0x03ff000000000000);
962
dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
973
dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974
dfa->word_char[1] = bits3 << 31 << 1 | bits2;
965
977
else if (BITSET_WORD_BITS == 32)
967
dfa->word_char[0] = UINT32_C (0x00000000);
968
dfa->word_char[1] = UINT32_C (0x03ff0000);
969
dfa->word_char[2] = UINT32_C (0x87fffffe);
970
dfa->word_char[3] = UINT32_C (0x07fffffe);
979
dfa->word_char[0] = bits0;
980
dfa->word_char[1] = bits1;
981
dfa->word_char[2] = bits2;
982
dfa->word_char[3] = bits3;
994
1006
free_workarea_compile (regex_t *preg)
996
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1008
re_dfa_t *dfa = preg->buffer;
997
1009
bin_tree_storage_t *storage, *next;
998
1010
for (storage = dfa->str_tree_storage; storage; storage = next)
1358
1370
static bin_tree_t *
1359
1371
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1361
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1373
re_dfa_t *dfa = preg->buffer;
1362
1374
bin_tree_t *body = node->left;
1363
1375
bin_tree_t *op, *cls, *tree1, *tree;
2139
2151
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2140
2152
reg_errcode_t *err)
2142
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154
re_dfa_t *dfa = preg->buffer;
2143
2155
bin_tree_t *tree, *eor, *root;
2144
2156
re_token_t current_token;
2145
2157
dfa->syntax = syntax;
2173
2185
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174
2186
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2176
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188
re_dfa_t *dfa = preg->buffer;
2177
2189
bin_tree_t *tree, *branch = NULL;
2178
2190
tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179
2191
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2215
2227
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217
2229
bin_tree_t *tree, *expr;
2218
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230
re_dfa_t *dfa = preg->buffer;
2219
2231
tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220
2232
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259
2271
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260
2272
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2262
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2274
re_dfa_t *dfa = preg->buffer;
2263
2275
bin_tree_t *tree;
2264
2276
switch (token->type)
2416
2428
case OP_NOTWORD:
2417
2429
tree = build_charclass_op (dfa, regexp->trans,
2418
(const unsigned char *) "alnum",
2419
(const unsigned char *) "_",
2420
2432
token->type == OP_NOTWORD, err);
2421
2433
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425
2437
case OP_NOTSPACE:
2426
2438
tree = build_charclass_op (dfa, regexp->trans,
2427
(const unsigned char *) "space",
2428
(const unsigned char *) "",
2429
2441
token->type == OP_NOTSPACE, err);
2430
2442
if (BE (*err != REG_NOERROR && tree == NULL, 0))
2475
2487
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476
2488
reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2478
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2490
re_dfa_t *dfa = preg->buffer;
2479
2491
bin_tree_t *tree;
2480
2492
size_t cur_nsub;
2481
2493
cur_nsub = preg->re_nsub++;
2611
2629
old_tree = NULL;
2613
2631
if (elem->token.type == SUBEXP)
2614
postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2633
uintptr_t subidx = elem->token.opr.idx;
2634
postorder (elem, mark_opt_subexp, (void *) subidx);
2616
2637
tree = create_tree (dfa, elem, NULL,
2617
2638
(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2697
2718
wint_t start_wc;
2699
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2701
2721
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2702
2722
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2710
2730
? __btowc (end_ch) : end_elem->opr.wch);
2711
2731
if (start_wc == WEOF || end_wc == WEOF)
2712
2732
return REG_ECOLLATE;
2713
cmp_buf[0] = start_wc;
2714
cmp_buf[4] = end_wc;
2716
if (BE ((syntax & RE_NO_EMPTY_RANGES)
2717
&& wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733
else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2718
2734
return REG_ERANGE;
2720
2736
/* Got valid collation sequence values, add them as a new entry.
2755
2771
/* Build the table for single byte characters. */
2756
2772
for (wc = 0; wc < SBC_MAX; ++wc)
2759
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2760
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2774
if (start_wc <= wc && wc <= end_wc)
2761
2775
bitset_set (sbcset, wc);
2827
2841
/* Local function for parse_bracket_exp used in _LIBC environment.
2828
2842
Seek the collating symbol entry corresponding to NAME.
2829
Return the index of the symbol in the SYMB_TABLE. */
2843
Return the index of the symbol in the SYMB_TABLE,
2844
or -1 if not found. */
2831
2846
auto inline int32_t
2832
__attribute ((always_inline))
2833
seek_collating_symbol_entry (name, name_len)
2834
const unsigned char *name;
2847
__attribute__ ((always_inline))
2848
seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2837
int32_t hash = elem_hash ((const char *) name, name_len);
2838
int32_t elem = hash % table_size;
2839
if (symb_table[2 * elem] != 0)
2841
int32_t second = hash % (table_size - 2) + 1;
2845
/* First compare the hashing value. */
2846
if (symb_table[2 * elem] == hash
2847
/* Compare the length of the name. */
2848
&& name_len == extra[symb_table[2 * elem + 1]]
2849
/* Compare the name. */
2850
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2853
/* Yep, this is the entry. */
2860
while (symb_table[2 * elem] != 0);
2852
for (elem = 0; elem < table_size; elem++)
2853
if (symb_table[2 * elem] != 0)
2855
int32_t idx = symb_table[2 * elem + 1];
2856
/* Skip the name of collating element name. */
2857
idx += 1 + extra[idx];
2858
if (/* Compare the length of the name. */
2859
name_len == extra[idx]
2860
/* Compare the name. */
2861
&& memcmp (name, &extra[idx + 1], name_len) == 0)
2862
/* Yep, this is the entry. */
2865
2868
/* Local function for parse_bracket_exp used in _LIBC environment.
2867
2870
Return the value if succeeded, UINT_MAX otherwise. */
2869
2872
auto inline unsigned int
2870
__attribute ((always_inline))
2871
lookup_collation_sequence_value (br_elem)
2872
bracket_elem_t *br_elem;
2873
__attribute__ ((always_inline))
2874
lookup_collation_sequence_value (bracket_elem_t *br_elem)
2874
2876
if (br_elem->type == SB_CHAR)
2897
2899
int32_t elem, idx;
2898
2900
elem = seek_collating_symbol_entry (br_elem->opr.name,
2900
if (symb_table[2 * elem] != 0)
2902
2904
/* We found the entry. */
2903
2905
idx = symb_table[2 * elem + 1];
2915
2917
/* Return the collation sequence value. */
2916
2918
return *(unsigned int *) (extra + idx);
2918
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2920
else if (sym_name_len == 1)
2920
2922
/* No valid character. Match it as a single byte
2938
2940
auto inline reg_errcode_t
2939
__attribute ((always_inline))
2940
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2941
re_charset_t *mbcset;
2944
bracket_elem_t *start_elem, *end_elem;
2941
__attribute__ ((always_inline))
2942
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943
bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2946
2945
unsigned int ch;
2947
2946
uint32_t start_collseq;
2955
2954
return REG_ERANGE;
2956
/* FIXME: Implement rational ranges here, too. */
2957
2957
start_collseq = lookup_collation_sequence_value (start_elem);
2958
2958
end_collseq = lookup_collation_sequence_value (end_elem);
2959
2959
/* Check start/end collation sequence values. */
3019
3019
pointer argument since we may update it. */
3021
3021
auto inline reg_errcode_t
3022
__attribute ((always_inline))
3023
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3024
re_charset_t *mbcset;
3025
Idx *coll_sym_alloc;
3027
const unsigned char *name;
3022
__attribute__ ((always_inline))
3023
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024
Idx *coll_sym_alloc, const unsigned char *name)
3029
3026
int32_t elem, idx;
3030
3027
size_t name_len = strlen ((const char *) name);
3031
3028
if (nrules != 0)
3033
3030
elem = seek_collating_symbol_entry (name, name_len);
3034
if (symb_table[2 * elem] != 0)
3036
3033
/* We found the entry. */
3037
3034
idx = symb_table[2 * elem + 1];
3038
3035
/* Skip the name of collating element name. */
3039
3036
idx += 1 + extra[idx];
3041
else if (symb_table[2 * elem] == 0 && name_len == 1)
3038
else if (name_len == 1)
3043
3040
/* No valid character, treat it as a normal
3281
3278
#ifdef RE_ENABLE_I18N
3282
3279
mbcset, &char_class_alloc,
3283
3280
#endif /* RE_ENABLE_I18N */
3284
start_elem.opr.name, syntax);
3281
(const char *) start_elem.opr.name,
3285
3283
if (BE (*err != REG_NOERROR, 0))
3286
3284
goto parse_bracket_exp_free_return;
3561
3559
#ifdef RE_ENABLE_I18N
3562
3560
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3563
3561
re_charset_t *mbcset, Idx *char_class_alloc,
3564
const unsigned char *class_name, reg_syntax_t syntax)
3562
const char *class_name, reg_syntax_t syntax)
3565
3563
#else /* not RE_ENABLE_I18N */
3566
3564
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567
const unsigned char *class_name, reg_syntax_t syntax)
3565
const char *class_name, reg_syntax_t syntax)
3568
3566
#endif /* not RE_ENABLE_I18N */
3571
const char *name = (const char *) class_name;
3569
const char *name = class_name;
3573
3571
/* In case of REG_ICASE "upper" and "lower" match the both of
3574
3572
upper and lower cases. */
3643
3641
static bin_tree_t *
3644
3642
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3645
const unsigned char *class_name,
3646
const unsigned char *extra, bool non_match,
3643
const char *class_name,
3644
const char *extra, bool non_match,
3647
3645
reg_errcode_t *err)
3649
3647
re_bitset_ptr_t sbcset;
3751
3749
/* This is intended for the expressions like "a{1,3}".
3752
3750
Fetch a number from 'input', and return the number.
3753
3751
Return REG_MISSING if the number field is empty like "{,1}".
3752
Return RE_DUP_MAX + 1 if the number field is too large.
3754
3753
Return REG_ERROR if an error occurred. */
3769
3768
num = ((token->type != CHARACTER || c < '0' || '9' < c
3770
3769
|| num == REG_ERROR)
3772
: ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3773
num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3771
: num == REG_MISSING
3773
: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3844
3844
static reg_errcode_t
3845
3845
mark_opt_subexp (void *extra, bin_tree_t *node)
3847
Idx idx = (Idx) (long) extra;
3847
Idx idx = (uintptr_t) extra;
3848
3848
if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849
3849
node->token.opt_subexp = 1;