~ubuntu-branches/ubuntu/vivid/parted/vivid

1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1
/* Extended regular expression matching and search library.
1.3.2 by Colin Watson
Import upstream version 3.2
2
   Copyright (C) 2002-2014 Free Software Foundation, Inc.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3
   This file is part of the GNU C Library.
4
   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
1.3.2 by Colin Watson
Import upstream version 3.2
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU General Public
8
   License as published by the Free Software Foundation; either
9
   version 3 of the License, or (at your option) any later version.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
10
1.3.2 by Colin Watson
Import upstream version 3.2
11
   The GNU C Library is distributed in the hope that it will be useful,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
1.3.2 by Colin Watson
Import upstream version 3.2
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   General Public License for more details.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
15
1.3.2 by Colin Watson
Import upstream version 3.2
16
   You should have received a copy of the GNU General Public
17
   License along with the GNU C Library; if not, see
18
   <http://www.gnu.org/licenses/>.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
19
20
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21
					  size_t length, reg_syntax_t syntax);
22
static void re_compile_fastmap_iter (regex_t *bufp,
23
				     const re_dfastate_t *init_state,
24
				     char *fastmap);
25
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26
#ifdef RE_ENABLE_I18N
27
static void free_charset (re_charset_t *cset);
28
#endif /* RE_ENABLE_I18N */
29
static void free_workarea_compile (regex_t *preg);
30
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31
#ifdef RE_ENABLE_I18N
32
static void optimize_utf8 (re_dfa_t *dfa);
33
#endif
34
static reg_errcode_t analyze (regex_t *preg);
35
static reg_errcode_t preorder (bin_tree_t *root,
36
			       reg_errcode_t (fn (void *, bin_tree_t *)),
37
			       void *extra);
38
static reg_errcode_t postorder (bin_tree_t *root,
39
				reg_errcode_t (fn (void *, bin_tree_t *)),
40
				void *extra);
41
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44
				 bin_tree_t *node);
45
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48
static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49
static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50
				   unsigned int constraint);
51
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53
					 Idx node, bool root);
54
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55
static Idx fetch_number (re_string_t *input, re_token_t *token,
56
			 reg_syntax_t syntax);
57
static int peek_token (re_token_t *token, re_string_t *input,
58
			reg_syntax_t syntax) internal_function;
59
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60
			  reg_syntax_t syntax, reg_errcode_t *err);
61
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62
				  re_token_t *token, reg_syntax_t syntax,
63
				  Idx nest, reg_errcode_t *err);
64
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65
				 re_token_t *token, reg_syntax_t syntax,
66
				 Idx nest, reg_errcode_t *err);
67
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68
				     re_token_t *token, reg_syntax_t syntax,
69
				     Idx nest, reg_errcode_t *err);
70
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71
				  re_token_t *token, reg_syntax_t syntax,
72
				  Idx nest, reg_errcode_t *err);
73
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74
				 re_dfa_t *dfa, re_token_t *token,
75
				 reg_syntax_t syntax, reg_errcode_t *err);
76
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77
				      re_token_t *token, reg_syntax_t syntax,
78
				      reg_errcode_t *err);
79
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80
					    re_string_t *regexp,
81
					    re_token_t *token, int token_len,
82
					    re_dfa_t *dfa,
83
					    reg_syntax_t syntax,
84
					    bool accept_hyphen);
85
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86
					  re_string_t *regexp,
87
					  re_token_t *token);
88
#ifdef RE_ENABLE_I18N
89
static reg_errcode_t build_equiv_class (bitset_t sbcset,
90
					re_charset_t *mbcset,
91
					Idx *equiv_class_alloc,
92
					const unsigned char *name);
93
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94
				      bitset_t sbcset,
95
				      re_charset_t *mbcset,
96
				      Idx *char_class_alloc,
1.3.2 by Colin Watson
Import upstream version 3.2
97
				      const char *class_name,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
98
				      reg_syntax_t syntax);
99
#else  /* not RE_ENABLE_I18N */
100
static reg_errcode_t build_equiv_class (bitset_t sbcset,
101
					const unsigned char *name);
102
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103
				      bitset_t sbcset,
1.3.2 by Colin Watson
Import upstream version 3.2
104
				      const char *class_name,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
105
				      reg_syntax_t syntax);
106
#endif /* not RE_ENABLE_I18N */
107
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108
				       RE_TRANSLATE_TYPE trans,
1.3.2 by Colin Watson
Import upstream version 3.2
109
				       const char *class_name,
110
				       const char *extra,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
111
				       bool non_match, reg_errcode_t *err);
112
static bin_tree_t *create_tree (re_dfa_t *dfa,
113
				bin_tree_t *left, bin_tree_t *right,
114
				re_token_type_t type);
115
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116
				      bin_tree_t *left, bin_tree_t *right,
117
				      const re_token_t *token);
118
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119
static void free_token (re_token_t *node);
120
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122

123
/* This table gives an error message for each of the error codes listed
124
   in regex.h.  Obviously the order here has to be same as there.
125
   POSIX doesn't require that we do anything for REG_NOERROR,
126
   but why not be nice?  */
127
128
static const char __re_error_msgid[] =
129
  {
130
#define REG_NOERROR_IDX	0
131
    gettext_noop ("Success")	/* REG_NOERROR */
132
    "\0"
133
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134
    gettext_noop ("No match")	/* REG_NOMATCH */
135
    "\0"
136
#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137
    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138
    "\0"
139
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140
    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141
    "\0"
142
#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143
    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144
    "\0"
145
#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146
    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147
    "\0"
148
#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149
    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150
    "\0"
151
#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152
    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153
    "\0"
154
#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155
    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156
    "\0"
157
#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158
    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159
    "\0"
160
#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161
    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162
    "\0"
163
#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164
    gettext_noop ("Invalid range end")	/* REG_ERANGE */
165
    "\0"
166
#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167
    gettext_noop ("Memory exhausted") /* REG_ESPACE */
168
    "\0"
169
#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170
    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171
    "\0"
172
#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173
    gettext_noop ("Premature end of regular expression") /* REG_EEND */
174
    "\0"
175
#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176
    gettext_noop ("Regular expression too big") /* REG_ESIZE */
177
    "\0"
178
#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179
    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180
  };
181
182
static const size_t __re_error_msgid_idx[] =
183
  {
184
    REG_NOERROR_IDX,
185
    REG_NOMATCH_IDX,
186
    REG_BADPAT_IDX,
187
    REG_ECOLLATE_IDX,
188
    REG_ECTYPE_IDX,
189
    REG_EESCAPE_IDX,
190
    REG_ESUBREG_IDX,
191
    REG_EBRACK_IDX,
192
    REG_EPAREN_IDX,
193
    REG_EBRACE_IDX,
194
    REG_BADBR_IDX,
195
    REG_ERANGE_IDX,
196
    REG_ESPACE_IDX,
197
    REG_BADRPT_IDX,
198
    REG_EEND_IDX,
199
    REG_ESIZE_IDX,
200
    REG_ERPAREN_IDX
201
  };
202

203
/* Entry points for GNU code.  */
204
205
/* re_compile_pattern is the GNU regular expression compiler: it
206
   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207
   Returns 0 if the pattern was valid, otherwise an error string.
208
1.3.1 by Colin Watson
Import upstream version 3.1
209
   Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
210
   are set in BUFP on entry.  */
211
212
#ifdef _LIBC
213
const char *
214
re_compile_pattern (pattern, length, bufp)
215
    const char *pattern;
216
    size_t length;
217
    struct re_pattern_buffer *bufp;
218
#else /* size_t might promote */
219
const char *
220
re_compile_pattern (const char *pattern, size_t length,
221
		    struct re_pattern_buffer *bufp)
222
#endif
223
{
224
  reg_errcode_t ret;
225
226
  /* And GNU code determines whether or not to get register information
227
     by passing null for the REGS argument to re_match, etc., not by
228
     setting no_sub, unless RE_NO_SUB is set.  */
229
  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231
  /* Match anchors at newline.  */
232
  bufp->newline_anchor = 1;
233
234
  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236
  if (!ret)
237
    return NULL;
238
  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239
}
240
#ifdef _LIBC
241
weak_alias (__re_compile_pattern, re_compile_pattern)
242
#endif
243
1.3.1 by Colin Watson
Import upstream version 3.1
244
/* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
245
   also be assigned to arbitrarily: each pattern buffer stores its own
246
   syntax, so it can be changed between regex compilations.  */
247
/* This has no initializer because initialized variables in Emacs
248
   become read-only after dumping.  */
249
reg_syntax_t re_syntax_options;
250
251
252
/* Specify the precise syntax of regexps for compilation.  This provides
253
   for compatibility for various utilities which historically have
254
   different, incompatible syntaxes.
255
256
   The argument SYNTAX is a bit mask comprised of the various bits
257
   defined in regex.h.  We return the old syntax.  */
258
259
reg_syntax_t
260
re_set_syntax (syntax)
261
    reg_syntax_t syntax;
262
{
263
  reg_syntax_t ret = re_syntax_options;
264
265
  re_syntax_options = syntax;
266
  return ret;
267
}
268
#ifdef _LIBC
269
weak_alias (__re_set_syntax, re_set_syntax)
270
#endif
271
272
int
273
re_compile_fastmap (bufp)
274
    struct re_pattern_buffer *bufp;
275
{
1.3.2 by Colin Watson
Import upstream version 3.2
276
  re_dfa_t *dfa = bufp->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
277
  char *fastmap = bufp->fastmap;
278
279
  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280
  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281
  if (dfa->init_state != dfa->init_state_word)
282
    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283
  if (dfa->init_state != dfa->init_state_nl)
284
    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285
  if (dfa->init_state != dfa->init_state_begbuf)
286
    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287
  bufp->fastmap_accurate = 1;
288
  return 0;
289
}
290
#ifdef _LIBC
291
weak_alias (__re_compile_fastmap, re_compile_fastmap)
292
#endif
293
294
static inline void
1.3.2 by Colin Watson
Import upstream version 3.2
295
__attribute__ ((always_inline))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
296
re_set_fastmap (char *fastmap, bool icase, int ch)
297
{
298
  fastmap[ch] = 1;
299
  if (icase)
300
    fastmap[tolower (ch)] = 1;
301
}
302
303
/* Helper function for re_compile_fastmap.
304
   Compile fastmap for the initial_state INIT_STATE.  */
305
306
static void
307
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308
			 char *fastmap)
309
{
1.3.2 by Colin Watson
Import upstream version 3.2
310
  re_dfa_t *dfa = bufp->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
311
  Idx node_cnt;
312
  bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313
  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314
    {
315
      Idx node = init_state->nodes.elems[node_cnt];
316
      re_token_type_t type = dfa->nodes[node].type;
317
318
      if (type == CHARACTER)
319
	{
320
	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321
#ifdef RE_ENABLE_I18N
322
	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323
	    {
324
	      unsigned char buf[MB_LEN_MAX];
325
	      unsigned char *p;
326
	      wchar_t wc;
327
	      mbstate_t state;
328
329
	      p = buf;
330
	      *p++ = dfa->nodes[node].opr.c;
331
	      while (++node < dfa->nodes_len
332
		     &&	dfa->nodes[node].type == CHARACTER
333
		     && dfa->nodes[node].mb_partial)
334
		*p++ = dfa->nodes[node].opr.c;
335
	      memset (&state, '\0', sizeof (state));
336
	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
337
			     &state) == p - buf
338
		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339
		      != (size_t) -1))
340
		re_set_fastmap (fastmap, false, buf[0]);
341
	    }
342
#endif
343
	}
344
      else if (type == SIMPLE_BRACKET)
345
	{
346
	  int i, ch;
347
	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348
	    {
349
	      int j;
350
	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351
	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352
		if (w & ((bitset_word_t) 1 << j))
353
		  re_set_fastmap (fastmap, icase, ch);
354
	    }
355
	}
356
#ifdef RE_ENABLE_I18N
357
      else if (type == COMPLEX_BRACKET)
358
	{
359
	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360
	  Idx i;
361
362
# ifdef _LIBC
363
	  /* See if we have to try all bytes which start multiple collation
364
	     elements.
365
	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366
		  collation element, and don't catch 'b' since 'b' is
367
		  the only collation element which starts from 'b' (and
368
		  it is caught by SIMPLE_BRACKET).  */
369
	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370
		  && (cset->ncoll_syms || cset->nranges))
371
		{
372
		  const int32_t *table = (const int32_t *)
373
		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374
		  for (i = 0; i < SBC_MAX; ++i)
375
		    if (table[i] < 0)
376
		      re_set_fastmap (fastmap, icase, i);
377
		}
378
# endif /* _LIBC */
379
380
	  /* See if we have to start the match at all multibyte characters,
381
	     i.e. where we would not find an invalid sequence.  This only
382
	     applies to multibyte character sets; for single byte character
383
	     sets, the SIMPLE_BRACKET again suffices.  */
384
	  if (dfa->mb_cur_max > 1
1.2.2 by Otavio Salvador
Import upstream version 2.1
385
	      && (cset->nchar_classes || cset->non_match || cset->nranges
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
386
# ifdef _LIBC
387
		  || cset->nequiv_classes
388
# endif /* _LIBC */
389
		 ))
390
	    {
391
	      unsigned char c = 0;
392
	      do
393
		{
394
		  mbstate_t mbs;
395
		  memset (&mbs, 0, sizeof (mbs));
396
		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397
		    re_set_fastmap (fastmap, false, (int) c);
398
		}
399
	      while (++c != 0);
400
	    }
401
402
	  else
403
	    {
404
	      /* ... Else catch all bytes which can start the mbchars.  */
405
	      for (i = 0; i < cset->nmbchars; ++i)
406
		{
407
		  char buf[256];
408
		  mbstate_t state;
409
		  memset (&state, '\0', sizeof (state));
410
		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411
		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412
		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413
		    {
414
		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415
			  != (size_t) -1)
416
			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417
		    }
418
		}
419
	    }
420
	}
421
#endif /* RE_ENABLE_I18N */
422
      else if (type == OP_PERIOD
423
#ifdef RE_ENABLE_I18N
424
	       || type == OP_UTF8_PERIOD
425
#endif /* RE_ENABLE_I18N */
426
	       || type == END_OF_RE)
427
	{
428
	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429
	  if (type == END_OF_RE)
430
	    bufp->can_be_null = 1;
431
	  return;
432
	}
433
    }
434
}
435

436
/* Entry point for POSIX code.  */
437
/* regcomp takes a regular expression as a string and compiles it.
438
439
   PREG is a regex_t *.  We do not expect any fields to be initialized,
440
   since POSIX says we shouldn't.  Thus, we set
441
1.3.1 by Colin Watson
Import upstream version 3.1
442
     'buffer' to the compiled pattern;
443
     'used' to the length of the compiled pattern;
444
     'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
445
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
446
       RE_SYNTAX_POSIX_BASIC;
1.3.1 by Colin Watson
Import upstream version 3.1
447
     'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448
     'fastmap' to an allocated space for the fastmap;
449
     'fastmap_accurate' to zero;
450
     're_nsub' to the number of subexpressions in PATTERN.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
451
452
   PATTERN is the address of the pattern string.
453
454
   CFLAGS is a series of bits which affect compilation.
455
456
     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457
     use POSIX basic syntax.
458
459
     If REG_NEWLINE is set, then . and [^...] don't match newline.
460
     Also, regexec will try a match beginning after every newline.
461
462
     If REG_ICASE is set, then we considers upper- and lowercase
463
     versions of letters to be equivalent when matching.
464
465
     If REG_NOSUB is set, then when PREG is passed to regexec, that
466
     routine will report only success or failure, and nothing about the
467
     registers.
468
469
   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
470
   the return codes and their meanings.)  */
471
472
int
473
regcomp (preg, pattern, cflags)
474
    regex_t *_Restrict_ preg;
475
    const char *_Restrict_ pattern;
476
    int cflags;
477
{
478
  reg_errcode_t ret;
479
  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480
			 : RE_SYNTAX_POSIX_BASIC);
481
482
  preg->buffer = NULL;
483
  preg->allocated = 0;
484
  preg->used = 0;
485
486
  /* Try to allocate space for the fastmap.  */
487
  preg->fastmap = re_malloc (char, SBC_MAX);
488
  if (BE (preg->fastmap == NULL, 0))
489
    return REG_ESPACE;
490
491
  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493
  /* If REG_NEWLINE is set, newlines are treated differently.  */
494
  if (cflags & REG_NEWLINE)
495
    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
496
      syntax &= ~RE_DOT_NEWLINE;
497
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498
      /* It also changes the matching behavior.  */
499
      preg->newline_anchor = 1;
500
    }
501
  else
502
    preg->newline_anchor = 0;
503
  preg->no_sub = !!(cflags & REG_NOSUB);
504
  preg->translate = NULL;
505
506
  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508
  /* POSIX doesn't distinguish between an unmatched open-group and an
509
     unmatched close-group: both are REG_EPAREN.  */
510
  if (ret == REG_ERPAREN)
511
    ret = REG_EPAREN;
512
513
  /* We have already checked preg->fastmap != NULL.  */
514
  if (BE (ret == REG_NOERROR, 1))
515
    /* Compute the fastmap now, since regexec cannot modify the pattern
516
       buffer.  This function never fails in this implementation.  */
517
    (void) re_compile_fastmap (preg);
518
  else
519
    {
520
      /* Some error occurred while compiling the expression.  */
521
      re_free (preg->fastmap);
522
      preg->fastmap = NULL;
523
    }
524
525
  return (int) ret;
526
}
527
#ifdef _LIBC
528
weak_alias (__regcomp, regcomp)
529
#endif
530
531
/* Returns a message corresponding to an error code, ERRCODE, returned
532
   from either regcomp or regexec.   We don't use PREG here.  */
533
534
#ifdef _LIBC
535
size_t
536
regerror (errcode, preg, errbuf, errbuf_size)
537
    int errcode;
538
    const regex_t *_Restrict_ preg;
539
    char *_Restrict_ errbuf;
540
    size_t errbuf_size;
541
#else /* size_t might promote */
542
size_t
543
regerror (int errcode, const regex_t *_Restrict_ preg,
544
	  char *_Restrict_ errbuf, size_t errbuf_size)
545
#endif
546
{
547
  const char *msg;
548
  size_t msg_size;
549
550
  if (BE (errcode < 0
551
	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
552
			       / sizeof (__re_error_msgid_idx[0])), 0))
553
    /* Only error codes returned by the rest of the code should be passed
554
       to this routine.  If we are given anything else, or if other regex
555
       code generates an invalid error code, then the program has a bug.
556
       Dump core so we can fix it.  */
557
    abort ();
558
559
  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561
  msg_size = strlen (msg) + 1; /* Includes the null.  */
562
563
  if (BE (errbuf_size != 0, 1))
564
    {
565
      size_t cpy_size = msg_size;
566
      if (BE (msg_size > errbuf_size, 0))
567
	{
568
	  cpy_size = errbuf_size - 1;
569
	  errbuf[cpy_size] = '\0';
570
	}
571
      memcpy (errbuf, msg, cpy_size);
572
    }
573
574
  return msg_size;
575
}
576
#ifdef _LIBC
577
weak_alias (__regerror, regerror)
578
#endif
579
580
581
#ifdef RE_ENABLE_I18N
582
/* This static array is used for the map to single-byte characters when
583
   UTF-8 is used.  Otherwise we would allocate memory just to initialize
584
   it the same all the time.  UTF-8 is the preferred encoding so this is
585
   a worthwhile optimization.  */
586
static const bitset_t utf8_sb_map =
587
{
588
  /* Set the first 128 bits.  */
1.3.2 by Colin Watson
Import upstream version 3.2
589
# if defined __GNUC__ && !defined __STRICT_ANSI__
1.3.1 by Colin Watson
Import upstream version 3.1
590
  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591
# else
592
#  if 4 * BITSET_WORD_BITS < ASCII_CHARS
593
#   error "bitset_word_t is narrower than 32 bits"
594
#  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
595
  BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
1.3.1 by Colin Watson
Import upstream version 3.1
596
#  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
597
  BITSET_WORD_MAX, BITSET_WORD_MAX,
1.3.1 by Colin Watson
Import upstream version 3.1
598
#  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
599
  BITSET_WORD_MAX,
1.3.1 by Colin Watson
Import upstream version 3.1
600
#  endif
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
601
  (BITSET_WORD_MAX
602
   >> (SBC_MAX % BITSET_WORD_BITS == 0
603
       ? 0
604
       : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
1.3.1 by Colin Watson
Import upstream version 3.1
605
# endif
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
606
};
607
#endif
608
609
610
static void
611
free_dfa_content (re_dfa_t *dfa)
612
{
613
  Idx i, j;
614
615
  if (dfa->nodes)
616
    for (i = 0; i < dfa->nodes_len; ++i)
617
      free_token (dfa->nodes + i);
618
  re_free (dfa->nexts);
619
  for (i = 0; i < dfa->nodes_len; ++i)
620
    {
621
      if (dfa->eclosures != NULL)
622
	re_node_set_free (dfa->eclosures + i);
623
      if (dfa->inveclosures != NULL)
624
	re_node_set_free (dfa->inveclosures + i);
625
      if (dfa->edests != NULL)
626
	re_node_set_free (dfa->edests + i);
627
    }
628
  re_free (dfa->edests);
629
  re_free (dfa->eclosures);
630
  re_free (dfa->inveclosures);
631
  re_free (dfa->nodes);
632
633
  if (dfa->state_table)
634
    for (i = 0; i <= dfa->state_hash_mask; ++i)
635
      {
636
	struct re_state_table_entry *entry = dfa->state_table + i;
637
	for (j = 0; j < entry->num; ++j)
638
	  {
639
	    re_dfastate_t *state = entry->array[j];
640
	    free_state (state);
641
	  }
1.2.3 by Colin Watson
Import upstream version 2.2
642
	re_free (entry->array);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
643
      }
644
  re_free (dfa->state_table);
645
#ifdef RE_ENABLE_I18N
646
  if (dfa->sb_char != utf8_sb_map)
647
    re_free (dfa->sb_char);
648
#endif
649
  re_free (dfa->subexp_map);
650
#ifdef DEBUG
651
  re_free (dfa->re_str);
652
#endif
653
654
  re_free (dfa);
655
}
656
657
658
/* Free dynamically allocated space used by PREG.  */
659
660
void
661
regfree (preg)
662
    regex_t *preg;
663
{
1.3.2 by Colin Watson
Import upstream version 3.2
664
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
665
  if (BE (dfa != NULL, 1))
1.3.2 by Colin Watson
Import upstream version 3.2
666
    {
667
      lock_fini (dfa->lock);
668
      free_dfa_content (dfa);
669
    }
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
670
  preg->buffer = NULL;
671
  preg->allocated = 0;
672
673
  re_free (preg->fastmap);
674
  preg->fastmap = NULL;
675
676
  re_free (preg->translate);
677
  preg->translate = NULL;
678
}
679
#ifdef _LIBC
680
weak_alias (__regfree, regfree)
681
#endif
682

683
/* Entry points compatible with 4.2 BSD regex library.  We don't define
684
   them unless specifically requested.  */
685
686
#if defined _REGEX_RE_COMP || defined _LIBC
687
688
/* BSD has one and only one pattern buffer.  */
689
static struct re_pattern_buffer re_comp_buf;
690
691
char *
692
# ifdef _LIBC
693
/* Make these definitions weak in libc, so POSIX programs can redefine
694
   these names if they don't use our functions, and still use
695
   regcomp/regexec above without link errors.  */
696
weak_function
697
# endif
698
re_comp (s)
699
     const char *s;
700
{
701
  reg_errcode_t ret;
702
  char *fastmap;
703
704
  if (!s)
705
    {
706
      if (!re_comp_buf.buffer)
707
	return gettext ("No previous regular expression");
708
      return 0;
709
    }
710
711
  if (re_comp_buf.buffer)
712
    {
713
      fastmap = re_comp_buf.fastmap;
714
      re_comp_buf.fastmap = NULL;
715
      __regfree (&re_comp_buf);
716
      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717
      re_comp_buf.fastmap = fastmap;
718
    }
719
720
  if (re_comp_buf.fastmap == NULL)
721
    {
722
      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723
      if (re_comp_buf.fastmap == NULL)
724
	return (char *) gettext (__re_error_msgid
725
				 + __re_error_msgid_idx[(int) REG_ESPACE]);
726
    }
727
1.3.1 by Colin Watson
Import upstream version 3.1
728
  /* Since 're_exec' always passes NULL for the 'regs' argument, we
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
729
     don't need to initialize the pattern buffer fields which affect it.  */
730
731
  /* Match anchors at newlines.  */
732
  re_comp_buf.newline_anchor = 1;
733
734
  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735
736
  if (!ret)
737
    return NULL;
738
1.3.1 by Colin Watson
Import upstream version 3.1
739
  /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
740
  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741
}
742
743
#ifdef _LIBC
744
libc_freeres_fn (free_mem)
745
{
746
  __regfree (&re_comp_buf);
747
}
748
#endif
749
750
#endif /* _REGEX_RE_COMP */
751

752
/* Internal entry point.
753
   Compile the regular expression PATTERN, whose length is LENGTH.
754
   SYNTAX indicate regular expression's syntax.  */
755
756
static reg_errcode_t
757
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
758
		     reg_syntax_t syntax)
759
{
760
  reg_errcode_t err = REG_NOERROR;
761
  re_dfa_t *dfa;
762
  re_string_t regexp;
763
764
  /* Initialize the pattern buffer.  */
765
  preg->fastmap_accurate = 0;
766
  preg->syntax = syntax;
767
  preg->not_bol = preg->not_eol = 0;
768
  preg->used = 0;
769
  preg->re_nsub = 0;
770
  preg->can_be_null = 0;
771
  preg->regs_allocated = REGS_UNALLOCATED;
772
773
  /* Initialize the dfa.  */
1.3.2 by Colin Watson
Import upstream version 3.2
774
  dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
775
  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
776
    {
777
      /* If zero allocated, but buffer is non-null, try to realloc
778
	 enough space.  This loses if buffer's address is bogus, but
779
	 that is the user's responsibility.  If ->buffer is NULL this
780
	 is a simple allocation.  */
781
      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
782
      if (dfa == NULL)
783
	return REG_ESPACE;
784
      preg->allocated = sizeof (re_dfa_t);
1.3.2 by Colin Watson
Import upstream version 3.2
785
      preg->buffer = dfa;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
786
    }
787
  preg->used = sizeof (re_dfa_t);
788
789
  err = init_dfa (dfa, length);
1.3.2 by Colin Watson
Import upstream version 3.2
790
  if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
791
    err = REG_ESPACE;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
792
  if (BE (err != REG_NOERROR, 0))
793
    {
794
      free_dfa_content (dfa);
795
      preg->buffer = NULL;
796
      preg->allocated = 0;
797
      return err;
798
    }
799
#ifdef DEBUG
800
  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
801
  dfa->re_str = re_malloc (char, length + 1);
802
  strncpy (dfa->re_str, pattern, length + 1);
803
#endif
804
805
  err = re_string_construct (&regexp, pattern, length, preg->translate,
806
			     (syntax & RE_ICASE) != 0, dfa);
807
  if (BE (err != REG_NOERROR, 0))
808
    {
809
    re_compile_internal_free_return:
810
      free_workarea_compile (preg);
811
      re_string_destruct (&regexp);
1.3.2 by Colin Watson
Import upstream version 3.2
812
      lock_fini (dfa->lock);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
813
      free_dfa_content (dfa);
814
      preg->buffer = NULL;
815
      preg->allocated = 0;
816
      return err;
817
    }
818
819
  /* Parse the regular expression, and build a structure tree.  */
820
  preg->re_nsub = 0;
821
  dfa->str_tree = parse (&regexp, preg, syntax, &err);
822
  if (BE (dfa->str_tree == NULL, 0))
823
    goto re_compile_internal_free_return;
824
825
  /* Analyze the tree and create the nfa.  */
826
  err = analyze (preg);
827
  if (BE (err != REG_NOERROR, 0))
828
    goto re_compile_internal_free_return;
829
830
#ifdef RE_ENABLE_I18N
831
  /* If possible, do searching in single byte encoding to speed things up.  */
832
  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833
    optimize_utf8 (dfa);
834
#endif
835
836
  /* Then create the initial state of the dfa.  */
837
  err = create_initial_state (dfa);
838
839
  /* Release work areas.  */
840
  free_workarea_compile (preg);
841
  re_string_destruct (&regexp);
842
843
  if (BE (err != REG_NOERROR, 0))
844
    {
1.3.2 by Colin Watson
Import upstream version 3.2
845
      lock_fini (dfa->lock);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
846
      free_dfa_content (dfa);
847
      preg->buffer = NULL;
848
      preg->allocated = 0;
849
    }
850
851
  return err;
852
}
853
854
/* Initialize DFA.  We use the length of the regular expression PAT_LEN
855
   as the initial length of some arrays.  */
856
857
static reg_errcode_t
858
init_dfa (re_dfa_t *dfa, size_t pat_len)
859
{
860
  __re_size_t table_size;
1.2.3 by Colin Watson
Import upstream version 2.2
861
#ifndef _LIBC
1.3.2 by Colin Watson
Import upstream version 3.2
862
  const char *codeset_name;
1.2.3 by Colin Watson
Import upstream version 2.2
863
#endif
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
864
#ifdef RE_ENABLE_I18N
865
  size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
866
#else
867
  size_t max_i18n_object_size = 0;
868
#endif
869
  size_t max_object_size =
870
    MAX (sizeof (struct re_state_table_entry),
871
	 MAX (sizeof (re_token_t),
872
	      MAX (sizeof (re_node_set),
873
		   MAX (sizeof (regmatch_t),
874
			max_i18n_object_size))));
875
876
  memset (dfa, '\0', sizeof (re_dfa_t));
877
878
  /* Force allocation of str_tree_storage the first time.  */
879
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
880
881
  /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
882
     calculation below, and for similar doubling calculations
883
     elsewhere.  And it's <= rather than <, because some of the
884
     doubling calculations add 1 afterwards.  */
1.3.1 by Colin Watson
Import upstream version 3.1
885
  if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
886
    return REG_ESPACE;
887
888
  dfa->nodes_alloc = pat_len + 1;
889
  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
890
891
  /*  table_size = 2 ^ ceil(log pat_len) */
892
  for (table_size = 1; ; table_size <<= 1)
893
    if (table_size > pat_len)
894
      break;
895
896
  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
897
  dfa->state_hash_mask = table_size - 1;
898
899
  dfa->mb_cur_max = MB_CUR_MAX;
900
#ifdef _LIBC
901
  if (dfa->mb_cur_max == 6
902
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
903
    dfa->is_utf8 = 1;
904
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
905
		       != 0);
906
#else
1.2.3 by Colin Watson
Import upstream version 2.2
907
  codeset_name = nl_langinfo (CODESET);
1.3.2 by Colin Watson
Import upstream version 3.2
908
  if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
909
      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
910
      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911
      && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
912
    dfa->is_utf8 = 1;
913
914
  /* We check exhaustively in the loop below if this charset is a
915
     superset of ASCII.  */
916
  dfa->map_notascii = 0;
917
#endif
918
919
#ifdef RE_ENABLE_I18N
920
  if (dfa->mb_cur_max > 1)
921
    {
922
      if (dfa->is_utf8)
923
	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
924
      else
925
	{
926
	  int i, j, ch;
927
928
	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
929
	  if (BE (dfa->sb_char == NULL, 0))
930
	    return REG_ESPACE;
931
932
	  /* Set the bits corresponding to single byte chars.  */
933
	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
934
	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935
	      {
936
		wint_t wch = __btowc (ch);
937
		if (wch != WEOF)
938
		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
939
# ifndef _LIBC
940
		if (isascii (ch) && wch != ch)
941
		  dfa->map_notascii = 1;
942
# endif
943
	      }
944
	}
945
    }
946
#endif
947
948
  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
949
    return REG_ESPACE;
950
  return REG_NOERROR;
951
}
952
953
/* Initialize WORD_CHAR table, which indicate which character is
954
   "word".  In this case "word" means that it is the word construction
955
   character used by some operators like "\<", "\>", etc.  */
956
957
static void
958
internal_function
959
init_word_char (re_dfa_t *dfa)
960
{
1.3.1 by Colin Watson
Import upstream version 3.1
961
  int i = 0;
962
  int j;
963
  int ch = 0;
1.3.2 by Colin Watson
Import upstream version 3.2
964
  dfa->word_ops_used = 1;
1.3.1 by Colin Watson
Import upstream version 3.1
965
  if (BE (dfa->map_notascii == 0, 1))
966
    {
1.3.2 by Colin Watson
Import upstream version 3.2
967
      bitset_word_t bits0 = 0x00000000;
968
      bitset_word_t bits1 = 0x03ff0000;
969
      bitset_word_t bits2 = 0x87fffffe;
970
      bitset_word_t bits3 = 0x07fffffe;
1.3.1 by Colin Watson
Import upstream version 3.1
971
      if (BITSET_WORD_BITS == 64)
972
	{
1.3.2 by Colin Watson
Import upstream version 3.2
973
	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974
	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
1.3.1 by Colin Watson
Import upstream version 3.1
975
	  i = 2;
976
	}
977
      else if (BITSET_WORD_BITS == 32)
978
	{
1.3.2 by Colin Watson
Import upstream version 3.2
979
	  dfa->word_char[0] = bits0;
980
	  dfa->word_char[1] = bits1;
981
	  dfa->word_char[2] = bits2;
982
	  dfa->word_char[3] = bits3;
1.3.1 by Colin Watson
Import upstream version 3.1
983
	  i = 4;
984
	}
985
      else
986
        goto general_case;
987
      ch = 128;
988
989
      if (BE (dfa->is_utf8, 1))
990
	{
991
	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992
	  return;
993
	}
994
    }
995
996
 general_case:
997
  for (; i < BITSET_WORDS; ++i)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
998
    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
999
      if (isalnum (ch) || ch == '_')
1000
	dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001
}
1002
1003
/* Free the work area which are only used while compiling.  */
1004
1005
static void
1006
free_workarea_compile (regex_t *preg)
1007
{
1.3.2 by Colin Watson
Import upstream version 3.2
1008
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1009
  bin_tree_storage_t *storage, *next;
1010
  for (storage = dfa->str_tree_storage; storage; storage = next)
1011
    {
1012
      next = storage->next;
1013
      re_free (storage);
1014
    }
1015
  dfa->str_tree_storage = NULL;
1016
  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017
  dfa->str_tree = NULL;
1018
  re_free (dfa->org_indices);
1019
  dfa->org_indices = NULL;
1020
}
1021
1022
/* Create initial states for all contexts.  */
1023
1024
static reg_errcode_t
1025
create_initial_state (re_dfa_t *dfa)
1026
{
1027
  Idx first, i;
1028
  reg_errcode_t err;
1029
  re_node_set init_nodes;
1030
1031
  /* Initial states have the epsilon closure of the node which is
1032
     the first node of the regular expression.  */
1033
  first = dfa->str_tree->first->node_idx;
1034
  dfa->init_node = first;
1035
  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036
  if (BE (err != REG_NOERROR, 0))
1037
    return err;
1038
1039
  /* The back-references which are in initial states can epsilon transit,
1040
     since in this case all of the subexpressions can be null.
1041
     Then we add epsilon closures of the nodes which are the next nodes of
1042
     the back-references.  */
1043
  if (dfa->nbackref > 0)
1044
    for (i = 0; i < init_nodes.nelem; ++i)
1045
      {
1046
	Idx node_idx = init_nodes.elems[i];
1047
	re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049
	Idx clexp_idx;
1050
	if (type != OP_BACK_REF)
1051
	  continue;
1052
	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053
	  {
1054
	    re_token_t *clexp_node;
1055
	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056
	    if (clexp_node->type == OP_CLOSE_SUBEXP
1057
		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058
	      break;
1059
	  }
1060
	if (clexp_idx == init_nodes.nelem)
1061
	  continue;
1062
1063
	if (type == OP_BACK_REF)
1064
	  {
1065
	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1066
	    if (!re_node_set_contains (&init_nodes, dest_idx))
1067
	      {
1.2.3 by Colin Watson
Import upstream version 2.2
1068
		reg_errcode_t merge_err
1069
                  = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070
		if (merge_err != REG_NOERROR)
1071
		  return merge_err;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1072
		i = 0;
1073
	      }
1074
	  }
1075
      }
1076
1077
  /* It must be the first time to invoke acquire_state.  */
1078
  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079
  /* We don't check ERR here, since the initial state must not be NULL.  */
1080
  if (BE (dfa->init_state == NULL, 0))
1081
    return err;
1082
  if (dfa->init_state->has_constraint)
1083
    {
1084
      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085
						       CONTEXT_WORD);
1086
      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087
						     CONTEXT_NEWLINE);
1088
      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089
							 &init_nodes,
1090
							 CONTEXT_NEWLINE
1091
							 | CONTEXT_BEGBUF);
1092
      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093
	      || dfa->init_state_begbuf == NULL, 0))
1094
	return err;
1095
    }
1096
  else
1097
    dfa->init_state_word = dfa->init_state_nl
1098
      = dfa->init_state_begbuf = dfa->init_state;
1099
1100
  re_node_set_free (&init_nodes);
1101
  return REG_NOERROR;
1102
}
1103

1104
#ifdef RE_ENABLE_I18N
1105
/* If it is possible to do searching in single byte encoding instead of UTF-8
1106
   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107
   DFA nodes where needed.  */
1108
1109
static void
1110
optimize_utf8 (re_dfa_t *dfa)
1111
{
1112
  Idx node;
1113
  int i;
1114
  bool mb_chars = false;
1115
  bool has_period = false;
1116
1117
  for (node = 0; node < dfa->nodes_len; ++node)
1118
    switch (dfa->nodes[node].type)
1119
      {
1120
      case CHARACTER:
1121
	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122
	  mb_chars = true;
1123
	break;
1124
      case ANCHOR:
1125
	switch (dfa->nodes[node].opr.ctx_type)
1126
	  {
1127
	  case LINE_FIRST:
1128
	  case LINE_LAST:
1129
	  case BUF_FIRST:
1130
	  case BUF_LAST:
1131
	    break;
1132
	  default:
1133
	    /* Word anchors etc. cannot be handled.  It's okay to test
1134
	       opr.ctx_type since constraints (for all DFA nodes) are
1135
	       created by ORing one or more opr.ctx_type values.  */
1136
	    return;
1137
	  }
1138
	break;
1139
      case OP_PERIOD:
1.2.3 by Colin Watson
Import upstream version 2.2
1140
	has_period = true;
1141
	break;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1142
      case OP_BACK_REF:
1143
      case OP_ALT:
1144
      case END_OF_RE:
1145
      case OP_DUP_ASTERISK:
1146
      case OP_OPEN_SUBEXP:
1147
      case OP_CLOSE_SUBEXP:
1148
	break;
1149
      case COMPLEX_BRACKET:
1150
	return;
1151
      case SIMPLE_BRACKET:
1152
	/* Just double check.  */
1153
	{
1154
	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155
			? 0
1156
			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157
	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158
	    {
1159
	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160
		return;
1161
	      rshift = 0;
1162
	    }
1163
	}
1164
	break;
1165
      default:
1166
	abort ();
1167
      }
1168
1169
  if (mb_chars || has_period)
1170
    for (node = 0; node < dfa->nodes_len; ++node)
1171
      {
1172
	if (dfa->nodes[node].type == CHARACTER
1173
	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174
	  dfa->nodes[node].mb_partial = 0;
1175
	else if (dfa->nodes[node].type == OP_PERIOD)
1176
	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1177
      }
1178
1179
  /* The search can be in single byte locale.  */
1180
  dfa->mb_cur_max = 1;
1181
  dfa->is_utf8 = 0;
1182
  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183
}
1184
#endif
1185

1186
/* Analyze the structure tree, and calculate "first", "next", "edest",
1187
   "eclosure", and "inveclosure".  */
1188
1189
static reg_errcode_t
1190
analyze (regex_t *preg)
1191
{
1.3.2 by Colin Watson
Import upstream version 3.2
1192
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1193
  reg_errcode_t ret;
1194
1195
  /* Allocate arrays.  */
1196
  dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197
  dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198
  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199
  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200
  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201
	  || dfa->eclosures == NULL, 0))
1202
    return REG_ESPACE;
1203
1204
  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205
  if (dfa->subexp_map != NULL)
1206
    {
1207
      Idx i;
1208
      for (i = 0; i < preg->re_nsub; i++)
1209
	dfa->subexp_map[i] = i;
1210
      preorder (dfa->str_tree, optimize_subexps, dfa);
1211
      for (i = 0; i < preg->re_nsub; i++)
1212
	if (dfa->subexp_map[i] != i)
1213
	  break;
1214
      if (i == preg->re_nsub)
1215
	{
1216
	  free (dfa->subexp_map);
1217
	  dfa->subexp_map = NULL;
1218
	}
1219
    }
1220
1221
  ret = postorder (dfa->str_tree, lower_subexps, preg);
1222
  if (BE (ret != REG_NOERROR, 0))
1223
    return ret;
1224
  ret = postorder (dfa->str_tree, calc_first, dfa);
1225
  if (BE (ret != REG_NOERROR, 0))
1226
    return ret;
1227
  preorder (dfa->str_tree, calc_next, dfa);
1228
  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229
  if (BE (ret != REG_NOERROR, 0))
1230
    return ret;
1231
  ret = calc_eclosure (dfa);
1232
  if (BE (ret != REG_NOERROR, 0))
1233
    return ret;
1234
1235
  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236
     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237
  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238
      || dfa->nbackref)
1239
    {
1240
      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241
      if (BE (dfa->inveclosures == NULL, 0))
1.2.3 by Colin Watson
Import upstream version 2.2
1242
	return REG_ESPACE;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1243
      ret = calc_inveclosure (dfa);
1244
    }
1245
1246
  return ret;
1247
}
1248
1249
/* Our parse trees are very unbalanced, so we cannot use a stack to
1250
   implement parse tree visits.  Instead, we use parent pointers and
1251
   some hairy code in these two functions.  */
1252
static reg_errcode_t
1253
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254
	   void *extra)
1255
{
1256
  bin_tree_t *node, *prev;
1257
1258
  for (node = root; ; )
1259
    {
1260
      /* Descend down the tree, preferably to the left (or to the right
1261
	 if that's the only child).  */
1262
      while (node->left || node->right)
1263
	if (node->left)
1.2.3 by Colin Watson
Import upstream version 2.2
1264
	  node = node->left;
1265
	else
1266
	  node = node->right;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1267
1268
      do
1269
	{
1270
	  reg_errcode_t err = fn (extra, node);
1271
	  if (BE (err != REG_NOERROR, 0))
1272
	    return err;
1.2.3 by Colin Watson
Import upstream version 2.2
1273
	  if (node->parent == NULL)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1274
	    return REG_NOERROR;
1275
	  prev = node;
1276
	  node = node->parent;
1277
	}
1278
      /* Go up while we have a node that is reached from the right.  */
1279
      while (node->right == prev || node->right == NULL);
1280
      node = node->right;
1281
    }
1282
}
1283
1284
static reg_errcode_t
1285
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286
	  void *extra)
1287
{
1288
  bin_tree_t *node;
1289
1290
  for (node = root; ; )
1291
    {
1292
      reg_errcode_t err = fn (extra, node);
1293
      if (BE (err != REG_NOERROR, 0))
1294
	return err;
1295
1296
      /* Go to the left node, or up and to the right.  */
1297
      if (node->left)
1298
	node = node->left;
1299
      else
1300
	{
1301
	  bin_tree_t *prev = NULL;
1302
	  while (node->right == prev || node->right == NULL)
1303
	    {
1304
	      prev = node;
1305
	      node = node->parent;
1306
	      if (!node)
1.2.3 by Colin Watson
Import upstream version 2.2
1307
		return REG_NOERROR;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1308
	    }
1309
	  node = node->right;
1310
	}
1311
    }
1312
}
1313
1314
/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315
   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316
   backreferences as well.  Requires a preorder visit.  */
1317
static reg_errcode_t
1318
optimize_subexps (void *extra, bin_tree_t *node)
1319
{
1320
  re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322
  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323
    {
1324
      int idx = node->token.opr.idx;
1325
      node->token.opr.idx = dfa->subexp_map[idx];
1326
      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327
    }
1328
1329
  else if (node->token.type == SUBEXP
1.2.3 by Colin Watson
Import upstream version 2.2
1330
	   && node->left && node->left->token.type == SUBEXP)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1331
    {
1332
      Idx other_idx = node->left->token.opr.idx;
1333
1334
      node->left = node->left->left;
1335
      if (node->left)
1.2.3 by Colin Watson
Import upstream version 2.2
1336
	node->left->parent = node;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1337
1338
      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339
      if (other_idx < BITSET_WORD_BITS)
1340
	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341
    }
1342
1343
  return REG_NOERROR;
1344
}
1345
1346
/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347
   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348
static reg_errcode_t
1349
lower_subexps (void *extra, bin_tree_t *node)
1350
{
1351
  regex_t *preg = (regex_t *) extra;
1352
  reg_errcode_t err = REG_NOERROR;
1353
1354
  if (node->left && node->left->token.type == SUBEXP)
1355
    {
1356
      node->left = lower_subexp (&err, preg, node->left);
1357
      if (node->left)
1358
	node->left->parent = node;
1359
    }
1360
  if (node->right && node->right->token.type == SUBEXP)
1361
    {
1362
      node->right = lower_subexp (&err, preg, node->right);
1363
      if (node->right)
1364
	node->right->parent = node;
1365
    }
1366
1367
  return err;
1368
}
1369
1370
static bin_tree_t *
1371
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372
{
1.3.2 by Colin Watson
Import upstream version 3.2
1373
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1374
  bin_tree_t *body = node->left;
1375
  bin_tree_t *op, *cls, *tree1, *tree;
1376
1377
  if (preg->no_sub
1378
      /* We do not optimize empty subexpressions, because otherwise we may
1379
	 have bad CONCAT nodes with NULL children.  This is obviously not
1380
	 very common, so we do not lose much.  An example that triggers
1381
	 this case is the sed "script" /\(\)/x.  */
1382
      && node->left != NULL
1383
      && (node->token.opr.idx >= BITSET_WORD_BITS
1384
	  || !(dfa->used_bkref_map
1385
	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1386
    return node->left;
1387
1388
  /* Convert the SUBEXP node to the concatenation of an
1389
     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390
  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391
  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392
  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393
  tree = create_tree (dfa, op, tree1, CONCAT);
1394
  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395
    {
1396
      *err = REG_ESPACE;
1397
      return NULL;
1398
    }
1399
1400
  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401
  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402
  return tree;
1403
}
1404
1405
/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406
   nodes.  Requires a postorder visit.  */
1407
static reg_errcode_t
1408
calc_first (void *extra, bin_tree_t *node)
1409
{
1410
  re_dfa_t *dfa = (re_dfa_t *) extra;
1411
  if (node->token.type == CONCAT)
1412
    {
1413
      node->first = node->left->first;
1414
      node->node_idx = node->left->node_idx;
1415
    }
1416
  else
1417
    {
1418
      node->first = node;
1419
      node->node_idx = re_dfa_add_node (dfa, node->token);
1420
      if (BE (node->node_idx == REG_MISSING, 0))
1.2.3 by Colin Watson
Import upstream version 2.2
1421
	return REG_ESPACE;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1422
      if (node->token.type == ANCHOR)
1.2.3 by Colin Watson
Import upstream version 2.2
1423
	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1424
    }
1425
  return REG_NOERROR;
1426
}
1427
1428
/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429
static reg_errcode_t
1430
calc_next (void *extra, bin_tree_t *node)
1431
{
1432
  switch (node->token.type)
1433
    {
1434
    case OP_DUP_ASTERISK:
1435
      node->left->next = node;
1436
      break;
1437
    case CONCAT:
1438
      node->left->next = node->right->first;
1439
      node->right->next = node->next;
1440
      break;
1441
    default:
1442
      if (node->left)
1443
	node->left->next = node->next;
1444
      if (node->right)
1.2.3 by Colin Watson
Import upstream version 2.2
1445
	node->right->next = node->next;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1446
      break;
1447
    }
1448
  return REG_NOERROR;
1449
}
1450
1451
/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452
static reg_errcode_t
1453
link_nfa_nodes (void *extra, bin_tree_t *node)
1454
{
1455
  re_dfa_t *dfa = (re_dfa_t *) extra;
1456
  Idx idx = node->node_idx;
1457
  reg_errcode_t err = REG_NOERROR;
1458
1459
  switch (node->token.type)
1460
    {
1461
    case CONCAT:
1462
      break;
1463
1464
    case END_OF_RE:
1465
      assert (node->next == NULL);
1466
      break;
1467
1468
    case OP_DUP_ASTERISK:
1469
    case OP_ALT:
1470
      {
1471
	Idx left, right;
1472
	dfa->has_plural_match = 1;
1473
	if (node->left != NULL)
1474
	  left = node->left->first->node_idx;
1475
	else
1476
	  left = node->next->node_idx;
1477
	if (node->right != NULL)
1478
	  right = node->right->first->node_idx;
1479
	else
1480
	  right = node->next->node_idx;
1481
	assert (REG_VALID_INDEX (left));
1482
	assert (REG_VALID_INDEX (right));
1483
	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484
      }
1485
      break;
1486
1487
    case ANCHOR:
1488
    case OP_OPEN_SUBEXP:
1489
    case OP_CLOSE_SUBEXP:
1490
      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491
      break;
1492
1493
    case OP_BACK_REF:
1494
      dfa->nexts[idx] = node->next->node_idx;
1495
      if (node->token.type == OP_BACK_REF)
1.2.3 by Colin Watson
Import upstream version 2.2
1496
	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1497
      break;
1498
1499
    default:
1500
      assert (!IS_EPSILON_NODE (node->token.type));
1501
      dfa->nexts[idx] = node->next->node_idx;
1502
      break;
1503
    }
1504
1505
  return err;
1506
}
1507
1508
/* Duplicate the epsilon closure of the node ROOT_NODE.
1509
   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510
   to their own constraint.  */
1511
1512
static reg_errcode_t
1513
internal_function
1514
duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515
			Idx root_node, unsigned int init_constraint)
1516
{
1517
  Idx org_node, clone_node;
1518
  bool ok;
1519
  unsigned int constraint = init_constraint;
1520
  for (org_node = top_org_node, clone_node = top_clone_node;;)
1521
    {
1522
      Idx org_dest, clone_dest;
1523
      if (dfa->nodes[org_node].type == OP_BACK_REF)
1524
	{
1525
	  /* If the back reference epsilon-transit, its destination must
1526
	     also have the constraint.  Then duplicate the epsilon closure
1527
	     of the destination of the back reference, and store it in
1528
	     edests of the back reference.  */
1529
	  org_dest = dfa->nexts[org_node];
1530
	  re_node_set_empty (dfa->edests + clone_node);
1531
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532
	  if (BE (clone_dest == REG_MISSING, 0))
1533
	    return REG_ESPACE;
1534
	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1535
	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536
	  if (BE (! ok, 0))
1537
	    return REG_ESPACE;
1538
	}
1539
      else if (dfa->edests[org_node].nelem == 0)
1540
	{
1541
	  /* In case of the node can't epsilon-transit, don't duplicate the
1542
	     destination and store the original destination as the
1543
	     destination of the node.  */
1544
	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1545
	  break;
1546
	}
1547
      else if (dfa->edests[org_node].nelem == 1)
1548
	{
1549
	  /* In case of the node can epsilon-transit, and it has only one
1550
	     destination.  */
1551
	  org_dest = dfa->edests[org_node].elems[0];
1552
	  re_node_set_empty (dfa->edests + clone_node);
1553
	  /* If the node is root_node itself, it means the epsilon closure
1554
	     has a loop.  Then tie it to the destination of the root_node.  */
1555
	  if (org_node == root_node && clone_node != org_node)
1556
	    {
1557
	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558
	      if (BE (! ok, 0))
1559
	        return REG_ESPACE;
1560
	      break;
1561
	    }
1562
	  /* In case the node has another constraint, append it.  */
1563
	  constraint |= dfa->nodes[org_node].constraint;
1564
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1565
	  if (BE (clone_dest == REG_MISSING, 0))
1566
	    return REG_ESPACE;
1567
	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568
	  if (BE (! ok, 0))
1569
	    return REG_ESPACE;
1570
	}
1571
      else /* dfa->edests[org_node].nelem == 2 */
1572
	{
1573
	  /* In case of the node can epsilon-transit, and it has two
1574
	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575
	  org_dest = dfa->edests[org_node].elems[0];
1576
	  re_node_set_empty (dfa->edests + clone_node);
1577
	  /* Search for a duplicated node which satisfies the constraint.  */
1578
	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579
	  if (clone_dest == REG_MISSING)
1580
	    {
1581
	      /* There is no such duplicated node, create a new one.  */
1582
	      reg_errcode_t err;
1583
	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1584
	      if (BE (clone_dest == REG_MISSING, 0))
1585
		return REG_ESPACE;
1586
	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587
	      if (BE (! ok, 0))
1588
		return REG_ESPACE;
1589
	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590
					    root_node, constraint);
1591
	      if (BE (err != REG_NOERROR, 0))
1592
		return err;
1593
	    }
1594
	  else
1595
	    {
1.2.3 by Colin Watson
Import upstream version 2.2
1596
	      /* There is a duplicated node which satisfies the constraint,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1597
		 use it to avoid infinite loop.  */
1598
	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599
	      if (BE (! ok, 0))
1600
		return REG_ESPACE;
1601
	    }
1602
1603
	  org_dest = dfa->edests[org_node].elems[1];
1604
	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1605
	  if (BE (clone_dest == REG_MISSING, 0))
1606
	    return REG_ESPACE;
1607
	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608
	  if (BE (! ok, 0))
1609
	    return REG_ESPACE;
1610
	}
1611
      org_node = org_dest;
1612
      clone_node = clone_dest;
1613
    }
1614
  return REG_NOERROR;
1615
}
1616
1617
/* Search for a node which is duplicated from the node ORG_NODE, and
1618
   satisfies the constraint CONSTRAINT.  */
1619
1620
static Idx
1621
search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622
			unsigned int constraint)
1623
{
1624
  Idx idx;
1625
  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626
    {
1627
      if (org_node == dfa->org_indices[idx]
1628
	  && constraint == dfa->nodes[idx].constraint)
1629
	return idx; /* Found.  */
1630
    }
1631
  return REG_MISSING; /* Not found.  */
1632
}
1633
1634
/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635
   Return the index of the new node, or REG_MISSING if insufficient storage is
1636
   available.  */
1637
1638
static Idx
1639
duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640
{
1641
  Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642
  if (BE (dup_idx != REG_MISSING, 1))
1643
    {
1644
      dfa->nodes[dup_idx].constraint = constraint;
1645
      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646
      dfa->nodes[dup_idx].duplicated = 1;
1647
1648
      /* Store the index of the original node.  */
1649
      dfa->org_indices[dup_idx] = org_idx;
1650
    }
1651
  return dup_idx;
1652
}
1653
1654
static reg_errcode_t
1655
calc_inveclosure (re_dfa_t *dfa)
1656
{
1657
  Idx src, idx;
1658
  bool ok;
1659
  for (idx = 0; idx < dfa->nodes_len; ++idx)
1660
    re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662
  for (src = 0; src < dfa->nodes_len; ++src)
1663
    {
1664
      Idx *elems = dfa->eclosures[src].elems;
1665
      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666
	{
1667
	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668
	  if (BE (! ok, 0))
1669
	    return REG_ESPACE;
1670
	}
1671
    }
1672
1673
  return REG_NOERROR;
1674
}
1675
1676
/* Calculate "eclosure" for all the node in DFA.  */
1677
1678
static reg_errcode_t
1679
calc_eclosure (re_dfa_t *dfa)
1680
{
1681
  Idx node_idx;
1682
  bool incomplete;
1683
#ifdef DEBUG
1684
  assert (dfa->nodes_len > 0);
1685
#endif
1686
  incomplete = false;
1687
  /* For each nodes, calculate epsilon closure.  */
1688
  for (node_idx = 0; ; ++node_idx)
1689
    {
1690
      reg_errcode_t err;
1691
      re_node_set eclosure_elem;
1692
      if (node_idx == dfa->nodes_len)
1693
	{
1694
	  if (!incomplete)
1695
	    break;
1696
	  incomplete = false;
1697
	  node_idx = 0;
1698
	}
1699
1700
#ifdef DEBUG
1701
      assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702
#endif
1703
1704
      /* If we have already calculated, skip it.  */
1705
      if (dfa->eclosures[node_idx].nelem != 0)
1706
	continue;
1.3.1 by Colin Watson
Import upstream version 3.1
1707
      /* Calculate epsilon closure of 'node_idx'.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1708
      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709
      if (BE (err != REG_NOERROR, 0))
1710
	return err;
1711
1712
      if (dfa->eclosures[node_idx].nelem == 0)
1713
	{
1714
	  incomplete = true;
1715
	  re_node_set_free (&eclosure_elem);
1716
	}
1717
    }
1718
  return REG_NOERROR;
1719
}
1720
1721
/* Calculate epsilon closure of NODE.  */
1722
1723
static reg_errcode_t
1724
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725
{
1726
  reg_errcode_t err;
1727
  Idx i;
1728
  re_node_set eclosure;
1.2.3 by Colin Watson
Import upstream version 2.2
1729
  bool ok;
1730
  bool incomplete = false;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1731
  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732
  if (BE (err != REG_NOERROR, 0))
1733
    return err;
1734
1735
  /* This indicates that we are calculating this node now.
1736
     We reference this value to avoid infinite loop.  */
1737
  dfa->eclosures[node].nelem = REG_MISSING;
1738
1739
  /* If the current node has constraints, duplicate all nodes
1740
     since they must inherit the constraints.  */
1741
  if (dfa->nodes[node].constraint
1742
      && dfa->edests[node].nelem
1743
      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744
    {
1745
      err = duplicate_node_closure (dfa, node, node, node,
1746
				    dfa->nodes[node].constraint);
1747
      if (BE (err != REG_NOERROR, 0))
1748
	return err;
1749
    }
1750
1751
  /* Expand each epsilon destination nodes.  */
1752
  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753
    for (i = 0; i < dfa->edests[node].nelem; ++i)
1754
      {
1755
	re_node_set eclosure_elem;
1756
	Idx edest = dfa->edests[node].elems[i];
1.3.1 by Colin Watson
Import upstream version 3.1
1757
	/* If calculating the epsilon closure of 'edest' is in progress,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1758
	   return intermediate result.  */
1759
	if (dfa->eclosures[edest].nelem == REG_MISSING)
1760
	  {
1761
	    incomplete = true;
1762
	    continue;
1763
	  }
1.3.1 by Colin Watson
Import upstream version 3.1
1764
	/* If we haven't calculated the epsilon closure of 'edest' yet,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1765
	   calculate now. Otherwise use calculated epsilon closure.  */
1766
	if (dfa->eclosures[edest].nelem == 0)
1767
	  {
1768
	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769
	    if (BE (err != REG_NOERROR, 0))
1770
	      return err;
1771
	  }
1772
	else
1773
	  eclosure_elem = dfa->eclosures[edest];
1.3.1 by Colin Watson
Import upstream version 3.1
1774
	/* Merge the epsilon closure of 'edest'.  */
1.2.3 by Colin Watson
Import upstream version 2.2
1775
	err = re_node_set_merge (&eclosure, &eclosure_elem);
1776
	if (BE (err != REG_NOERROR, 0))
1777
	  return err;
1.3.1 by Colin Watson
Import upstream version 3.1
1778
	/* If the epsilon closure of 'edest' is incomplete,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1779
	   the epsilon closure of this node is also incomplete.  */
1780
	if (dfa->eclosures[edest].nelem == 0)
1781
	  {
1782
	    incomplete = true;
1783
	    re_node_set_free (&eclosure_elem);
1784
	  }
1785
      }
1786
1.2.3 by Colin Watson
Import upstream version 2.2
1787
  /* An epsilon closure includes itself.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
1788
  ok = re_node_set_insert (&eclosure, node);
1789
  if (BE (! ok, 0))
1790
    return REG_ESPACE;
1791
  if (incomplete && !root)
1792
    dfa->eclosures[node].nelem = 0;
1793
  else
1794
    dfa->eclosures[node] = eclosure;
1795
  *new_set = eclosure;
1796
  return REG_NOERROR;
1797
}
1798

1799
/* Functions for token which are used in the parser.  */
1800
1801
/* Fetch a token from INPUT.
1802
   We must not use this function inside bracket expressions.  */
1803
1804
static void
1805
internal_function
1806
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807
{
1808
  re_string_skip_bytes (input, peek_token (result, input, syntax));
1809
}
1810
1811
/* Peek a token from INPUT, and return the length of the token.
1812
   We must not use this function inside bracket expressions.  */
1813
1814
static int
1815
internal_function
1816
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817
{
1818
  unsigned char c;
1819
1820
  if (re_string_eoi (input))
1821
    {
1822
      token->type = END_OF_RE;
1823
      return 0;
1824
    }
1825
1826
  c = re_string_peek_byte (input, 0);
1827
  token->opr.c = c;
1828
1829
  token->word_char = 0;
1830
#ifdef RE_ENABLE_I18N
1831
  token->mb_partial = 0;
1832
  if (input->mb_cur_max > 1 &&
1833
      !re_string_first_byte (input, re_string_cur_idx (input)))
1834
    {
1835
      token->type = CHARACTER;
1836
      token->mb_partial = 1;
1837
      return 1;
1838
    }
1839
#endif
1840
  if (c == '\\')
1841
    {
1842
      unsigned char c2;
1843
      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844
	{
1845
	  token->type = BACK_SLASH;
1846
	  return 1;
1847
	}
1848
1849
      c2 = re_string_peek_byte_case (input, 1);
1850
      token->opr.c = c2;
1851
      token->type = CHARACTER;
1852
#ifdef RE_ENABLE_I18N
1853
      if (input->mb_cur_max > 1)
1854
	{
1855
	  wint_t wc = re_string_wchar_at (input,
1856
					  re_string_cur_idx (input) + 1);
1857
	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858
	}
1859
      else
1860
#endif
1861
	token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863
      switch (c2)
1864
	{
1865
	case '|':
1866
	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867
	    token->type = OP_ALT;
1868
	  break;
1869
	case '1': case '2': case '3': case '4': case '5':
1870
	case '6': case '7': case '8': case '9':
1871
	  if (!(syntax & RE_NO_BK_REFS))
1872
	    {
1873
	      token->type = OP_BACK_REF;
1874
	      token->opr.idx = c2 - '1';
1875
	    }
1876
	  break;
1877
	case '<':
1878
	  if (!(syntax & RE_NO_GNU_OPS))
1879
	    {
1880
	      token->type = ANCHOR;
1881
	      token->opr.ctx_type = WORD_FIRST;
1882
	    }
1883
	  break;
1884
	case '>':
1885
	  if (!(syntax & RE_NO_GNU_OPS))
1886
	    {
1887
	      token->type = ANCHOR;
1888
	      token->opr.ctx_type = WORD_LAST;
1889
	    }
1890
	  break;
1891
	case 'b':
1892
	  if (!(syntax & RE_NO_GNU_OPS))
1893
	    {
1894
	      token->type = ANCHOR;
1895
	      token->opr.ctx_type = WORD_DELIM;
1896
	    }
1897
	  break;
1898
	case 'B':
1899
	  if (!(syntax & RE_NO_GNU_OPS))
1900
	    {
1901
	      token->type = ANCHOR;
1902
	      token->opr.ctx_type = NOT_WORD_DELIM;
1903
	    }
1904
	  break;
1905
	case 'w':
1906
	  if (!(syntax & RE_NO_GNU_OPS))
1907
	    token->type = OP_WORD;
1908
	  break;
1909
	case 'W':
1910
	  if (!(syntax & RE_NO_GNU_OPS))
1911
	    token->type = OP_NOTWORD;
1912
	  break;
1913
	case 's':
1914
	  if (!(syntax & RE_NO_GNU_OPS))
1915
	    token->type = OP_SPACE;
1916
	  break;
1917
	case 'S':
1918
	  if (!(syntax & RE_NO_GNU_OPS))
1919
	    token->type = OP_NOTSPACE;
1920
	  break;
1921
	case '`':
1922
	  if (!(syntax & RE_NO_GNU_OPS))
1923
	    {
1924
	      token->type = ANCHOR;
1925
	      token->opr.ctx_type = BUF_FIRST;
1926
	    }
1927
	  break;
1928
	case '\'':
1929
	  if (!(syntax & RE_NO_GNU_OPS))
1930
	    {
1931
	      token->type = ANCHOR;
1932
	      token->opr.ctx_type = BUF_LAST;
1933
	    }
1934
	  break;
1935
	case '(':
1936
	  if (!(syntax & RE_NO_BK_PARENS))
1937
	    token->type = OP_OPEN_SUBEXP;
1938
	  break;
1939
	case ')':
1940
	  if (!(syntax & RE_NO_BK_PARENS))
1941
	    token->type = OP_CLOSE_SUBEXP;
1942
	  break;
1943
	case '+':
1944
	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945
	    token->type = OP_DUP_PLUS;
1946
	  break;
1947
	case '?':
1948
	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949
	    token->type = OP_DUP_QUESTION;
1950
	  break;
1951
	case '{':
1952
	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953
	    token->type = OP_OPEN_DUP_NUM;
1954
	  break;
1955
	case '}':
1956
	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957
	    token->type = OP_CLOSE_DUP_NUM;
1958
	  break;
1959
	default:
1960
	  break;
1961
	}
1962
      return 2;
1963
    }
1964
1965
  token->type = CHARACTER;
1966
#ifdef RE_ENABLE_I18N
1967
  if (input->mb_cur_max > 1)
1968
    {
1969
      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970
      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971
    }
1972
  else
1973
#endif
1974
    token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976
  switch (c)
1977
    {
1978
    case '\n':
1979
      if (syntax & RE_NEWLINE_ALT)
1980
	token->type = OP_ALT;
1981
      break;
1982
    case '|':
1983
      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984
	token->type = OP_ALT;
1985
      break;
1986
    case '*':
1987
      token->type = OP_DUP_ASTERISK;
1988
      break;
1989
    case '+':
1990
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991
	token->type = OP_DUP_PLUS;
1992
      break;
1993
    case '?':
1994
      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995
	token->type = OP_DUP_QUESTION;
1996
      break;
1997
    case '{':
1998
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999
	token->type = OP_OPEN_DUP_NUM;
2000
      break;
2001
    case '}':
2002
      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003
	token->type = OP_CLOSE_DUP_NUM;
2004
      break;
2005
    case '(':
2006
      if (syntax & RE_NO_BK_PARENS)
2007
	token->type = OP_OPEN_SUBEXP;
2008
      break;
2009
    case ')':
2010
      if (syntax & RE_NO_BK_PARENS)
2011
	token->type = OP_CLOSE_SUBEXP;
2012
      break;
2013
    case '[':
2014
      token->type = OP_OPEN_BRACKET;
2015
      break;
2016
    case '.':
2017
      token->type = OP_PERIOD;
2018
      break;
2019
    case '^':
2020
      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021
	  re_string_cur_idx (input) != 0)
2022
	{
2023
	  char prev = re_string_peek_byte (input, -1);
2024
	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025
	    break;
2026
	}
2027
      token->type = ANCHOR;
2028
      token->opr.ctx_type = LINE_FIRST;
2029
      break;
2030
    case '$':
2031
      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032
	  re_string_cur_idx (input) + 1 != re_string_length (input))
2033
	{
2034
	  re_token_t next;
2035
	  re_string_skip_bytes (input, 1);
2036
	  peek_token (&next, input, syntax);
2037
	  re_string_skip_bytes (input, -1);
2038
	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039
	    break;
2040
	}
2041
      token->type = ANCHOR;
2042
      token->opr.ctx_type = LINE_LAST;
2043
      break;
2044
    default:
2045
      break;
2046
    }
2047
  return 1;
2048
}
2049
2050
/* Peek a token from INPUT, and return the length of the token.
2051
   We must not use this function out of bracket expressions.  */
2052
2053
static int
2054
internal_function
2055
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056
{
2057
  unsigned char c;
2058
  if (re_string_eoi (input))
2059
    {
2060
      token->type = END_OF_RE;
2061
      return 0;
2062
    }
2063
  c = re_string_peek_byte (input, 0);
2064
  token->opr.c = c;
2065
2066
#ifdef RE_ENABLE_I18N
2067
  if (input->mb_cur_max > 1 &&
2068
      !re_string_first_byte (input, re_string_cur_idx (input)))
2069
    {
2070
      token->type = CHARACTER;
2071
      return 1;
2072
    }
2073
#endif /* RE_ENABLE_I18N */
2074
2075
  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076
      && re_string_cur_idx (input) + 1 < re_string_length (input))
2077
    {
2078
      /* In this case, '\' escape a character.  */
2079
      unsigned char c2;
2080
      re_string_skip_bytes (input, 1);
2081
      c2 = re_string_peek_byte (input, 0);
2082
      token->opr.c = c2;
2083
      token->type = CHARACTER;
2084
      return 1;
2085
    }
2086
  if (c == '[') /* '[' is a special char in a bracket exps.  */
2087
    {
2088
      unsigned char c2;
2089
      int token_len;
2090
      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091
	c2 = re_string_peek_byte (input, 1);
2092
      else
2093
	c2 = 0;
2094
      token->opr.c = c2;
2095
      token_len = 2;
2096
      switch (c2)
2097
	{
2098
	case '.':
2099
	  token->type = OP_OPEN_COLL_ELEM;
2100
	  break;
2101
	case '=':
2102
	  token->type = OP_OPEN_EQUIV_CLASS;
2103
	  break;
2104
	case ':':
2105
	  if (syntax & RE_CHAR_CLASSES)
2106
	    {
2107
	      token->type = OP_OPEN_CHAR_CLASS;
2108
	      break;
2109
	    }
2110
	  /* else fall through.  */
2111
	default:
2112
	  token->type = CHARACTER;
2113
	  token->opr.c = c;
2114
	  token_len = 1;
2115
	  break;
2116
	}
2117
      return token_len;
2118
    }
2119
  switch (c)
2120
    {
2121
    case '-':
2122
      token->type = OP_CHARSET_RANGE;
2123
      break;
2124
    case ']':
2125
      token->type = OP_CLOSE_BRACKET;
2126
      break;
2127
    case '^':
2128
      token->type = OP_NON_MATCH_LIST;
2129
      break;
2130
    default:
2131
      token->type = CHARACTER;
2132
    }
2133
  return 1;
2134
}
2135

2136
/* Functions for parser.  */
2137
2138
/* Entry point of the parser.
2139
   Parse the regular expression REGEXP and return the structure tree.
1.3.1 by Colin Watson
Import upstream version 3.1
2140
   If an error occurs, ERR is set by error code, and return NULL.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2141
   This function build the following tree, from regular expression <reg_exp>:
2142
	   CAT
2143
	   / \
2144
	  /   \
2145
   <reg_exp>  EOR
2146
2147
   CAT means concatenation.
2148
   EOR means end of regular expression.  */
2149
2150
static bin_tree_t *
2151
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152
       reg_errcode_t *err)
2153
{
1.3.2 by Colin Watson
Import upstream version 3.2
2154
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2155
  bin_tree_t *tree, *eor, *root;
2156
  re_token_t current_token;
2157
  dfa->syntax = syntax;
2158
  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159
  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161
    return NULL;
2162
  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163
  if (tree != NULL)
2164
    root = create_tree (dfa, tree, eor, CONCAT);
2165
  else
2166
    root = eor;
2167
  if (BE (eor == NULL || root == NULL, 0))
2168
    {
2169
      *err = REG_ESPACE;
2170
      return NULL;
2171
    }
2172
  return root;
2173
}
2174
2175
/* This function build the following tree, from regular expression
2176
   <branch1>|<branch2>:
2177
	   ALT
2178
	   / \
2179
	  /   \
2180
   <branch1> <branch2>
2181
1.3.1 by Colin Watson
Import upstream version 3.1
2182
   ALT means alternative, which represents the operator '|'.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2183
2184
static bin_tree_t *
2185
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186
	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187
{
1.3.2 by Colin Watson
Import upstream version 3.2
2188
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2189
  bin_tree_t *tree, *branch = NULL;
2190
  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192
    return NULL;
2193
2194
  while (token->type == OP_ALT)
2195
    {
2196
      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197
      if (token->type != OP_ALT && token->type != END_OF_RE
2198
	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199
	{
2200
	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201
	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202
	    return NULL;
2203
	}
2204
      else
2205
	branch = NULL;
2206
      tree = create_tree (dfa, tree, branch, OP_ALT);
2207
      if (BE (tree == NULL, 0))
2208
	{
2209
	  *err = REG_ESPACE;
2210
	  return NULL;
2211
	}
2212
    }
2213
  return tree;
2214
}
2215
2216
/* This function build the following tree, from regular expression
2217
   <exp1><exp2>:
2218
	CAT
2219
	/ \
2220
       /   \
2221
   <exp1> <exp2>
2222
2223
   CAT means concatenation.  */
2224
2225
static bin_tree_t *
2226
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227
	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228
{
2229
  bin_tree_t *tree, *expr;
1.3.2 by Colin Watson
Import upstream version 3.2
2230
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2231
  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2232
  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233
    return NULL;
2234
2235
  while (token->type != OP_ALT && token->type != END_OF_RE
2236
	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2237
    {
2238
      expr = parse_expression (regexp, preg, token, syntax, nest, err);
2239
      if (BE (*err != REG_NOERROR && expr == NULL, 0))
2240
	{
1.3.1 by Colin Watson
Import upstream version 3.1
2241
	  if (tree != NULL)
2242
	    postorder (tree, free_tree, NULL);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2243
	  return NULL;
2244
	}
2245
      if (tree != NULL && expr != NULL)
2246
	{
1.3.1 by Colin Watson
Import upstream version 3.1
2247
	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2248
	  if (newtree == NULL)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2249
	    {
1.3.1 by Colin Watson
Import upstream version 3.1
2250
	      postorder (expr, free_tree, NULL);
2251
	      postorder (tree, free_tree, NULL);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2252
	      *err = REG_ESPACE;
2253
	      return NULL;
2254
	    }
1.3.1 by Colin Watson
Import upstream version 3.1
2255
	  tree = newtree;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2256
	}
2257
      else if (tree == NULL)
2258
	tree = expr;
2259
      /* Otherwise expr == NULL, we don't need to create new tree.  */
2260
    }
2261
  return tree;
2262
}
2263
2264
/* This function build the following tree, from regular expression a*:
2265
	 *
2266
	 |
2267
	 a
2268
*/
2269
2270
static bin_tree_t *
2271
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2272
		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2273
{
1.3.2 by Colin Watson
Import upstream version 3.2
2274
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2275
  bin_tree_t *tree;
2276
  switch (token->type)
2277
    {
2278
    case CHARACTER:
2279
      tree = create_token_tree (dfa, NULL, NULL, token);
2280
      if (BE (tree == NULL, 0))
2281
	{
2282
	  *err = REG_ESPACE;
2283
	  return NULL;
2284
	}
2285
#ifdef RE_ENABLE_I18N
2286
      if (dfa->mb_cur_max > 1)
2287
	{
2288
	  while (!re_string_eoi (regexp)
2289
		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2290
	    {
2291
	      bin_tree_t *mbc_remain;
2292
	      fetch_token (token, regexp, syntax);
2293
	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2294
	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2295
	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2296
		{
2297
		  *err = REG_ESPACE;
2298
		  return NULL;
2299
		}
2300
	    }
2301
	}
2302
#endif
2303
      break;
2304
    case OP_OPEN_SUBEXP:
2305
      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2306
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307
	return NULL;
2308
      break;
2309
    case OP_OPEN_BRACKET:
2310
      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2311
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312
	return NULL;
2313
      break;
2314
    case OP_BACK_REF:
2315
      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316
	{
2317
	  *err = REG_ESUBREG;
2318
	  return NULL;
2319
	}
2320
      dfa->used_bkref_map |= 1 << token->opr.idx;
2321
      tree = create_token_tree (dfa, NULL, NULL, token);
2322
      if (BE (tree == NULL, 0))
2323
	{
2324
	  *err = REG_ESPACE;
2325
	  return NULL;
2326
	}
2327
      ++dfa->nbackref;
2328
      dfa->has_mb_node = 1;
2329
      break;
2330
    case OP_OPEN_DUP_NUM:
2331
      if (syntax & RE_CONTEXT_INVALID_DUP)
2332
	{
2333
	  *err = REG_BADRPT;
2334
	  return NULL;
2335
	}
2336
      /* FALLTHROUGH */
2337
    case OP_DUP_ASTERISK:
2338
    case OP_DUP_PLUS:
2339
    case OP_DUP_QUESTION:
2340
      if (syntax & RE_CONTEXT_INVALID_OPS)
2341
	{
2342
	  *err = REG_BADRPT;
2343
	  return NULL;
2344
	}
2345
      else if (syntax & RE_CONTEXT_INDEP_OPS)
2346
	{
2347
	  fetch_token (token, regexp, syntax);
2348
	  return parse_expression (regexp, preg, token, syntax, nest, err);
2349
	}
2350
      /* else fall through  */
2351
    case OP_CLOSE_SUBEXP:
2352
      if ((token->type == OP_CLOSE_SUBEXP) &&
2353
	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354
	{
2355
	  *err = REG_ERPAREN;
2356
	  return NULL;
2357
	}
2358
      /* else fall through  */
2359
    case OP_CLOSE_DUP_NUM:
2360
      /* We treat it as a normal character.  */
2361
2362
      /* Then we can these characters as normal characters.  */
2363
      token->type = CHARACTER;
2364
      /* mb_partial and word_char bits should be initialized already
2365
	 by peek_token.  */
2366
      tree = create_token_tree (dfa, NULL, NULL, token);
2367
      if (BE (tree == NULL, 0))
2368
	{
2369
	  *err = REG_ESPACE;
2370
	  return NULL;
2371
	}
2372
      break;
2373
    case ANCHOR:
2374
      if ((token->opr.ctx_type
2375
	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2376
	  && dfa->word_ops_used == 0)
2377
	init_word_char (dfa);
2378
      if (token->opr.ctx_type == WORD_DELIM
1.2.3 by Colin Watson
Import upstream version 2.2
2379
	  || token->opr.ctx_type == NOT_WORD_DELIM)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2380
	{
2381
	  bin_tree_t *tree_first, *tree_last;
2382
	  if (token->opr.ctx_type == WORD_DELIM)
2383
	    {
2384
	      token->opr.ctx_type = WORD_FIRST;
2385
	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2386
	      token->opr.ctx_type = WORD_LAST;
1.2.3 by Colin Watson
Import upstream version 2.2
2387
	    }
2388
	  else
2389
	    {
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2390
	      token->opr.ctx_type = INSIDE_WORD;
2391
	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2392
	      token->opr.ctx_type = INSIDE_NOTWORD;
1.2.3 by Colin Watson
Import upstream version 2.2
2393
	    }
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2394
	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2395
	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2396
	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2397
	    {
2398
	      *err = REG_ESPACE;
2399
	      return NULL;
2400
	    }
2401
	}
2402
      else
2403
	{
2404
	  tree = create_token_tree (dfa, NULL, NULL, token);
2405
	  if (BE (tree == NULL, 0))
2406
	    {
2407
	      *err = REG_ESPACE;
2408
	      return NULL;
2409
	    }
2410
	}
2411
      /* We must return here, since ANCHORs can't be followed
2412
	 by repetition operators.
2413
	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414
	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2415
      fetch_token (token, regexp, syntax);
2416
      return tree;
2417
    case OP_PERIOD:
2418
      tree = create_token_tree (dfa, NULL, NULL, token);
2419
      if (BE (tree == NULL, 0))
2420
	{
2421
	  *err = REG_ESPACE;
2422
	  return NULL;
2423
	}
2424
      if (dfa->mb_cur_max > 1)
2425
	dfa->has_mb_node = 1;
2426
      break;
2427
    case OP_WORD:
2428
    case OP_NOTWORD:
2429
      tree = build_charclass_op (dfa, regexp->trans,
1.3.2 by Colin Watson
Import upstream version 3.2
2430
				 "alnum",
2431
				 "_",
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2432
				 token->type == OP_NOTWORD, err);
2433
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434
	return NULL;
2435
      break;
2436
    case OP_SPACE:
2437
    case OP_NOTSPACE:
2438
      tree = build_charclass_op (dfa, regexp->trans,
1.3.2 by Colin Watson
Import upstream version 3.2
2439
				 "space",
2440
				 "",
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2441
				 token->type == OP_NOTSPACE, err);
2442
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443
	return NULL;
2444
      break;
2445
    case OP_ALT:
2446
    case END_OF_RE:
2447
      return NULL;
2448
    case BACK_SLASH:
2449
      *err = REG_EESCAPE;
2450
      return NULL;
2451
    default:
2452
      /* Must not happen?  */
2453
#ifdef DEBUG
2454
      assert (0);
2455
#endif
2456
      return NULL;
2457
    }
2458
  fetch_token (token, regexp, syntax);
2459
2460
  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2461
	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2462
    {
2463
      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2464
      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2465
	return NULL;
2466
      /* In BRE consecutive duplications are not allowed.  */
2467
      if ((syntax & RE_CONTEXT_INVALID_DUP)
2468
	  && (token->type == OP_DUP_ASTERISK
2469
	      || token->type == OP_OPEN_DUP_NUM))
2470
	{
2471
	  *err = REG_BADRPT;
2472
	  return NULL;
2473
	}
2474
    }
2475
2476
  return tree;
2477
}
2478
2479
/* This function build the following tree, from regular expression
2480
   (<reg_exp>):
2481
	 SUBEXP
2482
	    |
2483
	<reg_exp>
2484
*/
2485
2486
static bin_tree_t *
2487
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488
	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489
{
1.3.2 by Colin Watson
Import upstream version 3.2
2490
  re_dfa_t *dfa = preg->buffer;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2491
  bin_tree_t *tree;
2492
  size_t cur_nsub;
2493
  cur_nsub = preg->re_nsub++;
2494
2495
  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497
  /* The subexpression may be a null string.  */
2498
  if (token->type == OP_CLOSE_SUBEXP)
2499
    tree = NULL;
2500
  else
2501
    {
2502
      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503
      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
1.3.1 by Colin Watson
Import upstream version 3.1
2504
	{
2505
	  if (tree != NULL)
2506
	    postorder (tree, free_tree, NULL);
2507
	  *err = REG_EPAREN;
2508
	}
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2509
      if (BE (*err != REG_NOERROR, 0))
2510
	return NULL;
2511
    }
2512
2513
  if (cur_nsub <= '9' - '1')
2514
    dfa->completed_bkref_map |= 1 << cur_nsub;
2515
2516
  tree = create_tree (dfa, tree, NULL, SUBEXP);
2517
  if (BE (tree == NULL, 0))
2518
    {
2519
      *err = REG_ESPACE;
2520
      return NULL;
2521
    }
2522
  tree->token.opr.idx = cur_nsub;
2523
  return tree;
2524
}
2525
2526
/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2527
2528
static bin_tree_t *
2529
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530
	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2531
{
2532
  bin_tree_t *tree = NULL, *old_tree = NULL;
2533
  Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2534
  re_token_t start_token = *token;
2535
2536
  if (token->type == OP_OPEN_DUP_NUM)
2537
    {
2538
      end = 0;
2539
      start = fetch_number (regexp, token, syntax);
2540
      if (start == REG_MISSING)
2541
	{
2542
	  if (token->type == CHARACTER && token->opr.c == ',')
2543
	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2544
	  else
2545
	    {
2546
	      *err = REG_BADBR; /* <re>{} is invalid.  */
2547
	      return NULL;
2548
	    }
2549
	}
2550
      if (BE (start != REG_ERROR, 1))
2551
	{
2552
	  /* We treat "{n}" as "{n,n}".  */
2553
	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2554
		 : ((token->type == CHARACTER && token->opr.c == ',')
2555
		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2556
	}
2557
      if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2558
	{
2559
	  /* Invalid sequence.  */
2560
	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561
	    {
2562
	      if (token->type == END_OF_RE)
2563
		*err = REG_EBRACE;
2564
	      else
2565
		*err = REG_BADBR;
2566
2567
	      return NULL;
2568
	    }
2569
2570
	  /* If the syntax bit is set, rollback.  */
2571
	  re_string_set_index (regexp, start_idx);
2572
	  *token = start_token;
2573
	  token->type = CHARACTER;
2574
	  /* mb_partial and word_char bits should be already initialized by
2575
	     peek_token.  */
2576
	  return elem;
2577
	}
2578
1.2.3 by Colin Watson
Import upstream version 2.2
2579
      if (BE ((end != REG_MISSING && start > end)
2580
	      || token->type != OP_CLOSE_DUP_NUM, 0))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2581
	{
2582
	  /* First number greater than second.  */
2583
	  *err = REG_BADBR;
2584
	  return NULL;
2585
	}
1.3.2 by Colin Watson
Import upstream version 3.2
2586
2587
      if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588
	{
2589
	  *err = REG_ESIZE;
2590
	  return NULL;
2591
	}
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2592
    }
2593
  else
2594
    {
2595
      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2596
      end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2597
    }
2598
2599
  fetch_token (token, regexp, syntax);
2600
2601
  if (BE (elem == NULL, 0))
2602
    return NULL;
2603
  if (BE (start == 0 && end == 0, 0))
2604
    {
2605
      postorder (elem, free_tree, NULL);
2606
      return NULL;
2607
    }
2608
2609
  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2610
  if (BE (start > 0, 0))
2611
    {
2612
      tree = elem;
2613
      for (i = 2; i <= start; ++i)
2614
	{
2615
	  elem = duplicate_tree (elem, dfa);
2616
	  tree = create_tree (dfa, tree, elem, CONCAT);
2617
	  if (BE (elem == NULL || tree == NULL, 0))
2618
	    goto parse_dup_op_espace;
2619
	}
2620
2621
      if (start == end)
2622
	return tree;
2623
2624
      /* Duplicate ELEM before it is marked optional.  */
2625
      elem = duplicate_tree (elem, dfa);
2626
      old_tree = tree;
2627
    }
2628
  else
2629
    old_tree = NULL;
2630
2631
  if (elem->token.type == SUBEXP)
1.3.2 by Colin Watson
Import upstream version 3.2
2632
    {
2633
      uintptr_t subidx = elem->token.opr.idx;
2634
      postorder (elem, mark_opt_subexp, (void *) subidx);
2635
    }
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2636
2637
  tree = create_tree (dfa, elem, NULL,
2638
		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2639
  if (BE (tree == NULL, 0))
2640
    goto parse_dup_op_espace;
2641
1.2.3 by Colin Watson
Import upstream version 2.2
2642
/* From gnulib's "intprops.h":
2643
   True if the arithmetic type T is signed.  */
2644
#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2645
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2646
  /* This loop is actually executed only when end != REG_MISSING,
2647
     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2648
     already created the start+1-th copy.  */
1.2.3 by Colin Watson
Import upstream version 2.2
2649
  if (TYPE_SIGNED (Idx) || end != REG_MISSING)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2650
    for (i = start + 2; i <= end; ++i)
2651
      {
2652
	elem = duplicate_tree (elem, dfa);
2653
	tree = create_tree (dfa, tree, elem, CONCAT);
2654
	if (BE (elem == NULL || tree == NULL, 0))
2655
	  goto parse_dup_op_espace;
2656
2657
	tree = create_tree (dfa, tree, NULL, OP_ALT);
2658
	if (BE (tree == NULL, 0))
2659
	  goto parse_dup_op_espace;
2660
      }
2661
2662
  if (old_tree)
2663
    tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665
  return tree;
2666
2667
 parse_dup_op_espace:
2668
  *err = REG_ESPACE;
2669
  return NULL;
2670
}
2671
2672
/* Size of the names for collating symbol/equivalence_class/character_class.
2673
   I'm not sure, but maybe enough.  */
2674
#define BRACKET_NAME_BUF_SIZE 32
2675
2676
#ifndef _LIBC
2677
  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678
     Build the range expression which starts from START_ELEM, and ends
2679
     at END_ELEM.  The result are written to MBCSET and SBCSET.
2680
     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
1.3.1 by Colin Watson
Import upstream version 3.1
2681
     mbcset->range_ends, is a pointer argument since we may
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2682
     update it.  */
2683
2684
static reg_errcode_t
2685
internal_function
2686
# ifdef RE_ENABLE_I18N
1.2.4 by Otavio Salvador
Import upstream version 2.3
2687
build_range_exp (const reg_syntax_t syntax,
2688
                 bitset_t sbcset,
2689
                 re_charset_t *mbcset,
2690
                 Idx *range_alloc,
2691
                 const bracket_elem_t *start_elem,
2692
                 const bracket_elem_t *end_elem)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2693
# else /* not RE_ENABLE_I18N */
1.2.4 by Otavio Salvador
Import upstream version 2.3
2694
build_range_exp (const reg_syntax_t syntax,
2695
                 bitset_t sbcset,
2696
                 const bracket_elem_t *start_elem,
2697
                 const bracket_elem_t *end_elem)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2698
# endif /* not RE_ENABLE_I18N */
2699
{
2700
  unsigned int start_ch, end_ch;
2701
  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2702
  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2703
	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2704
	  0))
2705
    return REG_ERANGE;
2706
2707
  /* We can handle no multi character collating elements without libc
2708
     support.  */
2709
  if (BE ((start_elem->type == COLL_SYM
2710
	   && strlen ((char *) start_elem->opr.name) > 1)
2711
	  || (end_elem->type == COLL_SYM
2712
	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2713
    return REG_ECOLLATE;
2714
2715
# ifdef RE_ENABLE_I18N
2716
  {
2717
    wchar_t wc;
2718
    wint_t start_wc;
2719
    wint_t end_wc;
2720
2721
    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2722
		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2723
		   : 0));
2724
    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2725
	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2726
		 : 0));
2727
    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2728
		? __btowc (start_ch) : start_elem->opr.wch);
2729
    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2730
	      ? __btowc (end_ch) : end_elem->opr.wch);
2731
    if (start_wc == WEOF || end_wc == WEOF)
2732
      return REG_ECOLLATE;
1.3.2 by Colin Watson
Import upstream version 3.2
2733
    else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2734
      return REG_ERANGE;
2735
2736
    /* Got valid collation sequence values, add them as a new entry.
2737
       However, for !_LIBC we have no collation elements: if the
2738
       character set is single byte, the single byte character set
2739
       that we build below suffices.  parse_bracket_exp passes
2740
       no MBCSET if dfa->mb_cur_max == 1.  */
2741
    if (mbcset)
2742
      {
1.2.3 by Colin Watson
Import upstream version 2.2
2743
	/* Check the space of the arrays.  */
2744
	if (BE (*range_alloc == mbcset->nranges, 0))
2745
	  {
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2746
	    /* There is not enough space, need realloc.  */
2747
	    wchar_t *new_array_start, *new_array_end;
2748
	    Idx new_nranges;
2749
2750
	    /* +1 in case of mbcset->nranges is 0.  */
2751
	    new_nranges = 2 * mbcset->nranges + 1;
2752
	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753
	       are NULL if *range_alloc == 0.  */
2754
	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
1.2.3 by Colin Watson
Import upstream version 2.2
2755
					  new_nranges);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2756
	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
1.2.3 by Colin Watson
Import upstream version 2.2
2757
					new_nranges);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2758
2759
	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760
	      return REG_ESPACE;
2761
2762
	    mbcset->range_starts = new_array_start;
2763
	    mbcset->range_ends = new_array_end;
2764
	    *range_alloc = new_nranges;
1.2.3 by Colin Watson
Import upstream version 2.2
2765
	  }
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2766
1.2.3 by Colin Watson
Import upstream version 2.2
2767
	mbcset->range_starts[mbcset->nranges] = start_wc;
2768
	mbcset->range_ends[mbcset->nranges++] = end_wc;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2769
      }
2770
2771
    /* Build the table for single byte characters.  */
2772
    for (wc = 0; wc < SBC_MAX; ++wc)
2773
      {
1.3.2 by Colin Watson
Import upstream version 3.2
2774
	if (start_wc <= wc && wc <= end_wc)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2775
	  bitset_set (sbcset, wc);
2776
      }
2777
  }
2778
# else /* not RE_ENABLE_I18N */
2779
  {
2780
    unsigned int ch;
2781
    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2782
		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783
		   : 0));
2784
    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2785
	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786
		 : 0));
2787
    if (start_ch > end_ch)
2788
      return REG_ERANGE;
2789
    /* Build the table for single byte characters.  */
2790
    for (ch = 0; ch < SBC_MAX; ++ch)
2791
      if (start_ch <= ch  && ch <= end_ch)
2792
	bitset_set (sbcset, ch);
2793
  }
2794
# endif /* not RE_ENABLE_I18N */
2795
  return REG_NOERROR;
2796
}
2797
#endif /* not _LIBC */
2798
2799
#ifndef _LIBC
2800
/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801
   Build the collating element which is represented by NAME.
2802
   The result are written to MBCSET and SBCSET.
2803
   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804
   pointer argument since we may update it.  */
2805
2806
static reg_errcode_t
2807
internal_function
2808
# ifdef RE_ENABLE_I18N
1.3.1 by Colin Watson
Import upstream version 3.1
2809
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2810
			Idx *coll_sym_alloc, const unsigned char *name)
2811
# else /* not RE_ENABLE_I18N */
2812
build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813
# endif /* not RE_ENABLE_I18N */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2814
{
2815
  size_t name_len = strlen ((const char *) name);
2816
  if (BE (name_len != 1, 0))
2817
    return REG_ECOLLATE;
2818
  else
2819
    {
2820
      bitset_set (sbcset, name[0]);
2821
      return REG_NOERROR;
2822
    }
2823
}
2824
#endif /* not _LIBC */
2825
2826
/* This function parse bracket expression like "[abc]", "[a-c]",
2827
   "[[.a-a.]]" etc.  */
2828
2829
static bin_tree_t *
2830
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2831
		   reg_syntax_t syntax, reg_errcode_t *err)
2832
{
2833
#ifdef _LIBC
2834
  const unsigned char *collseqmb;
2835
  const char *collseqwc;
2836
  uint32_t nrules;
2837
  int32_t table_size;
2838
  const int32_t *symb_table;
2839
  const unsigned char *extra;
2840
1.3.1 by Colin Watson
Import upstream version 3.1
2841
  /* Local function for parse_bracket_exp used in _LIBC environment.
2842
     Seek the collating symbol entry corresponding to NAME.
1.3.2 by Colin Watson
Import upstream version 3.2
2843
     Return the index of the symbol in the SYMB_TABLE,
2844
     or -1 if not found.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2845
2846
  auto inline int32_t
1.3.2 by Colin Watson
Import upstream version 3.2
2847
  __attribute__ ((always_inline))
2848
  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2849
    {
1.3.2 by Colin Watson
Import upstream version 3.2
2850
      int32_t elem;
2851
2852
      for (elem = 0; elem < table_size; elem++)
2853
	if (symb_table[2 * elem] != 0)
2854
	  {
2855
	    int32_t idx = symb_table[2 * elem + 1];
2856
	    /* Skip the name of collating element name.  */
2857
	    idx += 1 + extra[idx];
2858
	    if (/* Compare the length of the name.  */
2859
		name_len == extra[idx]
2860
		/* Compare the name.  */
2861
		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2862
	      /* Yep, this is the entry.  */
2863
	      return elem;
2864
	  }
2865
      return -1;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2866
    }
2867
1.2.3 by Colin Watson
Import upstream version 2.2
2868
  /* Local function for parse_bracket_exp used in _LIBC environment.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2869
     Look up the collation sequence value of BR_ELEM.
2870
     Return the value if succeeded, UINT_MAX otherwise.  */
2871
2872
  auto inline unsigned int
1.3.2 by Colin Watson
Import upstream version 3.2
2873
  __attribute__ ((always_inline))
2874
  lookup_collation_sequence_value (bracket_elem_t *br_elem)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2875
    {
2876
      if (br_elem->type == SB_CHAR)
2877
	{
2878
	  /*
2879
	  if (MB_CUR_MAX == 1)
2880
	  */
2881
	  if (nrules == 0)
2882
	    return collseqmb[br_elem->opr.ch];
2883
	  else
2884
	    {
2885
	      wint_t wc = __btowc (br_elem->opr.ch);
2886
	      return __collseq_table_lookup (collseqwc, wc);
2887
	    }
2888
	}
2889
      else if (br_elem->type == MB_CHAR)
2890
	{
1.2.3 by Colin Watson
Import upstream version 2.2
2891
	  if (nrules != 0)
2892
	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2893
	}
2894
      else if (br_elem->type == COLL_SYM)
2895
	{
2896
	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897
	  if (nrules != 0)
2898
	    {
2899
	      int32_t elem, idx;
2900
	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2901
						  sym_name_len);
1.3.2 by Colin Watson
Import upstream version 3.2
2902
	      if (elem != -1)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2903
		{
2904
		  /* We found the entry.  */
2905
		  idx = symb_table[2 * elem + 1];
2906
		  /* Skip the name of collating element name.  */
2907
		  idx += 1 + extra[idx];
2908
		  /* Skip the byte sequence of the collating element.  */
2909
		  idx += 1 + extra[idx];
2910
		  /* Adjust for the alignment.  */
2911
		  idx = (idx + 3) & ~3;
2912
		  /* Skip the multibyte collation sequence value.  */
2913
		  idx += sizeof (unsigned int);
2914
		  /* Skip the wide char sequence of the collating element.  */
2915
		  idx += sizeof (unsigned int) *
2916
		    (1 + *(unsigned int *) (extra + idx));
2917
		  /* Return the collation sequence value.  */
2918
		  return *(unsigned int *) (extra + idx);
2919
		}
1.3.2 by Colin Watson
Import upstream version 3.2
2920
	      else if (sym_name_len == 1)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2921
		{
2922
		  /* No valid character.  Match it as a single byte
2923
		     character.  */
2924
		  return collseqmb[br_elem->opr.name[0]];
2925
		}
2926
	    }
2927
	  else if (sym_name_len == 1)
2928
	    return collseqmb[br_elem->opr.name[0]];
2929
	}
2930
      return UINT_MAX;
2931
    }
2932
1.3.1 by Colin Watson
Import upstream version 3.1
2933
  /* Local function for parse_bracket_exp used in _LIBC environment.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2934
     Build the range expression which starts from START_ELEM, and ends
2935
     at END_ELEM.  The result are written to MBCSET and SBCSET.
2936
     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
1.3.1 by Colin Watson
Import upstream version 3.1
2937
     mbcset->range_ends, is a pointer argument since we may
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2938
     update it.  */
2939
2940
  auto inline reg_errcode_t
1.3.2 by Colin Watson
Import upstream version 3.2
2941
  __attribute__ ((always_inline))
2942
  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943
		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2944
    {
2945
      unsigned int ch;
2946
      uint32_t start_collseq;
2947
      uint32_t end_collseq;
2948
2949
      /* Equivalence Classes and Character Classes can't be a range
2950
	 start/end.  */
2951
      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2952
	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2953
	      0))
2954
	return REG_ERANGE;
2955
1.3.2 by Colin Watson
Import upstream version 3.2
2956
      /* FIXME: Implement rational ranges here, too.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2957
      start_collseq = lookup_collation_sequence_value (start_elem);
2958
      end_collseq = lookup_collation_sequence_value (end_elem);
2959
      /* Check start/end collation sequence values.  */
2960
      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961
	return REG_ECOLLATE;
2962
      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963
	return REG_ERANGE;
2964
2965
      /* Got valid collation sequence values, add them as a new entry.
2966
	 However, if we have no collation elements, and the character set
2967
	 is single byte, the single byte character set that we
2968
	 build below suffices. */
2969
      if (nrules > 0 || dfa->mb_cur_max > 1)
2970
	{
1.2.3 by Colin Watson
Import upstream version 2.2
2971
	  /* Check the space of the arrays.  */
2972
	  if (BE (*range_alloc == mbcset->nranges, 0))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2973
	    {
2974
	      /* There is not enough space, need realloc.  */
2975
	      uint32_t *new_array_start;
2976
	      uint32_t *new_array_end;
2977
	      Idx new_nranges;
2978
2979
	      /* +1 in case of mbcset->nranges is 0.  */
2980
	      new_nranges = 2 * mbcset->nranges + 1;
2981
	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982
					    new_nranges);
2983
	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
1.2.3 by Colin Watson
Import upstream version 2.2
2984
					  new_nranges);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2985
2986
	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
1.2.3 by Colin Watson
Import upstream version 2.2
2987
		return REG_ESPACE;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2988
2989
	      mbcset->range_starts = new_array_start;
2990
	      mbcset->range_ends = new_array_end;
2991
	      *range_alloc = new_nranges;
2992
	    }
2993
1.2.3 by Colin Watson
Import upstream version 2.2
2994
	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2995
	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
2996
	}
2997
2998
      /* Build the table for single byte characters.  */
2999
      for (ch = 0; ch < SBC_MAX; ch++)
3000
	{
3001
	  uint32_t ch_collseq;
3002
	  /*
3003
	  if (MB_CUR_MAX == 1)
3004
	  */
3005
	  if (nrules == 0)
3006
	    ch_collseq = collseqmb[ch];
3007
	  else
3008
	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009
	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010
	    bitset_set (sbcset, ch);
3011
	}
3012
      return REG_NOERROR;
3013
    }
3014
1.3.1 by Colin Watson
Import upstream version 3.1
3015
  /* Local function for parse_bracket_exp used in _LIBC environment.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3016
     Build the collating element which is represented by NAME.
3017
     The result are written to MBCSET and SBCSET.
3018
     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
1.3.1 by Colin Watson
Import upstream version 3.1
3019
     pointer argument since we may update it.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3020
3021
  auto inline reg_errcode_t
1.3.2 by Colin Watson
Import upstream version 3.2
3022
  __attribute__ ((always_inline))
3023
  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024
			  Idx *coll_sym_alloc, const unsigned char *name)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3025
    {
3026
      int32_t elem, idx;
3027
      size_t name_len = strlen ((const char *) name);
3028
      if (nrules != 0)
3029
	{
3030
	  elem = seek_collating_symbol_entry (name, name_len);
1.3.2 by Colin Watson
Import upstream version 3.2
3031
	  if (elem != -1)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3032
	    {
3033
	      /* We found the entry.  */
3034
	      idx = symb_table[2 * elem + 1];
3035
	      /* Skip the name of collating element name.  */
3036
	      idx += 1 + extra[idx];
3037
	    }
1.3.2 by Colin Watson
Import upstream version 3.2
3038
	  else if (name_len == 1)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3039
	    {
3040
	      /* No valid character, treat it as a normal
3041
		 character.  */
3042
	      bitset_set (sbcset, name[0]);
3043
	      return REG_NOERROR;
3044
	    }
3045
	  else
3046
	    return REG_ECOLLATE;
3047
3048
	  /* Got valid collation sequence, add it as a new entry.  */
3049
	  /* Check the space of the arrays.  */
3050
	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3051
	    {
3052
	      /* Not enough, realloc it.  */
3053
	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3054
	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3055
	      /* Use realloc since mbcset->coll_syms is NULL
3056
		 if *alloc == 0.  */
3057
	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3058
						   new_coll_sym_alloc);
3059
	      if (BE (new_coll_syms == NULL, 0))
3060
		return REG_ESPACE;
3061
	      mbcset->coll_syms = new_coll_syms;
3062
	      *coll_sym_alloc = new_coll_sym_alloc;
3063
	    }
3064
	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3065
	  return REG_NOERROR;
3066
	}
3067
      else
3068
	{
3069
	  if (BE (name_len != 1, 0))
3070
	    return REG_ECOLLATE;
3071
	  else
3072
	    {
3073
	      bitset_set (sbcset, name[0]);
3074
	      return REG_NOERROR;
3075
	    }
3076
	}
3077
    }
3078
#endif
3079
3080
  re_token_t br_token;
3081
  re_bitset_ptr_t sbcset;
3082
#ifdef RE_ENABLE_I18N
3083
  re_charset_t *mbcset;
3084
  Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3085
  Idx equiv_class_alloc = 0, char_class_alloc = 0;
3086
#endif /* not RE_ENABLE_I18N */
3087
  bool non_match = false;
3088
  bin_tree_t *work_tree;
3089
  int token_len;
3090
  bool first_round = true;
3091
#ifdef _LIBC
3092
  collseqmb = (const unsigned char *)
3093
    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3094
  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3095
  if (nrules)
3096
    {
3097
      /*
3098
      if (MB_CUR_MAX > 1)
3099
      */
3100
      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3101
      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3102
      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3103
						  _NL_COLLATE_SYMB_TABLEMB);
3104
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3105
						   _NL_COLLATE_SYMB_EXTRAMB);
3106
    }
3107
#endif
3108
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3109
#ifdef RE_ENABLE_I18N
3110
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3111
#endif /* RE_ENABLE_I18N */
3112
#ifdef RE_ENABLE_I18N
3113
  if (BE (sbcset == NULL || mbcset == NULL, 0))
3114
#else
3115
  if (BE (sbcset == NULL, 0))
3116
#endif /* RE_ENABLE_I18N */
3117
    {
1.3.1 by Colin Watson
Import upstream version 3.1
3118
      re_free (sbcset);
3119
#ifdef RE_ENABLE_I18N
3120
      re_free (mbcset);
3121
#endif
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3122
      *err = REG_ESPACE;
3123
      return NULL;
3124
    }
3125
3126
  token_len = peek_token_bracket (token, regexp, syntax);
3127
  if (BE (token->type == END_OF_RE, 0))
3128
    {
3129
      *err = REG_BADPAT;
3130
      goto parse_bracket_exp_free_return;
3131
    }
3132
  if (token->type == OP_NON_MATCH_LIST)
3133
    {
3134
#ifdef RE_ENABLE_I18N
3135
      mbcset->non_match = 1;
3136
#endif /* not RE_ENABLE_I18N */
3137
      non_match = true;
3138
      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3139
	bitset_set (sbcset, '\n');
3140
      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3141
      token_len = peek_token_bracket (token, regexp, syntax);
3142
      if (BE (token->type == END_OF_RE, 0))
3143
	{
3144
	  *err = REG_BADPAT;
3145
	  goto parse_bracket_exp_free_return;
3146
	}
3147
    }
3148
3149
  /* We treat the first ']' as a normal character.  */
3150
  if (token->type == OP_CLOSE_BRACKET)
3151
    token->type = CHARACTER;
3152
3153
  while (1)
3154
    {
3155
      bracket_elem_t start_elem, end_elem;
3156
      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3157
      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3158
      reg_errcode_t ret;
3159
      int token_len2 = 0;
3160
      bool is_range_exp = false;
3161
      re_token_t token2;
3162
3163
      start_elem.opr.name = start_name_buf;
3164
      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165
				   syntax, first_round);
3166
      if (BE (ret != REG_NOERROR, 0))
3167
	{
3168
	  *err = ret;
3169
	  goto parse_bracket_exp_free_return;
3170
	}
3171
      first_round = false;
3172
3173
      /* Get information about the next token.  We need it in any case.  */
3174
      token_len = peek_token_bracket (token, regexp, syntax);
3175
3176
      /* Do not check for ranges if we know they are not allowed.  */
3177
      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178
	{
3179
	  if (BE (token->type == END_OF_RE, 0))
3180
	    {
3181
	      *err = REG_EBRACK;
3182
	      goto parse_bracket_exp_free_return;
3183
	    }
3184
	  if (token->type == OP_CHARSET_RANGE)
3185
	    {
3186
	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3187
	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188
	      if (BE (token2.type == END_OF_RE, 0))
3189
		{
3190
		  *err = REG_EBRACK;
3191
		  goto parse_bracket_exp_free_return;
3192
		}
3193
	      if (token2.type == OP_CLOSE_BRACKET)
3194
		{
3195
		  /* We treat the last '-' as a normal character.  */
3196
		  re_string_skip_bytes (regexp, -token_len);
3197
		  token->type = CHARACTER;
3198
		}
3199
	      else
3200
		is_range_exp = true;
3201
	    }
3202
	}
3203
3204
      if (is_range_exp == true)
3205
	{
3206
	  end_elem.opr.name = end_name_buf;
3207
	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208
				       dfa, syntax, true);
3209
	  if (BE (ret != REG_NOERROR, 0))
3210
	    {
3211
	      *err = ret;
3212
	      goto parse_bracket_exp_free_return;
3213
	    }
3214
3215
	  token_len = peek_token_bracket (token, regexp, syntax);
3216
3217
#ifdef _LIBC
3218
	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219
				  &start_elem, &end_elem);
3220
#else
3221
# ifdef RE_ENABLE_I18N
1.2.4 by Otavio Salvador
Import upstream version 2.3
3222
	  *err = build_range_exp (syntax, sbcset,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3223
				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3224
				  &range_alloc, &start_elem, &end_elem);
3225
# else
1.2.4 by Otavio Salvador
Import upstream version 2.3
3226
	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3227
# endif
3228
#endif /* RE_ENABLE_I18N */
3229
	  if (BE (*err != REG_NOERROR, 0))
3230
	    goto parse_bracket_exp_free_return;
3231
	}
3232
      else
3233
	{
3234
	  switch (start_elem.type)
3235
	    {
3236
	    case SB_CHAR:
3237
	      bitset_set (sbcset, start_elem.opr.ch);
3238
	      break;
3239
#ifdef RE_ENABLE_I18N
3240
	    case MB_CHAR:
3241
	      /* Check whether the array has enough space.  */
3242
	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243
		{
3244
		  wchar_t *new_mbchars;
3245
		  /* Not enough, realloc it.  */
3246
		  /* +1 in case of mbcset->nmbchars is 0.  */
3247
		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248
		  /* Use realloc since array is NULL if *alloc == 0.  */
3249
		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250
					    mbchar_alloc);
3251
		  if (BE (new_mbchars == NULL, 0))
3252
		    goto parse_bracket_exp_espace;
3253
		  mbcset->mbchars = new_mbchars;
3254
		}
3255
	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256
	      break;
3257
#endif /* RE_ENABLE_I18N */
3258
	    case EQUIV_CLASS:
3259
	      *err = build_equiv_class (sbcset,
3260
#ifdef RE_ENABLE_I18N
3261
					mbcset, &equiv_class_alloc,
3262
#endif /* RE_ENABLE_I18N */
3263
					start_elem.opr.name);
3264
	      if (BE (*err != REG_NOERROR, 0))
3265
		goto parse_bracket_exp_free_return;
3266
	      break;
3267
	    case COLL_SYM:
3268
	      *err = build_collating_symbol (sbcset,
3269
#ifdef RE_ENABLE_I18N
3270
					     mbcset, &coll_sym_alloc,
3271
#endif /* RE_ENABLE_I18N */
3272
					     start_elem.opr.name);
3273
	      if (BE (*err != REG_NOERROR, 0))
3274
		goto parse_bracket_exp_free_return;
3275
	      break;
3276
	    case CHAR_CLASS:
3277
	      *err = build_charclass (regexp->trans, sbcset,
3278
#ifdef RE_ENABLE_I18N
3279
				      mbcset, &char_class_alloc,
3280
#endif /* RE_ENABLE_I18N */
1.3.2 by Colin Watson
Import upstream version 3.2
3281
				      (const char *) start_elem.opr.name,
3282
				      syntax);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3283
	      if (BE (*err != REG_NOERROR, 0))
3284
	       goto parse_bracket_exp_free_return;
3285
	      break;
3286
	    default:
3287
	      assert (0);
3288
	      break;
3289
	    }
3290
	}
3291
      if (BE (token->type == END_OF_RE, 0))
3292
	{
3293
	  *err = REG_EBRACK;
3294
	  goto parse_bracket_exp_free_return;
3295
	}
3296
      if (token->type == OP_CLOSE_BRACKET)
3297
	break;
3298
    }
3299
3300
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3301
3302
  /* If it is non-matching list.  */
3303
  if (non_match)
3304
    bitset_not (sbcset);
3305
3306
#ifdef RE_ENABLE_I18N
3307
  /* Ensure only single byte characters are set.  */
3308
  if (dfa->mb_cur_max > 1)
3309
    bitset_mask (sbcset, dfa->sb_char);
3310
3311
  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3312
      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3313
						     || mbcset->non_match)))
3314
    {
3315
      bin_tree_t *mbc_tree;
3316
      int sbc_idx;
3317
      /* Build a tree for complex bracket.  */
3318
      dfa->has_mb_node = 1;
3319
      br_token.type = COMPLEX_BRACKET;
3320
      br_token.opr.mbcset = mbcset;
3321
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322
      if (BE (mbc_tree == NULL, 0))
3323
	goto parse_bracket_exp_espace;
3324
      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3325
	if (sbcset[sbc_idx])
3326
	  break;
3327
      /* If there are no bits set in sbcset, there is no point
3328
	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3329
      if (sbc_idx < BITSET_WORDS)
3330
	{
1.2.3 by Colin Watson
Import upstream version 2.2
3331
	  /* Build a tree for simple bracket.  */
3332
	  br_token.type = SIMPLE_BRACKET;
3333
	  br_token.opr.sbcset = sbcset;
3334
	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335
	  if (BE (work_tree == NULL, 0))
3336
	    goto parse_bracket_exp_espace;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3337
1.2.3 by Colin Watson
Import upstream version 2.2
3338
	  /* Then join them by ALT node.  */
3339
	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3340
	  if (BE (work_tree == NULL, 0))
3341
	    goto parse_bracket_exp_espace;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3342
	}
3343
      else
3344
	{
3345
	  re_free (sbcset);
3346
	  work_tree = mbc_tree;
3347
	}
3348
    }
3349
  else
3350
#endif /* not RE_ENABLE_I18N */
3351
    {
3352
#ifdef RE_ENABLE_I18N
3353
      free_charset (mbcset);
3354
#endif
3355
      /* Build a tree for simple bracket.  */
3356
      br_token.type = SIMPLE_BRACKET;
3357
      br_token.opr.sbcset = sbcset;
3358
      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359
      if (BE (work_tree == NULL, 0))
1.2.3 by Colin Watson
Import upstream version 2.2
3360
	goto parse_bracket_exp_espace;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3361
    }
3362
  return work_tree;
3363
3364
 parse_bracket_exp_espace:
3365
  *err = REG_ESPACE;
3366
 parse_bracket_exp_free_return:
3367
  re_free (sbcset);
3368
#ifdef RE_ENABLE_I18N
3369
  free_charset (mbcset);
3370
#endif /* RE_ENABLE_I18N */
3371
  return NULL;
3372
}
3373
3374
/* Parse an element in the bracket expression.  */
3375
3376
static reg_errcode_t
3377
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3378
		       re_token_t *token, int token_len, re_dfa_t *dfa,
3379
		       reg_syntax_t syntax, bool accept_hyphen)
3380
{
3381
#ifdef RE_ENABLE_I18N
3382
  int cur_char_size;
3383
  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3384
  if (cur_char_size > 1)
3385
    {
3386
      elem->type = MB_CHAR;
3387
      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3388
      re_string_skip_bytes (regexp, cur_char_size);
3389
      return REG_NOERROR;
3390
    }
3391
#endif /* RE_ENABLE_I18N */
3392
  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3393
  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3394
      || token->type == OP_OPEN_EQUIV_CLASS)
3395
    return parse_bracket_symbol (elem, regexp, token);
3396
  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3397
    {
3398
      /* A '-' must only appear as anything but a range indicator before
3399
	 the closing bracket.  Everything else is an error.  */
3400
      re_token_t token2;
3401
      (void) peek_token_bracket (&token2, regexp, syntax);
3402
      if (token2.type != OP_CLOSE_BRACKET)
3403
	/* The actual error value is not standardized since this whole
3404
	   case is undefined.  But ERANGE makes good sense.  */
3405
	return REG_ERANGE;
3406
    }
3407
  elem->type = SB_CHAR;
3408
  elem->opr.ch = token->opr.c;
3409
  return REG_NOERROR;
3410
}
3411
3412
/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3413
   such as [:<character_class>:], [.<collating_element>.], and
3414
   [=<equivalent_class>=].  */
3415
3416
static reg_errcode_t
3417
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3418
		      re_token_t *token)
3419
{
3420
  unsigned char ch, delim = token->opr.c;
3421
  int i = 0;
3422
  if (re_string_eoi(regexp))
3423
    return REG_EBRACK;
3424
  for (;; ++i)
3425
    {
3426
      if (i >= BRACKET_NAME_BUF_SIZE)
3427
	return REG_EBRACK;
3428
      if (token->type == OP_OPEN_CHAR_CLASS)
3429
	ch = re_string_fetch_byte_case (regexp);
3430
      else
3431
	ch = re_string_fetch_byte (regexp);
3432
      if (re_string_eoi(regexp))
3433
	return REG_EBRACK;
3434
      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435
	break;
3436
      elem->opr.name[i] = ch;
3437
    }
3438
  re_string_skip_bytes (regexp, 1);
3439
  elem->opr.name[i] = '\0';
3440
  switch (token->type)
3441
    {
3442
    case OP_OPEN_COLL_ELEM:
3443
      elem->type = COLL_SYM;
3444
      break;
3445
    case OP_OPEN_EQUIV_CLASS:
3446
      elem->type = EQUIV_CLASS;
3447
      break;
3448
    case OP_OPEN_CHAR_CLASS:
3449
      elem->type = CHAR_CLASS;
3450
      break;
3451
    default:
3452
      break;
3453
    }
3454
  return REG_NOERROR;
3455
}
3456
3457
  /* Helper function for parse_bracket_exp.
3458
     Build the equivalence class which is represented by NAME.
3459
     The result are written to MBCSET and SBCSET.
3460
     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
1.3.1 by Colin Watson
Import upstream version 3.1
3461
     is a pointer argument since we may update it.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3462
3463
static reg_errcode_t
3464
#ifdef RE_ENABLE_I18N
3465
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3466
		   Idx *equiv_class_alloc, const unsigned char *name)
3467
#else /* not RE_ENABLE_I18N */
3468
build_equiv_class (bitset_t sbcset, const unsigned char *name)
3469
#endif /* not RE_ENABLE_I18N */
3470
{
3471
#ifdef _LIBC
3472
  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3473
  if (nrules != 0)
3474
    {
3475
      const int32_t *table, *indirect;
3476
      const unsigned char *weights, *extra, *cp;
3477
      unsigned char char_buf[2];
3478
      int32_t idx1, idx2;
3479
      unsigned int ch;
3480
      size_t len;
3481
      /* This #include defines a local function!  */
3482
# include <locale/weight.h>
3483
      /* Calculate the index for equivalence class.  */
3484
      cp = name;
3485
      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3486
      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3487
					       _NL_COLLATE_WEIGHTMB);
3488
      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489
						   _NL_COLLATE_EXTRAMB);
3490
      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3491
						_NL_COLLATE_INDIRECTMB);
1.3.1 by Colin Watson
Import upstream version 3.1
3492
      idx1 = findidx (&cp, -1);
3493
      if (BE (idx1 == 0 || *cp != '\0', 0))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3494
	/* This isn't a valid character.  */
3495
	return REG_ECOLLATE;
3496
1.3.1 by Colin Watson
Import upstream version 3.1
3497
      /* Build single byte matching table for this equivalence class.  */
1.2.3 by Colin Watson
Import upstream version 2.2
3498
      len = weights[idx1 & 0xffffff];
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3499
      for (ch = 0; ch < SBC_MAX; ++ch)
3500
	{
3501
	  char_buf[0] = ch;
3502
	  cp = char_buf;
1.3.1 by Colin Watson
Import upstream version 3.1
3503
	  idx2 = findidx (&cp, 1);
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3504
/*
3505
	  idx2 = table[ch];
3506
*/
3507
	  if (idx2 == 0)
3508
	    /* This isn't a valid character.  */
3509
	    continue;
1.2.3 by Colin Watson
Import upstream version 2.2
3510
	  /* Compare only if the length matches and the collation rule
3511
	     index is the same.  */
3512
	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3513
	    {
3514
	      int cnt = 0;
1.2.3 by Colin Watson
Import upstream version 2.2
3515
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3516
	      while (cnt <= len &&
1.2.3 by Colin Watson
Import upstream version 2.2
3517
		     weights[(idx1 & 0xffffff) + 1 + cnt]
3518
		     == weights[(idx2 & 0xffffff) + 1 + cnt])
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3519
		++cnt;
3520
3521
	      if (cnt > len)
3522
		bitset_set (sbcset, ch);
3523
	    }
3524
	}
3525
      /* Check whether the array has enough space.  */
3526
      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527
	{
3528
	  /* Not enough, realloc it.  */
3529
	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3530
	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531
	  /* Use realloc since the array is NULL if *alloc == 0.  */
3532
	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533
						   int32_t,
3534
						   new_equiv_class_alloc);
3535
	  if (BE (new_equiv_classes == NULL, 0))
3536
	    return REG_ESPACE;
3537
	  mbcset->equiv_classes = new_equiv_classes;
3538
	  *equiv_class_alloc = new_equiv_class_alloc;
3539
	}
3540
      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541
    }
3542
  else
3543
#endif /* _LIBC */
3544
    {
3545
      if (BE (strlen ((const char *) name) != 1, 0))
3546
	return REG_ECOLLATE;
3547
      bitset_set (sbcset, *name);
3548
    }
3549
  return REG_NOERROR;
3550
}
3551
3552
  /* Helper function for parse_bracket_exp.
3553
     Build the character class which is represented by NAME.
3554
     The result are written to MBCSET and SBCSET.
3555
     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
1.3.1 by Colin Watson
Import upstream version 3.1
3556
     is a pointer argument since we may update it.  */
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3557
3558
static reg_errcode_t
3559
#ifdef RE_ENABLE_I18N
3560
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3561
		 re_charset_t *mbcset, Idx *char_class_alloc,
1.3.2 by Colin Watson
Import upstream version 3.2
3562
		 const char *class_name, reg_syntax_t syntax)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3563
#else /* not RE_ENABLE_I18N */
3564
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
1.3.2 by Colin Watson
Import upstream version 3.2
3565
		 const char *class_name, reg_syntax_t syntax)
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3566
#endif /* not RE_ENABLE_I18N */
3567
{
3568
  int i;
1.3.2 by Colin Watson
Import upstream version 3.2
3569
  const char *name = class_name;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3570
3571
  /* In case of REG_ICASE "upper" and "lower" match the both of
3572
     upper and lower cases.  */
3573
  if ((syntax & RE_ICASE)
3574
      && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3575
    name = "alpha";
3576
3577
#ifdef RE_ENABLE_I18N
3578
  /* Check the space of the arrays.  */
3579
  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3580
    {
3581
      /* Not enough, realloc it.  */
3582
      /* +1 in case of mbcset->nchar_classes is 0.  */
3583
      Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3584
      /* Use realloc since array is NULL if *alloc == 0.  */
3585
      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3586
					       new_char_class_alloc);
3587
      if (BE (new_char_classes == NULL, 0))
3588
	return REG_ESPACE;
3589
      mbcset->char_classes = new_char_classes;
3590
      *char_class_alloc = new_char_class_alloc;
3591
    }
3592
  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3593
#endif /* RE_ENABLE_I18N */
3594
3595
#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3596
  do {						\
3597
    if (BE (trans != NULL, 0))			\
3598
      {						\
3599
	for (i = 0; i < SBC_MAX; ++i)		\
3600
	  if (ctype_func (i))			\
3601
	    bitset_set (sbcset, trans[i]);	\
3602
      }						\
3603
    else					\
3604
      {						\
3605
	for (i = 0; i < SBC_MAX; ++i)		\
3606
	  if (ctype_func (i))			\
3607
	    bitset_set (sbcset, i);		\
3608
      }						\
3609
  } while (0)
3610
3611
  if (strcmp (name, "alnum") == 0)
3612
    BUILD_CHARCLASS_LOOP (isalnum);
3613
  else if (strcmp (name, "cntrl") == 0)
3614
    BUILD_CHARCLASS_LOOP (iscntrl);
3615
  else if (strcmp (name, "lower") == 0)
3616
    BUILD_CHARCLASS_LOOP (islower);
3617
  else if (strcmp (name, "space") == 0)
3618
    BUILD_CHARCLASS_LOOP (isspace);
3619
  else if (strcmp (name, "alpha") == 0)
3620
    BUILD_CHARCLASS_LOOP (isalpha);
3621
  else if (strcmp (name, "digit") == 0)
3622
    BUILD_CHARCLASS_LOOP (isdigit);
3623
  else if (strcmp (name, "print") == 0)
3624
    BUILD_CHARCLASS_LOOP (isprint);
3625
  else if (strcmp (name, "upper") == 0)
3626
    BUILD_CHARCLASS_LOOP (isupper);
3627
  else if (strcmp (name, "blank") == 0)
3628
    BUILD_CHARCLASS_LOOP (isblank);
3629
  else if (strcmp (name, "graph") == 0)
3630
    BUILD_CHARCLASS_LOOP (isgraph);
3631
  else if (strcmp (name, "punct") == 0)
3632
    BUILD_CHARCLASS_LOOP (ispunct);
3633
  else if (strcmp (name, "xdigit") == 0)
3634
    BUILD_CHARCLASS_LOOP (isxdigit);
3635
  else
3636
    return REG_ECTYPE;
3637
3638
  return REG_NOERROR;
3639
}
3640
3641
static bin_tree_t *
3642
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
1.3.2 by Colin Watson
Import upstream version 3.2
3643
		    const char *class_name,
3644
		    const char *extra, bool non_match,
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3645
		    reg_errcode_t *err)
3646
{
3647
  re_bitset_ptr_t sbcset;
3648
#ifdef RE_ENABLE_I18N
3649
  re_charset_t *mbcset;
3650
  Idx alloc = 0;
3651
#endif /* not RE_ENABLE_I18N */
3652
  reg_errcode_t ret;
3653
  re_token_t br_token;
3654
  bin_tree_t *tree;
3655
3656
  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3657
#ifdef RE_ENABLE_I18N
3658
  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659
#endif /* RE_ENABLE_I18N */
3660
3661
#ifdef RE_ENABLE_I18N
3662
  if (BE (sbcset == NULL || mbcset == NULL, 0))
3663
#else /* not RE_ENABLE_I18N */
3664
  if (BE (sbcset == NULL, 0))
3665
#endif /* not RE_ENABLE_I18N */
3666
    {
3667
      *err = REG_ESPACE;
3668
      return NULL;
3669
    }
3670
3671
  if (non_match)
3672
    {
3673
#ifdef RE_ENABLE_I18N
3674
      mbcset->non_match = 1;
3675
#endif /* not RE_ENABLE_I18N */
3676
    }
3677
3678
  /* We don't care the syntax in this case.  */
3679
  ret = build_charclass (trans, sbcset,
3680
#ifdef RE_ENABLE_I18N
3681
			 mbcset, &alloc,
3682
#endif /* RE_ENABLE_I18N */
3683
			 class_name, 0);
3684
3685
  if (BE (ret != REG_NOERROR, 0))
3686
    {
3687
      re_free (sbcset);
3688
#ifdef RE_ENABLE_I18N
3689
      free_charset (mbcset);
3690
#endif /* RE_ENABLE_I18N */
3691
      *err = ret;
3692
      return NULL;
3693
    }
3694
  /* \w match '_' also.  */
3695
  for (; *extra; extra++)
3696
    bitset_set (sbcset, *extra);
3697
3698
  /* If it is non-matching list.  */
3699
  if (non_match)
3700
    bitset_not (sbcset);
3701
3702
#ifdef RE_ENABLE_I18N
3703
  /* Ensure only single byte characters are set.  */
3704
  if (dfa->mb_cur_max > 1)
3705
    bitset_mask (sbcset, dfa->sb_char);
3706
#endif
3707
3708
  /* Build a tree for simple bracket.  */
3709
  br_token.type = SIMPLE_BRACKET;
3710
  br_token.opr.sbcset = sbcset;
3711
  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712
  if (BE (tree == NULL, 0))
3713
    goto build_word_op_espace;
3714
3715
#ifdef RE_ENABLE_I18N
3716
  if (dfa->mb_cur_max > 1)
3717
    {
3718
      bin_tree_t *mbc_tree;
3719
      /* Build a tree for complex bracket.  */
3720
      br_token.type = COMPLEX_BRACKET;
3721
      br_token.opr.mbcset = mbcset;
3722
      dfa->has_mb_node = 1;
3723
      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724
      if (BE (mbc_tree == NULL, 0))
3725
	goto build_word_op_espace;
3726
      /* Then join them by ALT node.  */
3727
      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728
      if (BE (mbc_tree != NULL, 1))
3729
	return tree;
3730
    }
3731
  else
3732
    {
3733
      free_charset (mbcset);
3734
      return tree;
3735
    }
3736
#else /* not RE_ENABLE_I18N */
3737
  return tree;
3738
#endif /* not RE_ENABLE_I18N */
3739
3740
 build_word_op_espace:
3741
  re_free (sbcset);
3742
#ifdef RE_ENABLE_I18N
3743
  free_charset (mbcset);
3744
#endif /* RE_ENABLE_I18N */
3745
  *err = REG_ESPACE;
3746
  return NULL;
3747
}
3748
3749
/* This is intended for the expressions like "a{1,3}".
1.3.1 by Colin Watson
Import upstream version 3.1
3750
   Fetch a number from 'input', and return the number.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3751
   Return REG_MISSING if the number field is empty like "{,1}".
1.3.2 by Colin Watson
Import upstream version 3.2
3752
   Return RE_DUP_MAX + 1 if the number field is too large.
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3753
   Return REG_ERROR if an error occurred.  */
3754
3755
static Idx
3756
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757
{
3758
  Idx num = REG_MISSING;
3759
  unsigned char c;
3760
  while (1)
3761
    {
3762
      fetch_token (token, input, syntax);
3763
      c = token->opr.c;
3764
      if (BE (token->type == END_OF_RE, 0))
3765
	return REG_ERROR;
3766
      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3767
	break;
3768
      num = ((token->type != CHARACTER || c < '0' || '9' < c
3769
	      || num == REG_ERROR)
3770
	     ? REG_ERROR
1.3.2 by Colin Watson
Import upstream version 3.2
3771
	     : num == REG_MISSING
3772
	     ? c - '0'
3773
	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3774
    }
3775
  return num;
3776
}
3777

3778
#ifdef RE_ENABLE_I18N
3779
static void
3780
free_charset (re_charset_t *cset)
3781
{
3782
  re_free (cset->mbchars);
3783
# ifdef _LIBC
3784
  re_free (cset->coll_syms);
3785
  re_free (cset->equiv_classes);
3786
  re_free (cset->range_starts);
3787
  re_free (cset->range_ends);
3788
# endif
3789
  re_free (cset->char_classes);
3790
  re_free (cset);
3791
}
3792
#endif /* RE_ENABLE_I18N */
3793

3794
/* Functions for binary tree operation.  */
3795
3796
/* Create a tree node.  */
3797
3798
static bin_tree_t *
3799
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800
	     re_token_type_t type)
3801
{
3802
  re_token_t t;
3803
  t.type = type;
3804
  return create_token_tree (dfa, left, right, &t);
3805
}
3806
3807
static bin_tree_t *
3808
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809
		   const re_token_t *token)
3810
{
3811
  bin_tree_t *tree;
3812
  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813
    {
3814
      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816
      if (storage == NULL)
3817
	return NULL;
3818
      storage->next = dfa->str_tree_storage;
3819
      dfa->str_tree_storage = storage;
3820
      dfa->str_tree_storage_idx = 0;
3821
    }
3822
  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824
  tree->parent = NULL;
3825
  tree->left = left;
3826
  tree->right = right;
3827
  tree->token = *token;
3828
  tree->token.duplicated = 0;
3829
  tree->token.opt_subexp = 0;
3830
  tree->first = NULL;
3831
  tree->next = NULL;
3832
  tree->node_idx = REG_MISSING;
3833
3834
  if (left != NULL)
3835
    left->parent = tree;
3836
  if (right != NULL)
3837
    right->parent = tree;
3838
  return tree;
3839
}
3840
3841
/* Mark the tree SRC as an optional subexpression.
3842
   To be called from preorder or postorder.  */
3843
3844
static reg_errcode_t
3845
mark_opt_subexp (void *extra, bin_tree_t *node)
3846
{
1.3.2 by Colin Watson
Import upstream version 3.2
3847
  Idx idx = (uintptr_t) extra;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3848
  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849
    node->token.opt_subexp = 1;
3850
3851
  return REG_NOERROR;
3852
}
3853
3854
/* Free the allocated memory inside NODE. */
3855
3856
static void
3857
free_token (re_token_t *node)
3858
{
3859
#ifdef RE_ENABLE_I18N
3860
  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861
    free_charset (node->opr.mbcset);
3862
  else
3863
#endif /* RE_ENABLE_I18N */
3864
    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865
      re_free (node->opr.sbcset);
3866
}
3867
3868
/* Worker function for tree walking.  Free the allocated memory inside NODE
3869
   and its children. */
3870
3871
static reg_errcode_t
3872
free_tree (void *extra, bin_tree_t *node)
3873
{
3874
  free_token (&node->token);
3875
  return REG_NOERROR;
3876
}
3877
3878
3879
/* Duplicate the node SRC, and return new node.  This is a preorder
3880
   visit similar to the one implemented by the generic visitor, but
3881
   we need more infrastructure to maintain two parallel trees --- so,
3882
   it's easier to duplicate.  */
3883
3884
static bin_tree_t *
3885
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886
{
3887
  const bin_tree_t *node;
3888
  bin_tree_t *dup_root;
3889
  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891
  for (node = root; ; )
3892
    {
3893
      /* Create a new tree and link it back to the current parent.  */
3894
      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895
      if (*p_new == NULL)
3896
	return NULL;
3897
      (*p_new)->parent = dup_node;
3898
      (*p_new)->token.duplicated = 1;
3899
      dup_node = *p_new;
3900
3901
      /* Go to the left node, or up and to the right.  */
3902
      if (node->left)
3903
	{
3904
	  node = node->left;
3905
	  p_new = &dup_node->left;
3906
	}
3907
      else
3908
	{
3909
	  const bin_tree_t *prev = NULL;
3910
	  while (node->right == prev || node->right == NULL)
3911
	    {
3912
	      prev = node;
3913
	      node = node->parent;
3914
	      dup_node = dup_node->parent;
3915
	      if (!node)
1.2.3 by Colin Watson
Import upstream version 2.2
3916
		return dup_root;
1.1.4 by Otavio Salvador
Import upstream version 1.8.8.git.2009.06.03
3917
	    }
3918
	  node = node->right;
3919
	  p_new = &dup_node->right;
3920
	}
3921
    }
3922
}