1
/* Extended regular expression matching and search library,
3
(Implements POSIX draft P1003.2/D11.2, except for some of the
4
internationalization features.)
5
Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
7
The GNU C Library is free software; you can redistribute it and/or
8
modify it under the terms of the GNU Library General Public License as
9
published by the Free Software Foundation; either version 2 of the
10
License, or (at your option) any later version.
12
The GNU C Library is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
Library General Public License for more details.
17
You should have received a copy of the GNU Library General Public
18
License along with the GNU C Library; see the file COPYING.LIB. If not,
19
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20
Boston, MA 02111-1307, USA. */
22
/* AIX requires this to be the first thing in the file. */
23
#if defined _AIX && !defined REGEX_MALLOC
35
# if defined __GNUC__ || (defined __STDC__ && __STDC__)
36
# define PARAMS(args) args
38
# define PARAMS(args) ()
40
#endif /* Not PARAMS. */
42
#if defined STDC_HEADERS && !defined emacs
45
/* We need this for `regex.h', and perhaps for the Emacs include files. */
46
# include <sys/types.h>
49
#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
51
/* For platform which support the ISO C amendement 1 functionality we
52
support user defined character classes. */
53
#if defined _LIBC || WIDE_CHAR_SUPPORT
54
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60
/* We have to keep the namespace clean. */
61
# define regfree(preg) __regfree (preg)
62
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
63
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
64
# define regerror(errcode, preg, errbuf, errbuf_size) \
65
__regerror(errcode, preg, errbuf, errbuf_size)
66
# define re_set_registers(bu, re, nu, st, en) \
67
__re_set_registers (bu, re, nu, st, en)
68
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
69
__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
70
# define re_match(bufp, string, size, pos, regs) \
71
__re_match (bufp, string, size, pos, regs)
72
# define re_search(bufp, string, size, startpos, range, regs) \
73
__re_search (bufp, string, size, startpos, range, regs)
74
# define re_compile_pattern(pattern, length, bufp) \
75
__re_compile_pattern (pattern, length, bufp)
76
# define re_set_syntax(syntax) __re_set_syntax (syntax)
77
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
78
__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
79
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
84
/* This is for other GNU distributions with internationalized messages. */
85
#if HAVE_LIBINTL_H || defined _LIBC
88
# define gettext(msgid) (msgid)
92
/* This define is so xgettext can find the internationalizable
94
# define gettext_noop(String) String
97
/* The `emacs' switch turns on certain matching commands
98
that make sense only in Emacs. */
105
#else /* not emacs */
107
/* If we are not linking with Emacs proper,
108
we can't use the relocating allocator
109
even if config.h says that we can. */
112
# if defined STDC_HEADERS || defined _LIBC
121
/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
122
If nothing else has been done, use the method below. */
123
# ifdef INHIBIT_STRING_HEADER
124
# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
125
# if !defined bzero && !defined bcopy
126
# undef INHIBIT_STRING_HEADER
131
/* This is the normal way of making sure we have a bcopy and a bzero.
132
This is used in most programs--a few other programs avoid this
133
by defining INHIBIT_STRING_HEADER. */
134
# ifndef INHIBIT_STRING_HEADER
135
# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
139
# define bzero(s, n) (memset (s, '\0', n), (s))
141
# define bzero(s, n) __bzero (s, n)
145
# include <strings.h>
147
# define memcmp(s1, s2, n) bcmp (s1, s2, n)
150
# define memcpy(d, s, n) (bcopy (s, d, n), (d))
155
/* Define the syntax stuff for \<, \>, etc. */
157
/* This must be nonzero for the wordchar and notwordchar pattern
158
commands in re_match_2. */
163
# ifdef SWITCH_ENUM_BUG
164
# define SWITCH_ENUM_CAST(x) ((int)(x))
166
# define SWITCH_ENUM_CAST(x) (x)
169
/* How many characters in the character set. */
170
# define CHAR_SET_SIZE 256
174
extern char *re_syntax_table;
176
# else /* not SYNTAX_TABLE */
178
static char re_syntax_table[CHAR_SET_SIZE];
189
bzero (re_syntax_table, sizeof re_syntax_table);
191
for (c = 'a'; c <= 'z'; c++)
192
re_syntax_table[c] = Sword;
194
for (c = 'A'; c <= 'Z'; c++)
195
re_syntax_table[c] = Sword;
197
for (c = '0'; c <= '9'; c++)
198
re_syntax_table[c] = Sword;
200
re_syntax_table['_'] = Sword;
205
# endif /* not SYNTAX_TABLE */
207
# define SYNTAX(c) re_syntax_table[c]
209
#endif /* not emacs */
211
/* Get the interface, including the syntax bits. */
214
/* isalpha etc. are used for the character classes. */
217
/* Jim Meyering writes:
219
"... Some ctype macros are valid only for character codes that
220
isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
221
using /bin/cc or gcc but without giving an ansi option). So, all
222
ctype uses should be through macros like ISPRINT... If
223
STDC_HEADERS is defined, then autoconf has verified that the ctype
224
macros don't need to be guarded with references to isascii. ...
225
Defining isascii to 1 should let any compiler worth its salt
226
eliminate the && through constant folding."
227
Solaris defines some of these symbols so we must undefine them first. */
230
#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
231
# define ISASCII(c) 1
233
# define ISASCII(c) isascii(c)
237
# define ISBLANK(c) (ISASCII (c) && isblank (c))
239
# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
242
# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
244
# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
248
#define ISPRINT(c) (ISASCII (c) && isprint (c))
249
#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
250
#define ISALNUM(c) (ISASCII (c) && isalnum (c))
251
#define ISALPHA(c) (ISASCII (c) && isalpha (c))
252
#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
253
#define ISLOWER(c) (ISASCII (c) && islower (c))
254
#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
255
#define ISSPACE(c) (ISASCII (c) && isspace (c))
256
#define ISUPPER(c) (ISASCII (c) && isupper (c))
257
#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
260
# define TOLOWER(c) _tolower(c)
262
# define TOLOWER(c) tolower(c)
266
# define NULL (void *)0
269
/* We remove any previous definition of `SIGN_EXTEND_CHAR',
270
since ours (we hope) works properly with all combinations of
271
machines, compilers, `char' and `unsigned char' argument types.
272
(Per Bothner suggested the basic approach.) */
273
#undef SIGN_EXTEND_CHAR
275
# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
276
#else /* not __STDC__ */
277
/* As in Harbison and Steele. */
278
# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
281
/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
282
use `alloca' instead of `malloc'. This is because using malloc in
283
re_search* or re_match* could cause memory leaks when C-g is used in
284
Emacs; also, malloc is slower and causes storage fragmentation. On
285
the other hand, malloc is more portable, and easier to debug.
287
Because we sometimes use alloca, some routines have to be macros,
288
not functions -- `alloca'-allocated space disappears at the end of the
289
function it is called in. */
293
# define REGEX_ALLOCATE malloc
294
# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
295
# define REGEX_FREE free
297
#else /* not REGEX_MALLOC */
299
/* Emacs already defines alloca, sometimes. */
302
/* Make alloca work the best possible way. */
304
# define alloca __builtin_alloca
305
# else /* not __GNUC__ */
308
# endif /* HAVE_ALLOCA_H */
309
# endif /* not __GNUC__ */
311
# endif /* not alloca */
313
# define REGEX_ALLOCATE alloca
315
/* Assumes a `char *destination' variable. */
316
# define REGEX_REALLOCATE(source, osize, nsize) \
317
(destination = (char *) alloca (nsize), \
318
memcpy (destination, source, osize))
320
/* No need to do anything to free, after alloca. */
321
# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
323
#endif /* not REGEX_MALLOC */
325
/* Define how to allocate the failure stack. */
327
#if defined REL_ALLOC && defined REGEX_MALLOC
329
# define REGEX_ALLOCATE_STACK(size) \
330
r_alloc (&failure_stack_ptr, (size))
331
# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
332
r_re_alloc (&failure_stack_ptr, (nsize))
333
# define REGEX_FREE_STACK(ptr) \
334
r_alloc_free (&failure_stack_ptr)
336
#else /* not using relocating allocator */
340
# define REGEX_ALLOCATE_STACK malloc
341
# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
342
# define REGEX_FREE_STACK free
344
# else /* not REGEX_MALLOC */
346
# define REGEX_ALLOCATE_STACK alloca
348
# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
349
REGEX_REALLOCATE (source, osize, nsize)
350
/* No need to explicitly free anything. */
351
# define REGEX_FREE_STACK(arg)
353
# endif /* not REGEX_MALLOC */
354
#endif /* not using relocating allocator */
357
/* True if `size1' is non-NULL and PTR is pointing anywhere inside
358
`string1' or just past its end. This works if PTR is NULL, which is
360
#define FIRST_STRING_P(ptr) \
361
(size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
363
/* (Re)Allocate N items of type T using malloc, or fail. */
364
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
365
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
366
#define RETALLOC_IF(addr, n, t) \
367
if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
368
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
370
#define BYTEWIDTH 8 /* In bits. */
372
#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
376
#define MAX(a, b) ((a) > (b) ? (a) : (b))
377
#define MIN(a, b) ((a) < (b) ? (a) : (b))
379
typedef char boolean;
383
static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
384
const char *string1, int size1,
385
const char *string2, int size2,
387
struct re_registers *regs,
390
/* These are the command codes that appear in compiled regular
391
expressions. Some opcodes are followed by argument bytes. A
392
command code can specify any interpretation whatsoever for its
393
arguments. Zero bytes may appear in the compiled regular expression. */
399
/* Succeed right away--no more backtracking. */
402
/* Followed by one byte giving n, then by n literal bytes. */
405
/* Matches any (more or less) character. */
408
/* Matches any one char belonging to specified set. First
409
following byte is number of bitmap bytes. Then come bytes
410
for a bitmap saying which chars are in. Bits in each byte
411
are ordered low-bit-first. A character is in the set if its
412
bit is 1. A character too large to have a bit in the map is
413
automatically not in the set. */
416
/* Same parameters as charset, but match any character that is
417
not one of those specified. */
420
/* Start remembering the text that is matched, for storing in a
421
register. Followed by one byte with the register number, in
422
the range 0 to one less than the pattern buffer's re_nsub
423
field. Then followed by one byte with the number of groups
424
inner to this one. (This last has to be part of the
425
start_memory only because we need it in the on_failure_jump
429
/* Stop remembering the text that is matched and store it in a
430
memory register. Followed by one byte with the register
431
number, in the range 0 to one less than `re_nsub' in the
432
pattern buffer, and one byte with the number of inner groups,
433
just like `start_memory'. (We need the number of inner
434
groups here because we don't have any easy way of finding the
435
corresponding start_memory when we're at a stop_memory.) */
438
/* Match a duplicate of something remembered. Followed by one
439
byte containing the register number. */
442
/* Fail unless at beginning of line. */
445
/* Fail unless at end of line. */
448
/* Succeeds if at beginning of buffer (if emacs) or at beginning
449
of string to be matched (if not). */
452
/* Analogously, for end of buffer/string. */
455
/* Followed by two byte relative address to which to jump. */
458
/* Same as jump, but marks the end of an alternative. */
461
/* Followed by two-byte relative address of place to resume at
462
in case of failure. */
465
/* Like on_failure_jump, but pushes a placeholder instead of the
466
current string position when executed. */
467
on_failure_keep_string_jump,
469
/* Throw away latest failure point and then jump to following
470
two-byte relative address. */
473
/* Change to pop_failure_jump if know won't have to backtrack to
474
match; otherwise change to jump. This is used to jump
475
back to the beginning of a repeat. If what follows this jump
476
clearly won't match what the repeat does, such that we can be
477
sure that there is no use backtracking out of repetitions
478
already matched, then we change it to a pop_failure_jump.
479
Followed by two-byte address. */
482
/* Jump to following two-byte address, and push a dummy failure
483
point. This failure point will be thrown away if an attempt
484
is made to use it for a failure. A `+' construct makes this
485
before the first repeat. Also used as an intermediary kind
486
of jump when compiling an alternative. */
489
/* Push a dummy failure point and continue. Used at the end of
493
/* Followed by two-byte relative address and two-byte number n.
494
After matching N times, jump to the address upon failure. */
497
/* Followed by two-byte relative address, and two-byte number n.
498
Jump to the address N times, then fail. */
501
/* Set the following two-byte relative address to the
502
subsequent two-byte number. The address *includes* the two
506
wordchar, /* Matches any word-constituent character. */
507
notwordchar, /* Matches any char that is not a word-constituent. */
509
wordbeg, /* Succeeds if at word beginning. */
510
wordend, /* Succeeds if at word end. */
512
wordbound, /* Succeeds if at a word boundary. */
513
notwordbound /* Succeeds if not at a word boundary. */
516
,before_dot, /* Succeeds if before point. */
517
at_dot, /* Succeeds if at point. */
518
after_dot, /* Succeeds if after point. */
520
/* Matches any character whose syntax is specified. Followed by
521
a byte which contains a syntax code, e.g., Sword. */
524
/* Matches any character whose syntax is not that specified. */
529
/* Common operations on the compiled pattern. */
531
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
533
#define STORE_NUMBER(destination, number) \
535
(destination)[0] = (number) & 0377; \
536
(destination)[1] = (number) >> 8; \
539
/* Same as STORE_NUMBER, except increment DESTINATION to
540
the byte after where the number is stored. Therefore, DESTINATION
541
must be an lvalue. */
543
#define STORE_NUMBER_AND_INCR(destination, number) \
545
STORE_NUMBER (destination, number); \
546
(destination) += 2; \
549
/* Put into DESTINATION a number stored in two contiguous bytes starting
552
#define EXTRACT_NUMBER(destination, source) \
554
(destination) = *(source) & 0377; \
555
(destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
559
static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
561
extract_number (dest, source)
563
unsigned char *source;
565
int temp = SIGN_EXTEND_CHAR (*(source + 1));
566
*dest = *source & 0377;
570
# ifndef EXTRACT_MACROS /* To debug the macros. */
571
# undef EXTRACT_NUMBER
572
# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
573
# endif /* not EXTRACT_MACROS */
577
/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
578
SOURCE must be an lvalue. */
580
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
582
EXTRACT_NUMBER (destination, source); \
587
static void extract_number_and_incr _RE_ARGS ((int *destination,
588
unsigned char **source));
590
extract_number_and_incr (destination, source)
592
unsigned char **source;
594
extract_number (destination, *source);
598
# ifndef EXTRACT_MACROS
599
# undef EXTRACT_NUMBER_AND_INCR
600
# define EXTRACT_NUMBER_AND_INCR(dest, src) \
601
extract_number_and_incr (&dest, &src)
602
# endif /* not EXTRACT_MACROS */
606
/* If DEBUG is defined, Regex prints many voluminous messages about what
607
it is doing (if the variable `debug' is nonzero). If linked with the
608
main program in `iregex.c', you can enter patterns and strings
609
interactively. And if linked with the main program in `main.c' and
610
the other test files, you can run the already-written tests. */
614
/* We use standard I/O for debugging. */
617
/* It is useful to test things that ``must'' be true when debugging. */
620
static int debug = 0;
622
# define DEBUG_STATEMENT(e) e
623
# define DEBUG_PRINT1(x) if (debug) printf (x)
624
# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
625
# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
626
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
627
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
628
if (debug) print_partial_compiled_pattern (s, e)
629
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
630
if (debug) print_double_string (w, s1, sz1, s2, sz2)
633
/* Print the fastmap in human-readable form. */
636
print_fastmap (fastmap)
639
unsigned was_a_range = 0;
642
while (i < (1 << BYTEWIDTH))
648
while (i < (1 << BYTEWIDTH) && fastmap[i])
664
/* Print a compiled pattern string in human-readable form, starting at
665
the START pointer into it and ending just before the pointer END. */
668
print_partial_compiled_pattern (start, end)
669
unsigned char *start;
674
unsigned char *p = start;
675
unsigned char *pend = end;
683
/* Loop over pattern commands. */
686
printf ("%d:\t", p - start);
688
switch ((re_opcode_t) *p++)
696
printf ("/exactn/%d", mcnt);
707
printf ("/start_memory/%d/%d", mcnt, *p++);
712
printf ("/stop_memory/%d/%d", mcnt, *p++);
716
printf ("/duplicate/%d", *p++);
726
register int c, last = -100;
727
register int in_range = 0;
729
printf ("/charset [%s",
730
(re_opcode_t) *(p - 1) == charset_not ? "^" : "");
732
assert (p + *p < pend);
734
for (c = 0; c < 256; c++)
736
&& (p[1 + (c/8)] & (1 << (c % 8))))
738
/* Are we starting a range? */
739
if (last + 1 == c && ! in_range)
744
/* Have we broken a range? */
745
else if (last + 1 != c && in_range)
774
case on_failure_jump:
775
extract_number_and_incr (&mcnt, &p);
776
printf ("/on_failure_jump to %d", p + mcnt - start);
779
case on_failure_keep_string_jump:
780
extract_number_and_incr (&mcnt, &p);
781
printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
784
case dummy_failure_jump:
785
extract_number_and_incr (&mcnt, &p);
786
printf ("/dummy_failure_jump to %d", p + mcnt - start);
789
case push_dummy_failure:
790
printf ("/push_dummy_failure");
794
extract_number_and_incr (&mcnt, &p);
795
printf ("/maybe_pop_jump to %d", p + mcnt - start);
798
case pop_failure_jump:
799
extract_number_and_incr (&mcnt, &p);
800
printf ("/pop_failure_jump to %d", p + mcnt - start);
804
extract_number_and_incr (&mcnt, &p);
805
printf ("/jump_past_alt to %d", p + mcnt - start);
809
extract_number_and_incr (&mcnt, &p);
810
printf ("/jump to %d", p + mcnt - start);
814
extract_number_and_incr (&mcnt, &p);
816
extract_number_and_incr (&mcnt2, &p);
817
printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
821
extract_number_and_incr (&mcnt, &p);
823
extract_number_and_incr (&mcnt2, &p);
824
printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
828
extract_number_and_incr (&mcnt, &p);
830
extract_number_and_incr (&mcnt2, &p);
831
printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
835
printf ("/wordbound");
839
printf ("/notwordbound");
851
printf ("/before_dot");
859
printf ("/after_dot");
863
printf ("/syntaxspec");
865
printf ("/%d", mcnt);
869
printf ("/notsyntaxspec");
871
printf ("/%d", mcnt);
876
printf ("/wordchar");
880
printf ("/notwordchar");
892
printf ("?%d", *(p-1));
898
printf ("%d:\tend of pattern.\n", p - start);
903
print_compiled_pattern (bufp)
904
struct re_pattern_buffer *bufp;
906
unsigned char *buffer = bufp->buffer;
908
print_partial_compiled_pattern (buffer, buffer + bufp->used);
909
printf ("%ld bytes used/%ld bytes allocated.\n",
910
bufp->used, bufp->allocated);
912
if (bufp->fastmap_accurate && bufp->fastmap)
914
printf ("fastmap: ");
915
print_fastmap (bufp->fastmap);
918
printf ("re_nsub: %d\t", bufp->re_nsub);
919
printf ("regs_alloc: %d\t", bufp->regs_allocated);
920
printf ("can_be_null: %d\t", bufp->can_be_null);
921
printf ("newline_anchor: %d\n", bufp->newline_anchor);
922
printf ("no_sub: %d\t", bufp->no_sub);
923
printf ("not_bol: %d\t", bufp->not_bol);
924
printf ("not_eol: %d\t", bufp->not_eol);
925
printf ("syntax: %lx\n", bufp->syntax);
926
/* Perhaps we should print the translate table? */
931
print_double_string (where, string1, size1, string2, size2)
944
if (FIRST_STRING_P (where))
946
for (this_char = where - string1; this_char < size1; this_char++)
947
putchar (string1[this_char]);
952
for (this_char = where - string2; this_char < size2; this_char++)
953
putchar (string2[this_char]);
964
#else /* not DEBUG */
969
# define DEBUG_STATEMENT(e)
970
# define DEBUG_PRINT1(x)
971
# define DEBUG_PRINT2(x1, x2)
972
# define DEBUG_PRINT3(x1, x2, x3)
973
# define DEBUG_PRINT4(x1, x2, x3, x4)
974
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
975
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
977
#endif /* not DEBUG */
979
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
980
also be assigned to arbitrarily: each pattern buffer stores its own
981
syntax, so it can be changed between regex compilations. */
982
/* This has no initializer because initialized variables in Emacs
983
become read-only after dumping. */
984
reg_syntax_t re_syntax_options;
987
/* Specify the precise syntax of regexps for compilation. This provides
988
for compatibility for various utilities which historically have
989
different, incompatible syntaxes.
991
The argument SYNTAX is a bit mask comprised of the various bits
992
defined in regex.h. We return the old syntax. */
995
re_set_syntax (syntax)
998
reg_syntax_t ret = re_syntax_options;
1000
re_syntax_options = syntax;
1002
if (syntax & RE_DEBUG)
1004
else if (debug) /* was on but now is not */
1010
weak_alias (__re_set_syntax, re_set_syntax)
1013
/* This table gives an error message for each of the error codes listed
1014
in regex.h. Obviously the order here has to be same as there.
1015
POSIX doesn't require that we do anything for REG_NOERROR,
1016
but why not be nice? */
1018
static const char *re_error_msgid[] =
1020
gettext_noop ("Success"), /* REG_NOERROR */
1021
gettext_noop ("No match"), /* REG_NOMATCH */
1022
gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1023
gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1024
gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1025
gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1026
gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1027
gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1028
gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1029
gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1030
gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1031
gettext_noop ("Invalid range end"), /* REG_ERANGE */
1032
gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1033
gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1034
gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1035
gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1036
gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1039
/* Avoiding alloca during matching, to placate r_alloc. */
1041
/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1042
searching and matching functions should not call alloca. On some
1043
systems, alloca is implemented in terms of malloc, and if we're
1044
using the relocating allocator routines, then malloc could cause a
1045
relocation, which might (if the strings being searched are in the
1046
ralloc heap) shift the data out from underneath the regexp
1049
Here's another reason to avoid allocation: Emacs
1050
processes input from X in a signal handler; processing X input may
1051
call malloc; if input arrives while a matching routine is calling
1052
malloc, then we're scrod. But Emacs can't just block input while
1053
calling matching routines; then we don't notice interrupts when
1054
they come in. So, Emacs blocks input around all regexp calls
1055
except the matching calls, which it leaves unprotected, in the
1056
faith that they will not malloc. */
1058
/* Normally, this is fine. */
1059
#define MATCH_MAY_ALLOCATE
1061
/* When using GNU C, we are not REALLY using the C alloca, no matter
1062
what config.h may say. So don't take precautions for it. */
1067
/* The match routines may not allocate if (1) they would do it with malloc
1068
and (2) it's not safe for them to use malloc.
1069
Note that if REL_ALLOC is defined, matching would not use malloc for the
1070
failure stack, but we would still use it for the register vectors;
1071
so REL_ALLOC should not affect this. */
1072
#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1073
# undef MATCH_MAY_ALLOCATE
1077
/* Failure stack declarations and macros; both re_compile_fastmap and
1078
re_match_2 use a failure stack. These have to be macros because of
1079
REGEX_ALLOCATE_STACK. */
1082
/* Number of failure points for which to initially allocate space
1083
when matching. If this number is exceeded, we allocate more
1084
space, so it is not a hard limit. */
1085
#ifndef INIT_FAILURE_ALLOC
1086
# define INIT_FAILURE_ALLOC 5
1089
/* Roughly the maximum number of failure points on the stack. Would be
1090
exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1091
This is a variable only so users of regex can assign to it; we never
1092
change it ourselves. */
1096
# if defined MATCH_MAY_ALLOCATE
1097
/* 4400 was enough to cause a crash on Alpha OSF/1,
1098
whose default stack limit is 2mb. */
1099
long int re_max_failures = 4000;
1101
long int re_max_failures = 2000;
1104
union fail_stack_elt
1106
unsigned char *pointer;
1110
typedef union fail_stack_elt fail_stack_elt_t;
1114
fail_stack_elt_t *stack;
1115
unsigned long int size;
1116
unsigned long int avail; /* Offset of next open position. */
1119
#else /* not INT_IS_16BIT */
1121
# if defined MATCH_MAY_ALLOCATE
1122
/* 4400 was enough to cause a crash on Alpha OSF/1,
1123
whose default stack limit is 2mb. */
1124
int re_max_failures = 20000;
1126
int re_max_failures = 2000;
1129
union fail_stack_elt
1131
unsigned char *pointer;
1135
typedef union fail_stack_elt fail_stack_elt_t;
1139
fail_stack_elt_t *stack;
1141
unsigned avail; /* Offset of next open position. */
1144
#endif /* INT_IS_16BIT */
1146
#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1147
#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1148
#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1151
/* Define macros to initialize and free the failure stack.
1152
Do `return -2' if the alloc fails. */
1154
#ifdef MATCH_MAY_ALLOCATE
1155
# define INIT_FAIL_STACK() \
1157
fail_stack.stack = (fail_stack_elt_t *) \
1158
REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1160
if (fail_stack.stack == NULL) \
1163
fail_stack.size = INIT_FAILURE_ALLOC; \
1164
fail_stack.avail = 0; \
1167
# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1169
# define INIT_FAIL_STACK() \
1171
fail_stack.avail = 0; \
1174
# define RESET_FAIL_STACK()
1178
/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1180
Return 1 if succeeds, and 0 if either ran out of memory
1181
allocating space for it or it was already too large.
1183
REGEX_REALLOCATE_STACK requires `destination' be declared. */
1185
#define DOUBLE_FAIL_STACK(fail_stack) \
1186
((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1188
: ((fail_stack).stack = (fail_stack_elt_t *) \
1189
REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1190
(fail_stack).size * sizeof (fail_stack_elt_t), \
1191
((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1193
(fail_stack).stack == NULL \
1195
: ((fail_stack).size <<= 1, \
1199
/* Push pointer POINTER on FAIL_STACK.
1200
Return 1 if was able to do so and 0 if ran out of memory allocating
1202
#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1203
((FAIL_STACK_FULL () \
1204
&& !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1206
: ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1209
/* Push a pointer value onto the failure stack.
1210
Assumes the variable `fail_stack'. Probably should only
1211
be called from within `PUSH_FAILURE_POINT'. */
1212
#define PUSH_FAILURE_POINTER(item) \
1213
fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1215
/* This pushes an integer-valued item onto the failure stack.
1216
Assumes the variable `fail_stack'. Probably should only
1217
be called from within `PUSH_FAILURE_POINT'. */
1218
#define PUSH_FAILURE_INT(item) \
1219
fail_stack.stack[fail_stack.avail++].integer = (item)
1221
/* Push a fail_stack_elt_t value onto the failure stack.
1222
Assumes the variable `fail_stack'. Probably should only
1223
be called from within `PUSH_FAILURE_POINT'. */
1224
#define PUSH_FAILURE_ELT(item) \
1225
fail_stack.stack[fail_stack.avail++] = (item)
1227
/* These three POP... operations complement the three PUSH... operations.
1228
All assume that `fail_stack' is nonempty. */
1229
#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1230
#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1231
#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1233
/* Used to omit pushing failure point id's when we're not debugging. */
1235
# define DEBUG_PUSH PUSH_FAILURE_INT
1236
# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1238
# define DEBUG_PUSH(item)
1239
# define DEBUG_POP(item_addr)
1243
/* Push the information about the state we will need
1244
if we ever fail back to it.
1246
Requires variables fail_stack, regstart, regend, reg_info, and
1247
num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1250
Does `return FAILURE_CODE' if runs out of memory. */
1252
#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1254
char *destination; \
1255
/* Must be int, so when we don't save any registers, the arithmetic \
1256
of 0 + -1 isn't done as unsigned. */ \
1257
/* Can't be int, since there is not a shred of a guarantee that int \
1258
is wide enough to hold a value of something to which pointer can \
1260
active_reg_t this_reg; \
1262
DEBUG_STATEMENT (failure_id++); \
1263
DEBUG_STATEMENT (nfailure_points_pushed++); \
1264
DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1265
DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1266
DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1268
DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1269
DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1271
/* Ensure we have enough space allocated for what we will push. */ \
1272
while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1274
if (!DOUBLE_FAIL_STACK (fail_stack)) \
1275
return failure_code; \
1277
DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1278
(fail_stack).size); \
1279
DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1282
/* Push the info, starting with the registers. */ \
1283
DEBUG_PRINT1 ("\n"); \
1286
for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1289
DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1290
DEBUG_STATEMENT (num_regs_pushed++); \
1292
DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1293
PUSH_FAILURE_POINTER (regstart[this_reg]); \
1295
DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1296
PUSH_FAILURE_POINTER (regend[this_reg]); \
1298
DEBUG_PRINT2 (" info: %p\n ", \
1299
reg_info[this_reg].word.pointer); \
1300
DEBUG_PRINT2 (" match_null=%d", \
1301
REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1302
DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1303
DEBUG_PRINT2 (" matched_something=%d", \
1304
MATCHED_SOMETHING (reg_info[this_reg])); \
1305
DEBUG_PRINT2 (" ever_matched=%d", \
1306
EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1307
DEBUG_PRINT1 ("\n"); \
1308
PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1311
DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1312
PUSH_FAILURE_INT (lowest_active_reg); \
1314
DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1315
PUSH_FAILURE_INT (highest_active_reg); \
1317
DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1318
DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1319
PUSH_FAILURE_POINTER (pattern_place); \
1321
DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1322
DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1324
DEBUG_PRINT1 ("'\n"); \
1325
PUSH_FAILURE_POINTER (string_place); \
1327
DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1328
DEBUG_PUSH (failure_id); \
1331
/* This is the number of items that are pushed and popped on the stack
1332
for each register. */
1333
#define NUM_REG_ITEMS 3
1335
/* Individual items aside from the registers. */
1337
# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1339
# define NUM_NONREG_ITEMS 4
1342
/* We push at most this many items on the stack. */
1343
/* We used to use (num_regs - 1), which is the number of registers
1344
this regexp will save; but that was changed to 5
1345
to avoid stack overflow for a regexp with lots of parens. */
1346
#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1348
/* We actually push this many items. */
1349
#define NUM_FAILURE_ITEMS \
1351
? 0 : highest_active_reg - lowest_active_reg + 1) \
1355
/* How many items can still be added to the stack without overflowing it. */
1356
#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1359
/* Pops what PUSH_FAIL_STACK pushes.
1361
We restore into the parameters, all of which should be lvalues:
1362
STR -- the saved data position.
1363
PAT -- the saved pattern position.
1364
LOW_REG, HIGH_REG -- the highest and lowest active registers.
1365
REGSTART, REGEND -- arrays of string positions.
1366
REG_INFO -- array of information about each subexpression.
1368
Also assumes the variables `fail_stack' and (if debugging), `bufp',
1369
`pend', `string1', `size1', `string2', and `size2'. */
1371
#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1373
DEBUG_STATEMENT (unsigned failure_id;) \
1374
active_reg_t this_reg; \
1375
const unsigned char *string_temp; \
1377
assert (!FAIL_STACK_EMPTY ()); \
1379
/* Remove failure points and point to how many regs pushed. */ \
1380
DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1381
DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1382
DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1384
assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1386
DEBUG_POP (&failure_id); \
1387
DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1389
/* If the saved string location is NULL, it came from an \
1390
on_failure_keep_string_jump opcode, and we want to throw away the \
1391
saved NULL, thus retaining our current position in the string. */ \
1392
string_temp = POP_FAILURE_POINTER (); \
1393
if (string_temp != NULL) \
1394
str = (const char *) string_temp; \
1396
DEBUG_PRINT2 (" Popping string %p: `", str); \
1397
DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1398
DEBUG_PRINT1 ("'\n"); \
1400
pat = (unsigned char *) POP_FAILURE_POINTER (); \
1401
DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1402
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1404
/* Restore register info. */ \
1405
high_reg = (active_reg_t) POP_FAILURE_INT (); \
1406
DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1408
low_reg = (active_reg_t) POP_FAILURE_INT (); \
1409
DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1412
for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1414
DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1416
reg_info[this_reg].word = POP_FAILURE_ELT (); \
1417
DEBUG_PRINT2 (" info: %p\n", \
1418
reg_info[this_reg].word.pointer); \
1420
regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1421
DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1423
regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1424
DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1428
for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1430
reg_info[this_reg].word.integer = 0; \
1431
regend[this_reg] = 0; \
1432
regstart[this_reg] = 0; \
1434
highest_active_reg = high_reg; \
1437
set_regs_matched_done = 0; \
1438
DEBUG_STATEMENT (nfailure_points_popped++); \
1439
} /* POP_FAILURE_POINT */
1443
/* Structure for per-register (a.k.a. per-group) information.
1444
Other register information, such as the
1445
starting and ending positions (which are addresses), and the list of
1446
inner groups (which is a bits list) are maintained in separate
1449
We are making a (strictly speaking) nonportable assumption here: that
1450
the compiler will pack our bit fields into something that fits into
1451
the type of `word', i.e., is something that fits into one item on the
1455
/* Declarations and macros for re_match_2. */
1459
fail_stack_elt_t word;
1462
/* This field is one if this group can match the empty string,
1463
zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1464
#define MATCH_NULL_UNSET_VALUE 3
1465
unsigned match_null_string_p : 2;
1466
unsigned is_active : 1;
1467
unsigned matched_something : 1;
1468
unsigned ever_matched_something : 1;
1470
} register_info_type;
1472
#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1473
#define IS_ACTIVE(R) ((R).bits.is_active)
1474
#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1475
#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1478
/* Call this when have matched a real character; it sets `matched' flags
1479
for the subexpressions which we are currently inside. Also records
1480
that those subexprs have matched. */
1481
#define SET_REGS_MATCHED() \
1484
if (!set_regs_matched_done) \
1487
set_regs_matched_done = 1; \
1488
for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1490
MATCHED_SOMETHING (reg_info[r]) \
1491
= EVER_MATCHED_SOMETHING (reg_info[r]) \
1498
/* Registers are set to a sentinel when they haven't yet matched. */
1499
static char reg_unset_dummy;
1500
#define REG_UNSET_VALUE (®_unset_dummy)
1501
#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1503
/* Subroutine declarations and macros for regex_compile. */
1505
static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1506
reg_syntax_t syntax,
1507
struct re_pattern_buffer *bufp));
1508
static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1509
static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1510
int arg1, int arg2));
1511
static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1512
int arg, unsigned char *end));
1513
static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1514
int arg1, int arg2, unsigned char *end));
1515
static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
1516
reg_syntax_t syntax));
1517
static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
1518
reg_syntax_t syntax));
1519
static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
1522
reg_syntax_t syntax,
1525
/* Fetch the next character in the uncompiled pattern---translating it
1526
if necessary. Also cast from a signed character in the constant
1527
string passed to us by the user to an unsigned char that we can use
1528
as an array index (in, e.g., `translate'). */
1530
# define PATFETCH(c) \
1531
do {if (p == pend) return REG_EEND; \
1532
c = (unsigned char) *p++; \
1533
if (translate) c = (unsigned char) translate[c]; \
1537
/* Fetch the next character in the uncompiled pattern, with no
1539
#define PATFETCH_RAW(c) \
1540
do {if (p == pend) return REG_EEND; \
1541
c = (unsigned char) *p++; \
1544
/* Go backwards one character in the pattern. */
1545
#define PATUNFETCH p--
1548
/* If `translate' is non-null, return translate[D], else just D. We
1549
cast the subscript to translate because some data is declared as
1550
`char *', to avoid warnings when a string constant is passed. But
1551
when we use a character as a subscript we must make it unsigned. */
1553
# define TRANSLATE(d) \
1554
(translate ? (char) translate[(unsigned char) (d)] : (d))
1558
/* Macros for outputting the compiled pattern into `buffer'. */
1560
/* If the buffer isn't allocated when it comes in, use this. */
1561
#define INIT_BUF_SIZE 32
1563
/* Make sure we have at least N more bytes of space in buffer. */
1564
#define GET_BUFFER_SPACE(n) \
1565
while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1568
/* Make sure we have one more byte of buffer space and then add C to it. */
1569
#define BUF_PUSH(c) \
1571
GET_BUFFER_SPACE (1); \
1572
*b++ = (unsigned char) (c); \
1576
/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1577
#define BUF_PUSH_2(c1, c2) \
1579
GET_BUFFER_SPACE (2); \
1580
*b++ = (unsigned char) (c1); \
1581
*b++ = (unsigned char) (c2); \
1585
/* As with BUF_PUSH_2, except for three bytes. */
1586
#define BUF_PUSH_3(c1, c2, c3) \
1588
GET_BUFFER_SPACE (3); \
1589
*b++ = (unsigned char) (c1); \
1590
*b++ = (unsigned char) (c2); \
1591
*b++ = (unsigned char) (c3); \
1595
/* Store a jump with opcode OP at LOC to location TO. We store a
1596
relative address offset by the three bytes the jump itself occupies. */
1597
#define STORE_JUMP(op, loc, to) \
1598
store_op1 (op, loc, (int) ((to) - (loc) - 3))
1600
/* Likewise, for a two-argument jump. */
1601
#define STORE_JUMP2(op, loc, to, arg) \
1602
store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
1604
/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1605
#define INSERT_JUMP(op, loc, to) \
1606
insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
1608
/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1609
#define INSERT_JUMP2(op, loc, to, arg) \
1610
insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
1613
/* This is not an arbitrary limit: the arguments which represent offsets
1614
into the pattern are two bytes long. So if 2^16 bytes turns out to
1615
be too small, many things would have to change. */
1616
/* Any other compiler which, like MSC, has allocation limit below 2^16
1617
bytes will have to use approach similar to what was done below for
1618
MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1619
reallocating to 0 bytes. Such thing is not going to work too well.
1620
You have been warned!! */
1621
#if defined _MSC_VER && !defined WIN32
1622
/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1623
The REALLOC define eliminates a flurry of conversion warnings,
1624
but is not required. */
1625
# define MAX_BUF_SIZE 65500L
1626
# define REALLOC(p,s) realloc ((p), (size_t) (s))
1628
# define MAX_BUF_SIZE (1L << 16)
1629
# define REALLOC(p,s) realloc ((p), (s))
1632
/* Extend the buffer by twice its current size via realloc and
1633
reset the pointers that pointed into the old block to point to the
1634
correct places in the new one. If extending the buffer results in it
1635
being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1636
#define EXTEND_BUFFER() \
1638
unsigned char *old_buffer = bufp->buffer; \
1639
if (bufp->allocated == MAX_BUF_SIZE) \
1641
bufp->allocated <<= 1; \
1642
if (bufp->allocated > MAX_BUF_SIZE) \
1643
bufp->allocated = MAX_BUF_SIZE; \
1644
bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
1645
if (bufp->buffer == NULL) \
1646
return REG_ESPACE; \
1647
/* If the buffer moved, move all the pointers into it. */ \
1648
if (old_buffer != bufp->buffer) \
1650
b = (b - old_buffer) + bufp->buffer; \
1651
begalt = (begalt - old_buffer) + bufp->buffer; \
1652
if (fixup_alt_jump) \
1653
fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1655
laststart = (laststart - old_buffer) + bufp->buffer; \
1656
if (pending_exact) \
1657
pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1662
/* Since we have one byte reserved for the register number argument to
1663
{start,stop}_memory, the maximum number of groups we can report
1664
things about is what fits in that byte. */
1665
#define MAX_REGNUM 255
1667
/* But patterns can have more than `MAX_REGNUM' registers. We just
1668
ignore the excess. */
1669
typedef unsigned regnum_t;
1672
/* Macros for the compile stack. */
1674
/* Since offsets can go either forwards or backwards, this type needs to
1675
be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1676
/* int may be not enough when sizeof(int) == 2. */
1677
typedef long pattern_offset_t;
1681
pattern_offset_t begalt_offset;
1682
pattern_offset_t fixup_alt_jump;
1683
pattern_offset_t inner_group_offset;
1684
pattern_offset_t laststart_offset;
1686
} compile_stack_elt_t;
1691
compile_stack_elt_t *stack;
1693
unsigned avail; /* Offset of next open position. */
1694
} compile_stack_type;
1697
#define INIT_COMPILE_STACK_SIZE 32
1699
#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1700
#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1702
/* The next available element. */
1703
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1706
/* Set the bit for character C in a list. */
1707
#define SET_LIST_BIT(c) \
1708
(b[((unsigned char) (c)) / BYTEWIDTH] \
1709
|= 1 << (((unsigned char) c) % BYTEWIDTH))
1712
/* Get the next unsigned number in the uncompiled pattern. */
1713
#define GET_UNSIGNED_NUMBER(num) \
1717
while (ISDIGIT (c)) \
1721
num = num * 10 + c - '0'; \
1729
#if defined _LIBC || WIDE_CHAR_SUPPORT
1730
/* The GNU C library provides support for user-defined character classes
1731
and the functions from ISO C amendement 1. */
1732
# ifdef CHARCLASS_NAME_MAX
1733
# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1735
/* This shouldn't happen but some implementation might still have this
1736
problem. Use a reasonable default value. */
1737
# define CHAR_CLASS_MAX_LENGTH 256
1741
# define IS_CHAR_CLASS(string) __wctype (string)
1743
# define IS_CHAR_CLASS(string) wctype (string)
1746
# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1748
# define IS_CHAR_CLASS(string) \
1749
(STREQ (string, "alpha") || STREQ (string, "upper") \
1750
|| STREQ (string, "lower") || STREQ (string, "digit") \
1751
|| STREQ (string, "alnum") || STREQ (string, "xdigit") \
1752
|| STREQ (string, "space") || STREQ (string, "print") \
1753
|| STREQ (string, "punct") || STREQ (string, "graph") \
1754
|| STREQ (string, "cntrl") || STREQ (string, "blank"))
1757
#ifndef MATCH_MAY_ALLOCATE
1759
/* If we cannot allocate large objects within re_match_2_internal,
1760
we make the fail stack and register vectors global.
1761
The fail stack, we grow to the maximum size when a regexp
1763
The register vectors, we adjust in size each time we
1764
compile a regexp, according to the number of registers it needs. */
1766
static fail_stack_type fail_stack;
1768
/* Size with which the following vectors are currently allocated.
1769
That is so we can make them bigger as needed,
1770
but never make them smaller. */
1771
static int regs_allocated_size;
1773
static const char ** regstart, ** regend;
1774
static const char ** old_regstart, ** old_regend;
1775
static const char **best_regstart, **best_regend;
1776
static register_info_type *reg_info;
1777
static const char **reg_dummy;
1778
static register_info_type *reg_info_dummy;
1780
/* Make the register vectors big enough for NUM_REGS registers,
1781
but don't make them smaller. */
1784
regex_grow_registers (num_regs)
1787
if (num_regs > regs_allocated_size)
1789
RETALLOC_IF (regstart, num_regs, const char *);
1790
RETALLOC_IF (regend, num_regs, const char *);
1791
RETALLOC_IF (old_regstart, num_regs, const char *);
1792
RETALLOC_IF (old_regend, num_regs, const char *);
1793
RETALLOC_IF (best_regstart, num_regs, const char *);
1794
RETALLOC_IF (best_regend, num_regs, const char *);
1795
RETALLOC_IF (reg_info, num_regs, register_info_type);
1796
RETALLOC_IF (reg_dummy, num_regs, const char *);
1797
RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1799
regs_allocated_size = num_regs;
1803
#endif /* not MATCH_MAY_ALLOCATE */
1805
static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1809
/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1810
Returns one of error codes defined in `regex.h', or zero for success.
1812
Assumes the `allocated' (and perhaps `buffer') and `translate'
1813
fields are set in BUFP on entry.
1815
If it succeeds, results are put in BUFP (if it returns an error, the
1816
contents of BUFP are undefined):
1817
`buffer' is the compiled pattern;
1818
`syntax' is set to SYNTAX;
1819
`used' is set to the length of the compiled pattern;
1820
`fastmap_accurate' is zero;
1821
`re_nsub' is the number of subexpressions in PATTERN;
1822
`not_bol' and `not_eol' are zero;
1824
The `fastmap' and `newline_anchor' fields are neither
1825
examined nor set. */
1827
/* Return, freeing storage we allocated. */
1828
#define FREE_STACK_RETURN(value) \
1829
return (free (compile_stack.stack), value)
1831
static reg_errcode_t
1832
regex_compile (pattern, size, syntax, bufp)
1833
const char *pattern;
1835
reg_syntax_t syntax;
1836
struct re_pattern_buffer *bufp;
1838
/* We fetch characters from PATTERN here. Even though PATTERN is
1839
`char *' (i.e., signed), we declare these variables as unsigned, so
1840
they can be reliably used as array indices. */
1841
register unsigned char c, c1;
1843
/* A random temporary spot in PATTERN. */
1846
/* Points to the end of the buffer, where we should append. */
1847
register unsigned char *b;
1849
/* Keeps track of unclosed groups. */
1850
compile_stack_type compile_stack;
1852
/* Points to the current (ending) position in the pattern. */
1853
const char *p = pattern;
1854
const char *pend = pattern + size;
1856
/* How to translate the characters in the pattern. */
1857
RE_TRANSLATE_TYPE translate = bufp->translate;
1859
/* Address of the count-byte of the most recently inserted `exactn'
1860
command. This makes it possible to tell if a new exact-match
1861
character can be added to that command or if the character requires
1862
a new `exactn' command. */
1863
unsigned char *pending_exact = 0;
1865
/* Address of start of the most recently finished expression.
1866
This tells, e.g., postfix * where to find the start of its
1867
operand. Reset at the beginning of groups and alternatives. */
1868
unsigned char *laststart = 0;
1870
/* Address of beginning of regexp, or inside of last group. */
1871
unsigned char *begalt;
1873
/* Place in the uncompiled pattern (i.e., the {) to
1874
which to go back if the interval is invalid. */
1875
const char *beg_interval;
1877
/* Address of the place where a forward jump should go to the end of
1878
the containing expression. Each alternative of an `or' -- except the
1879
last -- ends with a forward jump of this sort. */
1880
unsigned char *fixup_alt_jump = 0;
1882
/* Counts open-groups as they are encountered. Remembered for the
1883
matching close-group on the compile stack, so the same register
1884
number is put in the stop_memory as the start_memory. */
1885
regnum_t regnum = 0;
1888
DEBUG_PRINT1 ("\nCompiling pattern: ");
1891
unsigned debug_count;
1893
for (debug_count = 0; debug_count < size; debug_count++)
1894
putchar (pattern[debug_count]);
1899
/* Initialize the compile stack. */
1900
compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1901
if (compile_stack.stack == NULL)
1904
compile_stack.size = INIT_COMPILE_STACK_SIZE;
1905
compile_stack.avail = 0;
1907
/* Initialize the pattern buffer. */
1908
bufp->syntax = syntax;
1909
bufp->fastmap_accurate = 0;
1910
bufp->not_bol = bufp->not_eol = 0;
1912
/* Set `used' to zero, so that if we return an error, the pattern
1913
printer (for debugging) will think there's no pattern. We reset it
1917
/* Always count groups, whether or not bufp->no_sub is set. */
1920
#if !defined emacs && !defined SYNTAX_TABLE
1921
/* Initialize the syntax table. */
1922
init_syntax_once ();
1925
if (bufp->allocated == 0)
1928
{ /* If zero allocated, but buffer is non-null, try to realloc
1929
enough space. This loses if buffer's address is bogus, but
1930
that is the user's responsibility. */
1931
RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1934
{ /* Caller did not allocate a buffer. Do it for them. */
1935
bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1937
if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1939
bufp->allocated = INIT_BUF_SIZE;
1942
begalt = b = bufp->buffer;
1944
/* Loop through the uncompiled pattern until we're at the end. */
1953
if ( /* If at start of pattern, it's an operator. */
1955
/* If context independent, it's an operator. */
1956
|| syntax & RE_CONTEXT_INDEP_ANCHORS
1957
/* Otherwise, depends on what's come before. */
1958
|| at_begline_loc_p (pattern, p, syntax))
1968
if ( /* If at end of pattern, it's an operator. */
1970
/* If context independent, it's an operator. */
1971
|| syntax & RE_CONTEXT_INDEP_ANCHORS
1972
/* Otherwise, depends on what's next. */
1973
|| at_endline_loc_p (p, pend, syntax))
1983
if ((syntax & RE_BK_PLUS_QM)
1984
|| (syntax & RE_LIMITED_OPS))
1988
/* If there is no previous pattern... */
1991
if (syntax & RE_CONTEXT_INVALID_OPS)
1992
FREE_STACK_RETURN (REG_BADRPT);
1993
else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1998
/* Are we optimizing this jump? */
1999
boolean keep_string_p = false;
2001
/* 1 means zero (many) matches is allowed. */
2002
char zero_times_ok = 0, many_times_ok = 0;
2004
/* If there is a sequence of repetition chars, collapse it
2005
down to just one (the right one). We can't combine
2006
interval operators with these because of, e.g., `a{2}*',
2007
which should only match an even number of `a's. */
2011
zero_times_ok |= c != '+';
2012
many_times_ok |= c != '?';
2020
|| (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2023
else if (syntax & RE_BK_PLUS_QM && c == '\\')
2025
if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2028
if (!(c1 == '+' || c1 == '?'))
2043
/* If we get here, we found another repeat character. */
2046
/* Star, etc. applied to an empty pattern is equivalent
2047
to an empty pattern. */
2051
/* Now we know whether or not zero matches is allowed
2052
and also whether or not two or more matches is allowed. */
2054
{ /* More than one repetition is allowed, so put in at the
2055
end a backward relative jump from `b' to before the next
2056
jump we're going to put in below (which jumps from
2057
laststart to after this jump).
2059
But if we are at the `*' in the exact sequence `.*\n',
2060
insert an unconditional jump backwards to the .,
2061
instead of the beginning of the loop. This way we only
2062
push a failure point once, instead of every time
2063
through the loop. */
2064
assert (p - 1 > pattern);
2066
/* Allocate the space for the jump. */
2067
GET_BUFFER_SPACE (3);
2069
/* We know we are not at the first character of the pattern,
2070
because laststart was nonzero. And we've already
2071
incremented `p', by the way, to be the character after
2072
the `*'. Do we have to do something analogous here
2073
for null bytes, because of RE_DOT_NOT_NULL? */
2074
if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2076
&& p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2077
&& !(syntax & RE_DOT_NEWLINE))
2078
{ /* We have .*\n. */
2079
STORE_JUMP (jump, b, laststart);
2080
keep_string_p = true;
2083
/* Anything else. */
2084
STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2086
/* We've added more stuff to the buffer. */
2090
/* On failure, jump from laststart to b + 3, which will be the
2091
end of the buffer after this jump is inserted. */
2092
GET_BUFFER_SPACE (3);
2093
INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2101
/* At least one repetition is required, so insert a
2102
`dummy_failure_jump' before the initial
2103
`on_failure_jump' instruction of the loop. This
2104
effects a skip over that instruction the first time
2105
we hit that loop. */
2106
GET_BUFFER_SPACE (3);
2107
INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2122
boolean had_char_class = false;
2124
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2126
/* Ensure that we have enough space to push a charset: the
2127
opcode, the length count, and the bitset; 34 bytes in all. */
2128
GET_BUFFER_SPACE (34);
2132
/* We test `*p == '^' twice, instead of using an if
2133
statement, so we only need one BUF_PUSH. */
2134
BUF_PUSH (*p == '^' ? charset_not : charset);
2138
/* Remember the first position in the bracket expression. */
2141
/* Push the number of bytes in the bitmap. */
2142
BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2144
/* Clear the whole map. */
2145
bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2147
/* charset_not matches newline according to a syntax bit. */
2148
if ((re_opcode_t) b[-2] == charset_not
2149
&& (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2150
SET_LIST_BIT ('\n');
2152
/* Read in characters and ranges, setting map bits. */
2155
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2159
/* \ might escape characters inside [...] and [^...]. */
2160
if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2162
if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2169
/* Could be the end of the bracket expression. If it's
2170
not (i.e., when the bracket expression is `[]' so
2171
far), the ']' character bit gets set way below. */
2172
if (c == ']' && p != p1 + 1)
2175
/* Look ahead to see if it's a range when the last thing
2176
was a character class. */
2177
if (had_char_class && c == '-' && *p != ']')
2178
FREE_STACK_RETURN (REG_ERANGE);
2180
/* Look ahead to see if it's a range when the last thing
2181
was a character: if this is a hyphen not at the
2182
beginning or the end of a list, then it's the range
2185
&& !(p - 2 >= pattern && p[-2] == '[')
2186
&& !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2190
= compile_range (&p, pend, translate, syntax, b);
2191
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2194
else if (p[0] == '-' && p[1] != ']')
2195
{ /* This handles ranges made up of characters only. */
2198
/* Move past the `-'. */
2201
ret = compile_range (&p, pend, translate, syntax, b);
2202
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2205
/* See if we're at the beginning of a possible character
2208
else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2209
{ /* Leave room for the null. */
2210
char str[CHAR_CLASS_MAX_LENGTH + 1];
2215
/* If pattern is `[[:'. */
2216
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2221
if ((c == ':' && *p == ']') || p == pend)
2223
if (c1 < CHAR_CLASS_MAX_LENGTH)
2226
/* This is in any case an invalid class name. */
2231
/* If isn't a word bracketed by `[:' and `:]':
2232
undo the ending character, the letters, and leave
2233
the leading `:' and `[' (but set bits for them). */
2234
if (c == ':' && *p == ']')
2236
#if defined _LIBC || WIDE_CHAR_SUPPORT
2237
boolean is_lower = STREQ (str, "lower");
2238
boolean is_upper = STREQ (str, "upper");
2242
wt = IS_CHAR_CLASS (str);
2244
FREE_STACK_RETURN (REG_ECTYPE);
2246
/* Throw away the ] at the end of the character
2250
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2252
for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2255
if (__iswctype (__btowc (ch), wt))
2258
if (iswctype (btowc (ch), wt))
2262
if (translate && (is_upper || is_lower)
2263
&& (ISUPPER (ch) || ISLOWER (ch)))
2267
had_char_class = true;
2270
boolean is_alnum = STREQ (str, "alnum");
2271
boolean is_alpha = STREQ (str, "alpha");
2272
boolean is_blank = STREQ (str, "blank");
2273
boolean is_cntrl = STREQ (str, "cntrl");
2274
boolean is_digit = STREQ (str, "digit");
2275
boolean is_graph = STREQ (str, "graph");
2276
boolean is_lower = STREQ (str, "lower");
2277
boolean is_print = STREQ (str, "print");
2278
boolean is_punct = STREQ (str, "punct");
2279
boolean is_space = STREQ (str, "space");
2280
boolean is_upper = STREQ (str, "upper");
2281
boolean is_xdigit = STREQ (str, "xdigit");
2283
if (!IS_CHAR_CLASS (str))
2284
FREE_STACK_RETURN (REG_ECTYPE);
2286
/* Throw away the ] at the end of the character
2290
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2292
for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2294
/* This was split into 3 if's to
2295
avoid an arbitrary limit in some compiler. */
2296
if ( (is_alnum && ISALNUM (ch))
2297
|| (is_alpha && ISALPHA (ch))
2298
|| (is_blank && ISBLANK (ch))
2299
|| (is_cntrl && ISCNTRL (ch)))
2301
if ( (is_digit && ISDIGIT (ch))
2302
|| (is_graph && ISGRAPH (ch))
2303
|| (is_lower && ISLOWER (ch))
2304
|| (is_print && ISPRINT (ch)))
2306
if ( (is_punct && ISPUNCT (ch))
2307
|| (is_space && ISSPACE (ch))
2308
|| (is_upper && ISUPPER (ch))
2309
|| (is_xdigit && ISXDIGIT (ch)))
2311
if ( translate && (is_upper || is_lower)
2312
&& (ISUPPER (ch) || ISLOWER (ch)))
2315
had_char_class = true;
2316
#endif /* libc || wctype.h */
2325
had_char_class = false;
2330
had_char_class = false;
2335
/* Discard any (non)matching list bytes that are all 0 at the
2336
end of the map. Decrease the map-length byte too. */
2337
while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2345
if (syntax & RE_NO_BK_PARENS)
2352
if (syntax & RE_NO_BK_PARENS)
2359
if (syntax & RE_NEWLINE_ALT)
2366
if (syntax & RE_NO_BK_VBAR)
2373
if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2374
goto handle_interval;
2380
if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2382
/* Do not translate the character after the \, so that we can
2383
distinguish, e.g., \B from \b, even if we normally would
2384
translate, e.g., B to b. */
2390
if (syntax & RE_NO_BK_PARENS)
2391
goto normal_backslash;
2397
if (COMPILE_STACK_FULL)
2399
RETALLOC (compile_stack.stack, compile_stack.size << 1,
2400
compile_stack_elt_t);
2401
if (compile_stack.stack == NULL) return REG_ESPACE;
2403
compile_stack.size <<= 1;
2406
/* These are the values to restore when we hit end of this
2407
group. They are all relative offsets, so that if the
2408
whole pattern moves because of realloc, they will still
2410
COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2411
COMPILE_STACK_TOP.fixup_alt_jump
2412
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2413
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2414
COMPILE_STACK_TOP.regnum = regnum;
2416
/* We will eventually replace the 0 with the number of
2417
groups inner to this one. But do not push a
2418
start_memory for groups beyond the last one we can
2419
represent in the compiled pattern. */
2420
if (regnum <= MAX_REGNUM)
2422
COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2423
BUF_PUSH_3 (start_memory, regnum, 0);
2426
compile_stack.avail++;
2431
/* If we've reached MAX_REGNUM groups, then this open
2432
won't actually generate any code, so we'll have to
2433
clear pending_exact explicitly. */
2439
if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2441
if (COMPILE_STACK_EMPTY)
2443
if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2444
goto normal_backslash;
2446
FREE_STACK_RETURN (REG_ERPAREN);
2451
{ /* Push a dummy failure point at the end of the
2452
alternative for a possible future
2453
`pop_failure_jump' to pop. See comments at
2454
`push_dummy_failure' in `re_match_2'. */
2455
BUF_PUSH (push_dummy_failure);
2457
/* We allocated space for this jump when we assigned
2458
to `fixup_alt_jump', in the `handle_alt' case below. */
2459
STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2462
/* See similar code for backslashed left paren above. */
2463
if (COMPILE_STACK_EMPTY)
2465
if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2468
FREE_STACK_RETURN (REG_ERPAREN);
2471
/* Since we just checked for an empty stack above, this
2472
``can't happen''. */
2473
assert (compile_stack.avail != 0);
2475
/* We don't just want to restore into `regnum', because
2476
later groups should continue to be numbered higher,
2477
as in `(ab)c(de)' -- the second group is #2. */
2478
regnum_t this_group_regnum;
2480
compile_stack.avail--;
2481
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2483
= COMPILE_STACK_TOP.fixup_alt_jump
2484
? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2486
laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2487
this_group_regnum = COMPILE_STACK_TOP.regnum;
2488
/* If we've reached MAX_REGNUM groups, then this open
2489
won't actually generate any code, so we'll have to
2490
clear pending_exact explicitly. */
2493
/* We're at the end of the group, so now we know how many
2494
groups were inside this one. */
2495
if (this_group_regnum <= MAX_REGNUM)
2497
unsigned char *inner_group_loc
2498
= bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2500
*inner_group_loc = regnum - this_group_regnum;
2501
BUF_PUSH_3 (stop_memory, this_group_regnum,
2502
regnum - this_group_regnum);
2508
case '|': /* `\|'. */
2509
if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2510
goto normal_backslash;
2512
if (syntax & RE_LIMITED_OPS)
2515
/* Insert before the previous alternative a jump which
2516
jumps to this alternative if the former fails. */
2517
GET_BUFFER_SPACE (3);
2518
INSERT_JUMP (on_failure_jump, begalt, b + 6);
2522
/* The alternative before this one has a jump after it
2523
which gets executed if it gets matched. Adjust that
2524
jump so it will jump to this alternative's analogous
2525
jump (put in below, which in turn will jump to the next
2526
(if any) alternative's such jump, etc.). The last such
2527
jump jumps to the correct final destination. A picture:
2533
If we are at `b', then fixup_alt_jump right now points to a
2534
three-byte space after `a'. We'll put in the jump, set
2535
fixup_alt_jump to right after `b', and leave behind three
2536
bytes which we'll fill in when we get to after `c'. */
2539
STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2541
/* Mark and leave space for a jump after this alternative,
2542
to be filled in later either by next alternative or
2543
when know we're at the end of a series of alternatives. */
2545
GET_BUFFER_SPACE (3);
2554
/* If \{ is a literal. */
2555
if (!(syntax & RE_INTERVALS)
2556
/* If we're at `\{' and it's not the open-interval
2558
|| ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2559
|| (p - 2 == pattern && p == pend))
2560
goto normal_backslash;
2564
/* If got here, then the syntax allows intervals. */
2566
/* At least (most) this many matches must be made. */
2567
int lower_bound = -1, upper_bound = -1;
2569
beg_interval = p - 1;
2573
if (syntax & RE_NO_BK_BRACES)
2574
goto unfetch_interval;
2576
FREE_STACK_RETURN (REG_EBRACE);
2579
GET_UNSIGNED_NUMBER (lower_bound);
2583
GET_UNSIGNED_NUMBER (upper_bound);
2584
if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2587
/* Interval such as `{1}' => match exactly once. */
2588
upper_bound = lower_bound;
2590
if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2591
|| lower_bound > upper_bound)
2593
if (syntax & RE_NO_BK_BRACES)
2594
goto unfetch_interval;
2596
FREE_STACK_RETURN (REG_BADBR);
2599
if (!(syntax & RE_NO_BK_BRACES))
2601
if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2608
if (syntax & RE_NO_BK_BRACES)
2609
goto unfetch_interval;
2611
FREE_STACK_RETURN (REG_BADBR);
2614
/* We just parsed a valid interval. */
2616
/* If it's invalid to have no preceding re. */
2619
if (syntax & RE_CONTEXT_INVALID_OPS)
2620
FREE_STACK_RETURN (REG_BADRPT);
2621
else if (syntax & RE_CONTEXT_INDEP_OPS)
2624
goto unfetch_interval;
2627
/* If the upper bound is zero, don't want to succeed at
2628
all; jump from `laststart' to `b + 3', which will be
2629
the end of the buffer after we insert the jump. */
2630
if (upper_bound == 0)
2632
GET_BUFFER_SPACE (3);
2633
INSERT_JUMP (jump, laststart, b + 3);
2637
/* Otherwise, we have a nontrivial interval. When
2638
we're all done, the pattern will look like:
2639
set_number_at <jump count> <upper bound>
2640
set_number_at <succeed_n count> <lower bound>
2641
succeed_n <after jump addr> <succeed_n count>
2643
jump_n <succeed_n addr> <jump count>
2644
(The upper bound and `jump_n' are omitted if
2645
`upper_bound' is 1, though.) */
2647
{ /* If the upper bound is > 1, we need to insert
2648
more at the end of the loop. */
2649
unsigned nbytes = 10 + (upper_bound > 1) * 10;
2651
GET_BUFFER_SPACE (nbytes);
2653
/* Initialize lower bound of the `succeed_n', even
2654
though it will be set during matching by its
2655
attendant `set_number_at' (inserted next),
2656
because `re_compile_fastmap' needs to know.
2657
Jump to the `jump_n' we might insert below. */
2658
INSERT_JUMP2 (succeed_n, laststart,
2659
b + 5 + (upper_bound > 1) * 5,
2663
/* Code to initialize the lower bound. Insert
2664
before the `succeed_n'. The `5' is the last two
2665
bytes of this `set_number_at', plus 3 bytes of
2666
the following `succeed_n'. */
2667
insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2670
if (upper_bound > 1)
2671
{ /* More than one repetition is allowed, so
2672
append a backward jump to the `succeed_n'
2673
that starts this interval.
2675
When we've reached this during matching,
2676
we'll have matched the interval once, so
2677
jump back only `upper_bound - 1' times. */
2678
STORE_JUMP2 (jump_n, b, laststart + 5,
2682
/* The location we want to set is the second
2683
parameter of the `jump_n'; that is `b-2' as
2684
an absolute address. `laststart' will be
2685
the `set_number_at' we're about to insert;
2686
`laststart+3' the number to set, the source
2687
for the relative address. But we are
2688
inserting into the middle of the pattern --
2689
so everything is getting moved up by 5.
2690
Conclusion: (b - 2) - (laststart + 3) + 5,
2691
i.e., b - laststart.
2693
We insert this at the beginning of the loop
2694
so that if we fail during matching, we'll
2695
reinitialize the bounds. */
2696
insert_op2 (set_number_at, laststart, b - laststart,
2697
upper_bound - 1, b);
2702
beg_interval = NULL;
2707
/* If an invalid interval, match the characters as literals. */
2708
assert (beg_interval);
2710
beg_interval = NULL;
2712
/* normal_char and normal_backslash need `c'. */
2715
if (!(syntax & RE_NO_BK_BRACES))
2717
if (p > pattern && p[-1] == '\\')
2718
goto normal_backslash;
2723
/* There is no way to specify the before_dot and after_dot
2724
operators. rms says this is ok. --karl */
2732
BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2738
BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2744
if (syntax & RE_NO_GNU_OPS)
2747
BUF_PUSH (wordchar);
2752
if (syntax & RE_NO_GNU_OPS)
2755
BUF_PUSH (notwordchar);
2760
if (syntax & RE_NO_GNU_OPS)
2766
if (syntax & RE_NO_GNU_OPS)
2772
if (syntax & RE_NO_GNU_OPS)
2774
BUF_PUSH (wordbound);
2778
if (syntax & RE_NO_GNU_OPS)
2780
BUF_PUSH (notwordbound);
2784
if (syntax & RE_NO_GNU_OPS)
2790
if (syntax & RE_NO_GNU_OPS)
2795
case '1': case '2': case '3': case '4': case '5':
2796
case '6': case '7': case '8': case '9':
2797
if (syntax & RE_NO_BK_REFS)
2803
FREE_STACK_RETURN (REG_ESUBREG);
2805
/* Can't back reference to a subexpression if inside of it. */
2806
if (group_in_compile_stack (compile_stack, (regnum_t) c1))
2810
BUF_PUSH_2 (duplicate, c1);
2816
if (syntax & RE_BK_PLUS_QM)
2819
goto normal_backslash;
2823
/* You might think it would be useful for \ to mean
2824
not to translate; but if we don't translate it
2825
it will never match anything. */
2833
/* Expects the character in `c'. */
2835
/* If no exactn currently being built. */
2838
/* If last exactn not at current position. */
2839
|| pending_exact + *pending_exact + 1 != b
2841
/* We have only one byte following the exactn for the count. */
2842
|| *pending_exact == (1 << BYTEWIDTH) - 1
2844
/* If followed by a repetition operator. */
2845
|| *p == '*' || *p == '^'
2846
|| ((syntax & RE_BK_PLUS_QM)
2847
? *p == '\\' && (p[1] == '+' || p[1] == '?')
2848
: (*p == '+' || *p == '?'))
2849
|| ((syntax & RE_INTERVALS)
2850
&& ((syntax & RE_NO_BK_BRACES)
2852
: (p[0] == '\\' && p[1] == '{'))))
2854
/* Start building a new exactn. */
2858
BUF_PUSH_2 (exactn, 0);
2859
pending_exact = b - 1;
2866
} /* while p != pend */
2869
/* Through the pattern now. */
2872
STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2874
if (!COMPILE_STACK_EMPTY)
2875
FREE_STACK_RETURN (REG_EPAREN);
2877
/* If we don't want backtracking, force success
2878
the first time we reach the end of the compiled pattern. */
2879
if (syntax & RE_NO_POSIX_BACKTRACKING)
2882
free (compile_stack.stack);
2884
/* We have succeeded; set the length of the buffer. */
2885
bufp->used = b - bufp->buffer;
2890
DEBUG_PRINT1 ("\nCompiled pattern: \n");
2891
print_compiled_pattern (bufp);
2895
#ifndef MATCH_MAY_ALLOCATE
2896
/* Initialize the failure stack to the largest possible stack. This
2897
isn't necessary unless we're trying to avoid calling alloca in
2898
the search and match routines. */
2900
int num_regs = bufp->re_nsub + 1;
2902
/* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2903
is strictly greater than re_max_failures, the largest possible stack
2904
is 2 * re_max_failures failure points. */
2905
if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2907
fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2910
if (! fail_stack.stack)
2912
= (fail_stack_elt_t *) xmalloc (fail_stack.size
2913
* sizeof (fail_stack_elt_t));
2916
= (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2918
* sizeof (fail_stack_elt_t)));
2919
# else /* not emacs */
2920
if (! fail_stack.stack)
2922
= (fail_stack_elt_t *) malloc (fail_stack.size
2923
* sizeof (fail_stack_elt_t));
2926
= (fail_stack_elt_t *) realloc (fail_stack.stack,
2928
* sizeof (fail_stack_elt_t)));
2929
# endif /* not emacs */
2932
regex_grow_registers (num_regs);
2934
#endif /* not MATCH_MAY_ALLOCATE */
2937
} /* regex_compile */
2939
/* Subroutines for `regex_compile'. */
2941
/* Store OP at LOC followed by two-byte integer parameter ARG. */
2944
store_op1 (op, loc, arg)
2949
*loc = (unsigned char) op;
2950
STORE_NUMBER (loc + 1, arg);
2954
/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2957
store_op2 (op, loc, arg1, arg2)
2962
*loc = (unsigned char) op;
2963
STORE_NUMBER (loc + 1, arg1);
2964
STORE_NUMBER (loc + 3, arg2);
2968
/* Copy the bytes from LOC to END to open up three bytes of space at LOC
2969
for OP followed by two-byte integer parameter ARG. */
2972
insert_op1 (op, loc, arg, end)
2978
register unsigned char *pfrom = end;
2979
register unsigned char *pto = end + 3;
2981
while (pfrom != loc)
2984
store_op1 (op, loc, arg);
2988
/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2991
insert_op2 (op, loc, arg1, arg2, end)
2997
register unsigned char *pfrom = end;
2998
register unsigned char *pto = end + 5;
3000
while (pfrom != loc)
3003
store_op2 (op, loc, arg1, arg2);
3007
/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3008
after an alternative or a begin-subexpression. We assume there is at
3009
least one character before the ^. */
3012
at_begline_loc_p (pattern, p, syntax)
3013
const char *pattern, *p;
3014
reg_syntax_t syntax;
3016
const char *prev = p - 2;
3017
boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3020
/* After a subexpression? */
3021
(*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3022
/* After an alternative? */
3023
|| (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3027
/* The dual of at_begline_loc_p. This one is for $. We assume there is
3028
at least one character after the $, i.e., `P < PEND'. */
3031
at_endline_loc_p (p, pend, syntax)
3032
const char *p, *pend;
3033
reg_syntax_t syntax;
3035
const char *next = p;
3036
boolean next_backslash = *next == '\\';
3037
const char *next_next = p + 1 < pend ? p + 1 : 0;
3040
/* Before a subexpression? */
3041
(syntax & RE_NO_BK_PARENS ? *next == ')'
3042
: next_backslash && next_next && *next_next == ')')
3043
/* Before an alternative? */
3044
|| (syntax & RE_NO_BK_VBAR ? *next == '|'
3045
: next_backslash && next_next && *next_next == '|');
3049
/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3050
false if it's not. */
3053
group_in_compile_stack (compile_stack, regnum)
3054
compile_stack_type compile_stack;
3059
for (this_element = compile_stack.avail - 1;
3062
if (compile_stack.stack[this_element].regnum == regnum)
3069
/* Read the ending character of a range (in a bracket expression) from the
3070
uncompiled pattern *P_PTR (which ends at PEND). We assume the
3071
starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3072
Then we set the translation of all bits between the starting and
3073
ending characters (inclusive) in the compiled pattern B.
3075
Return an error code.
3077
We use these short variable names so we can use the same macros as
3078
`regex_compile' itself. */
3080
static reg_errcode_t
3081
compile_range (p_ptr, pend, translate, syntax, b)
3082
const char **p_ptr, *pend;
3083
RE_TRANSLATE_TYPE translate;
3084
reg_syntax_t syntax;
3089
const char *p = *p_ptr;
3090
unsigned int range_start, range_end;
3095
/* Even though the pattern is a signed `char *', we need to fetch
3096
with unsigned char *'s; if the high bit of the pattern character
3097
is set, the range endpoints will be negative if we fetch using a
3100
We also want to fetch the endpoints without translating them; the
3101
appropriate translation is done in the bit-setting loop below. */
3102
/* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3103
range_start = ((const unsigned char *) p)[-2];
3104
range_end = ((const unsigned char *) p)[0];
3106
/* Have to increment the pointer into the pattern string, so the
3107
caller isn't still at the ending character. */
3110
/* If the start is after the end, the range is empty. */
3111
if (range_start > range_end)
3112
return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3114
/* Here we see why `this_char' has to be larger than an `unsigned
3115
char' -- the range is inclusive, so if `range_end' == 0xff
3116
(assuming 8-bit characters), we would otherwise go into an infinite
3117
loop, since all characters <= 0xff. */
3118
for (this_char = range_start; this_char <= range_end; this_char++)
3120
SET_LIST_BIT (TRANSLATE (this_char));
3126
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3127
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3128
characters can start a string that matches the pattern. This fastmap
3129
is used by re_search to skip quickly over impossible starting points.
3131
The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3132
area as BUFP->fastmap.
3134
We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3137
Returns 0 if we succeed, -2 if an internal error. */
3140
re_compile_fastmap (bufp)
3141
struct re_pattern_buffer *bufp;
3144
#ifdef MATCH_MAY_ALLOCATE
3145
fail_stack_type fail_stack;
3147
#ifndef REGEX_MALLOC
3151
register char *fastmap = bufp->fastmap;
3152
unsigned char *pattern = bufp->buffer;
3153
unsigned char *p = pattern;
3154
register unsigned char *pend = pattern + bufp->used;
3157
/* This holds the pointer to the failure stack, when
3158
it is allocated relocatably. */
3159
fail_stack_elt_t *failure_stack_ptr;
3162
/* Assume that each path through the pattern can be null until
3163
proven otherwise. We set this false at the bottom of switch
3164
statement, to which we get only if a particular path doesn't
3165
match the empty string. */
3166
boolean path_can_be_null = true;
3168
/* We aren't doing a `succeed_n' to begin with. */
3169
boolean succeed_n_p = false;
3171
assert (fastmap != NULL && p != NULL);
3174
bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3175
bufp->fastmap_accurate = 1; /* It will be when we're done. */
3176
bufp->can_be_null = 0;
3180
if (p == pend || *p == succeed)
3182
/* We have reached the (effective) end of pattern. */
3183
if (!FAIL_STACK_EMPTY ())
3185
bufp->can_be_null |= path_can_be_null;
3187
/* Reset for next path. */
3188
path_can_be_null = true;
3190
p = fail_stack.stack[--fail_stack.avail].pointer;
3198
/* We should never be about to go beyond the end of the pattern. */
3201
switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3204
/* I guess the idea here is to simply not bother with a fastmap
3205
if a backreference is used, since it's too hard to figure out
3206
the fastmap for the corresponding group. Setting
3207
`can_be_null' stops `re_search_2' from using the fastmap, so
3208
that is all we do. */
3210
bufp->can_be_null = 1;
3214
/* Following are the cases which match a character. These end
3223
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3224
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3230
/* Chars beyond end of map must be allowed. */
3231
for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3234
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3235
if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3241
for (j = 0; j < (1 << BYTEWIDTH); j++)
3242
if (SYNTAX (j) == Sword)
3248
for (j = 0; j < (1 << BYTEWIDTH); j++)
3249
if (SYNTAX (j) != Sword)
3256
int fastmap_newline = fastmap['\n'];
3258
/* `.' matches anything ... */
3259
for (j = 0; j < (1 << BYTEWIDTH); j++)
3262
/* ... except perhaps newline. */
3263
if (!(bufp->syntax & RE_DOT_NEWLINE))
3264
fastmap['\n'] = fastmap_newline;
3266
/* Return if we have already set `can_be_null'; if we have,
3267
then the fastmap is irrelevant. Something's wrong here. */
3268
else if (bufp->can_be_null)
3271
/* Otherwise, have to check alternative paths. */
3278
for (j = 0; j < (1 << BYTEWIDTH); j++)
3279
if (SYNTAX (j) == (enum syntaxcode) k)
3286
for (j = 0; j < (1 << BYTEWIDTH); j++)
3287
if (SYNTAX (j) != (enum syntaxcode) k)
3292
/* All cases after this match the empty string. These end with
3312
case push_dummy_failure:
3317
case pop_failure_jump:
3318
case maybe_pop_jump:
3321
case dummy_failure_jump:
3322
EXTRACT_NUMBER_AND_INCR (j, p);
3327
/* Jump backward implies we just went through the body of a
3328
loop and matched nothing. Opcode jumped to should be
3329
`on_failure_jump' or `succeed_n'. Just treat it like an
3330
ordinary jump. For a * loop, it has pushed its failure
3331
point already; if so, discard that as redundant. */
3332
if ((re_opcode_t) *p != on_failure_jump
3333
&& (re_opcode_t) *p != succeed_n)
3337
EXTRACT_NUMBER_AND_INCR (j, p);
3340
/* If what's on the stack is where we are now, pop it. */
3341
if (!FAIL_STACK_EMPTY ()
3342
&& fail_stack.stack[fail_stack.avail - 1].pointer == p)
3348
case on_failure_jump:
3349
case on_failure_keep_string_jump:
3350
handle_on_failure_jump:
3351
EXTRACT_NUMBER_AND_INCR (j, p);
3353
/* For some patterns, e.g., `(a?)?', `p+j' here points to the
3354
end of the pattern. We don't want to push such a point,
3355
since when we restore it above, entering the switch will
3356
increment `p' past the end of the pattern. We don't need
3357
to push such a point since we obviously won't find any more
3358
fastmap entries beyond `pend'. Such a pattern can match
3359
the null string, though. */
3362
if (!PUSH_PATTERN_OP (p + j, fail_stack))
3364
RESET_FAIL_STACK ();
3369
bufp->can_be_null = 1;
3373
EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3374
succeed_n_p = false;
3381
/* Get to the number of times to succeed. */
3384
/* Increment p past the n for when k != 0. */
3385
EXTRACT_NUMBER_AND_INCR (k, p);
3389
succeed_n_p = true; /* Spaghetti code alert. */
3390
goto handle_on_failure_jump;
3407
abort (); /* We have listed all the cases. */
3410
/* Getting here means we have found the possible starting
3411
characters for one path of the pattern -- and that the empty
3412
string does not match. We need not follow this path further.
3413
Instead, look at the next alternative (remembered on the
3414
stack), or quit if no more. The test at the top of the loop
3415
does these things. */
3416
path_can_be_null = false;
3420
/* Set `can_be_null' for the last path (also the first path, if the
3421
pattern is empty). */
3422
bufp->can_be_null |= path_can_be_null;
3425
RESET_FAIL_STACK ();
3427
} /* re_compile_fastmap */
3429
weak_alias (__re_compile_fastmap, re_compile_fastmap)
3432
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3433
ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3434
this memory for recording register information. STARTS and ENDS
3435
must be allocated using the malloc library routine, and must each
3436
be at least NUM_REGS * sizeof (regoff_t) bytes long.
3438
If NUM_REGS == 0, then subsequent matches should allocate their own
3441
Unless this function is called, the first search or match using
3442
PATTERN_BUFFER will allocate its own register data, without
3443
freeing the old data. */
3446
re_set_registers (bufp, regs, num_regs, starts, ends)
3447
struct re_pattern_buffer *bufp;
3448
struct re_registers *regs;
3450
regoff_t *starts, *ends;
3454
bufp->regs_allocated = REGS_REALLOCATE;
3455
regs->num_regs = num_regs;
3456
regs->start = starts;
3461
bufp->regs_allocated = REGS_UNALLOCATED;
3463
regs->start = regs->end = (regoff_t *) 0;
3467
weak_alias (__re_set_registers, re_set_registers)
3470
/* Searching routines. */
3472
/* Like re_search_2, below, but only one string is specified, and
3473
doesn't let you say where to stop matching. */
3476
re_search (bufp, string, size, startpos, range, regs)
3477
struct re_pattern_buffer *bufp;
3479
int size, startpos, range;
3480
struct re_registers *regs;
3482
return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3486
weak_alias (__re_search, re_search)
3490
/* Using the compiled pattern in BUFP->buffer, first tries to match the
3491
virtual concatenation of STRING1 and STRING2, starting first at index
3492
STARTPOS, then at STARTPOS + 1, and so on.
3494
STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3496
RANGE is how far to scan while trying to match. RANGE = 0 means try
3497
only at STARTPOS; in general, the last start tried is STARTPOS +
3500
In REGS, return the indices of the virtual concatenation of STRING1
3501
and STRING2 that matched the entire BUFP->buffer and its contained
3504
Do not consider matching one past the index STOP in the virtual
3505
concatenation of STRING1 and STRING2.
3507
We return either the position in the strings at which the match was
3508
found, -1 if no match, or -2 if error (such as failure
3512
re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3513
struct re_pattern_buffer *bufp;
3514
const char *string1, *string2;
3518
struct re_registers *regs;
3522
register char *fastmap = bufp->fastmap;
3523
register RE_TRANSLATE_TYPE translate = bufp->translate;
3524
int total_size = size1 + size2;
3525
int endpos = startpos + range;
3527
/* Check for out-of-range STARTPOS. */
3528
if (startpos < 0 || startpos > total_size)
3531
/* Fix up RANGE if it might eventually take us outside
3532
the virtual concatenation of STRING1 and STRING2.
3533
Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3535
range = 0 - startpos;
3536
else if (endpos > total_size)
3537
range = total_size - startpos;
3539
/* If the search isn't to be a backwards one, don't waste time in a
3540
search for a pattern that must be anchored. */
3541
if (bufp->used > 0 && range > 0
3542
&& ((re_opcode_t) bufp->buffer[0] == begbuf
3543
/* `begline' is like `begbuf' if it cannot match at newlines. */
3544
|| ((re_opcode_t) bufp->buffer[0] == begline
3545
&& !bufp->newline_anchor)))
3554
/* In a forward search for something that starts with \=.
3555
don't keep searching past point. */
3556
if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3558
range = PT - startpos;
3564
/* Update the fastmap now if not correct already. */
3565
if (fastmap && !bufp->fastmap_accurate)
3566
if (re_compile_fastmap (bufp) == -2)
3569
/* Loop through the string, looking for a place to start matching. */
3572
/* If a fastmap is supplied, skip quickly over characters that
3573
cannot be the start of a match. If the pattern can match the
3574
null string, however, we don't need to skip characters; we want
3575
the first null string. */
3576
if (fastmap && startpos < total_size && !bufp->can_be_null)
3578
if (range > 0) /* Searching forwards. */
3580
register const char *d;
3581
register int lim = 0;
3584
if (startpos < size1 && startpos + range >= size1)
3585
lim = range - (size1 - startpos);
3587
d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3589
/* Written out as an if-else to avoid testing `translate'
3593
&& !fastmap[(unsigned char)
3594
translate[(unsigned char) *d++]])
3597
while (range > lim && !fastmap[(unsigned char) *d++])
3600
startpos += irange - range;
3602
else /* Searching backwards. */
3604
register char c = (size1 == 0 || startpos >= size1
3605
? string2[startpos - size1]
3606
: string1[startpos]);
3608
if (!fastmap[(unsigned char) TRANSLATE (c)])
3613
/* If can't match the null string, and that's all we have left, fail. */
3614
if (range >= 0 && startpos == total_size && fastmap
3615
&& !bufp->can_be_null)
3618
val = re_match_2_internal (bufp, string1, size1, string2, size2,
3619
startpos, regs, stop);
3620
#ifndef REGEX_MALLOC
3649
weak_alias (__re_search_2, re_search_2)
3652
/* This converts PTR, a pointer into one of the search strings `string1'
3653
and `string2' into an offset from the beginning of that string. */
3654
#define POINTER_TO_OFFSET(ptr) \
3655
(FIRST_STRING_P (ptr) \
3656
? ((regoff_t) ((ptr) - string1)) \
3657
: ((regoff_t) ((ptr) - string2 + size1)))
3659
/* Macros for dealing with the split strings in re_match_2. */
3661
#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3663
/* Call before fetching a character with *d. This switches over to
3664
string2 if necessary. */
3665
#define PREFETCH() \
3668
/* End of string2 => fail. */ \
3669
if (dend == end_match_2) \
3671
/* End of string1 => advance to string2. */ \
3673
dend = end_match_2; \
3677
/* Test if at very beginning or at very end of the virtual concatenation
3678
of `string1' and `string2'. If only one string, it's `string2'. */
3679
#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3680
#define AT_STRINGS_END(d) ((d) == end2)
3683
/* Test if D points to a character which is word-constituent. We have
3684
two special cases to check for: if past the end of string1, look at
3685
the first character in string2; and if before the beginning of
3686
string2, look at the last character in string1. */
3687
#define WORDCHAR_P(d) \
3688
(SYNTAX ((d) == end1 ? *string2 \
3689
: (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3692
/* Disabled due to a compiler bug -- see comment at case wordbound */
3694
/* Test if the character before D and the one at D differ with respect
3695
to being word-constituent. */
3696
#define AT_WORD_BOUNDARY(d) \
3697
(AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3698
|| WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3701
/* Free everything we malloc. */
3702
#ifdef MATCH_MAY_ALLOCATE
3703
# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3704
# define FREE_VARIABLES() \
3706
REGEX_FREE_STACK (fail_stack.stack); \
3707
FREE_VAR (regstart); \
3708
FREE_VAR (regend); \
3709
FREE_VAR (old_regstart); \
3710
FREE_VAR (old_regend); \
3711
FREE_VAR (best_regstart); \
3712
FREE_VAR (best_regend); \
3713
FREE_VAR (reg_info); \
3714
FREE_VAR (reg_dummy); \
3715
FREE_VAR (reg_info_dummy); \
3718
# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3719
#endif /* not MATCH_MAY_ALLOCATE */
3721
/* These values must meet several constraints. They must not be valid
3722
register values; since we have a limit of 255 registers (because
3723
we use only one byte in the pattern for the register number), we can
3724
use numbers larger than 255. They must differ by 1, because of
3725
NUM_FAILURE_ITEMS above. And the value for the lowest register must
3726
be larger than the value for the highest register, so we do not try
3727
to actually save any registers when none are active. */
3728
#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3729
#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3731
/* Matching routines. */
3733
#ifndef emacs /* Emacs never uses this. */
3734
/* re_match is like re_match_2 except it takes only a single string. */
3737
re_match (bufp, string, size, pos, regs)
3738
struct re_pattern_buffer *bufp;
3741
struct re_registers *regs;
3743
int result = re_match_2_internal (bufp, NULL, 0, string, size,
3745
# ifndef REGEX_MALLOC
3753
weak_alias (__re_match, re_match)
3755
#endif /* not emacs */
3757
static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
3759
register_info_type *reg_info));
3760
static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
3762
register_info_type *reg_info));
3763
static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
3765
register_info_type *reg_info));
3766
static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
3767
int len, char *translate));
3769
/* re_match_2 matches the compiled pattern in BUFP against the
3770
the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3771
and SIZE2, respectively). We start matching at POS, and stop
3774
If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3775
store offsets for the substring each group matched in REGS. See the
3776
documentation for exactly how many groups we fill.
3778
We return -1 if no match, -2 if an internal error (such as the
3779
failure stack overflowing). Otherwise, we return the length of the
3780
matched substring. */
3783
re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3784
struct re_pattern_buffer *bufp;
3785
const char *string1, *string2;
3788
struct re_registers *regs;
3791
int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3793
#ifndef REGEX_MALLOC
3801
weak_alias (__re_match_2, re_match_2)
3804
/* This is a separate function so that we can force an alloca cleanup
3807
re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3808
struct re_pattern_buffer *bufp;
3809
const char *string1, *string2;
3812
struct re_registers *regs;
3815
/* General temporaries. */
3819
/* Just past the end of the corresponding string. */
3820
const char *end1, *end2;
3822
/* Pointers into string1 and string2, just past the last characters in
3823
each to consider matching. */
3824
const char *end_match_1, *end_match_2;
3826
/* Where we are in the data, and the end of the current string. */
3827
const char *d, *dend;
3829
/* Where we are in the pattern, and the end of the pattern. */
3830
unsigned char *p = bufp->buffer;
3831
register unsigned char *pend = p + bufp->used;
3833
/* Mark the opcode just after a start_memory, so we can test for an
3834
empty subpattern when we get to the stop_memory. */
3835
unsigned char *just_past_start_mem = 0;
3837
/* We use this to map every character in the string. */
3838
RE_TRANSLATE_TYPE translate = bufp->translate;
3840
/* Failure point stack. Each place that can handle a failure further
3841
down the line pushes a failure point on this stack. It consists of
3842
restart, regend, and reg_info for all registers corresponding to
3843
the subexpressions we're currently inside, plus the number of such
3844
registers, and, finally, two char *'s. The first char * is where
3845
to resume scanning the pattern; the second one is where to resume
3846
scanning the strings. If the latter is zero, the failure point is
3847
a ``dummy''; if a failure happens and the failure point is a dummy,
3848
it gets discarded and the next next one is tried. */
3849
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3850
fail_stack_type fail_stack;
3853
static unsigned failure_id = 0;
3854
unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3858
/* This holds the pointer to the failure stack, when
3859
it is allocated relocatably. */
3860
fail_stack_elt_t *failure_stack_ptr;
3863
/* We fill all the registers internally, independent of what we
3864
return, for use in backreferences. The number here includes
3865
an element for register zero. */
3866
size_t num_regs = bufp->re_nsub + 1;
3868
/* The currently active registers. */
3869
active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3870
active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3872
/* Information on the contents of registers. These are pointers into
3873
the input strings; they record just what was matched (on this
3874
attempt) by a subexpression part of the pattern, that is, the
3875
regnum-th regstart pointer points to where in the pattern we began
3876
matching and the regnum-th regend points to right after where we
3877
stopped matching the regnum-th subexpression. (The zeroth register
3878
keeps track of what the whole pattern matches.) */
3879
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3880
const char **regstart, **regend;
3883
/* If a group that's operated upon by a repetition operator fails to
3884
match anything, then the register for its start will need to be
3885
restored because it will have been set to wherever in the string we
3886
are when we last see its open-group operator. Similarly for a
3888
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3889
const char **old_regstart, **old_regend;
3892
/* The is_active field of reg_info helps us keep track of which (possibly
3893
nested) subexpressions we are currently in. The matched_something
3894
field of reg_info[reg_num] helps us tell whether or not we have
3895
matched any of the pattern so far this time through the reg_num-th
3896
subexpression. These two fields get reset each time through any
3897
loop their register is in. */
3898
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3899
register_info_type *reg_info;
3902
/* The following record the register info as found in the above
3903
variables when we find a match better than any we've seen before.
3904
This happens as we backtrack through the failure points, which in
3905
turn happens only if we have not yet matched the entire string. */
3906
unsigned best_regs_set = false;
3907
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3908
const char **best_regstart, **best_regend;
3911
/* Logically, this is `best_regend[0]'. But we don't want to have to
3912
allocate space for that if we're not allocating space for anything
3913
else (see below). Also, we never need info about register 0 for
3914
any of the other register vectors, and it seems rather a kludge to
3915
treat `best_regend' differently than the rest. So we keep track of
3916
the end of the best match so far in a separate variable. We
3917
initialize this to NULL so that when we backtrack the first time
3918
and need to test it, it's not garbage. */
3919
const char *match_end = NULL;
3921
/* This helps SET_REGS_MATCHED avoid doing redundant work. */
3922
int set_regs_matched_done = 0;
3924
/* Used when we pop values we don't care about. */
3925
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3926
const char **reg_dummy;
3927
register_info_type *reg_info_dummy;
3931
/* Counts the total number of registers pushed. */
3932
unsigned num_regs_pushed = 0;
3935
DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3939
#ifdef MATCH_MAY_ALLOCATE
3940
/* Do not bother to initialize all the register variables if there are
3941
no groups in the pattern, as it takes a fair amount of time. If
3942
there are groups, we include space for register 0 (the whole
3943
pattern), even though we never use it, since it simplifies the
3944
array indexing. We should fix this. */
3947
regstart = REGEX_TALLOC (num_regs, const char *);
3948
regend = REGEX_TALLOC (num_regs, const char *);
3949
old_regstart = REGEX_TALLOC (num_regs, const char *);
3950
old_regend = REGEX_TALLOC (num_regs, const char *);
3951
best_regstart = REGEX_TALLOC (num_regs, const char *);
3952
best_regend = REGEX_TALLOC (num_regs, const char *);
3953
reg_info = REGEX_TALLOC (num_regs, register_info_type);
3954
reg_dummy = REGEX_TALLOC (num_regs, const char *);
3955
reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3957
if (!(regstart && regend && old_regstart && old_regend && reg_info
3958
&& best_regstart && best_regend && reg_dummy && reg_info_dummy))
3966
/* We must initialize all our variables to NULL, so that
3967
`FREE_VARIABLES' doesn't try to free them. */
3968
regstart = regend = old_regstart = old_regend = best_regstart
3969
= best_regend = reg_dummy = NULL;
3970
reg_info = reg_info_dummy = (register_info_type *) NULL;
3972
#endif /* MATCH_MAY_ALLOCATE */
3974
/* The starting position is bogus. */
3975
if (pos < 0 || pos > size1 + size2)
3981
/* Initialize subexpression text positions to -1 to mark ones that no
3982
start_memory/stop_memory has been seen for. Also initialize the
3983
register information struct. */
3984
for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
3986
regstart[mcnt] = regend[mcnt]
3987
= old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3989
REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3990
IS_ACTIVE (reg_info[mcnt]) = 0;
3991
MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3992
EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3995
/* We move `string1' into `string2' if the latter's empty -- but not if
3996
`string1' is null. */
3997
if (size2 == 0 && string1 != NULL)
4004
end1 = string1 + size1;
4005
end2 = string2 + size2;
4007
/* Compute where to stop matching, within the two strings. */
4010
end_match_1 = string1 + stop;
4011
end_match_2 = string2;
4016
end_match_2 = string2 + stop - size1;
4019
/* `p' scans through the pattern as `d' scans through the data.
4020
`dend' is the end of the input string that `d' points within. `d'
4021
is advanced into the following input string whenever necessary, but
4022
this happens before fetching; therefore, at the beginning of the
4023
loop, `d' can be pointing at the end of a string, but it cannot
4025
if (size1 > 0 && pos <= size1)
4032
d = string2 + pos - size1;
4036
DEBUG_PRINT1 ("The compiled pattern is:\n");
4037
DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4038
DEBUG_PRINT1 ("The string to match is: `");
4039
DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4040
DEBUG_PRINT1 ("'\n");
4042
/* This loops over pattern commands. It exits by returning from the
4043
function if the match is complete, or it drops through if the match
4044
fails at this starting point in the input data. */
4048
DEBUG_PRINT2 ("\n%p: ", p);
4050
DEBUG_PRINT2 ("\n0x%x: ", p);
4054
{ /* End of pattern means we might have succeeded. */
4055
DEBUG_PRINT1 ("end of pattern ... ");
4057
/* If we haven't matched the entire string, and we want the
4058
longest match, try backtracking. */
4059
if (d != end_match_2)
4061
/* 1 if this match ends in the same string (string1 or string2)
4062
as the best previous match. */
4063
boolean same_str_p = (FIRST_STRING_P (match_end)
4064
== MATCHING_IN_FIRST_STRING);
4065
/* 1 if this match is the best seen so far. */
4066
boolean best_match_p;
4068
/* AIX compiler got confused when this was combined
4069
with the previous declaration. */
4071
best_match_p = d > match_end;
4073
best_match_p = !MATCHING_IN_FIRST_STRING;
4075
DEBUG_PRINT1 ("backtracking.\n");
4077
if (!FAIL_STACK_EMPTY ())
4078
{ /* More failure points to try. */
4080
/* If exceeds best match so far, save it. */
4081
if (!best_regs_set || best_match_p)
4083
best_regs_set = true;
4086
DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4088
for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4090
best_regstart[mcnt] = regstart[mcnt];
4091
best_regend[mcnt] = regend[mcnt];
4097
/* If no failure points, don't restore garbage. And if
4098
last match is real best match, don't restore second
4100
else if (best_regs_set && !best_match_p)
4103
/* Restore best match. It may happen that `dend ==
4104
end_match_1' while the restored d is in string2.
4105
For example, the pattern `x.*y.*z' against the
4106
strings `x-' and `y-z-', if the two strings are
4107
not consecutive in memory. */
4108
DEBUG_PRINT1 ("Restoring best registers.\n");
4111
dend = ((d >= string1 && d <= end1)
4112
? end_match_1 : end_match_2);
4114
for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4116
regstart[mcnt] = best_regstart[mcnt];
4117
regend[mcnt] = best_regend[mcnt];
4120
} /* d != end_match_2 */
4123
DEBUG_PRINT1 ("Accepting match.\n");
4125
/* If caller wants register contents data back, do it. */
4126
if (regs && !bufp->no_sub)
4128
/* Have the register data arrays been allocated? */
4129
if (bufp->regs_allocated == REGS_UNALLOCATED)
4130
{ /* No. So allocate them with malloc. We need one
4131
extra element beyond `num_regs' for the `-1' marker
4133
regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4134
regs->start = TALLOC (regs->num_regs, regoff_t);
4135
regs->end = TALLOC (regs->num_regs, regoff_t);
4136
if (regs->start == NULL || regs->end == NULL)
4141
bufp->regs_allocated = REGS_REALLOCATE;
4143
else if (bufp->regs_allocated == REGS_REALLOCATE)
4144
{ /* Yes. If we need more elements than were already
4145
allocated, reallocate them. If we need fewer, just
4147
if (regs->num_regs < num_regs + 1)
4149
regs->num_regs = num_regs + 1;
4150
RETALLOC (regs->start, regs->num_regs, regoff_t);
4151
RETALLOC (regs->end, regs->num_regs, regoff_t);
4152
if (regs->start == NULL || regs->end == NULL)
4161
/* These braces fend off a "empty body in an else-statement"
4162
warning under GCC when assert expands to nothing. */
4163
assert (bufp->regs_allocated == REGS_FIXED);
4166
/* Convert the pointer data in `regstart' and `regend' to
4167
indices. Register zero has to be set differently,
4168
since we haven't kept track of any info for it. */
4169
if (regs->num_regs > 0)
4171
regs->start[0] = pos;
4172
regs->end[0] = (MATCHING_IN_FIRST_STRING
4173
? ((regoff_t) (d - string1))
4174
: ((regoff_t) (d - string2 + size1)));
4177
/* Go through the first `min (num_regs, regs->num_regs)'
4178
registers, since that is all we initialized. */
4179
for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
4182
if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4183
regs->start[mcnt] = regs->end[mcnt] = -1;
4187
= (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4189
= (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4193
/* If the regs structure we return has more elements than
4194
were in the pattern, set the extra elements to -1. If
4195
we (re)allocated the registers, this is the case,
4196
because we always allocate enough to have at least one
4198
for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
4199
regs->start[mcnt] = regs->end[mcnt] = -1;
4200
} /* regs && !bufp->no_sub */
4202
DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4203
nfailure_points_pushed, nfailure_points_popped,
4204
nfailure_points_pushed - nfailure_points_popped);
4205
DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4207
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4211
DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4217
/* Otherwise match next pattern command. */
4218
switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4220
/* Ignore these. Used to ignore the n of succeed_n's which
4221
currently have n == 0. */
4223
DEBUG_PRINT1 ("EXECUTING no_op.\n");
4227
DEBUG_PRINT1 ("EXECUTING succeed.\n");
4230
/* Match the next n pattern characters exactly. The following
4231
byte in the pattern defines n, and the n bytes after that
4232
are the characters to match. */
4235
DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4237
/* This is written out as an if-else so we don't waste time
4238
testing `translate' inside the loop. */
4244
if ((unsigned char) translate[(unsigned char) *d++]
4245
!= (unsigned char) *p++)
4255
if (*d++ != (char) *p++) goto fail;
4259
SET_REGS_MATCHED ();
4263
/* Match any character except possibly a newline or a null. */
4265
DEBUG_PRINT1 ("EXECUTING anychar.\n");
4269
if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4270
|| (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4273
SET_REGS_MATCHED ();
4274
DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4282
register unsigned char c;
4283
boolean not = (re_opcode_t) *(p - 1) == charset_not;
4285
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4288
c = TRANSLATE (*d); /* The character to match. */
4290
/* Cast to `unsigned' instead of `unsigned char' in case the
4291
bit list is a full 32 bytes long. */
4292
if (c < (unsigned) (*p * BYTEWIDTH)
4293
&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4298
if (!not) goto fail;
4300
SET_REGS_MATCHED ();
4306
/* The beginning of a group is represented by start_memory.
4307
The arguments are the register number in the next byte, and the
4308
number of groups inner to this one in the next. The text
4309
matched within the group is recorded (in the internal
4310
registers data structure) under the register number. */
4312
DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4314
/* Find out if this group can match the empty string. */
4315
p1 = p; /* To send to group_match_null_string_p. */
4317
if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4318
REG_MATCH_NULL_STRING_P (reg_info[*p])
4319
= group_match_null_string_p (&p1, pend, reg_info);
4321
/* Save the position in the string where we were the last time
4322
we were at this open-group operator in case the group is
4323
operated upon by a repetition operator, e.g., with `(a*)*b'
4324
against `ab'; then we want to ignore where we are now in
4325
the string in case this attempt to match fails. */
4326
old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4327
? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4329
DEBUG_PRINT2 (" old_regstart: %d\n",
4330
POINTER_TO_OFFSET (old_regstart[*p]));
4333
DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4335
IS_ACTIVE (reg_info[*p]) = 1;
4336
MATCHED_SOMETHING (reg_info[*p]) = 0;
4338
/* Clear this whenever we change the register activity status. */
4339
set_regs_matched_done = 0;
4341
/* This is the new highest active register. */
4342
highest_active_reg = *p;
4344
/* If nothing was active before, this is the new lowest active
4346
if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4347
lowest_active_reg = *p;
4349
/* Move past the register number and inner group count. */
4351
just_past_start_mem = p;
4356
/* The stop_memory opcode represents the end of a group. Its
4357
arguments are the same as start_memory's: the register
4358
number, and the number of inner groups. */
4360
DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4362
/* We need to save the string position the last time we were at
4363
this close-group operator in case the group is operated
4364
upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4365
against `aba'; then we want to ignore where we are now in
4366
the string in case this attempt to match fails. */
4367
old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4368
? REG_UNSET (regend[*p]) ? d : regend[*p]
4370
DEBUG_PRINT2 (" old_regend: %d\n",
4371
POINTER_TO_OFFSET (old_regend[*p]));
4374
DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4376
/* This register isn't active anymore. */
4377
IS_ACTIVE (reg_info[*p]) = 0;
4379
/* Clear this whenever we change the register activity status. */
4380
set_regs_matched_done = 0;
4382
/* If this was the only register active, nothing is active
4384
if (lowest_active_reg == highest_active_reg)
4386
lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4387
highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4390
{ /* We must scan for the new highest active register, since
4391
it isn't necessarily one less than now: consider
4392
(a(b)c(d(e)f)g). When group 3 ends, after the f), the
4393
new highest active register is 1. */
4394
unsigned char r = *p - 1;
4395
while (r > 0 && !IS_ACTIVE (reg_info[r]))
4398
/* If we end up at register zero, that means that we saved
4399
the registers as the result of an `on_failure_jump', not
4400
a `start_memory', and we jumped to past the innermost
4401
`stop_memory'. For example, in ((.)*) we save
4402
registers 1 and 2 as a result of the *, but when we pop
4403
back to the second ), we are at the stop_memory 1.
4404
Thus, nothing is active. */
4407
lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4408
highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4411
highest_active_reg = r;
4414
/* If just failed to match something this time around with a
4415
group that's operated on by a repetition operator, try to
4416
force exit from the ``loop'', and restore the register
4417
information for this group that we had before trying this
4419
if ((!MATCHED_SOMETHING (reg_info[*p])
4420
|| just_past_start_mem == p - 1)
4423
boolean is_a_jump_n = false;
4427
switch ((re_opcode_t) *p1++)
4431
case pop_failure_jump:
4432
case maybe_pop_jump:
4434
case dummy_failure_jump:
4435
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4445
/* If the next operation is a jump backwards in the pattern
4446
to an on_failure_jump right before the start_memory
4447
corresponding to this stop_memory, exit from the loop
4448
by forcing a failure after pushing on the stack the
4449
on_failure_jump's jump in the pattern, and d. */
4450
if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4451
&& (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4453
/* If this group ever matched anything, then restore
4454
what its registers were before trying this last
4455
failed match, e.g., with `(a*)*b' against `ab' for
4456
regstart[1], and, e.g., with `((a*)*(b*)*)*'
4457
against `aba' for regend[3].
4459
Also restore the registers for inner groups for,
4460
e.g., `((a*)(b*))*' against `aba' (register 3 would
4461
otherwise get trashed). */
4463
if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4467
EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4469
/* Restore this and inner groups' (if any) registers. */
4470
for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
4473
regstart[r] = old_regstart[r];
4475
/* xx why this test? */
4476
if (old_regend[r] >= regstart[r])
4477
regend[r] = old_regend[r];
4481
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4482
PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4488
/* Move past the register number and the inner group count. */
4493
/* \<digit> has been turned into a `duplicate' command which is
4494
followed by the numeric value of <digit> as the register number. */
4497
register const char *d2, *dend2;
4498
int regno = *p++; /* Get which register to match against. */
4499
DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4501
/* Can't back reference a group which we've never matched. */
4502
if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4505
/* Where in input to try to start matching. */
4506
d2 = regstart[regno];
4508
/* Where to stop matching; if both the place to start and
4509
the place to stop matching are in the same string, then
4510
set to the place to stop, otherwise, for now have to use
4511
the end of the first string. */
4513
dend2 = ((FIRST_STRING_P (regstart[regno])
4514
== FIRST_STRING_P (regend[regno]))
4515
? regend[regno] : end_match_1);
4518
/* If necessary, advance to next segment in register
4522
if (dend2 == end_match_2) break;
4523
if (dend2 == regend[regno]) break;
4525
/* End of string1 => advance to string2. */
4527
dend2 = regend[regno];
4529
/* At end of register contents => success */
4530
if (d2 == dend2) break;
4532
/* If necessary, advance to next segment in data. */
4535
/* How many characters left in this segment to match. */
4538
/* Want how many consecutive characters we can match in
4539
one shot, so, if necessary, adjust the count. */
4540
if (mcnt > dend2 - d2)
4543
/* Compare that many; failure if mismatch, else move
4546
? bcmp_translate (d, d2, mcnt, translate)
4547
: memcmp (d, d2, mcnt))
4549
d += mcnt, d2 += mcnt;
4551
/* Do this because we've match some characters. */
4552
SET_REGS_MATCHED ();
4558
/* begline matches the empty string at the beginning of the string
4559
(unless `not_bol' is set in `bufp'), and, if
4560
`newline_anchor' is set, after newlines. */
4562
DEBUG_PRINT1 ("EXECUTING begline.\n");
4564
if (AT_STRINGS_BEG (d))
4566
if (!bufp->not_bol) break;
4568
else if (d[-1] == '\n' && bufp->newline_anchor)
4572
/* In all other cases, we fail. */
4576
/* endline is the dual of begline. */
4578
DEBUG_PRINT1 ("EXECUTING endline.\n");
4580
if (AT_STRINGS_END (d))
4582
if (!bufp->not_eol) break;
4585
/* We have to ``prefetch'' the next character. */
4586
else if ((d == end1 ? *string2 : *d) == '\n'
4587
&& bufp->newline_anchor)
4594
/* Match at the very beginning of the data. */
4596
DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4597
if (AT_STRINGS_BEG (d))
4602
/* Match at the very end of the data. */
4604
DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4605
if (AT_STRINGS_END (d))
4610
/* on_failure_keep_string_jump is used to optimize `.*\n'. It
4611
pushes NULL as the value for the string on the stack. Then
4612
`pop_failure_point' will keep the current value for the
4613
string, instead of restoring it. To see why, consider
4614
matching `foo\nbar' against `.*\n'. The .* matches the foo;
4615
then the . fails against the \n. But the next thing we want
4616
to do is match the \n against the \n; if we restored the
4617
string value, we would be back at the foo.
4619
Because this is used only in specific cases, we don't need to
4620
check all the things that `on_failure_jump' does, to make
4621
sure the right things get saved on the stack. Hence we don't
4622
share its code. The only reason to push anything on the
4623
stack at all is that otherwise we would have to change
4624
`anychar's code to do something besides goto fail in this
4625
case; that seems worse than this. */
4626
case on_failure_keep_string_jump:
4627
DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4629
EXTRACT_NUMBER_AND_INCR (mcnt, p);
4631
DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
4633
DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4636
PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4640
/* Uses of on_failure_jump:
4642
Each alternative starts with an on_failure_jump that points
4643
to the beginning of the next alternative. Each alternative
4644
except the last ends with a jump that in effect jumps past
4645
the rest of the alternatives. (They really jump to the
4646
ending jump of the following alternative, because tensioning
4647
these jumps is a hassle.)
4649
Repeats start with an on_failure_jump that points past both
4650
the repetition text and either the following jump or
4651
pop_failure_jump back to this on_failure_jump. */
4652
case on_failure_jump:
4654
DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4656
EXTRACT_NUMBER_AND_INCR (mcnt, p);
4658
DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
4660
DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4663
/* If this on_failure_jump comes right before a group (i.e.,
4664
the original * applied to a group), save the information
4665
for that group and all inner ones, so that if we fail back
4666
to this point, the group's information will be correct.
4667
For example, in \(a*\)*\1, we need the preceding group,
4668
and in \(zz\(a*\)b*\)\2, we need the inner group. */
4670
/* We can't use `p' to check ahead because we push
4671
a failure point to `p + mcnt' after we do this. */
4674
/* We need to skip no_op's before we look for the
4675
start_memory in case this on_failure_jump is happening as
4676
the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4678
while (p1 < pend && (re_opcode_t) *p1 == no_op)
4681
if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4683
/* We have a new highest active register now. This will
4684
get reset at the start_memory we are about to get to,
4685
but we will have saved all the registers relevant to
4686
this repetition op, as described above. */
4687
highest_active_reg = *(p1 + 1) + *(p1 + 2);
4688
if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4689
lowest_active_reg = *(p1 + 1);
4692
DEBUG_PRINT1 (":\n");
4693
PUSH_FAILURE_POINT (p + mcnt, d, -2);
4697
/* A smart repeat ends with `maybe_pop_jump'.
4698
We change it to either `pop_failure_jump' or `jump'. */
4699
case maybe_pop_jump:
4700
EXTRACT_NUMBER_AND_INCR (mcnt, p);
4701
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4703
register unsigned char *p2 = p;
4705
/* Compare the beginning of the repeat with what in the
4706
pattern follows its end. If we can establish that there
4707
is nothing that they would both match, i.e., that we
4708
would have to backtrack because of (as in, e.g., `a*a')
4709
then we can change to pop_failure_jump, because we'll
4710
never have to backtrack.
4712
This is not true in the case of alternatives: in
4713
`(a|ab)*' we do need to backtrack to the `ab' alternative
4714
(e.g., if the string was `ab'). But instead of trying to
4715
detect that here, the alternative has put on a dummy
4716
failure point which is what we will end up popping. */
4718
/* Skip over open/close-group commands.
4719
If what follows this loop is a ...+ construct,
4720
look at what begins its body, since we will have to
4721
match at least one of that. */
4725
&& ((re_opcode_t) *p2 == stop_memory
4726
|| (re_opcode_t) *p2 == start_memory))
4728
else if (p2 + 6 < pend
4729
&& (re_opcode_t) *p2 == dummy_failure_jump)
4736
/* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4737
to the `maybe_finalize_jump' of this case. Examine what
4740
/* If we're at the end of the pattern, we can change. */
4743
/* Consider what happens when matching ":\(.*\)"
4744
against ":/". I don't really understand this code
4746
p[-3] = (unsigned char) pop_failure_jump;
4748
(" End of pattern: change to `pop_failure_jump'.\n");
4751
else if ((re_opcode_t) *p2 == exactn
4752
|| (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4754
register unsigned char c
4755
= *p2 == (unsigned char) endline ? '\n' : p2[2];
4757
if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4759
p[-3] = (unsigned char) pop_failure_jump;
4760
DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4764
else if ((re_opcode_t) p1[3] == charset
4765
|| (re_opcode_t) p1[3] == charset_not)
4767
int not = (re_opcode_t) p1[3] == charset_not;
4769
if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4770
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4773
/* `not' is equal to 1 if c would match, which means
4774
that we can't change to pop_failure_jump. */
4777
p[-3] = (unsigned char) pop_failure_jump;
4778
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4782
else if ((re_opcode_t) *p2 == charset)
4785
register unsigned char c
4786
= *p2 == (unsigned char) endline ? '\n' : p2[2];
4790
if ((re_opcode_t) p1[3] == exactn
4791
&& ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
4792
&& (p2[2 + p1[5] / BYTEWIDTH]
4793
& (1 << (p1[5] % BYTEWIDTH)))))
4795
if ((re_opcode_t) p1[3] == exactn
4796
&& ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4797
&& (p2[2 + p1[4] / BYTEWIDTH]
4798
& (1 << (p1[4] % BYTEWIDTH)))))
4801
p[-3] = (unsigned char) pop_failure_jump;
4802
DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4806
else if ((re_opcode_t) p1[3] == charset_not)
4809
/* We win if the charset_not inside the loop
4810
lists every character listed in the charset after. */
4811
for (idx = 0; idx < (int) p2[1]; idx++)
4812
if (! (p2[2 + idx] == 0
4813
|| (idx < (int) p1[4]
4814
&& ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4819
p[-3] = (unsigned char) pop_failure_jump;
4820
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4823
else if ((re_opcode_t) p1[3] == charset)
4826
/* We win if the charset inside the loop
4827
has no overlap with the one after the loop. */
4829
idx < (int) p2[1] && idx < (int) p1[4];
4831
if ((p2[2 + idx] & p1[5 + idx]) != 0)
4834
if (idx == p2[1] || idx == p1[4])
4836
p[-3] = (unsigned char) pop_failure_jump;
4837
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4842
p -= 2; /* Point at relative address again. */
4843
if ((re_opcode_t) p[-1] != pop_failure_jump)
4845
p[-1] = (unsigned char) jump;
4846
DEBUG_PRINT1 (" Match => jump.\n");
4847
goto unconditional_jump;
4849
/* Note fall through. */
4852
/* The end of a simple repeat has a pop_failure_jump back to
4853
its matching on_failure_jump, where the latter will push a
4854
failure point. The pop_failure_jump takes off failure
4855
points put on by this pop_failure_jump's matching
4856
on_failure_jump; we got through the pattern to here from the
4857
matching on_failure_jump, so didn't fail. */
4858
case pop_failure_jump:
4860
/* We need to pass separate storage for the lowest and
4861
highest registers, even though we don't care about the
4862
actual values. Otherwise, we will restore only one
4863
register from the stack, since lowest will == highest in
4864
`pop_failure_point'. */
4865
active_reg_t dummy_low_reg, dummy_high_reg;
4866
unsigned char *pdummy;
4869
DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4870
POP_FAILURE_POINT (sdummy, pdummy,
4871
dummy_low_reg, dummy_high_reg,
4872
reg_dummy, reg_dummy, reg_info_dummy);
4874
/* Note fall through. */
4878
DEBUG_PRINT2 ("\n%p: ", p);
4880
DEBUG_PRINT2 ("\n0x%x: ", p);
4882
/* Note fall through. */
4884
/* Unconditionally jump (without popping any failure points). */
4886
EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4887
DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4888
p += mcnt; /* Do the jump. */
4890
DEBUG_PRINT2 ("(to %p).\n", p);
4892
DEBUG_PRINT2 ("(to 0x%x).\n", p);
4897
/* We need this opcode so we can detect where alternatives end
4898
in `group_match_null_string_p' et al. */
4900
DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4901
goto unconditional_jump;
4904
/* Normally, the on_failure_jump pushes a failure point, which
4905
then gets popped at pop_failure_jump. We will end up at
4906
pop_failure_jump, also, and with a pattern of, say, `a+', we
4907
are skipping over the on_failure_jump, so we have to push
4908
something meaningless for pop_failure_jump to pop. */
4909
case dummy_failure_jump:
4910
DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4911
/* It doesn't matter what we push for the string here. What
4912
the code at `fail' tests is the value for the pattern. */
4913
PUSH_FAILURE_POINT (NULL, NULL, -2);
4914
goto unconditional_jump;
4917
/* At the end of an alternative, we need to push a dummy failure
4918
point in case we are followed by a `pop_failure_jump', because
4919
we don't want the failure point for the alternative to be
4920
popped. For example, matching `(a|ab)*' against `aab'
4921
requires that we match the `ab' alternative. */
4922
case push_dummy_failure:
4923
DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4924
/* See comments just above at `dummy_failure_jump' about the
4926
PUSH_FAILURE_POINT (NULL, NULL, -2);
4929
/* Have to succeed matching what follows at least n times.
4930
After that, handle like `on_failure_jump'. */
4932
EXTRACT_NUMBER (mcnt, p + 2);
4933
DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4936
/* Originally, this is how many times we HAVE to succeed. */
4941
STORE_NUMBER_AND_INCR (p, mcnt);
4943
DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
4945
DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
4951
DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
4953
DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4955
p[2] = (unsigned char) no_op;
4956
p[3] = (unsigned char) no_op;
4962
EXTRACT_NUMBER (mcnt, p + 2);
4963
DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4965
/* Originally, this is how many times we CAN jump. */
4969
STORE_NUMBER (p + 2, mcnt);
4971
DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
4973
DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
4975
goto unconditional_jump;
4977
/* If don't have to jump any more, skip over the rest of command. */
4984
DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4986
EXTRACT_NUMBER_AND_INCR (mcnt, p);
4988
EXTRACT_NUMBER_AND_INCR (mcnt, p);
4990
DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
4992
DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4994
STORE_NUMBER (p1, mcnt);
4999
/* The DEC Alpha C compiler 3.x generates incorrect code for the
5000
test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
5001
AT_WORD_BOUNDARY, so this code is disabled. Expanding the
5002
macro and introducing temporary variables works around the bug. */
5005
DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5006
if (AT_WORD_BOUNDARY (d))
5011
DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5012
if (AT_WORD_BOUNDARY (d))
5018
boolean prevchar, thischar;
5020
DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5021
if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5024
prevchar = WORDCHAR_P (d - 1);
5025
thischar = WORDCHAR_P (d);
5026
if (prevchar != thischar)
5033
boolean prevchar, thischar;
5035
DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5036
if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5039
prevchar = WORDCHAR_P (d - 1);
5040
thischar = WORDCHAR_P (d);
5041
if (prevchar != thischar)
5048
DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5049
if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5054
DEBUG_PRINT1 ("EXECUTING wordend.\n");
5055
if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5056
&& (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5062
DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5063
if (PTR_CHAR_POS ((unsigned char *) d) >= point)
5068
DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5069
if (PTR_CHAR_POS ((unsigned char *) d) != point)
5074
DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5075
if (PTR_CHAR_POS ((unsigned char *) d) <= point)
5080
DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5085
DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5089
/* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5091
if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
5093
SET_REGS_MATCHED ();
5097
DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5099
goto matchnotsyntax;
5102
DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5106
/* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5108
if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
5110
SET_REGS_MATCHED ();
5113
#else /* not emacs */
5115
DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5117
if (!WORDCHAR_P (d))
5119
SET_REGS_MATCHED ();
5124
DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5128
SET_REGS_MATCHED ();
5131
#endif /* not emacs */
5136
continue; /* Successfully executed one pattern command; keep going. */
5139
/* We goto here if a matching operation fails. */
5141
if (!FAIL_STACK_EMPTY ())
5142
{ /* A restart point is known. Restore to that state. */
5143
DEBUG_PRINT1 ("\nFAIL:\n");
5144
POP_FAILURE_POINT (d, p,
5145
lowest_active_reg, highest_active_reg,
5146
regstart, regend, reg_info);
5148
/* If this failure point is a dummy, try the next one. */
5152
/* If we failed to the end of the pattern, don't examine *p. */
5156
boolean is_a_jump_n = false;
5158
/* If failed to a backwards jump that's part of a repetition
5159
loop, need to pop this failure point and use the next one. */
5160
switch ((re_opcode_t) *p)
5164
case maybe_pop_jump:
5165
case pop_failure_jump:
5168
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5171
if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5173
&& (re_opcode_t) *p1 == on_failure_jump))
5181
if (d >= string1 && d <= end1)
5185
break; /* Matching at this starting point really fails. */
5189
goto restore_best_regs;
5193
return -1; /* Failure to match. */
5196
/* Subroutine definitions for re_match_2. */
5199
/* We are passed P pointing to a register number after a start_memory.
5201
Return true if the pattern up to the corresponding stop_memory can
5202
match the empty string, and false otherwise.
5204
If we find the matching stop_memory, sets P to point to one past its number.
5205
Otherwise, sets P to an undefined byte less than or equal to END.
5207
We don't handle duplicates properly (yet). */
5210
group_match_null_string_p (p, end, reg_info)
5211
unsigned char **p, *end;
5212
register_info_type *reg_info;
5215
/* Point to after the args to the start_memory. */
5216
unsigned char *p1 = *p + 2;
5220
/* Skip over opcodes that can match nothing, and return true or
5221
false, as appropriate, when we get to one that can't, or to the
5222
matching stop_memory. */
5224
switch ((re_opcode_t) *p1)
5226
/* Could be either a loop or a series of alternatives. */
5227
case on_failure_jump:
5229
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5231
/* If the next operation is not a jump backwards in the
5236
/* Go through the on_failure_jumps of the alternatives,
5237
seeing if any of the alternatives cannot match nothing.
5238
The last alternative starts with only a jump,
5239
whereas the rest start with on_failure_jump and end
5240
with a jump, e.g., here is the pattern for `a|b|c':
5242
/on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5243
/on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5246
So, we have to first go through the first (n-1)
5247
alternatives and then deal with the last one separately. */
5250
/* Deal with the first (n-1) alternatives, which start
5251
with an on_failure_jump (see above) that jumps to right
5252
past a jump_past_alt. */
5254
while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5256
/* `mcnt' holds how many bytes long the alternative
5257
is, including the ending `jump_past_alt' and
5260
if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5264
/* Move to right after this alternative, including the
5268
/* Break if it's the beginning of an n-th alternative
5269
that doesn't begin with an on_failure_jump. */
5270
if ((re_opcode_t) *p1 != on_failure_jump)
5273
/* Still have to check that it's not an n-th
5274
alternative that starts with an on_failure_jump. */
5276
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5277
if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5279
/* Get to the beginning of the n-th alternative. */
5285
/* Deal with the last alternative: go back and get number
5286
of the `jump_past_alt' just before it. `mcnt' contains
5287
the length of the alternative. */
5288
EXTRACT_NUMBER (mcnt, p1 - 2);
5290
if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5293
p1 += mcnt; /* Get past the n-th alternative. */
5299
assert (p1[1] == **p);
5305
if (!common_op_match_null_string_p (&p1, end, reg_info))
5308
} /* while p1 < end */
5311
} /* group_match_null_string_p */
5314
/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5315
It expects P to be the first byte of a single alternative and END one
5316
byte past the last. The alternative can contain groups. */
5319
alt_match_null_string_p (p, end, reg_info)
5320
unsigned char *p, *end;
5321
register_info_type *reg_info;
5324
unsigned char *p1 = p;
5328
/* Skip over opcodes that can match nothing, and break when we get
5329
to one that can't. */
5331
switch ((re_opcode_t) *p1)
5334
case on_failure_jump:
5336
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5341
if (!common_op_match_null_string_p (&p1, end, reg_info))
5344
} /* while p1 < end */
5347
} /* alt_match_null_string_p */
5350
/* Deals with the ops common to group_match_null_string_p and
5351
alt_match_null_string_p.
5353
Sets P to one after the op and its arguments, if any. */
5356
common_op_match_null_string_p (p, end, reg_info)
5357
unsigned char **p, *end;
5358
register_info_type *reg_info;
5363
unsigned char *p1 = *p;
5365
switch ((re_opcode_t) *p1++)
5385
assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5386
ret = group_match_null_string_p (&p1, end, reg_info);
5388
/* Have to set this here in case we're checking a group which
5389
contains a group and a back reference to it. */
5391
if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5392
REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5398
/* If this is an optimized succeed_n for zero times, make the jump. */
5400
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5408
/* Get to the number of times to succeed. */
5410
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5415
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5423
if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5431
/* All other opcodes mean we cannot match the empty string. */
5437
} /* common_op_match_null_string_p */
5440
/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5441
bytes; nonzero otherwise. */
5444
bcmp_translate (s1, s2, len, translate)
5445
const char *s1, *s2;
5447
RE_TRANSLATE_TYPE translate;
5449
register const unsigned char *p1 = (const unsigned char *) s1;
5450
register const unsigned char *p2 = (const unsigned char *) s2;
5453
if (translate[*p1++] != translate[*p2++]) return 1;
5459
/* Entry points for GNU code. */
5461
/* re_compile_pattern is the GNU regular expression compiler: it
5462
compiles PATTERN (of length SIZE) and puts the result in BUFP.
5463
Returns 0 if the pattern was valid, otherwise an error string.
5465
Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5466
are set in BUFP on entry.
5468
We call regex_compile to do the actual compilation. */
5471
re_compile_pattern (pattern, length, bufp)
5472
const char *pattern;
5474
struct re_pattern_buffer *bufp;
5478
/* GNU code is written to assume at least RE_NREGS registers will be set
5479
(and at least one extra will be -1). */
5480
bufp->regs_allocated = REGS_UNALLOCATED;
5482
/* And GNU code determines whether or not to get register information
5483
by passing null for the REGS argument to re_match, etc., not by
5487
/* Match anchors at newline. */
5488
bufp->newline_anchor = 1;
5490
ret = regex_compile (pattern, length, re_syntax_options, bufp);
5494
return gettext (re_error_msgid[(int) ret]);
5497
weak_alias (__re_compile_pattern, re_compile_pattern)
5500
/* Entry points compatible with 4.2 BSD regex library. We don't define
5501
them unless specifically requested. */
5503
#if defined _REGEX_RE_COMP || defined _LIBC
5505
/* BSD has one and only one pattern buffer. */
5506
static struct re_pattern_buffer re_comp_buf;
5510
/* Make these definitions weak in libc, so POSIX programs can redefine
5511
these names if they don't use our functions, and still use
5512
regcomp/regexec below without link errors. */
5522
if (!re_comp_buf.buffer)
5523
return gettext ("No previous regular expression");
5527
if (!re_comp_buf.buffer)
5529
re_comp_buf.buffer = (unsigned char *) malloc (200);
5530
if (re_comp_buf.buffer == NULL)
5531
return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5532
re_comp_buf.allocated = 200;
5534
re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5535
if (re_comp_buf.fastmap == NULL)
5536
return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5539
/* Since `re_exec' always passes NULL for the `regs' argument, we
5540
don't need to initialize the pattern buffer fields which affect it. */
5542
/* Match anchors at newlines. */
5543
re_comp_buf.newline_anchor = 1;
5545
ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5550
/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5551
return (char *) gettext (re_error_msgid[(int) ret]);
5562
const int len = strlen (s);
5564
0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5567
#endif /* _REGEX_RE_COMP */
5569
/* POSIX.2 functions. Don't define these for Emacs. */
5573
/* regcomp takes a regular expression as a string and compiles it.
5575
PREG is a regex_t *. We do not expect any fields to be initialized,
5576
since POSIX says we shouldn't. Thus, we set
5578
`buffer' to the compiled pattern;
5579
`used' to the length of the compiled pattern;
5580
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5581
REG_EXTENDED bit in CFLAGS is set; otherwise, to
5582
RE_SYNTAX_POSIX_BASIC;
5583
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
5584
`fastmap' to an allocated space for the fastmap;
5585
`fastmap_accurate' to zero;
5586
`re_nsub' to the number of subexpressions in PATTERN.
5588
PATTERN is the address of the pattern string.
5590
CFLAGS is a series of bits which affect compilation.
5592
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5593
use POSIX basic syntax.
5595
If REG_NEWLINE is set, then . and [^...] don't match newline.
5596
Also, regexec will try a match beginning after every newline.
5598
If REG_ICASE is set, then we considers upper- and lowercase
5599
versions of letters to be equivalent when matching.
5601
If REG_NOSUB is set, then when PREG is passed to regexec, that
5602
routine will report only success or failure, and nothing about the
5605
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5606
the return codes and their meanings.) */
5609
regcomp (preg, pattern, cflags)
5611
const char *pattern;
5616
= (cflags & REG_EXTENDED) ?
5617
RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5619
/* regex_compile will allocate the space for the compiled pattern. */
5621
preg->allocated = 0;
5624
/* Try to allocate space for the fastmap. */
5625
preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5627
if (cflags & REG_ICASE)
5632
= (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5633
* sizeof (*(RE_TRANSLATE_TYPE)0));
5634
if (preg->translate == NULL)
5635
return (int) REG_ESPACE;
5637
/* Map uppercase characters to corresponding lowercase ones. */
5638
for (i = 0; i < CHAR_SET_SIZE; i++)
5639
preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
5642
preg->translate = NULL;
5644
/* If REG_NEWLINE is set, newlines are treated differently. */
5645
if (cflags & REG_NEWLINE)
5646
{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
5647
syntax &= ~RE_DOT_NEWLINE;
5648
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5649
/* It also changes the matching behavior. */
5650
preg->newline_anchor = 1;
5653
preg->newline_anchor = 0;
5655
preg->no_sub = !!(cflags & REG_NOSUB);
5657
/* POSIX says a null character in the pattern terminates it, so we
5658
can use strlen here in compiling the pattern. */
5659
ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5661
/* POSIX doesn't distinguish between an unmatched open-group and an
5662
unmatched close-group: both are REG_EPAREN. */
5663
if (ret == REG_ERPAREN) ret = REG_EPAREN;
5665
if (ret == REG_NOERROR && preg->fastmap)
5667
/* Compute the fastmap now, since regexec cannot modify the pattern
5669
if (re_compile_fastmap (preg) == -2)
5671
/* Some error occured while computing the fastmap, just forget
5673
free (preg->fastmap);
5674
preg->fastmap = NULL;
5681
weak_alias (__regcomp, regcomp)
5685
/* regexec searches for a given pattern, specified by PREG, in the
5688
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5689
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5690
least NMATCH elements, and we set them to the offsets of the
5691
corresponding matched substrings.
5693
EFLAGS specifies `execution flags' which affect matching: if
5694
REG_NOTBOL is set, then ^ does not match at the beginning of the
5695
string; if REG_NOTEOL is set, then $ does not match at the end.
5697
We return 0 if we find a match and REG_NOMATCH if not. */
5700
regexec (preg, string, nmatch, pmatch, eflags)
5701
const regex_t *preg;
5704
regmatch_t pmatch[];
5708
struct re_registers regs;
5709
regex_t private_preg;
5710
int len = strlen (string);
5711
boolean want_reg_info = !preg->no_sub && nmatch > 0;
5713
private_preg = *preg;
5715
private_preg.not_bol = !!(eflags & REG_NOTBOL);
5716
private_preg.not_eol = !!(eflags & REG_NOTEOL);
5718
/* The user has told us exactly how many registers to return
5719
information about, via `nmatch'. We have to pass that on to the
5720
matching routines. */
5721
private_preg.regs_allocated = REGS_FIXED;
5725
regs.num_regs = nmatch;
5726
regs.start = TALLOC (nmatch * 2, regoff_t);
5727
if (regs.start == NULL)
5728
return (int) REG_NOMATCH;
5729
regs.end = regs.start + nmatch;
5732
/* Perform the searching operation. */
5733
ret = re_search (&private_preg, string, len,
5734
/* start: */ 0, /* range: */ len,
5735
want_reg_info ? ®s : (struct re_registers *) 0);
5737
/* Copy the register information to the POSIX structure. */
5744
for (r = 0; r < nmatch; r++)
5746
pmatch[r].rm_so = regs.start[r];
5747
pmatch[r].rm_eo = regs.end[r];
5751
/* If we needed the temporary register info, free the space now. */
5755
/* We want zero return to mean success, unlike `re_search'. */
5756
return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5759
weak_alias (__regexec, regexec)
5763
/* Returns a message corresponding to an error code, ERRCODE, returned
5764
from either regcomp or regexec. We don't use PREG here. */
5767
regerror (errcode, preg, errbuf, errbuf_size)
5769
const regex_t *preg;
5777
|| errcode >= (int) (sizeof (re_error_msgid)
5778
/ sizeof (re_error_msgid[0])))
5779
/* Only error codes returned by the rest of the code should be passed
5780
to this routine. If we are given anything else, or if other regex
5781
code generates an invalid error code, then the program has a bug.
5782
Dump core so we can fix it. */
5785
msg = gettext (re_error_msgid[errcode]);
5787
msg_size = strlen (msg) + 1; /* Includes the null. */
5789
if (errbuf_size != 0)
5791
if (msg_size > errbuf_size)
5793
#if defined HAVE_MEMPCPY || defined _LIBC
5794
*((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
5796
memcpy (errbuf, msg, errbuf_size - 1);
5797
errbuf[errbuf_size - 1] = 0;
5801
memcpy (errbuf, msg, msg_size);
5807
weak_alias (__regerror, regerror)
5811
/* Free dynamically allocated space used by PREG. */
5817
if (preg->buffer != NULL)
5818
free (preg->buffer);
5819
preg->buffer = NULL;
5821
preg->allocated = 0;
5824
if (preg->fastmap != NULL)
5825
free (preg->fastmap);
5826
preg->fastmap = NULL;
5827
preg->fastmap_accurate = 0;
5829
if (preg->translate != NULL)
5830
free (preg->translate);
5831
preg->translate = NULL;
5834
weak_alias (__regfree, regfree)
5837
#endif /* not emacs */