2
* regcomp and regexec -- regsub and regerror are elsewhere
4
* Copyright (c) 1986 by University of Toronto.
5
* Written by Henry Spencer. Not derived from licensed software.
7
* Permission is granted to anyone to use this software for any
8
* purpose on any computer system, and to redistribute it freely,
9
* subject to the following restrictions:
11
* 1. The author is not responsible for the consequences of use of
12
* this software, no matter how awful, even if they arise
15
* 2. The origin of this software must not be misrepresented, either
16
* by explicit claim or by omission.
18
* 3. Altered versions must be plainly marked as such, and must not
19
* be misrepresented as being the original software.
21
* Beware that some of this code is subtly aware of the way operator
22
* precedence is structured in regular expressions. Serious changes in
23
* regular-expression syntax might require a total rethink.
25
* *** NOTE: this code has been altered slightly for use in Tcl. ***
26
* Slightly modified by David MacKenzie to undo most of the changes for TCL.
27
* Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
43
* The "internal use only" fields in regexp.h are present to pass info from
44
* compile to execute that permits the execute phase to run lots faster on
45
* simple cases. They are:
47
* regstart char that must begin a match; '\0' if none obvious
48
* reganch is the match anchored (at beginning-of-line only)?
49
* regmust string (pointer into program) that match must include, or NULL
50
* regmlen length of regmust string
52
* Regstart and reganch permit very fast decisions on suitable starting points
53
* for a match, cutting down the work a lot. Regmust permits fast rejection
54
* of lines that cannot possibly match. The regmust tests are costly enough
55
* that regcomp() supplies a regmust only if the r.e. contains something
56
* potentially expensive (at present, the only such thing detected is * or +
57
* at the start of the r.e., which can involve a lot of backup). Regmlen is
58
* supplied because the test in regexec() needs it and regcomp() is
59
* computing it anyway.
63
* Structure for regexp "program". This is essentially a linear encoding
64
* of a nondeterministic finite-state machine (aka syntax charts or
65
* "railroad normal form" in parsing technology). Each node is an opcode
66
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
67
* all nodes except BRANCH implement concatenation; a "next" pointer with
68
* a BRANCH on both ends of it is connecting two alternatives. (Here we
69
* have one of the subtle syntax dependencies: an individual BRANCH (as
70
* opposed to a collection of them) is never concatenated with anything
71
* because of operator precedence.) The operand of some types of node is
72
* a literal string; for others, it is a node leading into a sub-FSM. In
73
* particular, the operand of a BRANCH node is the first node of the branch.
74
* (NB this is *not* a tree structure: the tail of the branch connects
75
* to the thing following the set of BRANCHes.) The opcodes are:
78
/* definition number opnd? meaning */
80
#define END 0 /* no End of program. */
81
#define BOL 1 /* no Match "" at beginning of line. */
82
#define EOL 2 /* no Match "" at end of line. */
83
#define ANY 3 /* no Match any one character. */
84
#define ANYOF 4 /* str Match any character in this string. */
85
#define ANYBUT 5 /* str Match any character not in this string. */
86
#define BRANCH 6 /* node Match this alternative, or the next... */
87
#define BACK 7 /* no Match "", "next" ptr points backward. */
88
#define EXACTLY 8 /* str Match this string. */
89
#define NOTHING 9 /* no Match empty string. */
90
#define STAR 10 /* node Match this (simple) thing 0 or more times. */
91
#define PLUS 11 /* node Match this (simple) thing 1 or more times. */
92
#define OPEN 20 /* no Mark this point in input as start of #n. */
93
/* OPEN+1 is number 1, etc. */
94
#define CLOSE 30 /* no Analogous to OPEN. */
99
* BRANCH The set of branches constituting a single choice are hooked
100
* together with their "next" pointers, since precedence prevents
101
* anything being concatenated to any individual branch. The
102
* "next" pointer of the last BRANCH in a choice points to the
103
* thing following the whole choice. This is also where the
104
* final "next" pointer of each individual branch points; each
105
* branch starts with the operand node of a BRANCH node.
107
* BACK Normal "next" pointers all implicitly point forward; BACK
108
* exists to make loop structures possible.
110
* STAR,PLUS '?', and complex '*' and '+', are implemented as circular
111
* BRANCH structures using BACK. Simple cases (one character
112
* per match) are implemented with STAR and PLUS for speed
113
* and to minimize recursive plunges.
115
* OPEN,CLOSE ...are numbered at compile time.
119
* A node is one char of opcode followed by two chars of "next" pointer.
120
* "Next" pointers are stored as two 8-bit pieces, high order first. The
121
* value is a positive offset from the opcode of the node containing it.
122
* An operand, if any, simply follows the node. (Note that much of the
123
* code generation knows about this implicit relationship.)
125
* Using two bytes for the "next" pointer is vast overkill for most things,
126
* but allows patterns to get big without disasters.
129
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130
#define OPERAND(p) ((p) + 3)
133
* See regmagic.h for one further detail of program structure.
138
* Utility definitions.
141
#define UCHARAT(p) ((int)*(unsigned char *)(p))
143
#define UCHARAT(p) ((int)*(p)&CHARBITS)
146
#define FAIL(m) { regerror(m); return(NULL); }
147
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
148
#define META "^$.[()|?+*\\"
151
* Flags to be passed up and down.
153
#define HASWIDTH 01 /* Known never to match null string. */
154
#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155
#define SPSTART 04 /* Starts with * or +. */
156
#define WORST 0 /* Worst case. */
159
* Global work variables for regcomp().
161
static char *regparse; /* Input-scan pointer. */
162
static int regnpar; /* () count. */
163
static char regdummy;
164
static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165
static long regsize; /* Code size. */
168
* The first byte of the regexp internal "program" is actually this magic
169
* number; the start node begins in the second byte.
175
* Forward declarations for regcomp()'s friends.
178
#define STATIC static
181
STATIC char *regbranch();
182
STATIC char *regpiece();
183
STATIC char *regatom();
184
STATIC char *regnode();
185
STATIC char *regnext();
187
STATIC void reginsert();
188
STATIC void regtail();
189
STATIC void regoptail();
191
STATIC int strcspn();
195
- regcomp - compile a regular expression into internal code
197
* We can't allocate space until we know how big the compiled form will be,
198
* but we can't compile it (and thus know how big it is) until we've got a
199
* place to put the code. So we cheat: we compile it twice, once with code
200
* generation turned off and size counting turned on, and once "for real".
201
* This also means that we don't allocate space until we are sure that the
202
* thing really will compile successfully, and we never have to move the
203
* code and thus invalidate pointers into it. (Note that it has to be in
204
* one piece because free() must be able to free it all.)
206
* Beware that the optimization-preparation code in here knows about some
207
* of the structure of the compiled regexp.
215
register char *longest;
220
FAIL("NULL argument");
222
/* First pass: determine size, legality. */
228
if (reg(0, &flags) == NULL)
231
/* Small enough for pointer-storage convention? */
232
if (regsize >= 32767L) /* Probably could be 65535L. */
233
FAIL("regexp too big");
235
/* Allocate space. */
236
r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
238
FAIL("out of space");
240
/* Second pass: emit code. */
243
regcode = r->program;
245
if (reg(0, &flags) == NULL)
248
/* Dig out information for optimizations. */
249
r->regstart = '\0'; /* Worst-case defaults. */
253
scan = r->program+1; /* First BRANCH. */
254
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
255
scan = OPERAND(scan);
257
/* Starting-point info. */
258
if (OP(scan) == EXACTLY)
259
r->regstart = *OPERAND(scan);
260
else if (OP(scan) == BOL)
264
* If there's something expensive in the r.e., find the
265
* longest literal string that must appear and make it the
266
* regmust. Resolve ties in favor of later strings, since
267
* the regstart check works with the beginning of the r.e.
268
* and avoiding duplication strengthens checking. Not a
269
* strong reason, but sufficient in the absence of others.
274
for (; scan != NULL; scan = regnext(scan))
275
if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
276
longest = OPERAND(scan);
277
len = strlen(OPERAND(scan));
279
r->regmust = longest;
288
- reg - regular expression, i.e. main body or parenthesized thing
290
* Caller must absorb opening parenthesis.
292
* Combining parenthesis handling with the base level of regular expression
293
* is a trifle forced, but the need to tie the tails of the branches to what
294
* follows makes it hard to avoid.
298
int paren; /* Parenthesized? */
303
register char *ender;
304
register int parno = 0;
307
*flagp = HASWIDTH; /* Tentatively. */
309
/* Make an OPEN node, if parenthesized. */
311
if (regnpar >= NSUBEXP)
315
ret = regnode(OPEN+parno);
319
/* Pick up the branches, linking them together. */
320
br = regbranch(&flags);
324
regtail(ret, br); /* OPEN -> first. */
327
if (!(flags&HASWIDTH))
329
*flagp |= flags&SPSTART;
330
while (*regparse == '|') {
332
br = regbranch(&flags);
335
regtail(ret, br); /* BRANCH -> BRANCH. */
336
if (!(flags&HASWIDTH))
338
*flagp |= flags&SPSTART;
341
/* Make a closing node, and hook it on the end. */
342
ender = regnode((paren) ? CLOSE+parno : END);
345
/* Hook the tails of the branches to the closing node. */
346
for (br = ret; br != NULL; br = regnext(br))
347
regoptail(br, ender);
349
/* Check for proper termination. */
350
if (paren && *regparse++ != ')') {
351
FAIL("unmatched ()");
352
} else if (!paren && *regparse != '\0') {
353
if (*regparse == ')') {
354
FAIL("unmatched ()");
356
FAIL("junk on end"); /* "Can't happen". */
364
- regbranch - one alternative of an | operator
366
* Implements the concatenation operator.
373
register char *chain;
374
register char *latest;
377
*flagp = WORST; /* Tentatively. */
379
ret = regnode(BRANCH);
381
while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382
latest = regpiece(&flags);
385
*flagp |= flags&HASWIDTH;
386
if (chain == NULL) /* First piece. */
387
*flagp |= flags&SPSTART;
389
regtail(chain, latest);
392
if (chain == NULL) /* Loop ran zero times. */
393
(void) regnode(NOTHING);
399
- regpiece - something followed by possible [*+?]
401
* Note that the branching code sequences used for ? and the general cases
402
* of * and + are somewhat optimized: they use the same NOTHING node as
403
* both the endmarker for their branch list and the body of the last branch.
404
* It might seem that this node could be dispensed with entirely, but the
405
* endmarker role is not redundant.
416
ret = regatom(&flags);
426
if (!(flags&HASWIDTH) && op != '?')
427
FAIL("*+ operand could be empty");
428
*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
430
if (op == '*' && (flags&SIMPLE))
431
reginsert(STAR, ret);
432
else if (op == '*') {
433
/* Emit x* as (x&|), where & means "self". */
434
reginsert(BRANCH, ret); /* Either x */
435
regoptail(ret, regnode(BACK)); /* and loop */
436
regoptail(ret, ret); /* back */
437
regtail(ret, regnode(BRANCH)); /* or */
438
regtail(ret, regnode(NOTHING)); /* null. */
439
} else if (op == '+' && (flags&SIMPLE))
440
reginsert(PLUS, ret);
441
else if (op == '+') {
442
/* Emit x+ as x(&|), where & means "self". */
443
next = regnode(BRANCH); /* Either */
445
regtail(regnode(BACK), ret); /* loop back */
446
regtail(next, regnode(BRANCH)); /* or */
447
regtail(ret, regnode(NOTHING)); /* null. */
448
} else if (op == '?') {
449
/* Emit x? as (x|) */
450
reginsert(BRANCH, ret); /* Either x */
451
regtail(ret, regnode(BRANCH)); /* or */
452
next = regnode(NOTHING); /* null. */
454
regoptail(ret, next);
457
if (ISMULT(*regparse))
464
- regatom - the lowest level
466
* Optimization: gobbles an entire sequence of ordinary characters so that
467
* it can turn them into a single node, which is smaller to store and
468
* faster to run. Backslashed characters are exceptions, each becoming a
469
* separate node; the code is simpler that way and it's not worth fixing.
478
*flagp = WORST; /* Tentatively. */
480
switch (*regparse++) {
489
*flagp |= HASWIDTH|SIMPLE;
493
register int classend;
495
if (*regparse == '^') { /* Complement of range. */
496
ret = regnode(ANYBUT);
499
ret = regnode(ANYOF);
500
if (*regparse == ']' || *regparse == '-')
502
while (*regparse != '\0' && *regparse != ']') {
503
if (*regparse == '-') {
505
if (*regparse == ']' || *regparse == '\0')
508
clss = UCHARAT(regparse-2)+1;
509
classend = UCHARAT(regparse);
510
if (clss > classend+1)
511
FAIL("invalid [] range");
512
for (; clss <= classend; clss++)
520
if (*regparse != ']')
521
FAIL("unmatched []");
523
*flagp |= HASWIDTH|SIMPLE;
527
ret = reg(1, &flags);
530
*flagp |= flags&(HASWIDTH|SPSTART);
535
FAIL("internal urp"); /* Supposed to be caught earlier. */
541
FAIL("?+* follows nothing");
545
if (*regparse == '\0')
547
ret = regnode(EXACTLY);
550
*flagp |= HASWIDTH|SIMPLE;
557
len = strcspn(regparse, META);
559
FAIL("internal disaster");
560
ender = *(regparse+len);
561
if (len > 1 && ISMULT(ender))
562
len--; /* Back off clear of ?+* operand. */
566
ret = regnode(EXACTLY);
580
- regnode - emit a node
582
static char * /* Location. */
590
if (ret == ®dummy) {
597
*ptr++ = '\0'; /* Null "next" pointer. */
605
- regc - emit (if appropriate) a byte of code
611
if (regcode != ®dummy)
618
- reginsert - insert an operator in front of already-emitted operand
620
* Means relocating the operand.
629
register char *place;
631
if (regcode == ®dummy) {
642
place = opnd; /* Op node, where operand used to be. */
649
- regtail - set the next-pointer at the end of a node chain
663
/* Find last node. */
666
temp = regnext(scan);
672
if (OP(scan) == BACK)
676
*(scan+1) = (offset>>8)&0377;
677
*(scan+2) = offset&0377;
681
- regoptail - regtail on operand of first argument; nop if operandless
688
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
689
if (p == NULL || p == ®dummy || OP(p) != BRANCH)
691
regtail(OPERAND(p), val);
695
* regexec and friends
699
* Global work variables for regexec().
701
static char *reginput; /* String-input pointer. */
702
static char *regbol; /* Beginning of input, for ^ check. */
703
static char **regstartp; /* Pointer to startp array. */
704
static char **regendp; /* Ditto for endp. */
710
STATIC int regmatch();
711
STATIC int regrepeat();
716
STATIC char *regprop();
720
- regexec - match a regexp against a string
723
regexec2(prog, string, notbol)
724
register regexp *prog;
725
register char *string;
731
if (prog == NULL || string == NULL) {
732
regerror("NULL parameter");
736
/* Check validity of program. */
737
if (UCHARAT(prog->program) != MAGIC) {
738
regerror("corrupted program");
742
/* If there is a "must appear" string, look for it. */
743
if (prog->regmust != NULL) {
745
while ((s = strchr(s, prog->regmust[0])) != NULL) {
746
if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747
break; /* Found it. */
750
if (s == NULL) /* Not present. */
754
/* Mark beginning of line for ^ . */
760
/* Simplest case: anchored match need be tried only once. */
762
return(regtry(prog, string));
764
/* Messy cases: unanchored match. */
766
if (prog->regstart != '\0')
767
/* We know what char it must start with. */
768
while ((s = strchr(s, prog->regstart)) != NULL) {
774
/* We don't -- general case. */
778
} while (*s++ != '\0');
785
regexec(prog, string)
786
register regexp *prog;
787
register char *string;
789
return regexec2(prog, string, 0);
793
- regtry - try match at specific point
795
static int /* 0 failure, 1 success */
805
regstartp = prog->startp;
806
regendp = prog->endp;
810
for (i = NSUBEXP; i > 0; i--) {
814
if (regmatch(prog->program + 1)) {
815
prog->startp[0] = string;
816
prog->endp[0] = reginput;
823
- regmatch - main matching routine
825
* Conceptually the strategy is simple: check to see whether the current
826
* node matches, call self recursively to see whether the rest matches,
827
* and then act accordingly. In practice we make some effort to avoid
828
* recursion, in particular by going through "ordinary" nodes (that don't
829
* need to know whether the rest of the match failed) by a loop instead of
832
static int /* 0 failure, 1 success */
836
register char *scan; /* Current node. */
837
char *next; /* Next node. */
841
if (scan != NULL && regnarrate)
842
fprintf(stderr, "%s(\n", regprop(scan));
844
while (scan != NULL) {
847
fprintf(stderr, "%s...\n", regprop(scan));
849
next = regnext(scan);
853
if (reginput != regbol)
857
if (*reginput != '\0')
861
if (*reginput == '\0')
869
opnd = OPERAND(scan);
870
/* Inline the first character, for speed. */
871
if (*opnd != *reginput)
874
if (len > 1 && strncmp(opnd, reginput, len) != 0)
880
if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
885
if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
905
no = OP(scan) - OPEN;
908
if (regmatch(next)) {
910
* Don't set startp if some later
911
* invocation of the same parentheses
914
if (regstartp[no] == NULL)
915
regstartp[no] = save;
934
no = OP(scan) - CLOSE;
937
if (regmatch(next)) {
939
* Don't set endp if some later
940
* invocation of the same parentheses
943
if (regendp[no] == NULL)
954
if (OP(next) != BRANCH) /* No choice. */
955
next = OPERAND(scan); /* Avoid recursion. */
959
if (regmatch(OPERAND(scan)))
962
scan = regnext(scan);
963
} while (scan != NULL && OP(scan) == BRANCH);
972
register char nextch;
978
* Lookahead to avoid useless match attempts
979
* when we know what character comes next.
982
if (OP(next) == EXACTLY)
983
nextch = *OPERAND(next);
984
min = (OP(scan) == STAR) ? 0 : 1;
986
no = regrepeat(OPERAND(scan));
988
/* If it could work, try it. */
989
if (nextch == '\0' || *reginput == nextch)
992
/* Couldn't or didn't -- back up. */
994
reginput = save + no;
1001
return(1); /* Success! */
1005
regerror("memory corruption");
1015
* We get here only if there's trouble -- normally "case END" is
1016
* the terminating point.
1018
regerror("corrupted pointers");
1023
- regrepeat - repeatedly match something simple, report how many
1029
register int count = 0;
1030
register char *scan;
1031
register char *opnd;
1037
count = strlen(scan);
1041
while (*opnd == *scan) {
1047
while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1053
while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1058
default: /* Oh dear. Called inappropriately. */
1059
regerror("internal foulup");
1060
count = 0; /* Best compromise. */
1069
- regnext - dig the "next" pointer out of a node
1075
register int offset;
1092
STATIC char *regprop();
1095
- regdump - dump a regexp onto stdout in vaguely comprehensible form
1102
register char op = EXACTLY; /* Arbitrary non-END op. */
1103
register char *next;
1107
while (op != END) { /* While that wasn't END last time... */
1109
printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1111
if (next == NULL) /* Next ptr. */
1114
printf("(%d)", (s-r->program)+(next-s));
1116
if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1117
/* Literal string, where present. */
1118
while (*s != '\0') {
1127
/* Header fields of interest. */
1128
if (r->regstart != '\0')
1129
printf("start `%c' ", r->regstart);
1131
printf("anchored ");
1132
if (r->regmust != NULL)
1133
printf("must have \"%s\"", r->regmust);
1138
- regprop - printable representation of opcode
1145
static char buf[50];
1147
(void) strcpy(buf, ":");
1189
sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1201
sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1211
regerror("corrupted opcode");
1215
(void) strcat(buf, p);
1221
* The following is provided for those people who do not have strcspn() in
1222
* their C libraries. They should get off their butts and do something
1223
* about it; at least one public-domain implementation of those (highly
1224
* useful) string routines has been published on Usenet.
1228
* strcspn - find length of initial segment of s1 consisting entirely
1229
* of characters not from s2
1237
register char *scan1;
1238
register char *scan2;
1242
for (scan1 = s1; *scan1 != '\0'; scan1++) {
1243
for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1244
if (*scan1 == *scan2++)