2
* This code is derived from OpenBSD's libc/regex, original license follows:
4
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
5
* Copyright (c) 1992, 1993, 1994
6
* The Regents of the University of California. All rights reserved.
8
* This code is derived from software contributed to Berkeley by
11
* Redistribution and use in source and binary forms, with or without
12
* modification, are permitted provided that the following conditions
14
* 1. Redistributions of source code must retain the above copyright
15
* notice, this list of conditions and the following disclaimer.
16
* 2. Redistributions in binary form must reproduce the above copyright
17
* notice, this list of conditions and the following disclaimer in the
18
* documentation and/or other materials provided with the distribution.
19
* 3. Neither the name of the University nor the names of its contributors
20
* may be used to endorse or promote products derived from this software
21
* without specific prior written permission.
23
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35
* @(#)engine.c 8.5 (Berkeley) 3/20/94
39
* The matching engine and friends. This file is #included by regexec.c
40
* after suitable #defines of a variety of macros used herein, so that
41
* different state representations can be used without duplicating masses
46
#define matcher smatcher
49
#define dissect sdissect
50
#define backref sbackref
58
#define matcher lmatcher
61
#define dissect ldissect
62
#define backref lbackref
70
/* another structure passed up and down to avoid zillions of parameters */
74
llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
75
char *offp; /* offsets work from here */
76
char *beginp; /* start of string -- virtual NUL precedes */
77
char *endp; /* end of string -- virtual NUL here */
78
char *coldp; /* can be no match starting before here */
79
char **lastpos; /* [nplus+1] */
81
states st; /* current states */
82
states fresh; /* states for a fresh start */
83
states tmp; /* temporary */
84
states empty; /* empty set of states */
87
static int matcher(struct re_guts *, char *, size_t, llvm_regmatch_t[], int);
88
static char *dissect(struct match *, char *, char *, sopno, sopno);
89
static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int);
90
static char *fast(struct match *, char *, char *, sopno, sopno);
91
static char *slow(struct match *, char *, char *, sopno, sopno);
92
static states step(struct re_guts *, sopno, sopno, states, int, states);
93
#define MAX_RECURSION 100
96
#define BOLEOL (BOL+2)
97
#define NOTHING (BOL+3)
100
#define CODEMAX (BOL+5) /* highest code used */
101
#define NONCHAR(c) ((c) > CHAR_MAX)
102
#define NNONCHAR (CODEMAX-CHAR_MAX)
104
static void print(struct match *, char *, states, int, FILE *);
107
static void at(struct match *, char *, char *, char *, sopno, sopno);
110
static char *pchar(int);
114
#define SP(t, s, c) print(m, t, s, c, stdout)
115
#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
116
#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); }
119
#define SP(t, s, c) /* nothing */
120
#define AT(t, p1, p2, s1, s2) /* nothing */
121
#define NOTE(s) /* nothing */
125
- matcher - the actual matching engine
127
static int /* 0 success, REG_NOMATCH failure */
128
matcher(struct re_guts *g, char *string, size_t nmatch, llvm_regmatch_t pmatch[],
134
struct match *m = &mv;
136
const sopno gf = g->firststate+1; /* +1 for OEND */
137
const sopno gl = g->laststate;
141
/* simplify the situation where possible */
142
if (g->cflags®_NOSUB)
144
if (eflags®_STARTEND) {
145
start = string + pmatch[0].rm_so;
146
stop = string + pmatch[0].rm_eo;
149
stop = start + strlen(start);
154
/* prescreening; this does wonders for this rather slow code */
155
if (g->must != NULL) {
156
for (dp = start; dp < stop; dp++)
157
if (*dp == g->must[0] && stop - dp >= g->mlen &&
158
memcmp(dp, g->must, (size_t)g->mlen) == 0)
160
if (dp == stop) /* we didn't find g->must */
164
/* match struct setup */
179
/* this loop does only one repetition except for backrefs */
181
endp = fast(m, start, stop, gf, gl);
182
if (endp == NULL) { /* a miss */
188
if (nmatch == 0 && !g->backrefs)
189
break; /* no further info needed */
192
assert(m->coldp != NULL);
194
NOTE("finding start");
195
endp = slow(m, m->coldp, stop, gf, gl);
198
assert(m->coldp < m->endp);
201
if (nmatch == 1 && !g->backrefs)
202
break; /* no further info needed */
204
/* oh my, he wants the subexpressions... */
205
if (m->pmatch == NULL)
206
m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
207
sizeof(llvm_regmatch_t));
208
if (m->pmatch == NULL) {
212
for (i = 1; i <= m->g->nsub; i++)
213
m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
214
if (!g->backrefs && !(m->eflags®_BACKR)) {
216
dp = dissect(m, m->coldp, endp, gf, gl);
218
if (g->nplus > 0 && m->lastpos == NULL)
219
m->lastpos = (char **)malloc((g->nplus+1) *
221
if (g->nplus > 0 && m->lastpos == NULL) {
226
NOTE("backref dissect");
227
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
232
/* uh-oh... we couldn't find a subexpression-level match */
233
assert(g->backrefs); /* must be back references doing it */
234
assert(g->nplus == 0 || m->lastpos != NULL);
236
if (dp != NULL || endp <= m->coldp)
239
endp = slow(m, m->coldp, endp-1, gf, gl);
242
/* try it on a shorter possibility */
244
for (i = 1; i <= m->g->nsub; i++) {
245
assert(m->pmatch[i].rm_so == -1);
246
assert(m->pmatch[i].rm_eo == -1);
249
NOTE("backoff dissect");
250
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
252
assert(dp == NULL || dp == endp);
253
if (dp != NULL) /* found a shorter one */
256
/* despite initial appearances, there is no match here */
258
if (m->coldp == stop)
260
start = m->coldp + 1; /* recycle starting later */
263
/* fill in the details if requested */
265
pmatch[0].rm_so = m->coldp - m->offp;
266
pmatch[0].rm_eo = endp - m->offp;
269
assert(m->pmatch != NULL);
270
for (i = 1; i < nmatch; i++)
272
pmatch[i] = m->pmatch[i];
274
pmatch[i].rm_so = -1;
275
pmatch[i].rm_eo = -1;
279
if (m->pmatch != NULL)
280
free((char *)m->pmatch);
281
if (m->lastpos != NULL)
282
free((char *)m->lastpos);
288
- dissect - figure out what matched what, no back references
290
static char * /* == stop (success) always */
291
dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
294
sopno ss; /* start sop of current subRE */
295
sopno es; /* end sop of current subRE */
296
char *sp; /* start of string matched by it */
297
char *stp; /* string matched by it cannot pass here */
298
char *rest; /* start of rest of string */
299
char *tail; /* string unmatched by rest of RE */
300
sopno ssub; /* start sop of subsubRE */
301
sopno esub; /* end sop of subsubRE */
302
char *ssp; /* start of string matched by subsubRE */
303
char *sep; /* end of string matched by subsubRE */
304
char *oldssp; /* previous ssp */
306
AT("diss", start, stop, startst, stopst);
308
for (ss = startst; ss < stopst; ss = es) {
309
/* identify end of subRE */
311
switch (OP(m->g->strip[es])) {
314
es += OPND(m->g->strip[es]);
317
while (OP(m->g->strip[es]) != O_CH)
318
es += OPND(m->g->strip[es]);
323
/* figure out what it matched */
324
switch (OP(m->g->strip[ss])) {
344
/* cases where length of match is hard to find */
348
/* how long could this one be? */
349
rest = slow(m, sp, stp, ss, es);
350
assert(rest != NULL); /* it did match */
351
/* could the rest match the rest? */
352
tail = slow(m, rest, stop, es, stopst);
355
/* no -- try a shorter match for this one */
357
assert(stp >= sp); /* it did work */
361
/* did innards match? */
362
if (slow(m, sp, rest, ssub, esub) != NULL) {
363
char *dp = dissect(m, sp, rest, ssub, esub);
364
(void)dp; /* avoid warning if assertions off */
373
/* how long could this one be? */
374
rest = slow(m, sp, stp, ss, es);
375
assert(rest != NULL); /* it did match */
376
/* could the rest match the rest? */
377
tail = slow(m, rest, stop, es, stopst);
380
/* no -- try a shorter match for this one */
382
assert(stp >= sp); /* it did work */
388
for (;;) { /* find last match of innards */
389
sep = slow(m, ssp, rest, ssub, esub);
390
if (sep == NULL || sep == ssp)
391
break; /* failed or matched null */
392
oldssp = ssp; /* on to next try */
396
/* last successful match */
400
assert(sep == rest); /* must exhaust substring */
401
assert(slow(m, ssp, sep, ssub, esub) == rest);
403
char *dp = dissect(m, ssp, sep, ssub, esub);
404
(void)dp; /* avoid warning if assertions off */
412
/* how long could this one be? */
413
rest = slow(m, sp, stp, ss, es);
414
assert(rest != NULL); /* it did match */
415
/* could the rest match the rest? */
416
tail = slow(m, rest, stop, es, stopst);
419
/* no -- try a shorter match for this one */
421
assert(stp >= sp); /* it did work */
424
esub = ss + OPND(m->g->strip[ss]) - 1;
425
assert(OP(m->g->strip[esub]) == OOR1);
426
for (;;) { /* find first matching branch */
427
if (slow(m, sp, rest, ssub, esub) == rest)
428
break; /* it matched all of it */
429
/* that one missed, try next one */
430
assert(OP(m->g->strip[esub]) == OOR1);
432
assert(OP(m->g->strip[esub]) == OOR2);
434
esub += OPND(m->g->strip[esub]);
435
if (OP(m->g->strip[esub]) == OOR2)
438
assert(OP(m->g->strip[esub]) == O_CH);
441
char *dp = dissect(m, sp, rest, ssub, esub);
442
(void)dp; /* avoid warning if assertions off */
455
i = OPND(m->g->strip[ss]);
456
assert(0 < i && i <= m->g->nsub);
457
m->pmatch[i].rm_so = sp - m->offp;
460
i = OPND(m->g->strip[ss]);
461
assert(0 < i && i <= m->g->nsub);
462
m->pmatch[i].rm_eo = sp - m->offp;
475
- backref - figure out what matched what, figuring in back references
477
static char * /* == stop (success) or NULL (failure) */
478
backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst,
479
sopno lev, int rec) /* PLUS nesting level */
482
sopno ss; /* start sop of current subRE */
483
char *sp; /* start of string matched by it */
484
sopno ssub; /* start sop of subsubRE */
485
sopno esub; /* end sop of subsubRE */
486
char *ssp; /* start of string matched by subsubRE */
491
llvm_regoff_t offsave;
494
AT("back", start, stop, startst, stopst);
497
/* get as far as we can with easy stuff */
499
for (ss = startst; !hard && ss < stopst; ss++)
500
switch (OP(s = m->g->strip[ss])) {
502
if (sp == stop || *sp++ != (char)OPND(s))
511
cs = &m->g->sets[OPND(s)];
512
if (sp == stop || !CHIN(cs, *sp++))
516
if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
517
(sp < m->endp && *(sp-1) == '\n' &&
518
(m->g->cflags®_NEWLINE)) )
524
if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
525
(sp < m->endp && *sp == '\n' &&
526
(m->g->cflags®_NEWLINE)) )
532
if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
533
(sp < m->endp && *(sp-1) == '\n' &&
534
(m->g->cflags®_NEWLINE)) ||
536
!ISWORD(*(sp-1))) ) &&
537
(sp < m->endp && ISWORD(*sp)) )
543
if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
544
(sp < m->endp && *sp == '\n' &&
545
(m->g->cflags®_NEWLINE)) ||
546
(sp < m->endp && !ISWORD(*sp)) ) &&
547
(sp > m->beginp && ISWORD(*(sp-1))) )
554
case OOR1: /* matches null but needs to skip */
558
assert(OP(s) == OOR2);
560
} while (OP(s = m->g->strip[ss]) != O_CH);
561
/* note that the ss++ gets us past the O_CH */
563
default: /* have to make a choice */
567
if (!hard) { /* that was it! */
572
ss--; /* adjust for the for's final increment */
575
AT("hard", sp, stop, ss, stopst);
578
case OBACK_: /* the vilest depths */
580
assert(0 < i && i <= m->g->nsub);
581
if (m->pmatch[i].rm_eo == -1)
583
assert(m->pmatch[i].rm_so != -1);
584
len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
585
if (len == 0 && rec++ > MAX_RECURSION)
587
assert(stop - m->beginp >= len);
589
return(NULL); /* not enough left to match */
590
ssp = m->offp + m->pmatch[i].rm_so;
591
if (memcmp(sp, ssp, len) != 0)
593
while (m->g->strip[ss] != SOP(O_BACK, i))
595
return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
597
case OQUEST_: /* to null or not */
598
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
600
return(dp); /* not */
601
return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
604
assert(m->lastpos != NULL);
605
assert(lev+1 <= m->g->nplus);
606
m->lastpos[lev+1] = sp;
607
return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
610
if (sp == m->lastpos[lev]) /* last pass matched null */
611
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
612
/* try another pass */
613
m->lastpos[lev] = sp;
614
dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
616
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
620
case OCH_: /* find the right one, if any */
622
esub = ss + OPND(s) - 1;
623
assert(OP(m->g->strip[esub]) == OOR1);
624
for (;;) { /* find first matching branch */
625
dp = backref(m, sp, stop, ssub, esub, lev, rec);
628
/* that one missed, try next one */
629
if (OP(m->g->strip[esub]) == O_CH)
630
return(NULL); /* there is none */
632
assert(OP(m->g->strip[esub]) == OOR2);
634
esub += OPND(m->g->strip[esub]);
635
if (OP(m->g->strip[esub]) == OOR2)
638
assert(OP(m->g->strip[esub]) == O_CH);
641
case OLPAREN: /* must undo assignment if rest fails */
643
assert(0 < i && i <= m->g->nsub);
644
offsave = m->pmatch[i].rm_so;
645
m->pmatch[i].rm_so = sp - m->offp;
646
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
649
m->pmatch[i].rm_so = offsave;
652
case ORPAREN: /* must undo assignment if rest fails */
654
assert(0 < i && i <= m->g->nsub);
655
offsave = m->pmatch[i].rm_eo;
656
m->pmatch[i].rm_eo = sp - m->offp;
657
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
660
m->pmatch[i].rm_eo = offsave;
675
- fast - step through the string at top speed
677
static char * /* where tentative match ended, or NULL */
678
fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
681
states fresh = m->fresh;
684
int c = (start == m->beginp) ? OUT : *(start-1);
685
int lastc; /* previous c */
688
char *coldp; /* last p after which no match was underway */
692
st = step(m->g, startst, stopst, st, NOTHING, st);
699
c = (p == m->endp) ? OUT : *p;
703
/* is there an EOL and/or BOL between lastc and c? */
706
if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
707
(lastc == OUT && !(m->eflags®_NOTBOL)) ) {
711
if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
712
(c == OUT && !(m->eflags®_NOTEOL)) ) {
713
flagch = (flagch == BOL) ? BOLEOL : EOL;
718
st = step(m->g, startst, stopst, st, flagch, st);
722
/* how about a word boundary? */
723
if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
724
(c != OUT && ISWORD(c)) ) {
727
if ( (lastc != OUT && ISWORD(lastc)) &&
728
(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
731
if (flagch == BOW || flagch == EOW) {
732
st = step(m->g, startst, stopst, st, flagch, st);
737
if (ISSET(st, stopst) || p == stop)
738
break; /* NOTE BREAK OUT */
740
/* no, we must deal with this character */
744
st = step(m->g, startst, stopst, tmp, c, st);
746
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
750
assert(coldp != NULL);
752
if (ISSET(st, stopst))
759
- slow - step through the string more deliberately
761
static char * /* where it ended */
762
slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
765
states empty = m->empty;
768
int c = (start == m->beginp) ? OUT : *(start-1);
769
int lastc; /* previous c */
772
char *matchp; /* last p at which a match ended */
774
AT("slow", start, stop, startst, stopst);
777
SP("sstart", st, *p);
778
st = step(m->g, startst, stopst, st, NOTHING, st);
783
c = (p == m->endp) ? OUT : *p;
785
/* is there an EOL and/or BOL between lastc and c? */
788
if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
789
(lastc == OUT && !(m->eflags®_NOTBOL)) ) {
793
if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
794
(c == OUT && !(m->eflags®_NOTEOL)) ) {
795
flagch = (flagch == BOL) ? BOLEOL : EOL;
800
st = step(m->g, startst, stopst, st, flagch, st);
801
SP("sboleol", st, c);
804
/* how about a word boundary? */
805
if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
806
(c != OUT && ISWORD(c)) ) {
809
if ( (lastc != OUT && ISWORD(lastc)) &&
810
(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
813
if (flagch == BOW || flagch == EOW) {
814
st = step(m->g, startst, stopst, st, flagch, st);
815
SP("sboweow", st, c);
819
if (ISSET(st, stopst))
821
if (EQ(st, empty) || p == stop)
822
break; /* NOTE BREAK OUT */
824
/* no, we must deal with this character */
828
st = step(m->g, startst, stopst, tmp, c, st);
830
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
839
- step - map set of states reachable before char to set reachable after
842
step(struct re_guts *g,
843
sopno start, /* start state within strip */
844
sopno stop, /* state after stop state within strip */
845
states bef, /* states reachable before */
846
int ch, /* character or NONCHAR code */
847
states aft) /* states already known reachable after */
852
onestate here; /* note, macros know this name */
856
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
860
assert(pc == stop-1);
863
/* only characters can match */
864
assert(!NONCHAR(ch) || ch != (char)OPND(s));
865
if (ch == (char)OPND(s))
869
if (ch == BOL || ch == BOLEOL)
873
if (ch == EOL || ch == BOLEOL)
889
cs = &g->sets[OPND(s)];
890
if (!NONCHAR(ch) && CHIN(cs, ch))
893
case OBACK_: /* ignored here */
897
case OPLUS_: /* forward, this is just an empty */
900
case O_PLUS: /* both forward and back */
902
i = ISSETBACK(aft, OPND(s));
903
BACK(aft, aft, OPND(s));
904
if (!i && ISSETBACK(aft, OPND(s))) {
905
/* oho, must reconsider loop body */
910
case OQUEST_: /* two branches, both forward */
912
FWD(aft, aft, OPND(s));
914
case O_QUEST: /* just an empty */
917
case OLPAREN: /* not significant here */
921
case OCH_: /* mark the first two branches */
923
assert(OP(g->strip[pc+OPND(s)]) == OOR2);
924
FWD(aft, aft, OPND(s));
926
case OOR1: /* done a branch, find the O_CH */
927
if (ISSTATEIN(aft, here)) {
929
OP(s = g->strip[pc+look]) != O_CH;
931
assert(OP(s) == OOR2);
935
case OOR2: /* propagate OCH_'s marking */
937
if (OP(g->strip[pc+OPND(s)]) != O_CH) {
938
assert(OP(g->strip[pc+OPND(s)]) == OOR2);
939
FWD(aft, aft, OPND(s));
942
case O_CH: /* just empty */
945
default: /* ooooops... */
956
- print - print a set of states
959
print(struct match *m, char *caption, states st, int ch, FILE *d)
961
struct re_guts *g = m->g;
965
if (!(m->eflags®_TRACE))
968
(void)fprintf(d, "%s", caption);
970
(void)fprintf(d, " %s", pchar(ch));
971
for (i = 0; i < g->nstates; i++)
973
(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
976
(void)fprintf(d, "\n");
980
- at - print current situation
983
at(struct match *m, char *title, char *start, char *stop, sopno startst,
986
if (!(m->eflags®_TRACE))
989
(void)printf("%s %s-", title, pchar(*start));
990
(void)printf("%s ", pchar(*stop));
991
(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
995
#define PCHARDONE /* never again */
997
- pchar - make a character printable
999
* Is this identical to regchar() over in debug.c? Well, yes. But a
1000
* duplicate here avoids having a debugging-capable regexec.o tied to
1001
* a matching debug.o, and this is convenient. It all disappears in
1002
* the non-debug compilation anyway, so it doesn't matter much.
1004
static char * /* -> representation */
1007
static char pbuf[10];
1009
if (isprint(ch) || ch == ' ')
1010
(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1012
(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);