1
/*=========================================================================
3
Program: Insight Segmentation & Registration Toolkit
4
Module: $RCSfile: cmRegularExpression.cxx,v $
6
Date: $Date: 2001/04/27 12:01:17 $
7
Version: $Revision: 1.4 $
9
Copyright (c) 2001 Insight Consortium
12
Redistribution and use in source and binary forms, with or without
13
modification, are permitted provided that the following conditions are met:
15
* Redistributions of source code must retain the above copyright notice,
16
this list of conditions and the following disclaimer.
18
* Redistributions in binary form must reproduce the above copyright notice,
19
this list of conditions and the following disclaimer in the documentation
20
and/or other materials provided with the distribution.
22
* The name of the Insight Consortium, nor the names of any consortium members,
23
nor of any contributors, may be used to endorse or promote products derived
24
from this software without specific prior written permission.
26
* Modified source versions must be plainly marked as such, and must not be
27
misrepresented as being the original software.
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
30
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
33
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
35
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
37
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40
=========================================================================*/
42
// Copyright (C) 1991 Texas Instruments Incorporated.
44
// Permission is granted to any individual or institution to use, copy, modify,
45
// and distribute this software, provided that this complete copyright and
46
// permission notice is maintained, intact, in all copies and supporting
49
// Texas Instruments Incorporated provides this software "as is" without
50
// express or implied warranty.
53
// Created: MNF 06/13/89 Initial Design and Implementation
54
// Updated: LGO 08/09/89 Inherit from Generic
55
// Updated: MBN 09/07/89 Added conditional exception handling
56
// Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
57
// Updated: DLS 03/22/91 New lite version
60
#include "cmRegularExpression.h" // Include class specification
61
#include "cmStandardIncludes.h"
64
// cmRegularExpression -- Copies the given regular expression.
65
cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
67
this->progsize = rxp.progsize; // Copy regular expression size
68
this->program = new char[this->progsize]; // Allocate storage
69
for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
70
this->program[ind] = rxp.program[ind];
71
this->startp[0] = rxp.startp[0]; // Copy pointers into last
72
this->endp[0] = rxp.endp[0]; // Successful "find" operation
73
this->regmust = rxp.regmust; // Copy field
74
if (rxp.regmust != NULL) {
75
char* dum = rxp.program;
77
while (dum != rxp.regmust) {
81
this->regmust = this->program + ind;
83
this->regstart = rxp.regstart; // Copy starting index
84
this->reganch = rxp.reganch; // Copy remaining private data
85
this->regmlen = rxp.regmlen; // Copy remaining private data
88
// operator== -- Returns true if two regular expressions have the same
89
// compiled program for pattern matching.
90
bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
91
if (this != &rxp) { // Same address?
92
int ind = this->progsize; // Get regular expression size
93
if (ind != rxp.progsize) // If different size regexp
94
return false; // Return failure
95
while(ind-- != 0) // Else while still characters
96
if(this->program[ind] != rxp.program[ind]) // If regexp are different
97
return false; // Return failure
99
return true; // Else same, return success
103
// deep_equal -- Returns true if have the same compiled regular expressions
104
// and the same start and end pointers.
106
bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
107
int ind = this->progsize; // Get regular expression size
108
if (ind != rxp.progsize) // If different size regexp
109
return false; // Return failure
110
while(ind-- != 0) // Else while still characters
111
if(this->program[ind] != rxp.program[ind]) // If regexp are different
112
return false; // Return failure
113
return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
114
this->endp[0] == rxp.endp[0]); // Return true
117
// The remaining code in this file is derived from the regular expression code
118
// whose copyright statement appears below. It has been changed to work
119
// with the class concepts of C++ and COOL.
124
* Copyright (c) 1986 by University of Toronto.
125
* Written by Henry Spencer. Not derived from licensed software.
127
* Permission is granted to anyone to use this software for any
128
* purpose on any computer system, and to redistribute it freely,
129
* subject to the following restrictions:
131
* 1. The author is not responsible for the consequences of use of
132
* this software, no matter how awful, even if they arise
133
* from defects in it.
135
* 2. The origin of this software must not be misrepresented, either
136
* by explicit claim or by omission.
138
* 3. Altered versions must be plainly marked as such, and must not
139
* be misrepresented as being the original software.
141
* Beware that some of this code is subtly aware of the way operator
142
* precedence is structured in regular expressions. Serious changes in
143
* regular-expression syntax might require a total rethink.
147
* The "internal use only" fields in regexp.h are present to pass info from
148
* compile to execute that permits the execute phase to run lots faster on
149
* simple cases. They are:
151
* regstart char that must begin a match; '\0' if none obvious
152
* reganch is the match anchored (at beginning-of-line only)?
153
* regmust string (pointer into program) that match must include, or NULL
154
* regmlen length of regmust string
156
* Regstart and reganch permit very fast decisions on suitable starting points
157
* for a match, cutting down the work a lot. Regmust permits fast rejection
158
* of lines that cannot possibly match. The regmust tests are costly enough
159
* that compile() supplies a regmust only if the r.e. contains something
160
* potentially expensive (at present, the only such thing detected is * or +
161
* at the start of the r.e., which can involve a lot of backup). Regmlen is
162
* supplied because the test in find() needs it and compile() is computing
167
* Structure for regexp "program". This is essentially a linear encoding
168
* of a nondeterministic finite-state machine (aka syntax charts or
169
* "railroad normal form" in parsing technology). Each node is an opcode
170
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
171
* all nodes except BRANCH implement concatenation; a "next" pointer with
172
* a BRANCH on both ends of it is connecting two alternatives. (Here we
173
* have one of the subtle syntax dependencies: an individual BRANCH (as
174
* opposed to a collection of them) is never concatenated with anything
175
* because of operator precedence.) The operand of some types of node is
176
* a literal string; for others, it is a node leading into a sub-FSM. In
177
* particular, the operand of a BRANCH node is the first node of the branch.
178
* (NB this is *not* a tree structure: the tail of the branch connects
179
* to the thing following the set of BRANCHes.) The opcodes are:
182
// definition number opnd? meaning
183
#define END 0 // no End of program.
184
#define BOL 1 // no Match "" at beginning of line.
185
#define EOL 2 // no Match "" at end of line.
186
#define ANY 3 // no Match any one character.
187
#define ANYOF 4 // str Match any character in this string.
188
#define ANYBUT 5 // str Match any character not in this
190
#define BRANCH 6 // node Match this alternative, or the
192
#define BACK 7 // no Match "", "next" ptr points backward.
193
#define EXACTLY 8 // str Match this string.
194
#define NOTHING 9 // no Match empty string.
195
#define STAR 10 // node Match this (simple) thing 0 or more
197
#define PLUS 11 // node Match this (simple) thing 1 or more
199
#define OPEN 20 // no Mark this point in input as start of
201
// OPEN+1 is number 1, etc.
202
#define CLOSE 30 // no Analogous to OPEN.
207
* BRANCH The set of branches constituting a single choice are hooked
208
* together with their "next" pointers, since precedence prevents
209
* anything being concatenated to any individual branch. The
210
* "next" pointer of the last BRANCH in a choice points to the
211
* thing following the whole choice. This is also where the
212
* final "next" pointer of each individual branch points; each
213
* branch starts with the operand node of a BRANCH node.
215
* BACK Normal "next" pointers all implicitly point forward; BACK
216
* exists to make loop structures possible.
218
* STAR,PLUS '?', and complex '*' and '+', are implemented as circular
219
* BRANCH structures using BACK. Simple cases (one character
220
* per match) are implemented with STAR and PLUS for speed
221
* and to minimize recursive plunges.
223
* OPEN,CLOSE ...are numbered at compile time.
227
* A node is one char of opcode followed by two chars of "next" pointer.
228
* "Next" pointers are stored as two 8-bit pieces, high order first. The
229
* value is a positive offset from the opcode of the node containing it.
230
* An operand, if any, simply follows the node. (Note that much of the
231
* code generation knows about this implicit relationship.)
233
* Using two bytes for the "next" pointer is vast overkill for most things,
234
* but allows patterns to get big without disasters.
238
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
239
#define OPERAND(p) ((p) + 3)
241
const unsigned char MAGIC = 0234;
243
* Utility definitions.
246
#define UCHARAT(p) ((const unsigned char*)(p))[0]
249
#define FAIL(m) { regerror(m); return(NULL); }
250
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
251
#define META "^$.[()|?+*\\"
255
* Flags to be passed up and down.
257
#define HASWIDTH 01 // Known never to match null string.
258
#define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
259
#define SPSTART 04 // Starts with * or +.
260
#define WORST 0 // Worst case.
264
/////////////////////////////////////////////////////////////////////////
266
// COMPILE AND ASSOCIATED FUNCTIONS
268
/////////////////////////////////////////////////////////////////////////
272
* Global work variables for compile().
274
static const char* regparse; // Input-scan pointer.
275
static int regnpar; // () count.
276
static char regdummy;
277
static char* regcode; // Code-emit pointer; ®dummy = don't.
278
static long regsize; // Code size.
281
* Forward declarations for compile()'s friends.
284
// #define static static
286
static char* reg (int, int*);
287
static char* regbranch (int*);
288
static char* regpiece (int*);
289
static char* regatom (int*);
290
static char* regnode (char);
291
static const char* regnext (register const char*);
292
static char* regnext (register char*);
293
static void regc (unsigned char);
294
static void reginsert (char, char*);
295
static void regtail (char*, const char*);
296
static void regoptail (char*, const char*);
299
static int strcspn ();
305
* We can't allocate space until we know how big the compiled form will be,
306
* but we can't compile it (and thus know how big it is) until we've got a
307
* place to put the code. So we cheat: we compile it twice, once with code
308
* generation turned off and size counting turned on, and once "for real".
309
* This also means that we don't allocate space until we are sure that the
310
* thing really will compile successfully, and we never have to move the
311
* code and thus invalidate pointers into it. (Note that it has to be in
312
* one piece because free() must be able to free it all.)
314
* Beware that the optimization-preparation code in here knows about some
315
* of the structure of the compiled regexp.
319
// compile -- compile a regular expression into internal code
320
// for later pattern matching.
322
void cmRegularExpression::compile (const char* exp) {
323
register const char* scan;
324
register const char* longest;
325
register unsigned long len;
329
//RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
330
printf ("cmRegularExpression::compile(): No expression supplied.\n");
334
// First pass: determine size, legality.
342
printf ("cmRegularExpression::compile(): Error in compile.\n");
345
this->startp[0] = this->endp[0] = this->searchstring = NULL;
347
// Small enough for pointer-storage convention?
348
if (regsize >= 32767L) { // Probably could be 65535L.
349
//RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
350
printf ("cmRegularExpression::compile(): Expression too big.\n");
356
if (this->program != NULL) delete [] this->program;
358
this->program = new char[regsize];
359
this->progsize = (int) regsize;
361
if (this->program == NULL) {
362
//RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
363
printf ("cmRegularExpression::compile(): Out of memory.\n");
367
// Second pass: emit code.
370
regcode = this->program;
374
// Dig out information for optimizations.
375
this->regstart = '\0'; // Worst-case defaults.
377
this->regmust = NULL;
379
scan = this->program + 1; // First BRANCH.
380
if (OP(regnext(scan)) == END) { // Only one top-level choice.
381
scan = OPERAND(scan);
383
// Starting-point info.
384
if (OP(scan) == EXACTLY)
385
this->regstart = *OPERAND(scan);
386
else if (OP(scan) == BOL)
390
// If there's something expensive in the r.e., find the longest
391
// literal string that must appear and make it the regmust. Resolve
392
// ties in favor of later strings, since the regstart check works
393
// with the beginning of the r.e. and avoiding duplication
394
// strengthens checking. Not a strong reason, but sufficient in the
395
// absence of others.
397
if (flags & SPSTART) {
400
for (; scan != NULL; scan = regnext(scan))
401
if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
402
longest = OPERAND(scan);
403
len = strlen(OPERAND(scan));
405
this->regmust = longest;
413
- reg - regular expression, i.e. main body or parenthesized thing
415
* Caller must absorb opening parenthesis.
417
* Combining parenthesis handling with the base level of regular expression
418
* is a trifle forced, but the need to tie the tails of the branches to what
419
* follows makes it hard to avoid.
421
static char* reg (int paren, int *flagp) {
424
register char* ender;
425
register int parno =0;
428
*flagp = HASWIDTH; // Tentatively.
430
// Make an OPEN node, if parenthesized.
432
if (regnpar >= NSUBEXP) {
433
//RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
434
printf ("cmRegularExpression::compile(): Too many parentheses.\n");
439
ret = regnode(OPEN + parno);
444
// Pick up the branches, linking them together.
445
br = regbranch(&flags);
449
regtail(ret, br); // OPEN -> first.
452
if (!(flags & HASWIDTH))
454
*flagp |= flags & SPSTART;
455
while (*regparse == '|') {
457
br = regbranch(&flags);
460
regtail(ret, br); // BRANCH -> BRANCH.
461
if (!(flags & HASWIDTH))
463
*flagp |= flags & SPSTART;
466
// Make a closing node, and hook it on the end.
467
ender = regnode((paren) ? CLOSE + parno : END);
470
// Hook the tails of the branches to the closing node.
471
for (br = ret; br != NULL; br = regnext(br))
472
regoptail(br, ender);
474
// Check for proper termination.
475
if (paren && *regparse++ != ')') {
476
//RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
477
printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
480
else if (!paren && *regparse != '\0') {
481
if (*regparse == ')') {
482
//RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
483
printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
487
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
488
printf ("cmRegularExpression::compile(): Internal error.\n");
498
- regbranch - one alternative of an | operator
500
* Implements the concatenation operator.
502
static char* regbranch (int *flagp) {
504
register char* chain;
505
register char* latest;
508
*flagp = WORST; // Tentatively.
510
ret = regnode(BRANCH);
512
while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
513
latest = regpiece(&flags);
516
*flagp |= flags & HASWIDTH;
517
if (chain == NULL) // First piece.
518
*flagp |= flags & SPSTART;
520
regtail(chain, latest);
523
if (chain == NULL) // Loop ran zero times.
531
- regpiece - something followed by possible [*+?]
533
* Note that the branching code sequences used for ? and the general cases
534
* of * and + are somewhat optimized: they use the same NOTHING node as
535
* both the endmarker for their branch list and the body of the last branch.
536
* It might seem that this node could be dispensed with entirely, but the
537
* endmarker role is not redundant.
539
static char* regpiece (int *flagp) {
545
ret = regatom(&flags);
555
if (!(flags & HASWIDTH) && op != '?') {
556
//RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
557
printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
560
*flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
562
if (op == '*' && (flags & SIMPLE))
563
reginsert(STAR, ret);
564
else if (op == '*') {
565
// Emit x* as (x&|), where & means "self".
566
reginsert(BRANCH, ret); // Either x
567
regoptail(ret, regnode(BACK)); // and loop
568
regoptail(ret, ret); // back
569
regtail(ret, regnode(BRANCH)); // or
570
regtail(ret, regnode(NOTHING)); // null.
572
else if (op == '+' && (flags & SIMPLE))
573
reginsert(PLUS, ret);
574
else if (op == '+') {
575
// Emit x+ as x(&|), where & means "self".
576
next = regnode(BRANCH); // Either
578
regtail(regnode(BACK), ret); // loop back
579
regtail(next, regnode(BRANCH)); // or
580
regtail(ret, regnode(NOTHING)); // null.
582
else if (op == '?') {
584
reginsert(BRANCH, ret); // Either x
585
regtail(ret, regnode(BRANCH)); // or
586
next = regnode(NOTHING);// null.
588
regoptail(ret, next);
591
if (ISMULT(*regparse)) {
592
//RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
593
printf ("cmRegularExpression::compile(): Nested *?+.\n");
601
- regatom - the lowest level
603
* Optimization: gobbles an entire sequence of ordinary characters so that
604
* it can turn them into a single node, which is smaller to store and
605
* faster to run. Backslashed characters are exceptions, each becoming a
606
* separate node; the code is simpler that way and it's not worth fixing.
608
static char* regatom (int *flagp) {
612
*flagp = WORST; // Tentatively.
614
switch (*regparse++) {
623
*flagp |= HASWIDTH | SIMPLE;
626
register int rxpclass;
627
register int rxpclassend;
629
if (*regparse == '^') { // Complement of range.
630
ret = regnode(ANYBUT);
634
ret = regnode(ANYOF);
635
if (*regparse == ']' || *regparse == '-')
637
while (*regparse != '\0' && *regparse != ']') {
638
if (*regparse == '-') {
640
if (*regparse == ']' || *regparse == '\0')
643
rxpclass = UCHARAT(regparse - 2) + 1;
644
rxpclassend = UCHARAT(regparse);
645
if (rxpclass > rxpclassend + 1) {
646
//RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
647
printf ("cmRegularExpression::compile(): Invalid range in [].\n");
650
for (; rxpclass <= rxpclassend; rxpclass++)
659
if (*regparse != ']') {
660
//RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
661
printf ("cmRegularExpression::compile(): Unmatched [].\n");
665
*flagp |= HASWIDTH | SIMPLE;
669
ret = reg(1, &flags);
672
*flagp |= flags & (HASWIDTH | SPSTART);
677
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
678
printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
683
//RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
684
printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
687
if (*regparse == '\0') {
688
//RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
689
printf ("cmRegularExpression::compile(): Trailing backslash.\n");
692
ret = regnode(EXACTLY);
695
*flagp |= HASWIDTH | SIMPLE;
702
len = strcspn(regparse, META);
704
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
705
printf ("cmRegularExpression::compile(): Internal error.\n");
708
ender = *(regparse + len);
709
if (len > 1 && ISMULT(ender))
710
len--; // Back off clear of ?+* operand.
714
ret = regnode(EXACTLY);
728
- regnode - emit a node
731
static char* regnode (char op) {
736
if (ret == ®dummy) {
743
*ptr++ = '\0'; // Null "next" pointer.
752
- regc - emit (if appropriate) a byte of code
754
static void regc (unsigned char b) {
755
if (regcode != ®dummy)
763
- reginsert - insert an operator in front of already-emitted operand
765
* Means relocating the operand.
767
static void reginsert (char op, char* opnd) {
770
register char* place;
772
if (regcode == ®dummy) {
783
place = opnd; // Op node, where operand used to be.
791
- regtail - set the next-pointer at the end of a node chain
793
static void regtail (char* p, const char* val) {
804
temp = regnext(scan);
810
if (OP(scan) == BACK)
811
offset = (const char*)scan - val;
814
*(scan + 1) = (offset >> 8) & 0377;
815
*(scan + 2) = offset & 0377;
820
- regoptail - regtail on operand of first argument; nop if operandless
822
static void regoptail (char* p, const char* val) {
823
// "Operandless" and "op != BRANCH" are synonymous in practice.
824
if (p == NULL || p == ®dummy || OP(p) != BRANCH)
826
regtail(OPERAND(p), val);
831
////////////////////////////////////////////////////////////////////////
835
////////////////////////////////////////////////////////////////////////
839
* Global work variables for find().
841
static const char* reginput; // String-input pointer.
842
static const char* regbol; // Beginning of input, for ^ check.
843
static const char* *regstartp; // Pointer to startp array.
844
static const char* *regendp; // Ditto for endp.
849
static int regtry (const char*, const char* *,
850
const char* *, const char*);
851
static int regmatch (const char*);
852
static int regrepeat (const char*);
857
static char* regprop ();
860
bool cmRegularExpression::find (std::string const& s)
862
return find(s.c_str());
867
// find -- Matches the regular expression to the given string.
868
// Returns true if found, and sets start and end indexes accordingly.
870
bool cmRegularExpression::find (const char* string) {
871
register const char* s;
873
this->searchstring = string;
875
// Check validity of program.
876
if (!this->program || UCHARAT(this->program) != MAGIC) {
877
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
878
printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
882
// If there is a "must appear" string, look for it.
883
if (this->regmust != NULL) {
885
while ((s = strchr(s, this->regmust[0])) != NULL) {
886
if (strncmp(s, this->regmust, this->regmlen) == 0)
890
if (s == NULL) // Not present.
894
// Mark beginning of line for ^ .
897
// Simplest case: anchored match need be tried only once.
899
return (regtry(string, this->startp, this->endp, this->program) != 0);
901
// Messy cases: unanchored match.
903
if (this->regstart != '\0')
904
// We know what char it must start with.
905
while ((s = strchr(s, this->regstart)) != NULL) {
906
if (regtry(s, this->startp, this->endp, this->program))
912
// We don't -- general case.
914
if (regtry(s, this->startp, this->endp, this->program))
916
} while (*s++ != '\0');
924
- regtry - try match at specific point
927
static int regtry (const char* string, const char* *start,
928
const char* *end, const char* prog) {
930
register const char* *sp1;
931
register const char* *ep;
939
for (i = NSUBEXP; i > 0; i--) {
943
if (regmatch(prog + 1)) {
954
- regmatch - main matching routine
956
* Conceptually the strategy is simple: check to see whether the current
957
* node matches, call self recursively to see whether the rest matches,
958
* and then act accordingly. In practice we make some effort to avoid
959
* recursion, in particular by going through "ordinary" nodes (that don't
960
* need to know whether the rest of the match failed) by a loop instead of
962
* 0 failure, 1 success
964
static int regmatch (const char* prog) {
965
register const char* scan; // Current node.
966
const char* next; // Next node.
970
while (scan != NULL) {
972
next = regnext(scan);
976
if (reginput != regbol)
980
if (*reginput != '\0')
984
if (*reginput == '\0')
990
register const char* opnd;
992
opnd = OPERAND(scan);
993
// Inline the first character, for speed.
994
if (*opnd != *reginput)
997
if (len > 1 && strncmp(opnd, reginput, len) != 0)
1003
if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
1008
if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
1026
register const char* save;
1028
no = OP(scan) - OPEN;
1031
if (regmatch(next)) {
1034
// Don't set startp if some later invocation of the
1035
// same parentheses already has.
1037
if (regstartp[no] == NULL)
1038
regstartp[no] = save;
1055
register const char* save;
1057
no = OP(scan) - CLOSE;
1060
if (regmatch(next)) {
1063
// Don't set endp if some later invocation of the
1064
// same parentheses already has.
1066
if (regendp[no] == NULL)
1076
register const char* save;
1078
if (OP(next) != BRANCH) // No choice.
1079
next = OPERAND(scan); // Avoid recursion.
1083
if (regmatch(OPERAND(scan)))
1086
scan = regnext(scan);
1087
} while (scan != NULL && OP(scan) == BRANCH);
1095
register char nextch;
1097
register const char* save;
1098
register int min_no;
1101
// Lookahead to avoid useless match attempts when we know
1102
// what character comes next.
1105
if (OP(next) == EXACTLY)
1106
nextch = *OPERAND(next);
1107
min_no = (OP(scan) == STAR) ? 0 : 1;
1109
no = regrepeat(OPERAND(scan));
1110
while (no >= min_no) {
1111
// If it could work, try it.
1112
if (nextch == '\0' || *reginput == nextch)
1115
// Couldn't or didn't -- back up.
1117
reginput = save + no;
1123
return (1); // Success!
1126
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1127
printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
1134
// We get here only if there's trouble -- normally "case END" is the
1135
// terminating point.
1137
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1138
printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
1144
- regrepeat - repeatedly match something simple, report how many
1146
static int regrepeat (const char* p) {
1147
register int count = 0;
1148
register const char* scan;
1149
register const char* opnd;
1155
count = strlen(scan);
1159
while (*opnd == *scan) {
1165
while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1171
while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1176
default: // Oh dear. Called inappropriately.
1177
//RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1178
printf ("cm RegularExpression::find(): Internal error.\n");
1187
- regnext - dig the "next" pointer out of a node
1189
static const char* regnext (register const char* p) {
1190
register int offset;
1200
return (p - offset);
1202
return (p + offset);
1206
static char* regnext (register char* p) {
1207
register int offset;
1217
return (p - offset);
1219
return (p + offset);