1
// Scintilla source code edit control
3
** Regular expression search library.
7
* regex - Regular expression pattern matching and replacement
9
* By: Ozan S. Yigit (oz)
10
* Dept. of Computer Science
13
* Original code available from http://www.cs.yorku.ca/~oz/
14
* Translation to C++ by Neil Hodgson neilh@scintilla.org
15
* Removed all use of register.
16
* Converted to modern function prototypes.
17
* Put all global/static variables into an object so this code can be
18
* used from multiple threads, etc.
20
* These routines are the PUBLIC DOMAIN equivalents of regex
21
* routines as found in 4.nBSD UN*X, with minor extensions.
23
* These routines are derived from various implementations found
24
* in software tools books, and Conroy's grep. They are NOT derived
25
* from licensed/restricted software.
26
* For more interesting/academic/complicated implementations,
27
* see Henry Spencer's regexp routines, or GNU Emacs pattern
30
* Modification history removed.
33
* RESearch::Compile: compile a regular expression into a NFA.
35
* const char *RESearch::Compile(const char *pat, int length,
36
* bool caseSensitive, bool posix)
38
* Returns a short error string if they fail.
40
* RESearch::Execute: execute the NFA to match a pattern.
42
* int RESearch::Execute(characterIndexer &ci, int lp, int endp)
44
* RESearch::Substitute: substitute the matched portions in a new string.
46
* int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)
48
* re_fail: failure routine for RESearch::Execute. (no longer used)
50
* void re_fail(char *msg, char op)
52
* Regular Expressions:
54
* [1] char matches itself, unless it is a special
55
* character (metachar): . \ [ ] * + ^ $
56
* and ( ) if posix option.
58
* [2] . matches any character.
60
* [3] \ matches the character following it, except:
61
* - \a, \b, \f, \n, \t, \v match the
62
* corresponding C escape char;
63
* - if not in posix mode, when followed by a
64
* left or right round bracket (see [7]);
65
* - when followed by a digit 1 to 9 (see [8]);
66
* - when followed by a left or right angle bracket
68
* It is used as an escape character for all
69
* other meta-characters, and itself. When used
70
* in a set ([4]), it is treated as an ordinary
71
* character (except for escape chars).
73
* [4] [set] matches one of the characters in the set.
74
* If the first character in the set is "^",
75
* it matches a character NOT in the set, i.e.
76
* complements the set. A shorthand S-E (start-end)
77
* is used to specify a set of characters S upto
78
* E, inclusive. The special characters "]" and
79
* "-" have no special meaning if they appear
80
* as the first chars in the set. To include both,
81
* put - first: [-]A-Z]:
82
* [-]|] matches these 2 chars,
83
* []-|] matches from ] to | chars.
86
* [a-z] any lowercase alpha
88
* [^-]] any char except - and ]
90
* [^A-Z] any char except uppercase
95
* [5] * any regular expression form [1] to [4], followed by
96
* closure char (*) matches zero or more matches of
99
* [6] + same as [5], except it matches one or more.
101
* [7] a regular expression in the form [1] to [10], enclosed
102
* as \(form\) (or (form) with posix flag) matches what
103
* form matches. The enclosure creates a set of tags,
104
* used for [8] and for pattern substitution.
105
* The tagged forms are numbered starting from 1.
107
* [8] a \ followed by a digit 1 to 9 matches whatever a
108
* previously tagged regular expression ([7]) matched.
110
* [9] \< a regular expression starting with a \< construct
111
* \> and/or ending with a \> construct, restricts the
112
* pattern matching to the beginning of a word, and/or
113
* the end of a word. A word is defined to be a character
114
* string beginning and/or ending with the characters
115
* A-Z a-z 0-9 and _. It must also be preceded and/or
116
* followed by any character outside those mentioned.
118
* [10] a composite regular expression xy where x and y
119
* are in the form [1] to [10] matches the longest
120
* match of x followed by a match for y.
122
* [11] ^ a regular expression starting with a ^ character
123
* $ and/or ending with a $ character, restricts the
124
* pattern matching to the beginning of the line,
125
* or the end of line. [anchors] Elsewhere in the
126
* pattern, ^ and $ are treated as ordinary characters.
131
* HCR's Hugh Redelmeier has been most helpful in various
132
* stages of development. He convinced me to include BOW
133
* and EOW constructs, originally invented by Rob Pike at
134
* the University of Toronto.
137
* Software tools Kernighan & Plauger
138
* Software tools in Pascal Kernighan & Plauger
139
* Grep [rsx-11 C dist] David Conroy
140
* ed - text editor Un*x Programmer's Manual
141
* Advanced editing on Un*x B. W. Kernighan
142
* RegExp routines Henry Spencer
146
* This implementation uses a bit-set representation for character
147
* classes for speed and compactness. Each character is represented
148
* by one bit in a 256-bit block. Thus, CCL always takes a
149
* constant 32 bytes in the internal nfa, and RESearch::Execute does a single
150
* bit comparison to locate the character in the set.
155
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
156
* matches: fo foo fooo foobar fobar foxx ...
158
* pattern: fo[ob]a[rz]
159
* compile: CHR f CHR o CCL bitset CHR a CCL bitset END
160
* matches: fobar fooar fobaz fooaz
163
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
164
* matches: foo\ foo\\ foo\\\ ...
166
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
167
* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
168
* matches: foo1foo foo2foo foo3foo
170
* pattern: \(fo.*\)-\1
171
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
172
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
175
#include "CharClassify.h"
176
#include "RESearch.h"
178
// Shut up annoying Visual C++ warnings:
180
#pragma warning(disable: 4514)
201
* The following defines are not meant to be changeable.
202
* They are for readability only.
207
const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
209
#define badpat(x) (*nfa = END, x)
212
* Character classification table for word boundary operators BOW
213
* and EOW is passed in by the creator of this object (Scintilla
214
* Document). The Document default state is that word chars are:
218
RESearch::RESearch(CharClassify *charClassTable) {
219
charClass = charClassTable;
223
RESearch::~RESearch() {
227
void RESearch::Init() {
228
sta = NOP; /* status of lastpat */
230
for (int i=0; i<MAXTAG; i++)
232
for (int j=0; j<BITBLK; j++)
236
void RESearch::Clear() {
237
for (int i=0; i<MAXTAG; i++) {
245
bool RESearch::GrabMatches(CharacterIndexer &ci) {
247
for (unsigned int i=0; i<MAXTAG; i++) {
248
if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
249
unsigned int len = eopat[i] - bopat[i];
250
pat[i] = new char[len + 1];
252
for (unsigned int j=0; j<len; j++)
253
pat[i][j] = ci.CharAt(bopat[i] + j);
263
void RESearch::ChSet(char c) {
264
bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
267
void RESearch::ChSetWithCase(char c, bool caseSensitive) {
271
if ((c >= 'a') && (c <= 'z')) {
273
ChSet(static_cast<char>(c - 'a' + 'A'));
274
} else if ((c >= 'A') && (c <= 'Z')) {
276
ChSet(static_cast<char>(c - 'A' + 'a'));
283
const char escapeValue(char ch) {
285
case 'a': return '\a';
286
case 'b': return '\b';
287
case 'f': return '\f';
288
case 'n': return '\n';
289
case 'r': return '\r';
290
case 't': return '\t';
291
case 'v': return '\v';
296
const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
297
char *mp=nfa; /* nfa pointer */
298
char *lp; /* saved pointer */
299
char *sp=nfa; /* another one */
300
char *mpMax = mp + MAXNFA - BITBLK - 10;
302
int tagi = 0; /* tag stack index */
303
int tagc = 1; /* actual tag count */
306
char mask; /* xor mask -CCL/NCL */
313
return badpat("No previous regular expression");
316
const char *p=pat; /* pattern pointer */
317
for (int i=0; i<length; i++, p++) {
319
return badpat("Pattern too long");
323
case '.': /* match any char */
327
case '^': /* match beginning */
336
case '$': /* match endofline */
345
case '[': /* match char class */
356
if (*p == '-') { /* real dash */
360
if (*p == ']') { /* real brace */
364
while (*p && *p != ']') {
365
if (*p == '-' && *(p+1) && *(p+1) != ']') {
372
ChSetWithCase(static_cast<char>(c1++), caseSensitive);
374
} else if (*p == '\\' && *(p+1)) {
377
char escape = escapeValue(*p);
379
ChSetWithCase(escape, caseSensitive);
381
ChSetWithCase(*p, caseSensitive);
386
ChSetWithCase(*p++, caseSensitive);
390
return badpat("Missing ]");
392
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
393
*mp++ = static_cast<char>(mask ^ bittab[n]);
397
case '*': /* match 0 or more... */
398
case '+': /* match 1 or more... */
400
return badpat("Empty closure");
401
lp = sp; /* previous opcode */
402
if (*lp == CLO) /* equivalence... */
412
return badpat("Illegal closure");
418
for (sp = mp; lp < sp; lp++)
430
case '\\': /* tags, backrefs... */
439
return badpat("Null pattern inside \\<\\>");
452
if (tagi > 0 && tagstk[tagi] == n)
453
return badpat("Cyclical reference");
455
*mp++ = static_cast<char>(REF);
456
*mp++ = static_cast<char>(n);
459
return badpat("Undetermined reference");
469
*mp++ = escapeValue(*p);
472
if (!posix && *p == '(') {
474
tagstk[++tagi] = tagc;
476
*mp++ = static_cast<char>(tagc++);
479
return badpat("Too many \\(\\) pairs");
480
} else if (!posix && *p == ')') {
482
return badpat("Null pattern inside \\(\\)");
484
*mp++ = static_cast<char>(EOT);
485
*mp++ = static_cast<char>(tagstk[tagi--]);
488
return badpat("Unmatched \\)");
496
default : /* an ordinary char */
497
if (posix && *p == '(') {
499
tagstk[++tagi] = tagc;
501
*mp++ = static_cast<char>(tagc++);
504
return badpat("Too many () pairs");
505
} else if (posix && *p == ')') {
507
return badpat("Null pattern inside ()");
509
*mp++ = static_cast<char>(EOT);
510
*mp++ = static_cast<char>(tagstk[tagi--]);
513
return badpat("Unmatched )");
514
} else if (caseSensitive) {
520
ChSetWithCase(*p, false);
521
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
522
*mp++ = static_cast<char>(mask ^ bittab[n]);
529
return badpat((posix ? "Unmatched (" : "Unmatched \\("));
537
* execute nfa to find a match.
539
* special cases: (nfa[0])
541
* Match only once, starting from the
544
* First locate the character without
545
* calling PMatch, and if found, call
546
* PMatch for the remaining string.
548
* RESearch::Compile failed, poor luser did not
549
* check for it. Fail fast.
551
* If a match is found, bopat[0] and eopat[0] are set
552
* to the beginning and the end of the matched fragment,
557
int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
569
case BOL: /* anchored: match from BOL only */
570
ep = PMatch(ci, lp, endp, ap);
572
case EOL: /* just searching for end of line normal path doesn't work */
573
if (*(ap+1) == END) {
580
case CHR: /* ordinary char: locate it fast */
582
while ((lp < endp) && (ci.CharAt(lp) != c))
584
if (lp >= endp) /* if EOS, fail, else fall thru. */
586
default: /* regular matching all the way. */
588
ep = PMatch(ci, lp, endp, ap);
594
case END: /* munged automaton. fail always */
606
* PMatch: internal routine for the hard part
608
* This code is partly snarfed from an early grep written by
609
* David Conroy. The backref and tag stuff, and various other
610
* innovations are by oz.
612
* special case optimizations: (nfa[n], nfa[n+1])
614
* We KNOW .* will match everything upto the
615
* end of line. Thus, directly go to the end of
616
* line, without recursive PMatch calls. As in
617
* the other closure cases, the remaining pattern
618
* must be matched by moving backwards on the
619
* string recursively, to find a match for xy
620
* (x is ".*" and y is the remaining pattern)
621
* where the match satisfies the LONGEST match for
622
* x followed by a match for y.
624
* We can again scan the string forward for the
625
* single char and at the point of failure, we
626
* execute the remaining nfa recursively, same as
629
* At the end of a successful match, bopat[n] and eopat[n]
630
* are set to the beginning and end of subpatterns matched
631
* by tagged expressions (n = 1 to 9).
634
extern void re_fail(char *,char);
636
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
639
* skip values for CLO XXX to skip past the closure
642
#define ANYSKIP 2 /* [CLO] ANY END */
643
#define CHRSKIP 3 /* [CLO] CHR chr END */
644
#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
646
int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
648
int e; /* extra pointer for CLO */
649
int bp; /* beginning of subpat... */
650
int ep; /* ending of subpat... */
651
int are; /* to save the line ptr. */
653
while ((op = *ap++) != END)
657
if (ci.CharAt(lp++) != *ap++)
685
if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
689
if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
697
if (ci.CharAt(bp++) != ci.CharAt(lp++))
711
while ((lp < endp) && (c == ci.CharAt(lp)))
716
while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
722
//re_fail("closure: bad nfa.", *ap);
729
if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
735
//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
742
* RESearch::Substitute:
743
* substitute the matched portions of the src in dst.
745
* & substitute the entire matched pattern.
747
* \digit substitute a subpattern, with the given tag number.
748
* Tags are numbered from 1 to 9. If the particular
749
* tagged subpattern does not exist, null is substituted.
751
int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
757
if (!*src || !bopat[0])
760
while ((c = *src++) != 0) {
769
if (c >= '0' && c <= '9') {
779
if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
780
while (ci.CharAt(bp) && bp < ep)
781
*dst++ = ci.CharAt(bp++);