1
// Scintilla source code edit control
3
** Regular expression search library.
7
* regex - Regular expression pattern matching and replacement
9
* By: Ozan S. Yigit (oz)
10
* Dept. of Computer Science
13
* Original code available from http://www.cs.yorku.ca/~oz/
14
* Translation to C++ by Neil Hodgson neilh@scintilla.org
15
* Removed all use of register.
16
* Converted to modern function prototypes.
17
* Put all global/static variables into an object so this code can be
18
* used from multiple threads, etc.
19
* Some extensions by Philippe Lhoste PhiLho(a)GMX.net
20
* '?' extensions by Michael Mullin masmullin@gmail.com
22
* These routines are the PUBLIC DOMAIN equivalents of regex
23
* routines as found in 4.nBSD UN*X, with minor extensions.
25
* These routines are derived from various implementations found
26
* in software tools books, and Conroy's grep. They are NOT derived
27
* from licensed/restricted software.
28
* For more interesting/academic/complicated implementations,
29
* see Henry Spencer's regexp routines, or GNU Emacs pattern
32
* Modification history removed.
35
* RESearch::Compile: compile a regular expression into a NFA.
37
* const char *RESearch::Compile(const char *pattern, int length,
38
* bool caseSensitive, bool posix)
40
* Returns a short error string if they fail.
42
* RESearch::Execute: execute the NFA to match a pattern.
44
* int RESearch::Execute(characterIndexer &ci, int lp, int endp)
46
* re_fail: failure routine for RESearch::Execute. (no longer used)
48
* void re_fail(char *msg, char op)
50
* Regular Expressions:
52
* [1] char matches itself, unless it is a special
53
* character (metachar): . \ [ ] * + ? ^ $
54
* and ( ) if posix option.
56
* [2] . matches any character.
58
* [3] \ matches the character following it, except:
59
* - \a, \b, \f, \n, \r, \t, \v match the corresponding C
60
* escape char, respectively BEL, BS, FF, LF, CR, TAB and VT;
61
* Note that \r and \n are never matched because Scintilla
62
* regex searches are made line per line
63
* (stripped of end-of-line chars).
64
* - if not in posix mode, when followed by a
65
* left or right round bracket (see [8]);
66
* - when followed by a digit 1 to 9 (see [9]);
67
* - when followed by a left or right angle bracket
69
* - when followed by d, D, s, S, w or W (see [11]);
70
* - when followed by x and two hexa digits (see [12].
71
* Backslash is used as an escape character for all
72
* other meta-characters, and itself.
74
* [4] [set] matches one of the characters in the set.
75
* If the first character in the set is "^",
76
* it matches the characters NOT in the set, i.e.
77
* complements the set. A shorthand S-E (start dash end)
78
* is used to specify a set of characters S up to
79
* E, inclusive. S and E must be characters, otherwise
80
* the dash is taken literally (eg. in expression [\d-a]).
81
* The special characters "]" and "-" have no special
82
* meaning if they appear as the first chars in the set.
83
* To include both, put - first: [-]A-Z]
84
* (or just backslash them).
87
* [-]|] matches these 3 chars,
89
* []-|] matches from ] to | chars
91
* [a-z] any lowercase alpha
93
* [^-]] any char except - and ]
95
* [^A-Z] any char except uppercase
100
* [5] * any regular expression form [1] to [4]
101
* (except [8], [9] and [10] forms of [3]),
102
* followed by closure char (*)
103
* matches zero or more matches of that form.
105
* [6] + same as [5], except it matches one or more.
107
* [5-6] Both [5] and [6] are greedy (they match as much as possible).
108
* Unless they are followed by the 'lazy' quantifier (?)
109
* In which case both [5] and [6] try to match as little as possible
111
* [7] ? same as [5] except it matches zero or one.
113
* [8] a regular expression in the form [1] to [13], enclosed
114
* as \(form\) (or (form) with posix flag) matches what
115
* form matches. The enclosure creates a set of tags,
116
* used for [9] and for pattern substitution.
117
* The tagged forms are numbered starting from 1.
119
* [9] a \ followed by a digit 1 to 9 matches whatever a
120
* previously tagged regular expression ([8]) matched.
122
* [10] \< a regular expression starting with a \< construct
123
* \> and/or ending with a \> construct, restricts the
124
* pattern matching to the beginning of a word, and/or
125
* the end of a word. A word is defined to be a character
126
* string beginning and/or ending with the characters
127
* A-Z a-z 0-9 and _. Scintilla extends this definition
128
* by user setting. The word must also be preceded and/or
129
* followed by any character outside those mentioned.
131
* [11] \l a backslash followed by d, D, s, S, w or W,
132
* becomes a character class (both inside and
135
* D: any char except decimal digits
136
* s: whitespace (space, \t \n \r \f \v)
137
* S: any char except whitespace (see above)
138
* w: alphanumeric & underscore (changed by user setting)
139
* W: any char except alphanumeric & underscore (see above)
141
* [12] \xHH a backslash followed by x and two hexa digits,
142
* becomes the character whose Ascii code is equal
143
* to these digits. If not followed by two digits,
144
* it is 'x' char itself.
146
* [13] a composite regular expression xy where x and y
147
* are in the form [1] to [12] matches the longest
148
* match of x followed by a match for y.
150
* [14] ^ a regular expression starting with a ^ character
151
* $ and/or ending with a $ character, restricts the
152
* pattern matching to the beginning of the line,
153
* or the end of line. [anchors] Elsewhere in the
154
* pattern, ^ and $ are treated as ordinary characters.
159
* HCR's Hugh Redelmeier has been most helpful in various
160
* stages of development. He convinced me to include BOW
161
* and EOW constructs, originally invented by Rob Pike at
162
* the University of Toronto.
165
* Software tools Kernighan & Plauger
166
* Software tools in Pascal Kernighan & Plauger
167
* Grep [rsx-11 C dist] David Conroy
168
* ed - text editor Un*x Programmer's Manual
169
* Advanced editing on Un*x B. W. Kernighan
170
* RegExp routines Henry Spencer
174
* This implementation uses a bit-set representation for character
175
* classes for speed and compactness. Each character is represented
176
* by one bit in a 256-bit block. Thus, CCL always takes a
177
* constant 32 bytes in the internal nfa, and RESearch::Execute does a single
178
* bit comparison to locate the character in the set.
183
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
184
* matches: fo foo fooo foobar fobar foxx ...
186
* pattern: fo[ob]a[rz]
187
* compile: CHR f CHR o CCL bitset CHR a CCL bitset END
188
* matches: fobar fooar fobaz fooaz
191
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
192
* matches: foo\ foo\\ foo\\\ ...
194
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
195
* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
196
* matches: foo1foo foo2foo foo3foo
198
* pattern: \(fo.*\)-\1
199
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
200
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
208
#include "CharClassify.h"
209
#include "RESearch.h"
212
using namespace Scintilla;
229
#define CLQ 12 /* 0 to 1 closure */
230
#define LCLO 13 /* lazy closure */
235
* The following defines are not meant to be changeable.
236
* They are for readability only.
241
const char bitarr[] = { 1, 2, 4, 8, 16, 32, 64, '\200' };
243
#define badpat(x) (*nfa = END, x)
246
* Character classification table for word boundary operators BOW
247
* and EOW is passed in by the creator of this object (Scintilla
248
* Document). The Document default state is that word chars are:
249
* 0-9, a-z, A-Z and _
252
RESearch::RESearch(CharClassify *charClassTable) {
254
charClass = charClassTable;
255
sta = NOP; /* status of lastpat */
257
std::fill(bittab, bittab + BITBLK, 0);
258
std::fill(tagstk, tagstk + MAXTAG, 0);
259
std::fill(nfa, nfa + MAXNFA, 0);
263
RESearch::~RESearch() {
267
void RESearch::Clear() {
268
for (int i = 0; i < MAXTAG; i++) {
275
void RESearch::GrabMatches(CharacterIndexer &ci) {
276
for (unsigned int i = 0; i < MAXTAG; i++) {
277
if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
278
unsigned int len = eopat[i] - bopat[i];
280
for (unsigned int j = 0; j < len; j++)
281
pat[i][j] = ci.CharAt(bopat[i] + j);
286
void RESearch::ChSet(unsigned char c) {
287
bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
290
void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) {
294
if ((c >= 'a') && (c <= 'z')) {
296
ChSet(static_cast<unsigned char>(c - 'a' + 'A'));
297
} else if ((c >= 'A') && (c <= 'Z')) {
299
ChSet(static_cast<unsigned char>(c - 'A' + 'a'));
306
unsigned char escapeValue(unsigned char ch) {
308
case 'a': return '\a';
309
case 'b': return '\b';
310
case 'f': return '\f';
311
case 'n': return '\n';
312
case 'r': return '\r';
313
case 't': return '\t';
314
case 'v': return '\v';
319
static int GetHexaChar(unsigned char hd1, unsigned char hd2) {
321
if (hd1 >= '0' && hd1 <= '9') {
322
hexValue += 16 * (hd1 - '0');
323
} else if (hd1 >= 'A' && hd1 <= 'F') {
324
hexValue += 16 * (hd1 - 'A' + 10);
325
} else if (hd1 >= 'a' && hd1 <= 'f') {
326
hexValue += 16 * (hd1 - 'a' + 10);
330
if (hd2 >= '0' && hd2 <= '9') {
331
hexValue += hd2 - '0';
332
} else if (hd2 >= 'A' && hd2 <= 'F') {
333
hexValue += hd2 - 'A' + 10;
334
} else if (hd2 >= 'a' && hd2 <= 'f') {
335
hexValue += hd2 - 'a' + 10;
343
* Called when the parser finds a backslash not followed
344
* by a valid expression (like \( in non-Posix mode).
345
* @param pattern: pointer on the char after the backslash.
346
* @param incr: (out) number of chars to skip after expression evaluation.
347
* @return the char if it resolves to a simple char,
348
* or -1 for a char class. In this case, bittab is changed.
350
int RESearch::GetBackslashExpression(
353
// Since error reporting is primitive and messages are not used anyway,
354
// I choose to interpret unexpected syntax in a logical way instead
355
// of reporting errors. Otherwise, we can stick on, eg., PCRE behavior.
356
incr = 0; // Most of the time, will skip the char "naturally".
359
unsigned char bsc = *pattern;
362
result = '\\'; // \ at end of pattern, take it literally
374
result = escapeValue(bsc);
377
unsigned char hd1 = *(pattern + 1);
378
unsigned char hd2 = *(pattern + 2);
379
int hexValue = GetHexaChar(hd1, hd2);
382
incr = 2; // Must skip the digits
384
result = 'x'; // \x without 2 digits: see it as 'x'
389
for (c = '0'; c <= '9'; c++) {
390
ChSet(static_cast<unsigned char>(c));
394
for (c = 0; c < MAXCHR; c++) {
395
if (c < '0' || c > '9') {
396
ChSet(static_cast<unsigned char>(c));
409
for (c = 0; c < MAXCHR; c++) {
410
if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) {
411
ChSet(static_cast<unsigned char>(c));
416
for (c = 0; c < MAXCHR; c++) {
417
if (iswordc(static_cast<unsigned char>(c))) {
418
ChSet(static_cast<unsigned char>(c));
423
for (c = 0; c < MAXCHR; c++) {
424
if (!iswordc(static_cast<unsigned char>(c))) {
425
ChSet(static_cast<unsigned char>(c));
435
const char *RESearch::Compile(const char *pattern, int length, bool caseSensitive, bool posix) {
436
char *mp=nfa; /* nfa pointer */
437
char *lp; /* saved pointer */
438
char *sp=nfa; /* another one */
439
char *mpMax = mp + MAXNFA - BITBLK - 10;
441
int tagi = 0; /* tag stack index */
442
int tagc = 1; /* actual tag count */
445
char mask; /* xor mask -CCL/NCL */
446
int c1, c2, prevChar;
448
if (!pattern || !length) {
452
return badpat("No previous regular expression");
456
const char *p=pattern; /* pattern pointer */
457
for (int i=0; i<length; i++, p++) {
459
return badpat("Pattern too long");
463
case '.': /* match any char */
467
case '^': /* match beginning */
476
case '$': /* match endofline */
485
case '[': /* match char class */
498
if (*p == '-') { /* real dash */
503
if (*p == ']') { /* real brace */
508
while (*p && *p != ']') {
511
// Previous def. was a char class like \d, take dash literally
518
c2 = static_cast<unsigned char>(*++p);
520
if (!*(p+1)) { // End of RE
521
return badpat("Missing ]");
526
c2 = GetBackslashExpression(p, incr);
530
// Convention: \c (c is any char) is case sensitive, whatever the option
531
ChSet(static_cast<unsigned char>(c2));
534
// bittab is already changed
540
// Char after dash is char class like \d, take dash literally
544
// Put all chars between c1 and c2 included in the char set
546
ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive);
550
// Dash before the ], take it literally
555
return badpat("Missing ]");
557
} else if (*p == '\\' && *(p+1)) {
561
int c = GetBackslashExpression(p, incr);
565
// Convention: \c (c is any char) is case sensitive, whatever the option
566
ChSet(static_cast<unsigned char>(c));
569
// bittab is already changed
573
prevChar = static_cast<unsigned char>(*p);
574
ChSetWithCase(*p, caseSensitive);
580
return badpat("Missing ]");
582
for (n = 0; n < BITBLK; bittab[n++] = 0)
583
*mp++ = static_cast<char>(mask ^ bittab[n]);
587
case '*': /* match 0 or more... */
588
case '+': /* match 1 or more... */
591
return badpat("Empty closure");
592
lp = sp; /* previous opcode */
593
if (*lp == CLO || *lp == LCLO) /* equivalence... */
603
return badpat("Illegal closure");
609
for (sp = mp; lp < sp; lp++)
618
if (*p == '?') *mp = CLQ;
619
else if (*(p+1) == '?') *mp = LCLO;
625
case '\\': /* tags, backrefs... */
633
return badpat("Null pattern inside \\<\\>");
646
if (tagi > 0 && tagstk[tagi] == n)
647
return badpat("Cyclical reference");
649
*mp++ = static_cast<char>(REF);
650
*mp++ = static_cast<char>(n);
652
return badpat("Undetermined reference");
656
if (!posix && *p == '(') {
658
tagstk[++tagi] = tagc;
660
*mp++ = static_cast<char>(tagc++);
662
return badpat("Too many \\(\\) pairs");
664
} else if (!posix && *p == ')') {
666
return badpat("Null pattern inside \\(\\)");
668
*mp++ = static_cast<char>(EOT);
669
*mp++ = static_cast<char>(tagstk[tagi--]);
671
return badpat("Unmatched \\)");
675
int c = GetBackslashExpression(p, incr);
680
*mp++ = static_cast<unsigned char>(c);
684
for (n = 0; n < BITBLK; bittab[n++] = 0)
685
*mp++ = static_cast<char>(mask ^ bittab[n]);
691
default : /* an ordinary char */
692
if (posix && *p == '(') {
694
tagstk[++tagi] = tagc;
696
*mp++ = static_cast<char>(tagc++);
698
return badpat("Too many () pairs");
700
} else if (posix && *p == ')') {
702
return badpat("Null pattern inside ()");
704
*mp++ = static_cast<char>(EOT);
705
*mp++ = static_cast<char>(tagstk[tagi--]);
707
return badpat("Unmatched )");
710
unsigned char c = *p;
712
c = '\\'; // We take it as raw backslash
713
if (caseSensitive || !iswordc(c)) {
719
ChSetWithCase(c, false);
720
for (n = 0; n < BITBLK; bittab[n++] = 0)
721
*mp++ = static_cast<char>(mask ^ bittab[n]);
729
return badpat((posix ? "Unmatched (" : "Unmatched \\("));
737
* execute nfa to find a match.
739
* special cases: (nfa[0])
741
* Match only once, starting from the
744
* First locate the character without
745
* calling PMatch, and if found, call
746
* PMatch for the remaining string.
748
* RESearch::Compile failed, poor luser did not
749
* check for it. Fail fast.
751
* If a match is found, bopat[0] and eopat[0] are set
752
* to the beginning and the end of the matched fragment,
756
int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
768
case BOL: /* anchored: match from BOL only */
769
ep = PMatch(ci, lp, endp, ap);
771
case EOL: /* just searching for end of line normal path doesn't work */
772
if (*(ap+1) == END) {
779
case CHR: /* ordinary char: locate it fast */
781
while ((lp < endp) && (static_cast<unsigned char>(ci.CharAt(lp)) != c))
783
if (lp >= endp) /* if EOS, fail, else fall through. */
785
default: /* regular matching all the way. */
787
ep = PMatch(ci, lp, endp, ap);
793
case END: /* munged automaton. fail always */
805
* PMatch: internal routine for the hard part
807
* This code is partly snarfed from an early grep written by
808
* David Conroy. The backref and tag stuff, and various other
809
* innovations are by oz.
811
* special case optimizations: (nfa[n], nfa[n+1])
813
* We KNOW .* will match everything up to the
814
* end of line. Thus, directly go to the end of
815
* line, without recursive PMatch calls. As in
816
* the other closure cases, the remaining pattern
817
* must be matched by moving backwards on the
818
* string recursively, to find a match for xy
819
* (x is ".*" and y is the remaining pattern)
820
* where the match satisfies the LONGEST match for
821
* x followed by a match for y.
823
* We can again scan the string forward for the
824
* single char and at the point of failure, we
825
* execute the remaining nfa recursively, same as
828
* At the end of a successful match, bopat[n] and eopat[n]
829
* are set to the beginning and end of subpatterns matched
830
* by tagged expressions (n = 1 to 9).
833
extern void re_fail(char *,char);
835
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
838
* skip values for CLO XXX to skip past the closure
841
#define ANYSKIP 2 /* [CLO] ANY END */
842
#define CHRSKIP 3 /* [CLO] CHR chr END */
843
#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
845
int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
847
int e; /* extra pointer for CLO */
848
int bp; /* beginning of subpat... */
849
int ep; /* ending of subpat... */
850
int are; /* to save the line ptr. */
851
int llp; /* lazy lp for LCLO */
853
while ((op = *ap++) != END)
857
if (ci.CharAt(lp++) != *ap++)
881
bopat[static_cast<int>(*ap++)] = lp;
884
eopat[static_cast<int>(*ap++)] = lp;
887
if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp)))
891
if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
899
if (ci.CharAt(bp++) != ci.CharAt(lp++))
909
if (op == CLO || op == LCLO)
919
if (op == CLO || op == LCLO)
920
while ((lp < endp) && (c == ci.CharAt(lp)))
922
else if ((lp < endp) && (c == ci.CharAt(lp)))
927
while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
933
//re_fail("closure: bad nfa.", *ap);
942
if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) {
945
if (op != LCLO) return e;
947
if (*ap == END) return e;
951
PMatch(ci, lp, endp, ap);
954
//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));