3
* This file is #included by regcomp.c.
5
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7
* Development of this software was funded, in part, by Cray Research Inc.,
8
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9
* Corporation, none of whom are responsible for the results. The author
12
* Redistribution and use in source and binary forms -- with or without
13
* modification -- are permitted for any purpose, provided that
14
* redistributions in source form retain this entire copyright notice and
15
* indicate the origin and nature of any modifications.
17
* I'd appreciate being given credit for this package in the documentation
18
* of software which uses it, but that is not a requirement.
20
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
* $PostgreSQL: pgsql/src/backend/regex/regc_lex.c,v 1.4 2003-11-29 19:51:55 pgsql Exp $
35
/* scanning macros (know about v) */
36
#define ATEOS() (v->now >= v->stop)
37
#define HAVE(n) (v->stop - v->now >= (n))
38
#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39
#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40
#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41
*(v->now+1) == CHR(b) && \
42
*(v->now+2) == CHR(c))
43
#define SET(c) (v->nexttype = (c))
44
#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45
#define RET(c) return (SET(c), 1)
46
#define RETV(c, n) return (SETV(c, n), 1)
47
#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48
#define LASTTYPE(t) (v->lasttype == (t))
50
/* lexical contexts */
51
#define L_ERE 1 /* mainline ERE/ARE */
52
#define L_BRE 2 /* mainline BRE */
53
#define L_Q 3 /* REG_QUOTE */
54
#define L_EBND 4 /* ERE/ARE bound */
55
#define L_BBND 5 /* BRE bound */
56
#define L_BRACK 6 /* brackets */
57
#define L_CEL 7 /* collating element */
58
#define L_ECL 8 /* equivalence class */
59
#define L_CCL 9 /* character class */
60
#define INTOCON(c) (v->lexcon = (c))
61
#define INCON(con) (v->lexcon == (con))
63
/* construct pointer past end of chr array */
64
#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
67
* lexstart - set up lexical stuff, scan leading options
70
lexstart(struct vars * v)
72
prefixes(v); /* may turn on new type bits etc. */
75
if (v->cflags & REG_QUOTE)
77
assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
80
else if (v->cflags & REG_EXTENDED)
82
assert(!(v->cflags & REG_QUOTE));
87
assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
91
v->nexttype = EMPTY; /* remember we were at the start */
92
next(v); /* set up the first token */
96
* prefixes - implement various special prefixes
99
prefixes(struct vars * v)
101
/* literal string doesn't get any of this stuff */
102
if (v->cflags & REG_QUOTE)
105
/* initial "***" gets special things */
106
if (HAVE(4) && NEXT3('*', '*', '*'))
107
switch (*(v->now + 3))
109
case CHR('?'): /* "***?" error, msg shows version */
111
return; /* proceed no further */
113
case CHR('='): /* "***=" shifts to literal string */
115
v->cflags |= REG_QUOTE;
116
v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
118
return; /* and there can be no more prefixes */
120
case CHR(':'): /* "***:" shifts to AREs */
122
v->cflags |= REG_ADVANCED;
125
default: /* otherwise *** is just an error */
131
/* BREs and EREs don't get embedded options */
132
if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
135
/* embedded options (AREs only) */
136
if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
140
for (; !ATEOS() && iscalpha(*v->now); v->now++)
143
case CHR('b'): /* BREs (but why???) */
144
v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
146
case CHR('c'): /* case sensitive */
147
v->cflags &= ~REG_ICASE;
149
case CHR('e'): /* plain EREs */
150
v->cflags |= REG_EXTENDED;
151
v->cflags &= ~(REG_ADVF | REG_QUOTE);
153
case CHR('i'): /* case insensitive */
154
v->cflags |= REG_ICASE;
156
case CHR('m'): /* Perloid synonym for n */
157
case CHR('n'): /* \n affects ^ $ . [^ */
158
v->cflags |= REG_NEWLINE;
160
case CHR('p'): /* ~Perl, \n affects . [^ */
161
v->cflags |= REG_NLSTOP;
162
v->cflags &= ~REG_NLANCH;
164
case CHR('q'): /* literal string */
165
v->cflags |= REG_QUOTE;
166
v->cflags &= ~REG_ADVANCED;
168
case CHR('s'): /* single line, \n ordinary */
169
v->cflags &= ~REG_NEWLINE;
171
case CHR('t'): /* tight syntax */
172
v->cflags &= ~REG_EXPANDED;
174
case CHR('w'): /* weird, \n affects ^ $ only */
175
v->cflags &= ~REG_NLSTOP;
176
v->cflags |= REG_NLANCH;
178
case CHR('x'): /* expanded syntax */
179
v->cflags |= REG_EXPANDED;
191
if (v->cflags & REG_QUOTE)
192
v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
197
* lexnest - "call a subroutine", interpolating string at the lexical level
199
* Note, this is not a very general facility. There are a number of
200
* implicit assumptions about what sorts of strings can be subroutines.
203
lexnest(struct vars * v,
204
chr *beginp, /* start of interpolation */
205
chr *endp) /* one past end of interpolation */
207
assert(v->savenow == NULL); /* only one level of nesting */
209
v->savestop = v->stop;
215
* string constants to interpolate as expansions of things like \d
217
static chr backd[] = { /* \d */
218
CHR('['), CHR('['), CHR(':'),
219
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220
CHR(':'), CHR(']'), CHR(']')
222
static chr backD[] = { /* \D */
223
CHR('['), CHR('^'), CHR('['), CHR(':'),
224
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225
CHR(':'), CHR(']'), CHR(']')
227
static chr brbackd[] = { /* \d within brackets */
229
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
232
static chr backs[] = { /* \s */
233
CHR('['), CHR('['), CHR(':'),
234
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235
CHR(':'), CHR(']'), CHR(']')
237
static chr backS[] = { /* \S */
238
CHR('['), CHR('^'), CHR('['), CHR(':'),
239
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240
CHR(':'), CHR(']'), CHR(']')
242
static chr brbacks[] = { /* \s within brackets */
244
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
247
static chr backw[] = { /* \w */
248
CHR('['), CHR('['), CHR(':'),
249
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250
CHR(':'), CHR(']'), CHR('_'), CHR(']')
252
static chr backW[] = { /* \W */
253
CHR('['), CHR('^'), CHR('['), CHR(':'),
254
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255
CHR(':'), CHR(']'), CHR('_'), CHR(']')
257
static chr brbackw[] = { /* \w within brackets */
259
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260
CHR(':'), CHR(']'), CHR('_')
264
* lexword - interpolate a bracket expression for word characters
265
* Possibly ought to inquire whether there is a "word" character class.
268
lexword(struct vars * v)
270
lexnest(v, backw, ENDOF(backw));
274
* next - get next token
276
static int /* 1 normal, 0 failure */
277
next(struct vars * v)
281
/* errors yield an infinite sequence of failures */
283
return 0; /* the error has set nexttype to EOS */
285
/* remember flavor of last token */
286
v->lasttype = v->nexttype;
289
if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
291
/* at start of a REG_BOSONLY RE */
292
RETV(SBEGIN, 0); /* same as \A */
295
/* if we're nested and we've hit end, return to outer level */
296
if (v->savenow != NULL && ATEOS())
299
v->stop = v->savestop;
300
v->savenow = v->savestop = NULL;
303
/* skip white space etc. if appropriate (not in literal or []) */
304
if (v->cflags & REG_EXPANDED)
315
/* handle EOS, depending on context */
339
/* okay, time to actually get a character */
342
/* deal with the easy contexts, punt EREs to code below */
345
case L_BRE: /* punt BREs to separate function */
346
return brenext(v, c);
348
case L_ERE: /* see below */
350
case L_Q: /* literal strings are easy */
353
case L_BBND: /* bounds are fairly simple */
367
RETV(DIGIT, (chr) DIGITVAL(c));
372
case CHR('}'): /* ERE bound ends with } */
376
if ((v->cflags & REG_ADVF) && NEXT1('?'))
387
case CHR('\\'): /* BRE bound ends with \} */
388
if (INCON(L_BBND) && NEXT1('}'))
403
case L_BRACK: /* brackets are not too hard */
411
INTOCON((v->cflags & REG_EXTENDED) ?
418
if (!(v->cflags & REG_ADVF))
423
(DISCARD) lexescape(v);
425
{ /* not all escapes okay here */
430
switch (v->nextvalue)
433
lexnest(v, brbackd, ENDOF(brbackd));
436
lexnest(v, brbacks, ENDOF(brbacks));
439
lexnest(v, brbackw, ENDOF(brbackw));
445
/* lexnest done, back up and try again */
446
v->nexttype = v->lasttype;
450
/* not one of the acceptable escapes */
454
if (LASTTYPE('[') || NEXT1(']'))
466
/* might or might not be locale-specific */
492
case L_CEL: /* collating elements are easy */
493
if (c == CHR('.') && NEXT1(']'))
502
case L_ECL: /* ditto equivalence classes */
503
if (c == CHR('=') && NEXT1(']'))
512
case L_CCL: /* ditto character classes */
513
if (c == CHR(':') && NEXT1(']'))
527
/* that got rid of everything except EREs and AREs */
528
assert(INCON(L_ERE));
530
/* deal with EREs and AREs, except for backslashes */
537
if ((v->cflags & REG_ADVF) && NEXT1('?'))
546
if ((v->cflags & REG_ADVF) && NEXT1('?'))
555
if ((v->cflags & REG_ADVF) && NEXT1('?'))
563
case CHR('{'): /* bounds start or plain character */
564
if (v->cflags & REG_EXPANDED)
566
if (ATEOS() || !iscdigit(*v->now))
580
case CHR('('): /* parenthesis, or advanced extension */
581
if ((v->cflags & REG_ADVF) && NEXT1('?'))
587
case CHR(':'): /* non-capturing paren */
590
case CHR('#'): /* comment */
591
while (!ATEOS() && *v->now != CHR(')'))
595
assert(v->nexttype == v->lasttype);
598
case CHR('='): /* positive lookahead */
599
NOTE(REG_ULOOKAHEAD);
602
case CHR('!'): /* negative lookahead */
603
NOTE(REG_ULOOKAHEAD);
612
if (v->cflags & REG_NOSUB)
613
RETV('(', 0); /* all parens non-capturing */
622
case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
623
if (HAVE(6) && *(v->now + 0) == CHR('[') &&
624
*(v->now + 1) == CHR(':') &&
625
(*(v->now + 2) == CHR('<') ||
626
*(v->now + 2) == CHR('>')) &&
627
*(v->now + 3) == CHR(':') &&
628
*(v->now + 4) == CHR(']') &&
629
*(v->now + 5) == CHR(']'))
634
RET((c == CHR('<')) ? '<' : '>');
653
case CHR('\\'): /* mostly punt backslashes to code below */
657
default: /* ordinary character */
662
/* ERE/ARE backslash handling; backslash already eaten */
664
if (!(v->cflags & REG_ADVF))
665
{ /* only AREs have non-trivial escapes */
666
if (iscalnum(*v->now))
671
RETV(PLAIN, *v->now++);
673
(DISCARD) lexescape(v);
676
if (v->nexttype == CCLASS)
677
{ /* fudge at lexical level */
678
switch (v->nextvalue)
681
lexnest(v, backd, ENDOF(backd));
684
lexnest(v, backD, ENDOF(backD));
687
lexnest(v, backs, ENDOF(backs));
690
lexnest(v, backS, ENDOF(backS));
693
lexnest(v, backw, ENDOF(backw));
696
lexnest(v, backW, ENDOF(backW));
703
/* lexnest done, back up and try again */
704
v->nexttype = v->lasttype;
707
/* otherwise, lexescape has already done the work */
712
* lexescape - parse an ARE backslash escape (backslash already eaten)
713
* Note slightly nonstandard use of the CCLASS type code.
715
static int /* not actually used, but convenient for
717
lexescape(struct vars * v)
720
static chr alert[] = {
721
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
724
CHR('E'), CHR('S'), CHR('C')
728
assert(v->cflags & REG_ADVF);
739
RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
745
RETV(PLAIN, CHR('\b'));
748
RETV(PLAIN, CHR('\\'));
754
RETV(PLAIN, (chr) (*v->now++ & 037));
766
RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
769
RETV(PLAIN, CHR('\f'));
778
RETV(PLAIN, CHR('\n'));
781
RETV(PLAIN, CHR('\r'));
792
RETV(PLAIN, CHR('\t'));
795
c = lexdigits(v, 16, 4, 4);
801
c = lexdigits(v, 16, 8, 8);
807
RETV(PLAIN, CHR('\v'));
819
c = lexdigits(v, 16, 1, 255); /* REs >255 long outside
846
v->now--; /* put first digit back */
847
c = lexdigits(v, 10, 1, 255); /* REs >255 long outside
851
/* ugly heuristic (first test is "exactly 1 digit?") */
852
if (v->now - save == 0 || (int) c <= v->nsubexp)
855
RETV(BACKREF, (chr) c);
857
/* oops, doesn't look like it's a backref after all... */
859
/* and fall through into octal number */
862
v->now--; /* put first digit back */
863
c = lexdigits(v, 8, 1, 3);
870
FAILW(REG_EESCAPE); /* unknown alphabetic escape */
877
* lexdigits - slurp up digits and return chr value
879
static chr /* chr value; errors signalled via ERR */
880
lexdigits(struct vars * v,
885
uchr n; /* unsigned to avoid overflow misbehavior */
889
const uchr ub = (uchr) base;
892
for (len = 0; len < maxlen && !ATEOS(); len++)
934
v->now--; /* oops, not a digit at all */
940
{ /* not a plausible digit */
945
break; /* NOTE BREAK OUT */
946
n = n * ub + (uchr) d;
955
* brenext - get next BRE token
957
* This is much like EREs except for all the stupid backslashes and the
958
* context-dependency of some things.
960
static int /* 1 normal, 0 failure */
961
brenext(struct vars * v,
969
if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
974
if (HAVE(6) && *(v->now + 0) == CHR('[') &&
975
*(v->now + 1) == CHR(':') &&
976
(*(v->now + 2) == CHR('<') ||
977
*(v->now + 2) == CHR('>')) &&
978
*(v->now + 3) == CHR(':') &&
979
*(v->now + 4) == CHR(']') &&
980
*(v->now + 5) == CHR(']'))
985
RET((c == CHR('<')) ? '<' : '>');
1009
if (v->cflags & REG_EXPANDED)
1013
if (NEXT2('\\', ')'))
1021
break; /* see below */
1027
assert(c == CHR('\\'));
1047
NOTE(REG_UNONPOSIX);
1051
NOTE(REG_UNONPOSIX);
1064
RETV(BACKREF, (chr) DIGITVAL(c));
1080
* skip - skip white space and comments in expanded form
1083
skip(struct vars * v)
1085
chr *start = v->now;
1087
assert(v->cflags & REG_EXPANDED);
1091
while (!ATEOS() && iscspace(*v->now))
1093
if (ATEOS() || *v->now != CHR('#'))
1094
break; /* NOTE BREAK OUT */
1096
while (!ATEOS() && *v->now != CHR('\n'))
1098
/* leave the newline to be picked up by the iscspace loop */
1101
if (v->now != start)
1102
NOTE(REG_UNONPOSIX);
1106
* newline - return the chr for a newline
1108
* This helps confine use of CHR to this source file.
1117
* chrnamed - return the chr known by a given (chr string) name
1119
* The code is a bit clumsy, but this routine gets only such specialized
1120
* use that it hardly matters.
1123
chrnamed(struct vars * v,
1124
chr *startp, /* start of name */
1125
chr *endp, /* just past end of name */
1126
chr lastresort) /* what to return if name lookup fails */
1135
c = element(v, startp, endp);
1140
return (chr) lastresort;
1142
cv = range(v, c, c, 0);
1144
return (chr) lastresort;