~vcs-imports/mammoth-replicator/trunk

« back to all changes in this revision

Viewing changes to src/backend/regex/regc_lex.c

  • Committer: alvherre
  • Date: 2005-12-16 21:24:52 UTC
  • Revision ID: svn-v4:db760fc0-0f08-0410-9d63-cc6633f64896:trunk:1
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * lexical analyzer
 
3
 * This file is #included by regcomp.c.
 
4
 *
 
5
 * Copyright (c) 1998, 1999 Henry Spencer.      All rights reserved.
 
6
 *
 
7
 * Development of this software was funded, in part, by Cray Research Inc.,
 
8
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 
9
 * Corporation, none of whom are responsible for the results.  The author
 
10
 * thanks all of them.
 
11
 *
 
12
 * Redistribution and use in source and binary forms -- with or without
 
13
 * modification -- are permitted for any purpose, provided that
 
14
 * redistributions in source form retain this entire copyright notice and
 
15
 * indicate the origin and nature of any modifications.
 
16
 *
 
17
 * I'd appreciate being given credit for this package in the documentation
 
18
 * of software which uses it, but that is not a requirement.
 
19
 *
 
20
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 
21
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 
22
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
 
23
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 
24
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 
25
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 
26
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 
27
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 
28
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
29
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
30
 *
 
31
 * $PostgreSQL: pgsql/src/backend/regex/regc_lex.c,v 1.4 2003-11-29 19:51:55 pgsql Exp $
 
32
 *
 
33
 */
 
34
 
 
35
/* scanning macros (know about v) */
 
36
#define ATEOS()         (v->now >= v->stop)
 
37
#define HAVE(n)         (v->stop - v->now >= (n))
 
38
#define NEXT1(c)        (!ATEOS() && *v->now == CHR(c))
 
39
#define NEXT2(a,b)      (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
 
40
#define NEXT3(a,b,c)    (HAVE(3) && *v->now == CHR(a) && \
 
41
                                                *(v->now+1) == CHR(b) && \
 
42
                                                *(v->now+2) == CHR(c))
 
43
#define SET(c)          (v->nexttype = (c))
 
44
#define SETV(c, n)      (v->nexttype = (c), v->nextvalue = (n))
 
45
#define RET(c)          return (SET(c), 1)
 
46
#define RETV(c, n)      return (SETV(c, n), 1)
 
47
#define FAILW(e)        return (ERR(e), 0)      /* ERR does SET(EOS) */
 
48
#define LASTTYPE(t) (v->lasttype == (t))
 
49
 
 
50
/* lexical contexts */
 
51
#define L_ERE   1                               /* mainline ERE/ARE */
 
52
#define L_BRE   2                               /* mainline BRE */
 
53
#define L_Q 3                                   /* REG_QUOTE */
 
54
#define L_EBND  4                               /* ERE/ARE bound */
 
55
#define L_BBND  5                               /* BRE bound */
 
56
#define L_BRACK 6                               /* brackets */
 
57
#define L_CEL   7                               /* collating element */
 
58
#define L_ECL   8                               /* equivalence class */
 
59
#define L_CCL   9                               /* character class */
 
60
#define INTOCON(c)      (v->lexcon = (c))
 
61
#define INCON(con)      (v->lexcon == (con))
 
62
 
 
63
/* construct pointer past end of chr array */
 
64
#define ENDOF(array)    ((array) + sizeof(array)/sizeof(chr))
 
65
 
 
66
/*
 
67
 * lexstart - set up lexical stuff, scan leading options
 
68
 */
 
69
static void
 
70
lexstart(struct vars * v)
 
71
{
 
72
        prefixes(v);                            /* may turn on new type bits etc. */
 
73
        NOERR();
 
74
 
 
75
        if (v->cflags & REG_QUOTE)
 
76
        {
 
77
                assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
 
78
                INTOCON(L_Q);
 
79
        }
 
80
        else if (v->cflags & REG_EXTENDED)
 
81
        {
 
82
                assert(!(v->cflags & REG_QUOTE));
 
83
                INTOCON(L_ERE);
 
84
        }
 
85
        else
 
86
        {
 
87
                assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
 
88
                INTOCON(L_BRE);
 
89
        }
 
90
 
 
91
        v->nexttype = EMPTY;            /* remember we were at the start */
 
92
        next(v);                                        /* set up the first token */
 
93
}
 
94
 
 
95
/*
 
96
 * prefixes - implement various special prefixes
 
97
 */
 
98
static void
 
99
prefixes(struct vars * v)
 
100
{
 
101
        /* literal string doesn't get any of this stuff */
 
102
        if (v->cflags & REG_QUOTE)
 
103
                return;
 
104
 
 
105
        /* initial "***" gets special things */
 
106
        if (HAVE(4) && NEXT3('*', '*', '*'))
 
107
                switch (*(v->now + 3))
 
108
                {
 
109
                        case CHR('?'):          /* "***?" error, msg shows version */
 
110
                                ERR(REG_BADPAT);
 
111
                                return;                 /* proceed no further */
 
112
                                break;
 
113
                        case CHR('='):          /* "***=" shifts to literal string */
 
114
                                NOTE(REG_UNONPOSIX);
 
115
                                v->cflags |= REG_QUOTE;
 
116
                                v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
 
117
                                v->now += 4;
 
118
                                return;                 /* and there can be no more prefixes */
 
119
                                break;
 
120
                        case CHR(':'):          /* "***:" shifts to AREs */
 
121
                                NOTE(REG_UNONPOSIX);
 
122
                                v->cflags |= REG_ADVANCED;
 
123
                                v->now += 4;
 
124
                                break;
 
125
                        default:                        /* otherwise *** is just an error */
 
126
                                ERR(REG_BADRPT);
 
127
                                return;
 
128
                                break;
 
129
                }
 
130
 
 
131
        /* BREs and EREs don't get embedded options */
 
132
        if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
 
133
                return;
 
134
 
 
135
        /* embedded options (AREs only) */
 
136
        if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
 
137
        {
 
138
                NOTE(REG_UNONPOSIX);
 
139
                v->now += 2;
 
140
                for (; !ATEOS() && iscalpha(*v->now); v->now++)
 
141
                        switch (*v->now)
 
142
                        {
 
143
                                case CHR('b'):  /* BREs (but why???) */
 
144
                                        v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
 
145
                                        break;
 
146
                                case CHR('c'):  /* case sensitive */
 
147
                                        v->cflags &= ~REG_ICASE;
 
148
                                        break;
 
149
                                case CHR('e'):  /* plain EREs */
 
150
                                        v->cflags |= REG_EXTENDED;
 
151
                                        v->cflags &= ~(REG_ADVF | REG_QUOTE);
 
152
                                        break;
 
153
                                case CHR('i'):  /* case insensitive */
 
154
                                        v->cflags |= REG_ICASE;
 
155
                                        break;
 
156
                                case CHR('m'):  /* Perloid synonym for n */
 
157
                                case CHR('n'):  /* \n affects ^ $ . [^ */
 
158
                                        v->cflags |= REG_NEWLINE;
 
159
                                        break;
 
160
                                case CHR('p'):  /* ~Perl, \n affects . [^ */
 
161
                                        v->cflags |= REG_NLSTOP;
 
162
                                        v->cflags &= ~REG_NLANCH;
 
163
                                        break;
 
164
                                case CHR('q'):  /* literal string */
 
165
                                        v->cflags |= REG_QUOTE;
 
166
                                        v->cflags &= ~REG_ADVANCED;
 
167
                                        break;
 
168
                                case CHR('s'):  /* single line, \n ordinary */
 
169
                                        v->cflags &= ~REG_NEWLINE;
 
170
                                        break;
 
171
                                case CHR('t'):  /* tight syntax */
 
172
                                        v->cflags &= ~REG_EXPANDED;
 
173
                                        break;
 
174
                                case CHR('w'):  /* weird, \n affects ^ $ only */
 
175
                                        v->cflags &= ~REG_NLSTOP;
 
176
                                        v->cflags |= REG_NLANCH;
 
177
                                        break;
 
178
                                case CHR('x'):  /* expanded syntax */
 
179
                                        v->cflags |= REG_EXPANDED;
 
180
                                        break;
 
181
                                default:
 
182
                                        ERR(REG_BADOPT);
 
183
                                        return;
 
184
                        }
 
185
                if (!NEXT1(')'))
 
186
                {
 
187
                        ERR(REG_BADOPT);
 
188
                        return;
 
189
                }
 
190
                v->now++;
 
191
                if (v->cflags & REG_QUOTE)
 
192
                        v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
 
193
        }
 
194
}
 
195
 
 
196
/*
 
197
 * lexnest - "call a subroutine", interpolating string at the lexical level
 
198
 *
 
199
 * Note, this is not a very general facility.  There are a number of
 
200
 * implicit assumptions about what sorts of strings can be subroutines.
 
201
 */
 
202
static void
 
203
lexnest(struct vars * v,
 
204
                chr *beginp,                    /* start of interpolation */
 
205
                chr *endp)                              /* one past end of interpolation */
 
206
{
 
207
        assert(v->savenow == NULL); /* only one level of nesting */
 
208
        v->savenow = v->now;
 
209
        v->savestop = v->stop;
 
210
        v->now = beginp;
 
211
        v->stop = endp;
 
212
}
 
213
 
 
214
/*
 
215
 * string constants to interpolate as expansions of things like \d
 
216
 */
 
217
static chr      backd[] = {                     /* \d */
 
218
        CHR('['), CHR('['), CHR(':'),
 
219
        CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 
220
        CHR(':'), CHR(']'), CHR(']')
 
221
};
 
222
static chr      backD[] = {                     /* \D */
 
223
        CHR('['), CHR('^'), CHR('['), CHR(':'),
 
224
        CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 
225
        CHR(':'), CHR(']'), CHR(']')
 
226
};
 
227
static chr      brbackd[] = {           /* \d within brackets */
 
228
        CHR('['), CHR(':'),
 
229
        CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 
230
        CHR(':'), CHR(']')
 
231
};
 
232
static chr      backs[] = {                     /* \s */
 
233
        CHR('['), CHR('['), CHR(':'),
 
234
        CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 
235
        CHR(':'), CHR(']'), CHR(']')
 
236
};
 
237
static chr      backS[] = {                     /* \S */
 
238
        CHR('['), CHR('^'), CHR('['), CHR(':'),
 
239
        CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 
240
        CHR(':'), CHR(']'), CHR(']')
 
241
};
 
242
static chr      brbacks[] = {           /* \s within brackets */
 
243
        CHR('['), CHR(':'),
 
244
        CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 
245
        CHR(':'), CHR(']')
 
246
};
 
247
static chr      backw[] = {                     /* \w */
 
248
        CHR('['), CHR('['), CHR(':'),
 
249
        CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 
250
        CHR(':'), CHR(']'), CHR('_'), CHR(']')
 
251
};
 
252
static chr      backW[] = {                     /* \W */
 
253
        CHR('['), CHR('^'), CHR('['), CHR(':'),
 
254
        CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 
255
        CHR(':'), CHR(']'), CHR('_'), CHR(']')
 
256
};
 
257
static chr      brbackw[] = {           /* \w within brackets */
 
258
        CHR('['), CHR(':'),
 
259
        CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 
260
        CHR(':'), CHR(']'), CHR('_')
 
261
};
 
262
 
 
263
/*
 
264
 * lexword - interpolate a bracket expression for word characters
 
265
 * Possibly ought to inquire whether there is a "word" character class.
 
266
 */
 
267
static void
 
268
lexword(struct vars * v)
 
269
{
 
270
        lexnest(v, backw, ENDOF(backw));
 
271
}
 
272
 
 
273
/*
 
274
 * next - get next token
 
275
 */
 
276
static int                                              /* 1 normal, 0 failure */
 
277
next(struct vars * v)
 
278
{
 
279
        chr                     c;
 
280
 
 
281
        /* errors yield an infinite sequence of failures */
 
282
        if (ISERR())
 
283
                return 0;                               /* the error has set nexttype to EOS */
 
284
 
 
285
        /* remember flavor of last token */
 
286
        v->lasttype = v->nexttype;
 
287
 
 
288
        /* REG_BOSONLY */
 
289
        if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
 
290
        {
 
291
                /* at start of a REG_BOSONLY RE */
 
292
                RETV(SBEGIN, 0);                /* same as \A */
 
293
        }
 
294
 
 
295
        /* if we're nested and we've hit end, return to outer level */
 
296
        if (v->savenow != NULL && ATEOS())
 
297
        {
 
298
                v->now = v->savenow;
 
299
                v->stop = v->savestop;
 
300
                v->savenow = v->savestop = NULL;
 
301
        }
 
302
 
 
303
        /* skip white space etc. if appropriate (not in literal or []) */
 
304
        if (v->cflags & REG_EXPANDED)
 
305
                switch (v->lexcon)
 
306
                {
 
307
                        case L_ERE:
 
308
                        case L_BRE:
 
309
                        case L_EBND:
 
310
                        case L_BBND:
 
311
                                skip(v);
 
312
                                break;
 
313
                }
 
314
 
 
315
        /* handle EOS, depending on context */
 
316
        if (ATEOS())
 
317
        {
 
318
                switch (v->lexcon)
 
319
                {
 
320
                        case L_ERE:
 
321
                        case L_BRE:
 
322
                        case L_Q:
 
323
                                RET(EOS);
 
324
                                break;
 
325
                        case L_EBND:
 
326
                        case L_BBND:
 
327
                                FAILW(REG_EBRACE);
 
328
                                break;
 
329
                        case L_BRACK:
 
330
                        case L_CEL:
 
331
                        case L_ECL:
 
332
                        case L_CCL:
 
333
                                FAILW(REG_EBRACK);
 
334
                                break;
 
335
                }
 
336
                assert(NOTREACHED);
 
337
        }
 
338
 
 
339
        /* okay, time to actually get a character */
 
340
        c = *v->now++;
 
341
 
 
342
        /* deal with the easy contexts, punt EREs to code below */
 
343
        switch (v->lexcon)
 
344
        {
 
345
                case L_BRE:                             /* punt BREs to separate function */
 
346
                        return brenext(v, c);
 
347
                        break;
 
348
                case L_ERE:                             /* see below */
 
349
                        break;
 
350
                case L_Q:                               /* literal strings are easy */
 
351
                        RETV(PLAIN, c);
 
352
                        break;
 
353
                case L_BBND:                    /* bounds are fairly simple */
 
354
                case L_EBND:
 
355
                        switch (c)
 
356
                        {
 
357
                                case CHR('0'):
 
358
                                case CHR('1'):
 
359
                                case CHR('2'):
 
360
                                case CHR('3'):
 
361
                                case CHR('4'):
 
362
                                case CHR('5'):
 
363
                                case CHR('6'):
 
364
                                case CHR('7'):
 
365
                                case CHR('8'):
 
366
                                case CHR('9'):
 
367
                                        RETV(DIGIT, (chr) DIGITVAL(c));
 
368
                                        break;
 
369
                                case CHR(','):
 
370
                                        RET(',');
 
371
                                        break;
 
372
                                case CHR('}'):  /* ERE bound ends with } */
 
373
                                        if (INCON(L_EBND))
 
374
                                        {
 
375
                                                INTOCON(L_ERE);
 
376
                                                if ((v->cflags & REG_ADVF) && NEXT1('?'))
 
377
                                                {
 
378
                                                        v->now++;
 
379
                                                        NOTE(REG_UNONPOSIX);
 
380
                                                        RETV('}', 0);
 
381
                                                }
 
382
                                                RETV('}', 1);
 
383
                                        }
 
384
                                        else
 
385
                                                FAILW(REG_BADBR);
 
386
                                        break;
 
387
                                case CHR('\\'): /* BRE bound ends with \} */
 
388
                                        if (INCON(L_BBND) && NEXT1('}'))
 
389
                                        {
 
390
                                                v->now++;
 
391
                                                INTOCON(L_BRE);
 
392
                                                RET('}');
 
393
                                        }
 
394
                                        else
 
395
                                                FAILW(REG_BADBR);
 
396
                                        break;
 
397
                                default:
 
398
                                        FAILW(REG_BADBR);
 
399
                                        break;
 
400
                        }
 
401
                        assert(NOTREACHED);
 
402
                        break;
 
403
                case L_BRACK:                   /* brackets are not too hard */
 
404
                        switch (c)
 
405
                        {
 
406
                                case CHR(']'):
 
407
                                        if (LASTTYPE('['))
 
408
                                                RETV(PLAIN, c);
 
409
                                        else
 
410
                                        {
 
411
                                                INTOCON((v->cflags & REG_EXTENDED) ?
 
412
                                                                L_ERE : L_BRE);
 
413
                                                RET(']');
 
414
                                        }
 
415
                                        break;
 
416
                                case CHR('\\'):
 
417
                                        NOTE(REG_UBBS);
 
418
                                        if (!(v->cflags & REG_ADVF))
 
419
                                                RETV(PLAIN, c);
 
420
                                        NOTE(REG_UNONPOSIX);
 
421
                                        if (ATEOS())
 
422
                                                FAILW(REG_EESCAPE);
 
423
                                        (DISCARD) lexescape(v);
 
424
                                        switch (v->nexttype)
 
425
                                        {                       /* not all escapes okay here */
 
426
                                                case PLAIN:
 
427
                                                        return 1;
 
428
                                                        break;
 
429
                                                case CCLASS:
 
430
                                                        switch (v->nextvalue)
 
431
                                                        {
 
432
                                                                case 'd':
 
433
                                                                        lexnest(v, brbackd, ENDOF(brbackd));
 
434
                                                                        break;
 
435
                                                                case 's':
 
436
                                                                        lexnest(v, brbacks, ENDOF(brbacks));
 
437
                                                                        break;
 
438
                                                                case 'w':
 
439
                                                                        lexnest(v, brbackw, ENDOF(brbackw));
 
440
                                                                        break;
 
441
                                                                default:
 
442
                                                                        FAILW(REG_EESCAPE);
 
443
                                                                        break;
 
444
                                                        }
 
445
                                                        /* lexnest done, back up and try again */
 
446
                                                        v->nexttype = v->lasttype;
 
447
                                                        return next(v);
 
448
                                                        break;
 
449
                                        }
 
450
                                        /* not one of the acceptable escapes */
 
451
                                        FAILW(REG_EESCAPE);
 
452
                                        break;
 
453
                                case CHR('-'):
 
454
                                        if (LASTTYPE('[') || NEXT1(']'))
 
455
                                                RETV(PLAIN, c);
 
456
                                        else
 
457
                                                RETV(RANGE, c);
 
458
                                        break;
 
459
                                case CHR('['):
 
460
                                        if (ATEOS())
 
461
                                                FAILW(REG_EBRACK);
 
462
                                        switch (*v->now++)
 
463
                                        {
 
464
                                                case CHR('.'):
 
465
                                                        INTOCON(L_CEL);
 
466
                                                        /* might or might not be locale-specific */
 
467
                                                        RET(COLLEL);
 
468
                                                        break;
 
469
                                                case CHR('='):
 
470
                                                        INTOCON(L_ECL);
 
471
                                                        NOTE(REG_ULOCALE);
 
472
                                                        RET(ECLASS);
 
473
                                                        break;
 
474
                                                case CHR(':'):
 
475
                                                        INTOCON(L_CCL);
 
476
                                                        NOTE(REG_ULOCALE);
 
477
                                                        RET(CCLASS);
 
478
                                                        break;
 
479
                                                default:                /* oops */
 
480
                                                        v->now--;
 
481
                                                        RETV(PLAIN, c);
 
482
                                                        break;
 
483
                                        }
 
484
                                        assert(NOTREACHED);
 
485
                                        break;
 
486
                                default:
 
487
                                        RETV(PLAIN, c);
 
488
                                        break;
 
489
                        }
 
490
                        assert(NOTREACHED);
 
491
                        break;
 
492
                case L_CEL:                             /* collating elements are easy */
 
493
                        if (c == CHR('.') && NEXT1(']'))
 
494
                        {
 
495
                                v->now++;
 
496
                                INTOCON(L_BRACK);
 
497
                                RETV(END, '.');
 
498
                        }
 
499
                        else
 
500
                                RETV(PLAIN, c);
 
501
                        break;
 
502
                case L_ECL:                             /* ditto equivalence classes */
 
503
                        if (c == CHR('=') && NEXT1(']'))
 
504
                        {
 
505
                                v->now++;
 
506
                                INTOCON(L_BRACK);
 
507
                                RETV(END, '=');
 
508
                        }
 
509
                        else
 
510
                                RETV(PLAIN, c);
 
511
                        break;
 
512
                case L_CCL:                             /* ditto character classes */
 
513
                        if (c == CHR(':') && NEXT1(']'))
 
514
                        {
 
515
                                v->now++;
 
516
                                INTOCON(L_BRACK);
 
517
                                RETV(END, ':');
 
518
                        }
 
519
                        else
 
520
                                RETV(PLAIN, c);
 
521
                        break;
 
522
                default:
 
523
                        assert(NOTREACHED);
 
524
                        break;
 
525
        }
 
526
 
 
527
        /* that got rid of everything except EREs and AREs */
 
528
        assert(INCON(L_ERE));
 
529
 
 
530
        /* deal with EREs and AREs, except for backslashes */
 
531
        switch (c)
 
532
        {
 
533
                case CHR('|'):
 
534
                        RET('|');
 
535
                        break;
 
536
                case CHR('*'):
 
537
                        if ((v->cflags & REG_ADVF) && NEXT1('?'))
 
538
                        {
 
539
                                v->now++;
 
540
                                NOTE(REG_UNONPOSIX);
 
541
                                RETV('*', 0);
 
542
                        }
 
543
                        RETV('*', 1);
 
544
                        break;
 
545
                case CHR('+'):
 
546
                        if ((v->cflags & REG_ADVF) && NEXT1('?'))
 
547
                        {
 
548
                                v->now++;
 
549
                                NOTE(REG_UNONPOSIX);
 
550
                                RETV('+', 0);
 
551
                        }
 
552
                        RETV('+', 1);
 
553
                        break;
 
554
                case CHR('?'):
 
555
                        if ((v->cflags & REG_ADVF) && NEXT1('?'))
 
556
                        {
 
557
                                v->now++;
 
558
                                NOTE(REG_UNONPOSIX);
 
559
                                RETV('?', 0);
 
560
                        }
 
561
                        RETV('?', 1);
 
562
                        break;
 
563
                case CHR('{'):                  /* bounds start or plain character */
 
564
                        if (v->cflags & REG_EXPANDED)
 
565
                                skip(v);
 
566
                        if (ATEOS() || !iscdigit(*v->now))
 
567
                        {
 
568
                                NOTE(REG_UBRACES);
 
569
                                NOTE(REG_UUNSPEC);
 
570
                                RETV(PLAIN, c);
 
571
                        }
 
572
                        else
 
573
                        {
 
574
                                NOTE(REG_UBOUNDS);
 
575
                                INTOCON(L_EBND);
 
576
                                RET('{');
 
577
                        }
 
578
                        assert(NOTREACHED);
 
579
                        break;
 
580
                case CHR('('):                  /* parenthesis, or advanced extension */
 
581
                        if ((v->cflags & REG_ADVF) && NEXT1('?'))
 
582
                        {
 
583
                                NOTE(REG_UNONPOSIX);
 
584
                                v->now++;
 
585
                                switch (*v->now++)
 
586
                                {
 
587
                                        case CHR(':'):          /* non-capturing paren */
 
588
                                                RETV('(', 0);
 
589
                                                break;
 
590
                                        case CHR('#'):          /* comment */
 
591
                                                while (!ATEOS() && *v->now != CHR(')'))
 
592
                                                        v->now++;
 
593
                                                if (!ATEOS())
 
594
                                                        v->now++;
 
595
                                                assert(v->nexttype == v->lasttype);
 
596
                                                return next(v);
 
597
                                                break;
 
598
                                        case CHR('='):          /* positive lookahead */
 
599
                                                NOTE(REG_ULOOKAHEAD);
 
600
                                                RETV(LACON, 1);
 
601
                                                break;
 
602
                                        case CHR('!'):          /* negative lookahead */
 
603
                                                NOTE(REG_ULOOKAHEAD);
 
604
                                                RETV(LACON, 0);
 
605
                                                break;
 
606
                                        default:
 
607
                                                FAILW(REG_BADRPT);
 
608
                                                break;
 
609
                                }
 
610
                                assert(NOTREACHED);
 
611
                        }
 
612
                        if (v->cflags & REG_NOSUB)
 
613
                                RETV('(', 0);   /* all parens non-capturing */
 
614
                        else
 
615
                                RETV('(', 1);
 
616
                        break;
 
617
                case CHR(')'):
 
618
                        if (LASTTYPE('('))
 
619
                                NOTE(REG_UUNSPEC);
 
620
                        RETV(')', c);
 
621
                        break;
 
622
                case CHR('['):                  /* easy except for [[:<:]] and [[:>:]] */
 
623
                        if (HAVE(6) && *(v->now + 0) == CHR('[') &&
 
624
                                *(v->now + 1) == CHR(':') &&
 
625
                                (*(v->now + 2) == CHR('<') ||
 
626
                                 *(v->now + 2) == CHR('>')) &&
 
627
                                *(v->now + 3) == CHR(':') &&
 
628
                                *(v->now + 4) == CHR(']') &&
 
629
                                *(v->now + 5) == CHR(']'))
 
630
                        {
 
631
                                c = *(v->now + 2);
 
632
                                v->now += 6;
 
633
                                NOTE(REG_UNONPOSIX);
 
634
                                RET((c == CHR('<')) ? '<' : '>');
 
635
                        }
 
636
                        INTOCON(L_BRACK);
 
637
                        if (NEXT1('^'))
 
638
                        {
 
639
                                v->now++;
 
640
                                RETV('[', 0);
 
641
                        }
 
642
                        RETV('[', 1);
 
643
                        break;
 
644
                case CHR('.'):
 
645
                        RET('.');
 
646
                        break;
 
647
                case CHR('^'):
 
648
                        RET('^');
 
649
                        break;
 
650
                case CHR('$'):
 
651
                        RET('$');
 
652
                        break;
 
653
                case CHR('\\'): /* mostly punt backslashes to code below */
 
654
                        if (ATEOS())
 
655
                                FAILW(REG_EESCAPE);
 
656
                        break;
 
657
                default:                                /* ordinary character */
 
658
                        RETV(PLAIN, c);
 
659
                        break;
 
660
        }
 
661
 
 
662
        /* ERE/ARE backslash handling; backslash already eaten */
 
663
        assert(!ATEOS());
 
664
        if (!(v->cflags & REG_ADVF))
 
665
        {                                                       /* only AREs have non-trivial escapes */
 
666
                if (iscalnum(*v->now))
 
667
                {
 
668
                        NOTE(REG_UBSALNUM);
 
669
                        NOTE(REG_UUNSPEC);
 
670
                }
 
671
                RETV(PLAIN, *v->now++);
 
672
        }
 
673
        (DISCARD) lexescape(v);
 
674
        if (ISERR())
 
675
                FAILW(REG_EESCAPE);
 
676
        if (v->nexttype == CCLASS)
 
677
        {                                                       /* fudge at lexical level */
 
678
                switch (v->nextvalue)
 
679
                {
 
680
                        case 'd':
 
681
                                lexnest(v, backd, ENDOF(backd));
 
682
                                break;
 
683
                        case 'D':
 
684
                                lexnest(v, backD, ENDOF(backD));
 
685
                                break;
 
686
                        case 's':
 
687
                                lexnest(v, backs, ENDOF(backs));
 
688
                                break;
 
689
                        case 'S':
 
690
                                lexnest(v, backS, ENDOF(backS));
 
691
                                break;
 
692
                        case 'w':
 
693
                                lexnest(v, backw, ENDOF(backw));
 
694
                                break;
 
695
                        case 'W':
 
696
                                lexnest(v, backW, ENDOF(backW));
 
697
                                break;
 
698
                        default:
 
699
                                assert(NOTREACHED);
 
700
                                FAILW(REG_ASSERT);
 
701
                                break;
 
702
                }
 
703
                /* lexnest done, back up and try again */
 
704
                v->nexttype = v->lasttype;
 
705
                return next(v);
 
706
        }
 
707
        /* otherwise, lexescape has already done the work */
 
708
        return !ISERR();
 
709
}
 
710
 
 
711
/*
 
712
 * lexescape - parse an ARE backslash escape (backslash already eaten)
 
713
 * Note slightly nonstandard use of the CCLASS type code.
 
714
 */
 
715
static int                                              /* not actually used, but convenient for
 
716
                                                                 * RETV */
 
717
lexescape(struct vars * v)
 
718
{
 
719
        chr                     c;
 
720
        static chr      alert[] = {
 
721
                CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
 
722
        };
 
723
        static chr      esc[] = {
 
724
                CHR('E'), CHR('S'), CHR('C')
 
725
        };
 
726
        chr                *save;
 
727
 
 
728
        assert(v->cflags & REG_ADVF);
 
729
 
 
730
        assert(!ATEOS());
 
731
        c = *v->now++;
 
732
        if (!iscalnum(c))
 
733
                RETV(PLAIN, c);
 
734
 
 
735
        NOTE(REG_UNONPOSIX);
 
736
        switch (c)
 
737
        {
 
738
                case CHR('a'):
 
739
                        RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
 
740
                        break;
 
741
                case CHR('A'):
 
742
                        RETV(SBEGIN, 0);
 
743
                        break;
 
744
                case CHR('b'):
 
745
                        RETV(PLAIN, CHR('\b'));
 
746
                        break;
 
747
                case CHR('B'):
 
748
                        RETV(PLAIN, CHR('\\'));
 
749
                        break;
 
750
                case CHR('c'):
 
751
                        NOTE(REG_UUNPORT);
 
752
                        if (ATEOS())
 
753
                                FAILW(REG_EESCAPE);
 
754
                        RETV(PLAIN, (chr) (*v->now++ & 037));
 
755
                        break;
 
756
                case CHR('d'):
 
757
                        NOTE(REG_ULOCALE);
 
758
                        RETV(CCLASS, 'd');
 
759
                        break;
 
760
                case CHR('D'):
 
761
                        NOTE(REG_ULOCALE);
 
762
                        RETV(CCLASS, 'D');
 
763
                        break;
 
764
                case CHR('e'):
 
765
                        NOTE(REG_UUNPORT);
 
766
                        RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
 
767
                        break;
 
768
                case CHR('f'):
 
769
                        RETV(PLAIN, CHR('\f'));
 
770
                        break;
 
771
                case CHR('m'):
 
772
                        RET('<');
 
773
                        break;
 
774
                case CHR('M'):
 
775
                        RET('>');
 
776
                        break;
 
777
                case CHR('n'):
 
778
                        RETV(PLAIN, CHR('\n'));
 
779
                        break;
 
780
                case CHR('r'):
 
781
                        RETV(PLAIN, CHR('\r'));
 
782
                        break;
 
783
                case CHR('s'):
 
784
                        NOTE(REG_ULOCALE);
 
785
                        RETV(CCLASS, 's');
 
786
                        break;
 
787
                case CHR('S'):
 
788
                        NOTE(REG_ULOCALE);
 
789
                        RETV(CCLASS, 'S');
 
790
                        break;
 
791
                case CHR('t'):
 
792
                        RETV(PLAIN, CHR('\t'));
 
793
                        break;
 
794
                case CHR('u'):
 
795
                        c = lexdigits(v, 16, 4, 4);
 
796
                        if (ISERR())
 
797
                                FAILW(REG_EESCAPE);
 
798
                        RETV(PLAIN, c);
 
799
                        break;
 
800
                case CHR('U'):
 
801
                        c = lexdigits(v, 16, 8, 8);
 
802
                        if (ISERR())
 
803
                                FAILW(REG_EESCAPE);
 
804
                        RETV(PLAIN, c);
 
805
                        break;
 
806
                case CHR('v'):
 
807
                        RETV(PLAIN, CHR('\v'));
 
808
                        break;
 
809
                case CHR('w'):
 
810
                        NOTE(REG_ULOCALE);
 
811
                        RETV(CCLASS, 'w');
 
812
                        break;
 
813
                case CHR('W'):
 
814
                        NOTE(REG_ULOCALE);
 
815
                        RETV(CCLASS, 'W');
 
816
                        break;
 
817
                case CHR('x'):
 
818
                        NOTE(REG_UUNPORT);
 
819
                        c = lexdigits(v, 16, 1, 255);           /* REs >255 long outside
 
820
                                                                                                 * spec */
 
821
                        if (ISERR())
 
822
                                FAILW(REG_EESCAPE);
 
823
                        RETV(PLAIN, c);
 
824
                        break;
 
825
                case CHR('y'):
 
826
                        NOTE(REG_ULOCALE);
 
827
                        RETV(WBDRY, 0);
 
828
                        break;
 
829
                case CHR('Y'):
 
830
                        NOTE(REG_ULOCALE);
 
831
                        RETV(NWBDRY, 0);
 
832
                        break;
 
833
                case CHR('Z'):
 
834
                        RETV(SEND, 0);
 
835
                        break;
 
836
                case CHR('1'):
 
837
                case CHR('2'):
 
838
                case CHR('3'):
 
839
                case CHR('4'):
 
840
                case CHR('5'):
 
841
                case CHR('6'):
 
842
                case CHR('7'):
 
843
                case CHR('8'):
 
844
                case CHR('9'):
 
845
                        save = v->now;
 
846
                        v->now--;                       /* put first digit back */
 
847
                        c = lexdigits(v, 10, 1, 255);           /* REs >255 long outside
 
848
                                                                                                 * spec */
 
849
                        if (ISERR())
 
850
                                FAILW(REG_EESCAPE);
 
851
                        /* ugly heuristic (first test is "exactly 1 digit?") */
 
852
                        if (v->now - save == 0 || (int) c <= v->nsubexp)
 
853
                        {
 
854
                                NOTE(REG_UBACKREF);
 
855
                                RETV(BACKREF, (chr) c);
 
856
                        }
 
857
                        /* oops, doesn't look like it's a backref after all... */
 
858
                        v->now = save;
 
859
                        /* and fall through into octal number */
 
860
                case CHR('0'):
 
861
                        NOTE(REG_UUNPORT);
 
862
                        v->now--;                       /* put first digit back */
 
863
                        c = lexdigits(v, 8, 1, 3);
 
864
                        if (ISERR())
 
865
                                FAILW(REG_EESCAPE);
 
866
                        RETV(PLAIN, c);
 
867
                        break;
 
868
                default:
 
869
                        assert(iscalpha(c));
 
870
                        FAILW(REG_EESCAPE); /* unknown alphabetic escape */
 
871
                        break;
 
872
        }
 
873
        assert(NOTREACHED);
 
874
}
 
875
 
 
876
/*
 
877
 * lexdigits - slurp up digits and return chr value
 
878
 */
 
879
static chr                                              /* chr value; errors signalled via ERR */
 
880
lexdigits(struct vars * v,
 
881
                  int base,
 
882
                  int minlen,
 
883
                  int maxlen)
 
884
{
 
885
        uchr            n;                              /* unsigned to avoid overflow misbehavior */
 
886
        int                     len;
 
887
        chr                     c;
 
888
        int                     d;
 
889
        const uchr      ub = (uchr) base;
 
890
 
 
891
        n = 0;
 
892
        for (len = 0; len < maxlen && !ATEOS(); len++)
 
893
        {
 
894
                c = *v->now++;
 
895
                switch (c)
 
896
                {
 
897
                        case CHR('0'):
 
898
                        case CHR('1'):
 
899
                        case CHR('2'):
 
900
                        case CHR('3'):
 
901
                        case CHR('4'):
 
902
                        case CHR('5'):
 
903
                        case CHR('6'):
 
904
                        case CHR('7'):
 
905
                        case CHR('8'):
 
906
                        case CHR('9'):
 
907
                                d = DIGITVAL(c);
 
908
                                break;
 
909
                        case CHR('a'):
 
910
                        case CHR('A'):
 
911
                                d = 10;
 
912
                                break;
 
913
                        case CHR('b'):
 
914
                        case CHR('B'):
 
915
                                d = 11;
 
916
                                break;
 
917
                        case CHR('c'):
 
918
                        case CHR('C'):
 
919
                                d = 12;
 
920
                                break;
 
921
                        case CHR('d'):
 
922
                        case CHR('D'):
 
923
                                d = 13;
 
924
                                break;
 
925
                        case CHR('e'):
 
926
                        case CHR('E'):
 
927
                                d = 14;
 
928
                                break;
 
929
                        case CHR('f'):
 
930
                        case CHR('F'):
 
931
                                d = 15;
 
932
                                break;
 
933
                        default:
 
934
                                v->now--;               /* oops, not a digit at all */
 
935
                                d = -1;
 
936
                                break;
 
937
                }
 
938
 
 
939
                if (d >= base)
 
940
                {                                               /* not a plausible digit */
 
941
                        v->now--;
 
942
                        d = -1;
 
943
                }
 
944
                if (d < 0)
 
945
                        break;                          /* NOTE BREAK OUT */
 
946
                n = n * ub + (uchr) d;
 
947
        }
 
948
        if (len < minlen)
 
949
                ERR(REG_EESCAPE);
 
950
 
 
951
        return (chr) n;
 
952
}
 
953
 
 
954
/*
 
955
 * brenext - get next BRE token
 
956
 *
 
957
 * This is much like EREs except for all the stupid backslashes and the
 
958
 * context-dependency of some things.
 
959
 */
 
960
static int                                              /* 1 normal, 0 failure */
 
961
brenext(struct vars * v,
 
962
                chr pc)
 
963
{
 
964
        chr                     c = (chr) pc;
 
965
 
 
966
        switch (c)
 
967
        {
 
968
                case CHR('*'):
 
969
                        if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
 
970
                                RETV(PLAIN, c);
 
971
                        RET('*');
 
972
                        break;
 
973
                case CHR('['):
 
974
                        if (HAVE(6) && *(v->now + 0) == CHR('[') &&
 
975
                                *(v->now + 1) == CHR(':') &&
 
976
                                (*(v->now + 2) == CHR('<') ||
 
977
                                 *(v->now + 2) == CHR('>')) &&
 
978
                                *(v->now + 3) == CHR(':') &&
 
979
                                *(v->now + 4) == CHR(']') &&
 
980
                                *(v->now + 5) == CHR(']'))
 
981
                        {
 
982
                                c = *(v->now + 2);
 
983
                                v->now += 6;
 
984
                                NOTE(REG_UNONPOSIX);
 
985
                                RET((c == CHR('<')) ? '<' : '>');
 
986
                        }
 
987
                        INTOCON(L_BRACK);
 
988
                        if (NEXT1('^'))
 
989
                        {
 
990
                                v->now++;
 
991
                                RETV('[', 0);
 
992
                        }
 
993
                        RETV('[', 1);
 
994
                        break;
 
995
                case CHR('.'):
 
996
                        RET('.');
 
997
                        break;
 
998
                case CHR('^'):
 
999
                        if (LASTTYPE(EMPTY))
 
1000
                                RET('^');
 
1001
                        if (LASTTYPE('('))
 
1002
                        {
 
1003
                                NOTE(REG_UUNSPEC);
 
1004
                                RET('^');
 
1005
                        }
 
1006
                        RETV(PLAIN, c);
 
1007
                        break;
 
1008
                case CHR('$'):
 
1009
                        if (v->cflags & REG_EXPANDED)
 
1010
                                skip(v);
 
1011
                        if (ATEOS())
 
1012
                                RET('$');
 
1013
                        if (NEXT2('\\', ')'))
 
1014
                        {
 
1015
                                NOTE(REG_UUNSPEC);
 
1016
                                RET('$');
 
1017
                        }
 
1018
                        RETV(PLAIN, c);
 
1019
                        break;
 
1020
                case CHR('\\'):
 
1021
                        break;                          /* see below */
 
1022
                default:
 
1023
                        RETV(PLAIN, c);
 
1024
                        break;
 
1025
        }
 
1026
 
 
1027
        assert(c == CHR('\\'));
 
1028
 
 
1029
        if (ATEOS())
 
1030
                FAILW(REG_EESCAPE);
 
1031
 
 
1032
        c = *v->now++;
 
1033
        switch (c)
 
1034
        {
 
1035
                case CHR('{'):
 
1036
                        INTOCON(L_BBND);
 
1037
                        NOTE(REG_UBOUNDS);
 
1038
                        RET('{');
 
1039
                        break;
 
1040
                case CHR('('):
 
1041
                        RETV('(', 1);
 
1042
                        break;
 
1043
                case CHR(')'):
 
1044
                        RETV(')', c);
 
1045
                        break;
 
1046
                case CHR('<'):
 
1047
                        NOTE(REG_UNONPOSIX);
 
1048
                        RET('<');
 
1049
                        break;
 
1050
                case CHR('>'):
 
1051
                        NOTE(REG_UNONPOSIX);
 
1052
                        RET('>');
 
1053
                        break;
 
1054
                case CHR('1'):
 
1055
                case CHR('2'):
 
1056
                case CHR('3'):
 
1057
                case CHR('4'):
 
1058
                case CHR('5'):
 
1059
                case CHR('6'):
 
1060
                case CHR('7'):
 
1061
                case CHR('8'):
 
1062
                case CHR('9'):
 
1063
                        NOTE(REG_UBACKREF);
 
1064
                        RETV(BACKREF, (chr) DIGITVAL(c));
 
1065
                        break;
 
1066
                default:
 
1067
                        if (iscalnum(c))
 
1068
                        {
 
1069
                                NOTE(REG_UBSALNUM);
 
1070
                                NOTE(REG_UUNSPEC);
 
1071
                        }
 
1072
                        RETV(PLAIN, c);
 
1073
                        break;
 
1074
        }
 
1075
 
 
1076
        assert(NOTREACHED);
 
1077
}
 
1078
 
 
1079
/*
 
1080
 * skip - skip white space and comments in expanded form
 
1081
 */
 
1082
static void
 
1083
skip(struct vars * v)
 
1084
{
 
1085
        chr                *start = v->now;
 
1086
 
 
1087
        assert(v->cflags & REG_EXPANDED);
 
1088
 
 
1089
        for (;;)
 
1090
        {
 
1091
                while (!ATEOS() && iscspace(*v->now))
 
1092
                        v->now++;
 
1093
                if (ATEOS() || *v->now != CHR('#'))
 
1094
                        break;                          /* NOTE BREAK OUT */
 
1095
                assert(NEXT1('#'));
 
1096
                while (!ATEOS() && *v->now != CHR('\n'))
 
1097
                        v->now++;
 
1098
                /* leave the newline to be picked up by the iscspace loop */
 
1099
        }
 
1100
 
 
1101
        if (v->now != start)
 
1102
                NOTE(REG_UNONPOSIX);
 
1103
}
 
1104
 
 
1105
/*
 
1106
 * newline - return the chr for a newline
 
1107
 *
 
1108
 * This helps confine use of CHR to this source file.
 
1109
 */
 
1110
static chr
 
1111
newline(void)
 
1112
{
 
1113
        return CHR('\n');
 
1114
}
 
1115
 
 
1116
/*
 
1117
 * chrnamed - return the chr known by a given (chr string) name
 
1118
 *
 
1119
 * The code is a bit clumsy, but this routine gets only such specialized
 
1120
 * use that it hardly matters.
 
1121
 */
 
1122
static chr
 
1123
chrnamed(struct vars * v,
 
1124
                 chr *startp,                   /* start of name */
 
1125
                 chr *endp,                             /* just past end of name */
 
1126
                 chr lastresort)                /* what to return if name lookup fails */
 
1127
{
 
1128
        celt            c;
 
1129
        int                     errsave;
 
1130
        int                     e;
 
1131
        struct cvec *cv;
 
1132
 
 
1133
        errsave = v->err;
 
1134
        v->err = 0;
 
1135
        c = element(v, startp, endp);
 
1136
        e = v->err;
 
1137
        v->err = errsave;
 
1138
 
 
1139
        if (e != 0)
 
1140
                return (chr) lastresort;
 
1141
 
 
1142
        cv = range(v, c, c, 0);
 
1143
        if (cv->nchrs == 0)
 
1144
                return (chr) lastresort;
 
1145
        return cv->chrs[0];
 
1146
}