~vcs-imports/mammoth-replicator/trunk

1 by alvherre
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.
1
/*
2
 * lexical analyzer
3
 * This file is #included by regcomp.c.
4
 *
5
 * Copyright (c) 1998, 1999 Henry Spencer.	All rights reserved.
6
 *
7
 * Development of this software was funded, in part, by Cray Research Inc.,
8
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9
 * Corporation, none of whom are responsible for the results.  The author
10
 * thanks all of them.
11
 *
12
 * Redistribution and use in source and binary forms -- with or without
13
 * modification -- are permitted for any purpose, provided that
14
 * redistributions in source form retain this entire copyright notice and
15
 * indicate the origin and nature of any modifications.
16
 *
17
 * I'd appreciate being given credit for this package in the documentation
18
 * of software which uses it, but that is not a requirement.
19
 *
20
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 * $PostgreSQL: pgsql/src/backend/regex/regc_lex.c,v 1.4 2003-11-29 19:51:55 pgsql Exp $
32
 *
33
 */
34
35
/* scanning macros (know about v) */
36
#define ATEOS()		(v->now >= v->stop)
37
#define HAVE(n)		(v->stop - v->now >= (n))
38
#define NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
39
#define NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40
#define NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
41
						*(v->now+1) == CHR(b) && \
42
						*(v->now+2) == CHR(c))
43
#define SET(c)		(v->nexttype = (c))
44
#define SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
45
#define RET(c)		return (SET(c), 1)
46
#define RETV(c, n)	return (SETV(c, n), 1)
47
#define FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
48
#define LASTTYPE(t) (v->lasttype == (t))
49
50
/* lexical contexts */
51
#define L_ERE	1				/* mainline ERE/ARE */
52
#define L_BRE	2				/* mainline BRE */
53
#define L_Q 3					/* REG_QUOTE */
54
#define L_EBND	4				/* ERE/ARE bound */
55
#define L_BBND	5				/* BRE bound */
56
#define L_BRACK 6				/* brackets */
57
#define L_CEL	7				/* collating element */
58
#define L_ECL	8				/* equivalence class */
59
#define L_CCL	9				/* character class */
60
#define INTOCON(c)	(v->lexcon = (c))
61
#define INCON(con)	(v->lexcon == (con))
62
63
/* construct pointer past end of chr array */
64
#define ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
65
66
/*
67
 * lexstart - set up lexical stuff, scan leading options
68
 */
69
static void
70
lexstart(struct vars * v)
71
{
72
	prefixes(v);				/* may turn on new type bits etc. */
73
	NOERR();
74
75
	if (v->cflags & REG_QUOTE)
76
	{
77
		assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
78
		INTOCON(L_Q);
79
	}
80
	else if (v->cflags & REG_EXTENDED)
81
	{
82
		assert(!(v->cflags & REG_QUOTE));
83
		INTOCON(L_ERE);
84
	}
85
	else
86
	{
87
		assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88
		INTOCON(L_BRE);
89
	}
90
91
	v->nexttype = EMPTY;		/* remember we were at the start */
92
	next(v);					/* set up the first token */
93
}
94
95
/*
96
 * prefixes - implement various special prefixes
97
 */
98
static void
99
prefixes(struct vars * v)
100
{
101
	/* literal string doesn't get any of this stuff */
102
	if (v->cflags & REG_QUOTE)
103
		return;
104
105
	/* initial "***" gets special things */
106
	if (HAVE(4) && NEXT3('*', '*', '*'))
107
		switch (*(v->now + 3))
108
		{
109
			case CHR('?'):		/* "***?" error, msg shows version */
110
				ERR(REG_BADPAT);
111
				return;			/* proceed no further */
112
				break;
113
			case CHR('='):		/* "***=" shifts to literal string */
114
				NOTE(REG_UNONPOSIX);
115
				v->cflags |= REG_QUOTE;
116
				v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
117
				v->now += 4;
118
				return;			/* and there can be no more prefixes */
119
				break;
120
			case CHR(':'):		/* "***:" shifts to AREs */
121
				NOTE(REG_UNONPOSIX);
122
				v->cflags |= REG_ADVANCED;
123
				v->now += 4;
124
				break;
125
			default:			/* otherwise *** is just an error */
126
				ERR(REG_BADRPT);
127
				return;
128
				break;
129
		}
130
131
	/* BREs and EREs don't get embedded options */
132
	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133
		return;
134
135
	/* embedded options (AREs only) */
136
	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137
	{
138
		NOTE(REG_UNONPOSIX);
139
		v->now += 2;
140
		for (; !ATEOS() && iscalpha(*v->now); v->now++)
141
			switch (*v->now)
142
			{
143
				case CHR('b'):	/* BREs (but why???) */
144
					v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145
					break;
146
				case CHR('c'):	/* case sensitive */
147
					v->cflags &= ~REG_ICASE;
148
					break;
149
				case CHR('e'):	/* plain EREs */
150
					v->cflags |= REG_EXTENDED;
151
					v->cflags &= ~(REG_ADVF | REG_QUOTE);
152
					break;
153
				case CHR('i'):	/* case insensitive */
154
					v->cflags |= REG_ICASE;
155
					break;
156
				case CHR('m'):	/* Perloid synonym for n */
157
				case CHR('n'):	/* \n affects ^ $ . [^ */
158
					v->cflags |= REG_NEWLINE;
159
					break;
160
				case CHR('p'):	/* ~Perl, \n affects . [^ */
161
					v->cflags |= REG_NLSTOP;
162
					v->cflags &= ~REG_NLANCH;
163
					break;
164
				case CHR('q'):	/* literal string */
165
					v->cflags |= REG_QUOTE;
166
					v->cflags &= ~REG_ADVANCED;
167
					break;
168
				case CHR('s'):	/* single line, \n ordinary */
169
					v->cflags &= ~REG_NEWLINE;
170
					break;
171
				case CHR('t'):	/* tight syntax */
172
					v->cflags &= ~REG_EXPANDED;
173
					break;
174
				case CHR('w'):	/* weird, \n affects ^ $ only */
175
					v->cflags &= ~REG_NLSTOP;
176
					v->cflags |= REG_NLANCH;
177
					break;
178
				case CHR('x'):	/* expanded syntax */
179
					v->cflags |= REG_EXPANDED;
180
					break;
181
				default:
182
					ERR(REG_BADOPT);
183
					return;
184
			}
185
		if (!NEXT1(')'))
186
		{
187
			ERR(REG_BADOPT);
188
			return;
189
		}
190
		v->now++;
191
		if (v->cflags & REG_QUOTE)
192
			v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193
	}
194
}
195
196
/*
197
 * lexnest - "call a subroutine", interpolating string at the lexical level
198
 *
199
 * Note, this is not a very general facility.  There are a number of
200
 * implicit assumptions about what sorts of strings can be subroutines.
201
 */
202
static void
203
lexnest(struct vars * v,
204
		chr *beginp,			/* start of interpolation */
205
		chr *endp)				/* one past end of interpolation */
206
{
207
	assert(v->savenow == NULL); /* only one level of nesting */
208
	v->savenow = v->now;
209
	v->savestop = v->stop;
210
	v->now = beginp;
211
	v->stop = endp;
212
}
213
214
/*
215
 * string constants to interpolate as expansions of things like \d
216
 */
217
static chr	backd[] = {			/* \d */
218
	CHR('['), CHR('['), CHR(':'),
219
	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220
	CHR(':'), CHR(']'), CHR(']')
221
};
222
static chr	backD[] = {			/* \D */
223
	CHR('['), CHR('^'), CHR('['), CHR(':'),
224
	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225
	CHR(':'), CHR(']'), CHR(']')
226
};
227
static chr	brbackd[] = {		/* \d within brackets */
228
	CHR('['), CHR(':'),
229
	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230
	CHR(':'), CHR(']')
231
};
232
static chr	backs[] = {			/* \s */
233
	CHR('['), CHR('['), CHR(':'),
234
	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235
	CHR(':'), CHR(']'), CHR(']')
236
};
237
static chr	backS[] = {			/* \S */
238
	CHR('['), CHR('^'), CHR('['), CHR(':'),
239
	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240
	CHR(':'), CHR(']'), CHR(']')
241
};
242
static chr	brbacks[] = {		/* \s within brackets */
243
	CHR('['), CHR(':'),
244
	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245
	CHR(':'), CHR(']')
246
};
247
static chr	backw[] = {			/* \w */
248
	CHR('['), CHR('['), CHR(':'),
249
	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250
	CHR(':'), CHR(']'), CHR('_'), CHR(']')
251
};
252
static chr	backW[] = {			/* \W */
253
	CHR('['), CHR('^'), CHR('['), CHR(':'),
254
	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255
	CHR(':'), CHR(']'), CHR('_'), CHR(']')
256
};
257
static chr	brbackw[] = {		/* \w within brackets */
258
	CHR('['), CHR(':'),
259
	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260
	CHR(':'), CHR(']'), CHR('_')
261
};
262
263
/*
264
 * lexword - interpolate a bracket expression for word characters
265
 * Possibly ought to inquire whether there is a "word" character class.
266
 */
267
static void
268
lexword(struct vars * v)
269
{
270
	lexnest(v, backw, ENDOF(backw));
271
}
272
273
/*
274
 * next - get next token
275
 */
276
static int						/* 1 normal, 0 failure */
277
next(struct vars * v)
278
{
279
	chr			c;
280
281
	/* errors yield an infinite sequence of failures */
282
	if (ISERR())
283
		return 0;				/* the error has set nexttype to EOS */
284
285
	/* remember flavor of last token */
286
	v->lasttype = v->nexttype;
287
288
	/* REG_BOSONLY */
289
	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290
	{
291
		/* at start of a REG_BOSONLY RE */
292
		RETV(SBEGIN, 0);		/* same as \A */
293
	}
294
295
	/* if we're nested and we've hit end, return to outer level */
296
	if (v->savenow != NULL && ATEOS())
297
	{
298
		v->now = v->savenow;
299
		v->stop = v->savestop;
300
		v->savenow = v->savestop = NULL;
301
	}
302
303
	/* skip white space etc. if appropriate (not in literal or []) */
304
	if (v->cflags & REG_EXPANDED)
305
		switch (v->lexcon)
306
		{
307
			case L_ERE:
308
			case L_BRE:
309
			case L_EBND:
310
			case L_BBND:
311
				skip(v);
312
				break;
313
		}
314
315
	/* handle EOS, depending on context */
316
	if (ATEOS())
317
	{
318
		switch (v->lexcon)
319
		{
320
			case L_ERE:
321
			case L_BRE:
322
			case L_Q:
323
				RET(EOS);
324
				break;
325
			case L_EBND:
326
			case L_BBND:
327
				FAILW(REG_EBRACE);
328
				break;
329
			case L_BRACK:
330
			case L_CEL:
331
			case L_ECL:
332
			case L_CCL:
333
				FAILW(REG_EBRACK);
334
				break;
335
		}
336
		assert(NOTREACHED);
337
	}
338
339
	/* okay, time to actually get a character */
340
	c = *v->now++;
341
342
	/* deal with the easy contexts, punt EREs to code below */
343
	switch (v->lexcon)
344
	{
345
		case L_BRE:				/* punt BREs to separate function */
346
			return brenext(v, c);
347
			break;
348
		case L_ERE:				/* see below */
349
			break;
350
		case L_Q:				/* literal strings are easy */
351
			RETV(PLAIN, c);
352
			break;
353
		case L_BBND:			/* bounds are fairly simple */
354
		case L_EBND:
355
			switch (c)
356
			{
357
				case CHR('0'):
358
				case CHR('1'):
359
				case CHR('2'):
360
				case CHR('3'):
361
				case CHR('4'):
362
				case CHR('5'):
363
				case CHR('6'):
364
				case CHR('7'):
365
				case CHR('8'):
366
				case CHR('9'):
367
					RETV(DIGIT, (chr) DIGITVAL(c));
368
					break;
369
				case CHR(','):
370
					RET(',');
371
					break;
372
				case CHR('}'):	/* ERE bound ends with } */
373
					if (INCON(L_EBND))
374
					{
375
						INTOCON(L_ERE);
376
						if ((v->cflags & REG_ADVF) && NEXT1('?'))
377
						{
378
							v->now++;
379
							NOTE(REG_UNONPOSIX);
380
							RETV('}', 0);
381
						}
382
						RETV('}', 1);
383
					}
384
					else
385
						FAILW(REG_BADBR);
386
					break;
387
				case CHR('\\'):	/* BRE bound ends with \} */
388
					if (INCON(L_BBND) && NEXT1('}'))
389
					{
390
						v->now++;
391
						INTOCON(L_BRE);
392
						RET('}');
393
					}
394
					else
395
						FAILW(REG_BADBR);
396
					break;
397
				default:
398
					FAILW(REG_BADBR);
399
					break;
400
			}
401
			assert(NOTREACHED);
402
			break;
403
		case L_BRACK:			/* brackets are not too hard */
404
			switch (c)
405
			{
406
				case CHR(']'):
407
					if (LASTTYPE('['))
408
						RETV(PLAIN, c);
409
					else
410
					{
411
						INTOCON((v->cflags & REG_EXTENDED) ?
412
								L_ERE : L_BRE);
413
						RET(']');
414
					}
415
					break;
416
				case CHR('\\'):
417
					NOTE(REG_UBBS);
418
					if (!(v->cflags & REG_ADVF))
419
						RETV(PLAIN, c);
420
					NOTE(REG_UNONPOSIX);
421
					if (ATEOS())
422
						FAILW(REG_EESCAPE);
423
					(DISCARD) lexescape(v);
424
					switch (v->nexttype)
425
					{			/* not all escapes okay here */
426
						case PLAIN:
427
							return 1;
428
							break;
429
						case CCLASS:
430
							switch (v->nextvalue)
431
							{
432
								case 'd':
433
									lexnest(v, brbackd, ENDOF(brbackd));
434
									break;
435
								case 's':
436
									lexnest(v, brbacks, ENDOF(brbacks));
437
									break;
438
								case 'w':
439
									lexnest(v, brbackw, ENDOF(brbackw));
440
									break;
441
								default:
442
									FAILW(REG_EESCAPE);
443
									break;
444
							}
445
							/* lexnest done, back up and try again */
446
							v->nexttype = v->lasttype;
447
							return next(v);
448
							break;
449
					}
450
					/* not one of the acceptable escapes */
451
					FAILW(REG_EESCAPE);
452
					break;
453
				case CHR('-'):
454
					if (LASTTYPE('[') || NEXT1(']'))
455
						RETV(PLAIN, c);
456
					else
457
						RETV(RANGE, c);
458
					break;
459
				case CHR('['):
460
					if (ATEOS())
461
						FAILW(REG_EBRACK);
462
					switch (*v->now++)
463
					{
464
						case CHR('.'):
465
							INTOCON(L_CEL);
466
							/* might or might not be locale-specific */
467
							RET(COLLEL);
468
							break;
469
						case CHR('='):
470
							INTOCON(L_ECL);
471
							NOTE(REG_ULOCALE);
472
							RET(ECLASS);
473
							break;
474
						case CHR(':'):
475
							INTOCON(L_CCL);
476
							NOTE(REG_ULOCALE);
477
							RET(CCLASS);
478
							break;
479
						default:		/* oops */
480
							v->now--;
481
							RETV(PLAIN, c);
482
							break;
483
					}
484
					assert(NOTREACHED);
485
					break;
486
				default:
487
					RETV(PLAIN, c);
488
					break;
489
			}
490
			assert(NOTREACHED);
491
			break;
492
		case L_CEL:				/* collating elements are easy */
493
			if (c == CHR('.') && NEXT1(']'))
494
			{
495
				v->now++;
496
				INTOCON(L_BRACK);
497
				RETV(END, '.');
498
			}
499
			else
500
				RETV(PLAIN, c);
501
			break;
502
		case L_ECL:				/* ditto equivalence classes */
503
			if (c == CHR('=') && NEXT1(']'))
504
			{
505
				v->now++;
506
				INTOCON(L_BRACK);
507
				RETV(END, '=');
508
			}
509
			else
510
				RETV(PLAIN, c);
511
			break;
512
		case L_CCL:				/* ditto character classes */
513
			if (c == CHR(':') && NEXT1(']'))
514
			{
515
				v->now++;
516
				INTOCON(L_BRACK);
517
				RETV(END, ':');
518
			}
519
			else
520
				RETV(PLAIN, c);
521
			break;
522
		default:
523
			assert(NOTREACHED);
524
			break;
525
	}
526
527
	/* that got rid of everything except EREs and AREs */
528
	assert(INCON(L_ERE));
529
530
	/* deal with EREs and AREs, except for backslashes */
531
	switch (c)
532
	{
533
		case CHR('|'):
534
			RET('|');
535
			break;
536
		case CHR('*'):
537
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
538
			{
539
				v->now++;
540
				NOTE(REG_UNONPOSIX);
541
				RETV('*', 0);
542
			}
543
			RETV('*', 1);
544
			break;
545
		case CHR('+'):
546
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
547
			{
548
				v->now++;
549
				NOTE(REG_UNONPOSIX);
550
				RETV('+', 0);
551
			}
552
			RETV('+', 1);
553
			break;
554
		case CHR('?'):
555
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
556
			{
557
				v->now++;
558
				NOTE(REG_UNONPOSIX);
559
				RETV('?', 0);
560
			}
561
			RETV('?', 1);
562
			break;
563
		case CHR('{'):			/* bounds start or plain character */
564
			if (v->cflags & REG_EXPANDED)
565
				skip(v);
566
			if (ATEOS() || !iscdigit(*v->now))
567
			{
568
				NOTE(REG_UBRACES);
569
				NOTE(REG_UUNSPEC);
570
				RETV(PLAIN, c);
571
			}
572
			else
573
			{
574
				NOTE(REG_UBOUNDS);
575
				INTOCON(L_EBND);
576
				RET('{');
577
			}
578
			assert(NOTREACHED);
579
			break;
580
		case CHR('('):			/* parenthesis, or advanced extension */
581
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
582
			{
583
				NOTE(REG_UNONPOSIX);
584
				v->now++;
585
				switch (*v->now++)
586
				{
587
					case CHR(':'):		/* non-capturing paren */
588
						RETV('(', 0);
589
						break;
590
					case CHR('#'):		/* comment */
591
						while (!ATEOS() && *v->now != CHR(')'))
592
							v->now++;
593
						if (!ATEOS())
594
							v->now++;
595
						assert(v->nexttype == v->lasttype);
596
						return next(v);
597
						break;
598
					case CHR('='):		/* positive lookahead */
599
						NOTE(REG_ULOOKAHEAD);
600
						RETV(LACON, 1);
601
						break;
602
					case CHR('!'):		/* negative lookahead */
603
						NOTE(REG_ULOOKAHEAD);
604
						RETV(LACON, 0);
605
						break;
606
					default:
607
						FAILW(REG_BADRPT);
608
						break;
609
				}
610
				assert(NOTREACHED);
611
			}
612
			if (v->cflags & REG_NOSUB)
613
				RETV('(', 0);	/* all parens non-capturing */
614
			else
615
				RETV('(', 1);
616
			break;
617
		case CHR(')'):
618
			if (LASTTYPE('('))
619
				NOTE(REG_UUNSPEC);
620
			RETV(')', c);
621
			break;
622
		case CHR('['):			/* easy except for [[:<:]] and [[:>:]] */
623
			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
624
				*(v->now + 1) == CHR(':') &&
625
				(*(v->now + 2) == CHR('<') ||
626
				 *(v->now + 2) == CHR('>')) &&
627
				*(v->now + 3) == CHR(':') &&
628
				*(v->now + 4) == CHR(']') &&
629
				*(v->now + 5) == CHR(']'))
630
			{
631
				c = *(v->now + 2);
632
				v->now += 6;
633
				NOTE(REG_UNONPOSIX);
634
				RET((c == CHR('<')) ? '<' : '>');
635
			}
636
			INTOCON(L_BRACK);
637
			if (NEXT1('^'))
638
			{
639
				v->now++;
640
				RETV('[', 0);
641
			}
642
			RETV('[', 1);
643
			break;
644
		case CHR('.'):
645
			RET('.');
646
			break;
647
		case CHR('^'):
648
			RET('^');
649
			break;
650
		case CHR('$'):
651
			RET('$');
652
			break;
653
		case CHR('\\'): /* mostly punt backslashes to code below */
654
			if (ATEOS())
655
				FAILW(REG_EESCAPE);
656
			break;
657
		default:				/* ordinary character */
658
			RETV(PLAIN, c);
659
			break;
660
	}
661
662
	/* ERE/ARE backslash handling; backslash already eaten */
663
	assert(!ATEOS());
664
	if (!(v->cflags & REG_ADVF))
665
	{							/* only AREs have non-trivial escapes */
666
		if (iscalnum(*v->now))
667
		{
668
			NOTE(REG_UBSALNUM);
669
			NOTE(REG_UUNSPEC);
670
		}
671
		RETV(PLAIN, *v->now++);
672
	}
673
	(DISCARD) lexescape(v);
674
	if (ISERR())
675
		FAILW(REG_EESCAPE);
676
	if (v->nexttype == CCLASS)
677
	{							/* fudge at lexical level */
678
		switch (v->nextvalue)
679
		{
680
			case 'd':
681
				lexnest(v, backd, ENDOF(backd));
682
				break;
683
			case 'D':
684
				lexnest(v, backD, ENDOF(backD));
685
				break;
686
			case 's':
687
				lexnest(v, backs, ENDOF(backs));
688
				break;
689
			case 'S':
690
				lexnest(v, backS, ENDOF(backS));
691
				break;
692
			case 'w':
693
				lexnest(v, backw, ENDOF(backw));
694
				break;
695
			case 'W':
696
				lexnest(v, backW, ENDOF(backW));
697
				break;
698
			default:
699
				assert(NOTREACHED);
700
				FAILW(REG_ASSERT);
701
				break;
702
		}
703
		/* lexnest done, back up and try again */
704
		v->nexttype = v->lasttype;
705
		return next(v);
706
	}
707
	/* otherwise, lexescape has already done the work */
708
	return !ISERR();
709
}
710
711
/*
712
 * lexescape - parse an ARE backslash escape (backslash already eaten)
713
 * Note slightly nonstandard use of the CCLASS type code.
714
 */
715
static int						/* not actually used, but convenient for
716
								 * RETV */
717
lexescape(struct vars * v)
718
{
719
	chr			c;
720
	static chr	alert[] = {
721
		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
722
	};
723
	static chr	esc[] = {
724
		CHR('E'), CHR('S'), CHR('C')
725
	};
726
	chr		   *save;
727
728
	assert(v->cflags & REG_ADVF);
729
730
	assert(!ATEOS());
731
	c = *v->now++;
732
	if (!iscalnum(c))
733
		RETV(PLAIN, c);
734
735
	NOTE(REG_UNONPOSIX);
736
	switch (c)
737
	{
738
		case CHR('a'):
739
			RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
740
			break;
741
		case CHR('A'):
742
			RETV(SBEGIN, 0);
743
			break;
744
		case CHR('b'):
745
			RETV(PLAIN, CHR('\b'));
746
			break;
747
		case CHR('B'):
748
			RETV(PLAIN, CHR('\\'));
749
			break;
750
		case CHR('c'):
751
			NOTE(REG_UUNPORT);
752
			if (ATEOS())
753
				FAILW(REG_EESCAPE);
754
			RETV(PLAIN, (chr) (*v->now++ & 037));
755
			break;
756
		case CHR('d'):
757
			NOTE(REG_ULOCALE);
758
			RETV(CCLASS, 'd');
759
			break;
760
		case CHR('D'):
761
			NOTE(REG_ULOCALE);
762
			RETV(CCLASS, 'D');
763
			break;
764
		case CHR('e'):
765
			NOTE(REG_UUNPORT);
766
			RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
767
			break;
768
		case CHR('f'):
769
			RETV(PLAIN, CHR('\f'));
770
			break;
771
		case CHR('m'):
772
			RET('<');
773
			break;
774
		case CHR('M'):
775
			RET('>');
776
			break;
777
		case CHR('n'):
778
			RETV(PLAIN, CHR('\n'));
779
			break;
780
		case CHR('r'):
781
			RETV(PLAIN, CHR('\r'));
782
			break;
783
		case CHR('s'):
784
			NOTE(REG_ULOCALE);
785
			RETV(CCLASS, 's');
786
			break;
787
		case CHR('S'):
788
			NOTE(REG_ULOCALE);
789
			RETV(CCLASS, 'S');
790
			break;
791
		case CHR('t'):
792
			RETV(PLAIN, CHR('\t'));
793
			break;
794
		case CHR('u'):
795
			c = lexdigits(v, 16, 4, 4);
796
			if (ISERR())
797
				FAILW(REG_EESCAPE);
798
			RETV(PLAIN, c);
799
			break;
800
		case CHR('U'):
801
			c = lexdigits(v, 16, 8, 8);
802
			if (ISERR())
803
				FAILW(REG_EESCAPE);
804
			RETV(PLAIN, c);
805
			break;
806
		case CHR('v'):
807
			RETV(PLAIN, CHR('\v'));
808
			break;
809
		case CHR('w'):
810
			NOTE(REG_ULOCALE);
811
			RETV(CCLASS, 'w');
812
			break;
813
		case CHR('W'):
814
			NOTE(REG_ULOCALE);
815
			RETV(CCLASS, 'W');
816
			break;
817
		case CHR('x'):
818
			NOTE(REG_UUNPORT);
819
			c = lexdigits(v, 16, 1, 255);		/* REs >255 long outside
820
												 * spec */
821
			if (ISERR())
822
				FAILW(REG_EESCAPE);
823
			RETV(PLAIN, c);
824
			break;
825
		case CHR('y'):
826
			NOTE(REG_ULOCALE);
827
			RETV(WBDRY, 0);
828
			break;
829
		case CHR('Y'):
830
			NOTE(REG_ULOCALE);
831
			RETV(NWBDRY, 0);
832
			break;
833
		case CHR('Z'):
834
			RETV(SEND, 0);
835
			break;
836
		case CHR('1'):
837
		case CHR('2'):
838
		case CHR('3'):
839
		case CHR('4'):
840
		case CHR('5'):
841
		case CHR('6'):
842
		case CHR('7'):
843
		case CHR('8'):
844
		case CHR('9'):
845
			save = v->now;
846
			v->now--;			/* put first digit back */
847
			c = lexdigits(v, 10, 1, 255);		/* REs >255 long outside
848
												 * spec */
849
			if (ISERR())
850
				FAILW(REG_EESCAPE);
851
			/* ugly heuristic (first test is "exactly 1 digit?") */
852
			if (v->now - save == 0 || (int) c <= v->nsubexp)
853
			{
854
				NOTE(REG_UBACKREF);
855
				RETV(BACKREF, (chr) c);
856
			}
857
			/* oops, doesn't look like it's a backref after all... */
858
			v->now = save;
859
			/* and fall through into octal number */
860
		case CHR('0'):
861
			NOTE(REG_UUNPORT);
862
			v->now--;			/* put first digit back */
863
			c = lexdigits(v, 8, 1, 3);
864
			if (ISERR())
865
				FAILW(REG_EESCAPE);
866
			RETV(PLAIN, c);
867
			break;
868
		default:
869
			assert(iscalpha(c));
870
			FAILW(REG_EESCAPE); /* unknown alphabetic escape */
871
			break;
872
	}
873
	assert(NOTREACHED);
874
}
875
876
/*
877
 * lexdigits - slurp up digits and return chr value
878
 */
879
static chr						/* chr value; errors signalled via ERR */
880
lexdigits(struct vars * v,
881
		  int base,
882
		  int minlen,
883
		  int maxlen)
884
{
885
	uchr		n;				/* unsigned to avoid overflow misbehavior */
886
	int			len;
887
	chr			c;
888
	int			d;
889
	const uchr	ub = (uchr) base;
890
891
	n = 0;
892
	for (len = 0; len < maxlen && !ATEOS(); len++)
893
	{
894
		c = *v->now++;
895
		switch (c)
896
		{
897
			case CHR('0'):
898
			case CHR('1'):
899
			case CHR('2'):
900
			case CHR('3'):
901
			case CHR('4'):
902
			case CHR('5'):
903
			case CHR('6'):
904
			case CHR('7'):
905
			case CHR('8'):
906
			case CHR('9'):
907
				d = DIGITVAL(c);
908
				break;
909
			case CHR('a'):
910
			case CHR('A'):
911
				d = 10;
912
				break;
913
			case CHR('b'):
914
			case CHR('B'):
915
				d = 11;
916
				break;
917
			case CHR('c'):
918
			case CHR('C'):
919
				d = 12;
920
				break;
921
			case CHR('d'):
922
			case CHR('D'):
923
				d = 13;
924
				break;
925
			case CHR('e'):
926
			case CHR('E'):
927
				d = 14;
928
				break;
929
			case CHR('f'):
930
			case CHR('F'):
931
				d = 15;
932
				break;
933
			default:
934
				v->now--;		/* oops, not a digit at all */
935
				d = -1;
936
				break;
937
		}
938
939
		if (d >= base)
940
		{						/* not a plausible digit */
941
			v->now--;
942
			d = -1;
943
		}
944
		if (d < 0)
945
			break;				/* NOTE BREAK OUT */
946
		n = n * ub + (uchr) d;
947
	}
948
	if (len < minlen)
949
		ERR(REG_EESCAPE);
950
951
	return (chr) n;
952
}
953
954
/*
955
 * brenext - get next BRE token
956
 *
957
 * This is much like EREs except for all the stupid backslashes and the
958
 * context-dependency of some things.
959
 */
960
static int						/* 1 normal, 0 failure */
961
brenext(struct vars * v,
962
		chr pc)
963
{
964
	chr			c = (chr) pc;
965
966
	switch (c)
967
	{
968
		case CHR('*'):
969
			if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
970
				RETV(PLAIN, c);
971
			RET('*');
972
			break;
973
		case CHR('['):
974
			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
975
				*(v->now + 1) == CHR(':') &&
976
				(*(v->now + 2) == CHR('<') ||
977
				 *(v->now + 2) == CHR('>')) &&
978
				*(v->now + 3) == CHR(':') &&
979
				*(v->now + 4) == CHR(']') &&
980
				*(v->now + 5) == CHR(']'))
981
			{
982
				c = *(v->now + 2);
983
				v->now += 6;
984
				NOTE(REG_UNONPOSIX);
985
				RET((c == CHR('<')) ? '<' : '>');
986
			}
987
			INTOCON(L_BRACK);
988
			if (NEXT1('^'))
989
			{
990
				v->now++;
991
				RETV('[', 0);
992
			}
993
			RETV('[', 1);
994
			break;
995
		case CHR('.'):
996
			RET('.');
997
			break;
998
		case CHR('^'):
999
			if (LASTTYPE(EMPTY))
1000
				RET('^');
1001
			if (LASTTYPE('('))
1002
			{
1003
				NOTE(REG_UUNSPEC);
1004
				RET('^');
1005
			}
1006
			RETV(PLAIN, c);
1007
			break;
1008
		case CHR('$'):
1009
			if (v->cflags & REG_EXPANDED)
1010
				skip(v);
1011
			if (ATEOS())
1012
				RET('$');
1013
			if (NEXT2('\\', ')'))
1014
			{
1015
				NOTE(REG_UUNSPEC);
1016
				RET('$');
1017
			}
1018
			RETV(PLAIN, c);
1019
			break;
1020
		case CHR('\\'):
1021
			break;				/* see below */
1022
		default:
1023
			RETV(PLAIN, c);
1024
			break;
1025
	}
1026
1027
	assert(c == CHR('\\'));
1028
1029
	if (ATEOS())
1030
		FAILW(REG_EESCAPE);
1031
1032
	c = *v->now++;
1033
	switch (c)
1034
	{
1035
		case CHR('{'):
1036
			INTOCON(L_BBND);
1037
			NOTE(REG_UBOUNDS);
1038
			RET('{');
1039
			break;
1040
		case CHR('('):
1041
			RETV('(', 1);
1042
			break;
1043
		case CHR(')'):
1044
			RETV(')', c);
1045
			break;
1046
		case CHR('<'):
1047
			NOTE(REG_UNONPOSIX);
1048
			RET('<');
1049
			break;
1050
		case CHR('>'):
1051
			NOTE(REG_UNONPOSIX);
1052
			RET('>');
1053
			break;
1054
		case CHR('1'):
1055
		case CHR('2'):
1056
		case CHR('3'):
1057
		case CHR('4'):
1058
		case CHR('5'):
1059
		case CHR('6'):
1060
		case CHR('7'):
1061
		case CHR('8'):
1062
		case CHR('9'):
1063
			NOTE(REG_UBACKREF);
1064
			RETV(BACKREF, (chr) DIGITVAL(c));
1065
			break;
1066
		default:
1067
			if (iscalnum(c))
1068
			{
1069
				NOTE(REG_UBSALNUM);
1070
				NOTE(REG_UUNSPEC);
1071
			}
1072
			RETV(PLAIN, c);
1073
			break;
1074
	}
1075
1076
	assert(NOTREACHED);
1077
}
1078
1079
/*
1080
 * skip - skip white space and comments in expanded form
1081
 */
1082
static void
1083
skip(struct vars * v)
1084
{
1085
	chr		   *start = v->now;
1086
1087
	assert(v->cflags & REG_EXPANDED);
1088
1089
	for (;;)
1090
	{
1091
		while (!ATEOS() && iscspace(*v->now))
1092
			v->now++;
1093
		if (ATEOS() || *v->now != CHR('#'))
1094
			break;				/* NOTE BREAK OUT */
1095
		assert(NEXT1('#'));
1096
		while (!ATEOS() && *v->now != CHR('\n'))
1097
			v->now++;
1098
		/* leave the newline to be picked up by the iscspace loop */
1099
	}
1100
1101
	if (v->now != start)
1102
		NOTE(REG_UNONPOSIX);
1103
}
1104
1105
/*
1106
 * newline - return the chr for a newline
1107
 *
1108
 * This helps confine use of CHR to this source file.
1109
 */
1110
static chr
1111
newline(void)
1112
{
1113
	return CHR('\n');
1114
}
1115
1116
/*
1117
 * chrnamed - return the chr known by a given (chr string) name
1118
 *
1119
 * The code is a bit clumsy, but this routine gets only such specialized
1120
 * use that it hardly matters.
1121
 */
1122
static chr
1123
chrnamed(struct vars * v,
1124
		 chr *startp,			/* start of name */
1125
		 chr *endp,				/* just past end of name */
1126
		 chr lastresort)		/* what to return if name lookup fails */
1127
{
1128
	celt		c;
1129
	int			errsave;
1130
	int			e;
1131
	struct cvec *cv;
1132
1133
	errsave = v->err;
1134
	v->err = 0;
1135
	c = element(v, startp, endp);
1136
	e = v->err;
1137
	v->err = errsave;
1138
1139
	if (e != 0)
1140
		return (chr) lastresort;
1141
1142
	cv = range(v, c, c, 0);
1143
	if (cv->nchrs == 0)
1144
		return (chr) lastresort;
1145
	return cv->chrs[0];
1146
}