1
/* $Cambridge: exim/exim-src/src/pcre/pcre_compile.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */
3
/*************************************************
4
* Perl-Compatible Regular Expressions *
5
*************************************************/
7
/* PCRE is a library of functions to support regular expressions whose syntax
8
and semantics are as close as possible to those of the Perl 5 language.
10
Written by Philip Hazel
11
Copyright (c) 1997-2005 University of Cambridge
13
-----------------------------------------------------------------------------
14
Redistribution and use in source and binary forms, with or without
15
modification, are permitted provided that the following conditions are met:
17
* Redistributions of source code must retain the above copyright notice,
18
this list of conditions and the following disclaimer.
20
* Redistributions in binary form must reproduce the above copyright
21
notice, this list of conditions and the following disclaimer in the
22
documentation and/or other materials provided with the distribution.
24
* Neither the name of the University of Cambridge nor the names of its
25
contributors may be used to endorse or promote products derived from
26
this software without specific prior written permission.
28
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
-----------------------------------------------------------------------------
43
/* This module contains the external function pcre_compile(), along with
44
supporting internal functions that are not used by other modules. */
47
#include "pcre_internal.h"
50
/*************************************************
51
* Code parameters and static tables *
52
*************************************************/
54
/* Maximum number of items on the nested bracket stacks at compile time. This
55
applies to the nesting of all kinds of parentheses. It does not limit
56
un-nested, non-capturing parentheses. This number can be made bigger if
57
necessary - it is used to dimension one int and one unsigned char vector at
60
#define BRASTACK_SIZE 200
63
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
64
are simple data values; negative values are for special things like \d and so
65
on. Zero means further processing is needed (for things like \x), or the escape
68
#if !EBCDIC /* This is the "normal" table for ASCII systems */
69
static const short int escapes[] = {
70
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72
'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74
-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75
-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76
'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77
0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78
-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79
0, 0, -ESC_z /* x - z */
82
#else /* This is the "abnormal" table for EBCDIC systems */
83
static const short int escapes[] = {
84
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86
/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87
/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91
/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92
/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93
/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100
/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101
/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102
/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103
/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
111
/* Tables of names of POSIX character classes and their lengths. The list is
112
terminated by a zero length entry. The first three must be alpha, upper, lower,
113
as this is assumed for handling case independence. */
115
static const char *const posix_names[] = {
116
"alpha", "lower", "upper",
117
"alnum", "ascii", "blank", "cntrl", "digit", "graph",
118
"print", "punct", "space", "word", "xdigit" };
120
static const uschar posix_name_lengths[] = {
121
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
123
/* Table of class bit maps for each POSIX class; up to three may be combined
124
to form the class. The table for [:blank:] is dynamically modified to remove
125
the vertical space characters. */
127
static const int posix_class_maps[] = {
128
cbit_lower, cbit_upper, -1, /* alpha */
129
cbit_lower, -1, -1, /* lower */
130
cbit_upper, -1, -1, /* upper */
131
cbit_digit, cbit_lower, cbit_upper, /* alnum */
132
cbit_print, cbit_cntrl, -1, /* ascii */
133
cbit_space, -1, -1, /* blank - a GNU extension */
134
cbit_cntrl, -1, -1, /* cntrl */
135
cbit_digit, -1, -1, /* digit */
136
cbit_graph, -1, -1, /* graph */
137
cbit_print, -1, -1, /* print */
138
cbit_punct, -1, -1, /* punct */
139
cbit_space, -1, -1, /* space */
140
cbit_word, -1, -1, /* word - a Perl extension */
141
cbit_xdigit,-1, -1 /* xdigit */
145
/* The texts of compile-time error messages. These are "char *" because they
146
are passed to the outside world. */
148
static const char *error_texts[] = {
150
"\\ at end of pattern",
151
"\\c at end of pattern",
152
"unrecognized character follows \\",
153
"numbers out of order in {} quantifier",
155
"number too big in {} quantifier",
156
"missing terminating ] for character class",
157
"invalid escape sequence in character class",
158
"range out of order in character class",
161
"operand of unlimited repeat could match the empty string",
162
"internal error: unexpected repeat",
163
"unrecognized character after (?",
164
"POSIX named classes are supported only within a class",
167
"reference to non-existent subpattern",
168
"erroffset passed as NULL",
169
"unknown option bit(s) set",
170
"missing ) after comment",
171
"parentheses nested too deeply",
173
"regular expression too large",
174
"failed to get memory",
175
"unmatched parentheses",
176
"internal error: code overflow",
177
"unrecognized character after (?<",
179
"lookbehind assertion is not fixed length",
180
"malformed number after (?(",
181
"conditional group contains more than two branches",
182
"assertion expected after (?(",
183
"(?R or (?digits must be followed by )",
185
"unknown POSIX class name",
186
"POSIX collating elements are not supported",
187
"this version of PCRE is not compiled with PCRE_UTF8 support",
189
"character value in \\x{...} sequence is too large",
191
"invalid condition (?(0)",
192
"\\C not allowed in lookbehind assertion",
193
"PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194
"number after (?C is > 255",
195
"closing ) for (?C expected",
197
"recursive call could loop indefinitely",
198
"unrecognized character after (?P",
199
"syntax error after (?P",
200
"two named groups have the same name",
201
"invalid UTF-8 string",
203
"support for \\P, \\p, and \\X has not been compiled",
204
"malformed \\P or \\p sequence",
205
"unknown property name after \\P or \\p"
209
/* Table to identify digits and hex digits. This is used when compiling
210
patterns. Note that the tables in chartables are dependent on the locale, and
211
may mark arbitrary characters as digits - but the PCRE compiling code expects
212
to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213
a private table here. It costs 256 bytes, but it is a lot faster than doing
214
character value tests (at least in some simple cases I timed), and in some
215
applications one wants PCRE to compile efficiently as well as match
218
For convenience, we use the same bit definitions as in chartables:
221
0x08 hexadecimal digit
223
Then we can use ctype_digit and ctype_xdigit in the code. */
225
#if !EBCDIC /* This is the "normal" case, for ASCII systems */
226
static const unsigned char digitab[] =
228
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
261
#else /* This is the "abnormal" case, for EBCDIC systems */
262
static const unsigned char digitab[] =
264
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- � */
276
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
297
static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298
0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299
0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- � */
310
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314
0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320
0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321
0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322
0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328
0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329
0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
333
/* Definition to allow mutual recursion */
336
compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337
int *, int *, branch_chain *, compile_data *);
341
/*************************************************
343
*************************************************/
345
/* This function is called when a \ has been encountered. It either returns a
346
positive value for a simple escape such as \n, or a negative value which
347
encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348
a positive value greater than 255 may be returned. On entry, ptr is pointing at
349
the \. On exit, it is on the final character of the escape sequence.
352
ptrptr points to the pattern position pointer
353
errorcodeptr points to the errorcode variable
354
bracount number of previous extracting brackets
355
options the options bits
356
isclass TRUE if inside a character class
358
Returns: zero or positive => a data character
359
negative => a special escape sequence
360
on error, errorptr is set
364
check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365
int options, BOOL isclass)
367
const uschar *ptr = *ptrptr;
370
/* If backslash is at the end of the pattern, it's an error. */
373
if (c == 0) *errorcodeptr = ERR1;
375
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376
a table. A non-zero result is something that can be returned immediately.
377
Otherwise further processing may be required. */
379
#if !EBCDIC /* ASCII coding */
380
else if (c < '0' || c > 'z') {} /* Not alphameric */
381
else if ((i = escapes[c - '0']) != 0) c = i;
383
#else /* EBCDIC coding */
384
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385
else if ((i = escapes[c - 0x48]) != 0) c = i;
388
/* Escapes that need further processing, or are illegal. */
392
const uschar *oldptr;
395
/* A number of Perl escapes are not handled by PCRE. We give an explicit
403
*errorcodeptr = ERR37;
406
/* The handling of escape sequences consisting of a string of digits
407
starting with one that is not zero is not straightforward. By experiment,
408
the way Perl works seems to be as follows:
410
Outside a character class, the digits are read as a decimal number. If the
411
number is less than 10, or if there are that many previous extracting
412
left brackets, then it is a back reference. Otherwise, up to three octal
413
digits are read to form an escaped byte. Thus \123 is likely to be octal
414
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415
value is greater than 377, the least significant 8 bits are taken. Inside a
416
character class, \ followed by a digit is always an octal number. */
418
case '1': case '2': case '3': case '4': case '5':
419
case '6': case '7': case '8': case '9':
425
while ((digitab[ptr[1]] & ctype_digit) != 0)
426
c = c * 10 + *(++ptr) - '0';
427
if (c < 10 || c <= bracount)
432
ptr = oldptr; /* Put the pointer back and fall through */
435
/* Handle an octal number following \. If the first digit is 8 or 9, Perl
436
generates a binary zero byte and treats the digit as a following literal.
437
Thus we have to pull back the pointer by one. */
439
if ((c = *ptr) >= '8')
446
/* \0 always starts an octal number, but we may drop through to here with a
447
larger first octal digit. */
451
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452
c = c * 8 + *(++ptr) - '0';
453
c &= 255; /* Take least significant 8 bits */
456
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457
which can be greater than 0xff, but only if the ddd are hex digits. */
461
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
463
const uschar *pt = ptr + 2;
464
register int count = 0;
466
while ((digitab[*pt] & ctype_xdigit) != 0)
470
#if !EBCDIC /* ASCII coding */
471
if (cc >= 'a') cc -= 32; /* Convert to upper case */
472
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473
#else /* EBCDIC coding */
474
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
480
if (c < 0 || count > 8) *errorcodeptr = ERR34;
484
/* If the sequence of hex digits does not end with '}', then we don't
485
recognize this construct; fall through to the normal \x handling. */
489
/* Read just a single hex char */
492
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
494
int cc; /* Some compilers don't like ++ */
495
cc = *(++ptr); /* in initializers */
496
#if !EBCDIC /* ASCII coding */
497
if (cc >= 'a') cc -= 32; /* Convert to upper case */
498
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499
#else /* EBCDIC coding */
500
if (cc <= 'z') cc += 64; /* Convert to upper case */
501
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
506
/* Other special escapes not starting with a digit are straightforward */
512
*errorcodeptr = ERR2;
516
/* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517
is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518
(However, an EBCDIC equivalent has now been added.) */
520
#if !EBCDIC /* ASCII coding */
521
if (c >= 'a' && c <= 'z') c -= 32;
523
#else /* EBCDIC coding */
524
if (c >= 'a' && c <= 'z') c += 64;
529
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531
for Perl compatibility, it is a literal. This code looks a bit odd, but
532
there used to be some cases other than the default, and there may be again
533
in future, so I haven't "optimized" it. */
536
if ((options & PCRE_EXTRA) != 0) switch(c)
539
*errorcodeptr = ERR3;
553
/*************************************************
555
*************************************************/
557
/* This function is called after \P or \p has been encountered, provided that
558
PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559
pointing at the P or p. On exit, it is pointing at the final character of the
563
ptrptr points to the pattern position pointer
564
negptr points to a boolean that is set TRUE for negation else FALSE
565
errorcodeptr points to the error code variable
567
Returns: value from ucp_type_table, or -1 for an invalid type
571
get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
574
const uschar *ptr = *ptrptr;
578
if (c == 0) goto ERROR_RETURN;
582
/* \P or \p can be followed by a one- or two-character name in {}, optionally
583
preceded by ^ for negation. */
592
for (i = 0; i <= 2; i++)
595
if (c == 0) goto ERROR_RETURN;
599
if (c !='}') /* Try to distinguish error cases */
601
while (*(++ptr) != 0 && *ptr != '}');
602
if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
607
/* Otherwise there is just one following character */
617
/* Search for a recognized property name using binary chop */
620
top = _pcre_utt_size;
625
c = strcmp(name, _pcre_utt[i].name);
626
if (c == 0) return _pcre_utt[i].value;
627
if (c > 0) bot = i + 1; else top = i;
631
*errorcodeptr = ERR47;
636
*errorcodeptr = ERR46;
645
/*************************************************
646
* Check for counted repeat *
647
*************************************************/
649
/* This function is called when a '{' is encountered in a place where it might
650
start a quantifier. It looks ahead to see if it really is a quantifier or not.
651
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652
where the ddds are digits.
655
p pointer to the first char after '{'
657
Returns: TRUE or FALSE
661
is_counted_repeat(const uschar *p)
663
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664
while ((digitab[*p] & ctype_digit) != 0) p++;
665
if (*p == '}') return TRUE;
667
if (*p++ != ',') return FALSE;
668
if (*p == '}') return TRUE;
670
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671
while ((digitab[*p] & ctype_digit) != 0) p++;
678
/*************************************************
679
* Read repeat counts *
680
*************************************************/
682
/* Read an item of the form {n,m} and return the values. This is called only
683
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684
so the syntax is guaranteed to be correct, but we need to check the values.
687
p pointer to first char after '{'
688
minp pointer to int for min
689
maxp pointer to int for max
690
returned as -1 if no max
691
errorcodeptr points to error code variable
693
Returns: pointer to '}' on success;
694
current ptr on error, with errorcodeptr set non-zero
697
static const uschar *
698
read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
703
while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
705
if (*p == '}') max = min; else
710
while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
713
*errorcodeptr = ERR4;
719
/* Do paranoid checks, then fill in the required variables, and pass back the
720
pointer to the terminating '}'. */
722
if (min > 65535 || max > 65535)
723
*errorcodeptr = ERR5;
734
/*************************************************
735
* Find first significant op code *
736
*************************************************/
738
/* This is called by several functions that scan a compiled expression looking
739
for a fixed first character, or an anchoring op code etc. It skips over things
740
that do not influence this. For some calls, a change of option is important.
741
For some calls, it makes sense to skip negative forward and all backward
742
assertions, and also the \b assertion; for others it does not.
745
code pointer to the start of the group
746
options pointer to external options
747
optbit the option bit whose changing is significant, or
749
skipassert TRUE if certain assertions are to be skipped
751
Returns: pointer to the first significant opcode
755
first_significant_code(const uschar *code, int *options, int optbit,
763
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
764
*options = (int)code[1];
770
case OP_ASSERTBACK_NOT:
771
if (!skipassert) return code;
772
do code += GET(code, 1); while (*code == OP_ALT);
773
code += _pcre_OP_lengths[*code];
776
case OP_WORD_BOUNDARY:
777
case OP_NOT_WORD_BOUNDARY:
778
if (!skipassert) return code;
784
code += _pcre_OP_lengths[*code];
791
/* Control never reaches here */
797
/*************************************************
798
* Find the fixed length of a pattern *
799
*************************************************/
801
/* Scan a pattern and compute the fixed length of subject that will match it,
802
if the length is fixed. This is needed for dealing with backward assertions.
803
In UTF8 mode, the result is in characters rather than bytes.
806
code points to the start of the pattern (the bracket)
807
options the compiling options
809
Returns: the fixed length, or -1 if there is no fixed length,
810
or -2 if \C was encountered
814
find_fixedlength(uschar *code, int options)
818
register int branchlength = 0;
819
register uschar *cc = code + 1 + LINK_SIZE;
821
/* Scan along the opcodes for this branch. If we get to the end of the
822
branch, check the length against that of the other branches. */
827
register int op = *cc;
828
if (op >= OP_BRA) op = OP_BRA;
835
d = find_fixedlength(cc, options);
838
do cc += GET(cc, 1); while (*cc == OP_ALT);
842
/* Reached end of a branch; if it's a ket it is the end of a nested
843
call. If it's ALT it is an alternation in a nested call. If it is
844
END it's the end of the outer call. All can be handled by the same code. */
851
if (length < 0) length = branchlength;
852
else if (length != branchlength) return -1;
853
if (*cc != OP_ALT) return length;
858
/* Skip over assertive subpatterns */
863
case OP_ASSERTBACK_NOT:
864
do cc += GET(cc, 1); while (*cc == OP_ALT);
867
/* Skip over things that don't match chars */
880
case OP_NOT_WORD_BOUNDARY:
881
case OP_WORD_BOUNDARY:
882
cc += _pcre_OP_lengths[*cc];
885
/* Handle literal characters */
892
if ((options & PCRE_UTF8) != 0)
894
while ((*cc & 0xc0) == 0x80) cc++;
899
/* Handle exact repetitions. The count is already in characters, but we
900
need to skip over a multibyte character in UTF8 mode. */
903
branchlength += GET2(cc,1);
906
if ((options & PCRE_UTF8) != 0)
908
while((*cc & 0x80) == 0x80) cc++;
914
branchlength += GET2(cc,1);
918
/* Handle single-char matchers */
927
case OP_NOT_WHITESPACE:
929
case OP_NOT_WORDCHAR:
936
/* The single-byte matcher isn't allowed */
941
/* Check a class for variable quantification */
945
cc += GET(cc, 1) - 33;
963
if (GET2(cc,1) != GET2(cc,3)) return -1;
964
branchlength += GET2(cc,1);
973
/* Anything else is variable length */
979
/* Control never gets here */
985
/*************************************************
986
* Scan compiled regex for numbered bracket *
987
*************************************************/
989
/* This little function scans through a compiled pattern until it finds a
990
capturing bracket with the given number.
993
code points to start of expression
994
utf8 TRUE in UTF-8 mode
995
number the required bracket number
997
Returns: pointer to the opcode for the bracket, or NULL if not found
1000
static const uschar *
1001
find_bracket(const uschar *code, BOOL utf8, int number)
1003
#ifndef SUPPORT_UTF8
1004
utf8 = utf8; /* Stop pedantic compilers complaining */
1009
register int c = *code;
1010
if (c == OP_END) return NULL;
1011
else if (c > OP_BRA)
1014
if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1015
if (n == number) return (uschar *)code;
1016
code += _pcre_OP_lengths[OP_BRA];
1020
code += _pcre_OP_lengths[c];
1024
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1025
by a multi-byte character. The length in the table is a minimum, so we have
1026
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1027
can use relatively efficient code. */
1042
while ((*code & 0xc0) == 0x80) code++;
1045
/* XCLASS is used for classes that cannot be represented just by a bit
1046
map. This includes negated single high-valued characters. The length in
1047
the table is zero; the actual length is stored in the compiled code. */
1050
code += GET(code, 1) + 1;
1060
/*************************************************
1061
* Scan compiled regex for recursion reference *
1062
*************************************************/
1064
/* This little function scans through a compiled pattern until it finds an
1065
instance of OP_RECURSE.
1068
code points to start of expression
1069
utf8 TRUE in UTF-8 mode
1071
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1074
static const uschar *
1075
find_recurse(const uschar *code, BOOL utf8)
1077
#ifndef SUPPORT_UTF8
1078
utf8 = utf8; /* Stop pedantic compilers complaining */
1083
register int c = *code;
1084
if (c == OP_END) return NULL;
1085
else if (c == OP_RECURSE) return code;
1086
else if (c > OP_BRA)
1088
code += _pcre_OP_lengths[OP_BRA];
1092
code += _pcre_OP_lengths[c];
1096
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1097
by a multi-byte character. The length in the table is a minimum, so we have
1098
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1099
can use relatively efficient code. */
1114
while ((*code & 0xc0) == 0x80) code++;
1117
/* XCLASS is used for classes that cannot be represented just by a bit
1118
map. This includes negated single high-valued characters. The length in
1119
the table is zero; the actual length is stored in the compiled code. */
1122
code += GET(code, 1) + 1;
1132
/*************************************************
1133
* Scan compiled branch for non-emptiness *
1134
*************************************************/
1136
/* This function scans through a branch of a compiled pattern to see whether it
1137
can match the empty string or not. It is called only from could_be_empty()
1138
below. Note that first_significant_code() skips over assertions. If we hit an
1139
unclosed bracket, we return "empty" - this means we've struck an inner bracket
1140
whose current branch will already have been scanned.
1143
code points to start of search
1144
endcode points to where to stop
1145
utf8 TRUE if in UTF8 mode
1147
Returns: TRUE if what is matched could be empty
1151
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1154
for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1156
code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1158
const uschar *ccode;
1165
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1167
/* Scan a closed bracket */
1169
empty_branch = FALSE;
1172
if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1173
empty_branch = TRUE;
1174
code += GET(code, 1);
1176
while (*code == OP_ALT);
1177
if (!empty_branch) return FALSE; /* All branches are non-empty */
1178
code += 1 + LINK_SIZE;
1184
/* Check for quantifiers after a class */
1188
ccode = code + GET(code, 1);
1189
goto CHECK_CLASS_REPEAT;
1202
case OP_CRSTAR: /* These could be empty; continue */
1208
default: /* Non-repeat => class must match */
1209
case OP_CRPLUS: /* These repeats aren't empty */
1215
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1220
/* Opcodes that must match a character */
1227
case OP_NOT_WHITESPACE:
1229
case OP_NOT_WORDCHAR:
1243
case OP_TYPEMINPLUS:
1255
/* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1256
followed by a multibyte character */
1265
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1276
/*************************************************
1277
* Scan compiled regex for non-emptiness *
1278
*************************************************/
1280
/* This function is called to check for left recursive calls. We want to check
1281
the current branch of the current pattern to see if it could match the empty
1282
string. If it could, we must look outwards for branches at other levels,
1283
stopping when we pass beyond the bracket which is the subject of the recursion.
1286
code points to start of the recursion
1287
endcode points to where to stop (current RECURSE item)
1288
bcptr points to the chain of current (unclosed) branch starts
1289
utf8 TRUE if in UTF-8 mode
1291
Returns: TRUE if what is matched could be empty
1295
could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1298
while (bcptr != NULL && bcptr->current >= code)
1300
if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1301
bcptr = bcptr->outer;
1308
/*************************************************
1309
* Check for POSIX class syntax *
1310
*************************************************/
1312
/* This function is called when the sequence "[:" or "[." or "[=" is
1313
encountered in a character class. It checks whether this is followed by an
1314
optional ^ and then a sequence of letters, terminated by a matching ":]" or
1318
ptr pointer to the initial [
1319
endptr where to return the end pointer
1320
cd pointer to compile data
1322
Returns: TRUE or FALSE
1326
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1328
int terminator; /* Don't combine these lines; the Solaris cc */
1329
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1330
if (*(++ptr) == '^') ptr++;
1331
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1332
if (*ptr == terminator && ptr[1] == ']')
1343
/*************************************************
1344
* Check POSIX class name *
1345
*************************************************/
1347
/* This function is called to check the name given in a POSIX-style class entry
1351
ptr points to the first letter
1352
len the length of the name
1354
Returns: a value representing the name, or -1 if unknown
1358
check_posix_name(const uschar *ptr, int len)
1360
register int yield = 0;
1361
while (posix_name_lengths[yield] != 0)
1363
if (len == posix_name_lengths[yield] &&
1364
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1371
/*************************************************
1372
* Adjust OP_RECURSE items in repeated group *
1373
*************************************************/
1375
/* OP_RECURSE items contain an offset from the start of the regex to the group
1376
that is referenced. This means that groups can be replicated for fixed
1377
repetition simply by copying (because the recursion is allowed to refer to
1378
earlier groups that are outside the current group). However, when a group is
1379
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1380
it, after it has been compiled. This means that any OP_RECURSE items within it
1381
that refer to the group itself or any contained groups have to have their
1382
offsets adjusted. That is the job of this function. Before it is called, the
1383
partially compiled regex must be temporarily terminated with OP_END.
1386
group points to the start of the group
1387
adjust the amount by which the group is to be moved
1388
utf8 TRUE in UTF-8 mode
1389
cd contains pointers to tables etc.
1395
adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1397
uschar *ptr = group;
1398
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1400
int offset = GET(ptr, 1);
1401
if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1402
ptr += 1 + LINK_SIZE;
1408
/*************************************************
1409
* Insert an automatic callout point *
1410
*************************************************/
1412
/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1413
callout points before each pattern item.
1416
code current code pointer
1417
ptr current pattern pointer
1418
cd pointers to tables etc
1420
Returns: new code pointer
1424
auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1426
*code++ = OP_CALLOUT;
1428
PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1429
PUT(code, LINK_SIZE, 0); /* Default length */
1430
return code + 2*LINK_SIZE;
1435
/*************************************************
1436
* Complete a callout item *
1437
*************************************************/
1439
/* A callout item contains the length of the next item in the pattern, which
1440
we can't fill in till after we have reached the relevant point. This is used
1441
for both automatic and manual callouts.
1444
previous_callout points to previous callout item
1445
ptr current pattern pointer
1446
cd pointers to tables etc
1452
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1454
int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1455
PUT(previous_callout, 2 + LINK_SIZE, length);
1461
/*************************************************
1462
* Get othercase range *
1463
*************************************************/
1465
/* This function is passed the start and end of a class range, in UTF-8 mode
1466
with UCP support. It searches up the characters, looking for internal ranges of
1467
characters in the "other" case. Each call returns the next one, updating the
1471
cptr points to starting character value; updated
1473
ocptr where to put start of othercase range
1474
odptr where to put end of othercase range
1476
Yield: TRUE when range returned; FALSE when no more
1480
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1482
int c, chartype, othercase, next;
1484
for (c = *cptr; c <= d; c++)
1486
if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1490
if (c > d) return FALSE;
1493
next = othercase + 1;
1495
for (++c; c <= d; c++)
1497
if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1508
#endif /* SUPPORT_UCP */
1511
/*************************************************
1512
* Compile one branch *
1513
*************************************************/
1515
/* Scan the pattern, compiling it into the code vector. If the options are
1516
changed during the branch, the pointer is used to change the external options
1520
optionsptr pointer to the option bits
1521
brackets points to number of extracting brackets used
1522
codeptr points to the pointer to the current code point
1523
ptrptr points to the current pattern pointer
1524
errorcodeptr points to error code variable
1525
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1526
reqbyteptr set to the last literal character required, else < 0
1527
bcptr points to current branch chain
1528
cd contains pointers to tables etc.
1530
Returns: TRUE on success
1531
FALSE, with *errorcodeptr set non-zero on error
1535
compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1536
const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1537
int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1539
int repeat_type, op_type;
1540
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1542
int greedy_default, greedy_non_default;
1543
int firstbyte, reqbyte;
1544
int zeroreqbyte, zerofirstbyte;
1545
int req_caseopt, reqvary, tempreqvary;
1547
int options = *optionsptr;
1548
int after_manual_callout = 0;
1550
register uschar *code = *codeptr;
1552
BOOL inescq = FALSE;
1553
BOOL groupsetfirstbyte = FALSE;
1554
const uschar *ptr = *ptrptr;
1555
const uschar *tempptr;
1556
uschar *previous = NULL;
1557
uschar *previous_callout = NULL;
1558
uschar classbits[32];
1562
BOOL utf8 = (options & PCRE_UTF8) != 0;
1563
uschar *class_utf8data;
1564
uschar utf8_char[6];
1569
/* Set up the default and non-default settings for greediness */
1571
greedy_default = ((options & PCRE_UNGREEDY) != 0);
1572
greedy_non_default = greedy_default ^ 1;
1574
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
1575
matching encountered yet". It gets changed to REQ_NONE if we hit something that
1576
matches a non-fixed char first char; reqbyte just remains unset if we never
1579
When we hit a repeat whose minimum is zero, we may have to adjust these values
1580
to take the zero repeat into account. This is implemented by setting them to
1581
zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1582
item types that can be repeated set these backoff variables appropriately. */
1584
firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1586
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1587
according to the current setting of the caseless flag. REQ_CASELESS is a bit
1588
value > 255. It is added into the firstbyte or reqbyte variables to record the
1589
case status of the value. This is used only for ASCII characters. */
1591
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1593
/* Switch on next character until the end of the branch */
1598
BOOL possessive_quantifier;
1600
int class_charcount;
1610
/* Next byte in the pattern */
1614
/* If in \Q...\E, check for the end; if not, we have a literal */
1616
if (inescq && c != 0)
1618
if (c == '\\' && ptr[1] == 'E')
1626
if (previous_callout != NULL)
1628
complete_callout(previous_callout, ptr, cd);
1629
previous_callout = NULL;
1631
if ((options & PCRE_AUTO_CALLOUT) != 0)
1633
previous_callout = code;
1634
code = auto_callout(code, ptr, cd);
1640
/* Fill in length of a previous callout, except when the next thing is
1643
is_quantifier = c == '*' || c == '+' || c == '?' ||
1644
(c == '{' && is_counted_repeat(ptr+1));
1646
if (!is_quantifier && previous_callout != NULL &&
1647
after_manual_callout-- <= 0)
1649
complete_callout(previous_callout, ptr, cd);
1650
previous_callout = NULL;
1653
/* In extended mode, skip white space and comments */
1655
if ((options & PCRE_EXTENDED) != 0)
1657
if ((cd->ctypes[c] & ctype_space) != 0) continue;
1660
/* The space before the ; is to avoid a warning on a silly compiler
1661
on the Macintosh. */
1662
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1663
if (c != 0) continue; /* Else fall through to handle end of string */
1667
/* No auto callout for quantifiers. */
1669
if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1671
previous_callout = code;
1672
code = auto_callout(code, ptr, cd);
1677
/* The branch terminates at end of string, |, or ). */
1682
*firstbyteptr = firstbyte;
1683
*reqbyteptr = reqbyte;
1688
/* Handle single-character metacharacters. In multiline mode, ^ disables
1689
the setting of any following char as a first character. */
1692
if ((options & PCRE_MULTILINE) != 0)
1694
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1705
/* There can never be a first char if '.' is first, whatever happens about
1706
repeats. The value of reqbyte doesn't change either. */
1709
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1710
zerofirstbyte = firstbyte;
1711
zeroreqbyte = reqbyte;
1716
/* Character classes. If the included characters are all < 255 in value, we
1717
build a 32-byte bitmap of the permitted characters, except in the special
1718
case where there is only one such character. For negated classes, we build
1719
the map as usual, then invert it at the end. However, we use a different
1720
opcode so that data characters > 255 can be handled correctly.
1722
If the class contains characters outside the 0-255 range, a different
1723
opcode is compiled. It may optionally have a bit map for characters < 256,
1724
but those above are are explicitly listed afterwards. A flag byte tells
1725
whether the bitmap is present, and whether this is a negated class or not.
1731
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1732
they are encountered at the top level, so we'll do that too. */
1734
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1735
check_posix_syntax(ptr, &tempptr, cd))
1737
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1741
/* If the first character is '^', set the negation flag and skip it. */
1743
if ((c = *(++ptr)) == '^')
1745
negate_class = TRUE;
1750
negate_class = FALSE;
1753
/* Keep a count of chars with values < 256 so that we can optimize the case
1754
of just a single character (as long as it's < 256). For higher valued UTF-8
1755
characters, we don't yet do any optimization. */
1757
class_charcount = 0;
1758
class_lastchar = -1;
1761
class_utf8 = FALSE; /* No chars >= 256 */
1762
class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1765
/* Initialize the 32-char bit map to all zeros. We have to build the
1766
map in a temporary bit of store, in case the class contains only 1
1767
character (< 256), because in that case the compiled code doesn't use the
1770
memset(classbits, 0, 32 * sizeof(uschar));
1772
/* Process characters until ] is reached. By writing this as a "do" it
1773
means that an initial ] is taken as a data character. The first pass
1774
through the regex checked the overall syntax, so we don't need to be very
1775
strict here. At the start of the loop, c contains the first byte of the
1781
if (utf8 && c > 127)
1782
{ /* Braces are required because the */
1783
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1787
/* Inside \Q...\E everything is literal except \E */
1791
if (c == '\\' && ptr[1] == 'E')
1797
else goto LONE_SINGLE_CHARACTER;
1800
/* Handle POSIX class names. Perl allows a negation extension of the
1801
form [:^name:]. A square bracket that doesn't match the syntax is
1802
treated as a literal. We also recognize the POSIX constructions
1803
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1807
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1808
check_posix_syntax(ptr, &tempptr, cd))
1810
BOOL local_negate = FALSE;
1812
register const uschar *cbits = cd->cbits;
1816
*errorcodeptr = ERR31;
1823
local_negate = TRUE;
1827
posix_class = check_posix_name(ptr, tempptr - ptr);
1828
if (posix_class < 0)
1830
*errorcodeptr = ERR30;
1834
/* If matching is caseless, upper and lower are converted to
1835
alpha. This relies on the fact that the class table starts with
1836
alpha, lower, upper as the first 3 entries. */
1838
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1841
/* Or into the map we are building up to 3 of the static class
1842
tables, or their negations. The [:blank:] class sets up the same
1843
chars as the [:space:] class (all white space). We remove the vertical
1844
white space chars afterwards. */
1847
for (i = 0; i < 3; i++)
1849
BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1850
int taboffset = posix_class_maps[posix_class + i];
1851
if (taboffset < 0) break;
1855
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1857
for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1858
if (blankclass) classbits[1] |= 0x3c;
1862
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1863
if (blankclass) classbits[1] &= ~0x3c;
1868
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1869
continue; /* End of POSIX syntax handling */
1872
/* Backslash may introduce a single character, or it may introduce one
1873
of the specials, which just set a flag. Escaped items are checked for
1874
validity in the pre-compiling pass. The sequence \b is a special case.
1875
Inside a class (and only there) it is treated as backspace. Elsewhere
1876
it marks a word boundary. Other escapes have preset maps ready to
1877
or into the one we are building. We assume they have more than one
1878
character in them, so set class_charcount bigger than one. */
1882
c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1884
if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1885
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1886
else if (-c == ESC_Q) /* Handle start of quoted string */
1888
if (ptr[1] == '\\' && ptr[2] == 'E')
1890
ptr += 2; /* avoid empty string */
1898
register const uschar *cbits = cd->cbits;
1899
class_charcount += 2; /* Greater than 1 is what matters */
1903
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1907
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1911
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1915
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1919
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1920
classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1924
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1925
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1933
int property = get_ucp(&ptr, &negated, errorcodeptr);
1934
if (property < 0) goto FAILED;
1936
*class_utf8data++ = ((-c == ESC_p) != negated)?
1937
XCL_PROP : XCL_NOTPROP;
1938
*class_utf8data++ = property;
1939
class_charcount -= 2; /* Not a < 256 character */
1944
/* Unrecognized escapes are faulted if PCRE is running in its
1945
strict mode. By default, for compatibility with Perl, they are
1946
treated as literals. */
1949
if ((options & PCRE_EXTRA) != 0)
1951
*errorcodeptr = ERR7;
1954
c = *ptr; /* The final character */
1955
class_charcount -= 2; /* Undo the default count from above */
1959
/* Fall through if we have a single character (c >= 0). This may be
1960
> 256 in UTF-8 mode. */
1962
} /* End of backslash handling */
1964
/* A single character may be followed by '-' to form a range. However,
1965
Perl does not permit ']' to be the end of the range. A '-' character
1966
here is treated as a literal. */
1968
if (ptr[1] == '-' && ptr[2] != ']')
1975
{ /* Braces are required because the */
1976
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1980
d = *ptr; /* Not UTF-8 mode */
1982
/* The second part of a range can be a single-character escape, but
1983
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1984
in such circumstances. */
1988
const uschar *oldptr = ptr;
1989
d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1991
/* \b is backslash; \X is literal X; any other special means the '-'
1996
if (d == -ESC_b) d = '\b';
1997
else if (d == -ESC_X) d = 'X'; else
2000
goto LONE_SINGLE_CHARACTER; /* A few lines below */
2005
/* The check that the two values are in the correct order happens in
2006
the pre-pass. Optimize one-character ranges */
2008
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2010
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2011
matching, we have to use an XCLASS with extra data items. Caseless
2012
matching for characters > 127 is available only if UCP support is
2016
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2020
/* With UCP support, we can find the other case equivalents of
2021
the relevant characters. There may be several ranges. Optimize how
2022
they fit with the basic range. */
2025
if ((options & PCRE_CASELESS) != 0)
2030
while (get_othercase_range(&cc, origd, &occ, &ocd))
2032
if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2034
if (occ < c && ocd >= c - 1) /* Extend the basic range */
2035
{ /* if there is overlap, */
2036
c = occ; /* noting that if occ < c */
2037
continue; /* we can't have ocd > d */
2038
} /* because a subrange is */
2039
if (ocd > d && occ <= d + 1) /* always shorter than */
2040
{ /* the basic range. */
2047
*class_utf8data++ = XCL_SINGLE;
2051
*class_utf8data++ = XCL_RANGE;
2052
class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2054
class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2057
#endif /* SUPPORT_UCP */
2059
/* Now record the original range, possibly modified for UCP caseless
2060
overlapping ranges. */
2062
*class_utf8data++ = XCL_RANGE;
2063
class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2064
class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2066
/* With UCP support, we are done. Without UCP support, there is no
2067
caseless matching for UTF-8 characters > 127; we can use the bit map
2068
for the smaller ones. */
2071
continue; /* With next character in the class */
2073
if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2075
/* Adjust upper limit and fall through to set up the map */
2079
#endif /* SUPPORT_UCP */
2081
#endif /* SUPPORT_UTF8 */
2083
/* We use the bit map for all cases when not in UTF-8 mode; else
2084
ranges that lie entirely within 0-127 when there is UCP support; else
2085
for partial ranges without UCP support. */
2089
classbits[c/8] |= (1 << (c&7));
2090
if ((options & PCRE_CASELESS) != 0)
2092
int uc = cd->fcc[c]; /* flip case */
2093
classbits[uc/8] |= (1 << (uc&7));
2095
class_charcount++; /* in case a one-char range */
2099
continue; /* Go get the next char in the class */
2102
/* Handle a lone single character - we can get here for a normal
2103
non-escape char, or after \ that introduces a single character or for an
2104
apparent range that isn't. */
2106
LONE_SINGLE_CHARACTER:
2108
/* Handle a character that cannot go in the bit map */
2111
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2114
*class_utf8data++ = XCL_SINGLE;
2115
class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2118
if ((options & PCRE_CASELESS) != 0)
2122
if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2125
*class_utf8data++ = XCL_SINGLE;
2126
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2129
#endif /* SUPPORT_UCP */
2133
#endif /* SUPPORT_UTF8 */
2135
/* Handle a single-byte character */
2137
classbits[c/8] |= (1 << (c&7));
2138
if ((options & PCRE_CASELESS) != 0)
2140
c = cd->fcc[c]; /* flip case */
2141
classbits[c/8] |= (1 << (c&7));
2148
/* Loop until ']' reached; the check for end of string happens inside the
2149
loop. This "while" is the end of the "do" above. */
2151
while ((c = *(++ptr)) != ']' || inescq);
2153
/* If class_charcount is 1, we saw precisely one character whose value is
2154
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2155
can optimize the negative case only if there were no characters >= 128
2156
because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2157
single-bytes only. This is an historical hangover. Maybe one day we can
2158
tidy these opcodes to handle multi-byte characters.
2160
The optimization throws away the bit map. We turn the item into a
2161
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2162
that OP_NOT does not support multibyte characters. In the positive case, it
2163
can cause firstbyte to be set. Otherwise, there can be no first char if
2164
this item is first, whatever repeat count may follow. In the case of
2165
reqbyte, save the previous value for reinstating. */
2168
if (class_charcount == 1 &&
2170
(!class_utf8 && (!negate_class || class_lastchar < 128))))
2173
if (class_charcount == 1)
2176
zeroreqbyte = reqbyte;
2178
/* The OP_NOT opcode works on one-byte characters only. */
2182
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2183
zerofirstbyte = firstbyte;
2185
*code++ = class_lastchar;
2189
/* For a single, positive character, get the value into mcbuffer, and
2190
then we can handle this with the normal one-character code. */
2193
if (utf8 && class_lastchar > 127)
2194
mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2198
mcbuffer[0] = class_lastchar;
2202
} /* End of 1-char optimization */
2204
/* The general case - not the one-char optimization. If this is the first
2205
thing in the branch, there can be no first char setting, whatever the
2206
repeat count. Any reqbyte setting must remain unchanged after any kind of
2209
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2210
zerofirstbyte = firstbyte;
2211
zeroreqbyte = reqbyte;
2213
/* If there are characters with values > 255, we have to compile an
2214
extended class, with its own opcode. If there are no characters < 256,
2215
we can omit the bitmap. */
2220
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
2221
*code++ = OP_XCLASS;
2223
*code = negate_class? XCL_NOT : 0;
2225
/* If the map is required, install it, and move on to the end of
2228
if (class_charcount > 0)
2231
memcpy(code, classbits, 32);
2232
code = class_utf8data;
2235
/* If the map is not required, slide down the extra data. */
2239
int len = class_utf8data - (code + 33);
2240
memmove(code + 1, code + 33, len);
2244
/* Now fill in the complete length of the item */
2246
PUT(previous, 1, code - previous);
2247
break; /* End of class handling */
2251
/* If there are no characters > 255, negate the 32-byte map if necessary,
2252
and copy it into the code vector. If this is the first thing in the branch,
2253
there can be no first char setting, whatever the repeat count. Any reqbyte
2254
setting must remain unchanged after any kind of repeat. */
2258
*code++ = OP_NCLASS;
2259
for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2264
memcpy(code, classbits, 32);
2269
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2270
has been tested above. */
2273
if (!is_quantifier) goto NORMAL_CHAR;
2274
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2275
if (*errorcodeptr != 0) goto FAILED;
2293
if (previous == NULL)
2295
*errorcodeptr = ERR9;
2299
if (repeat_min == 0)
2301
firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2302
reqbyte = zeroreqbyte; /* Ditto */
2305
/* Remember whether this is a variable length repeat */
2307
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2309
op_type = 0; /* Default single-char op codes */
2310
possessive_quantifier = FALSE; /* Default not possessive quantifier */
2312
/* Save start of previous item, in case we have to move it up to make space
2313
for an inserted OP_ONCE for the additional '+' extension. */
2315
tempcode = previous;
2317
/* If the next character is '+', we have a possessive quantifier. This
2318
implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2319
If the next character is '?' this is a minimizing repeat, by default,
2320
but if PCRE_UNGREEDY is set, it works the other way round. We change the
2321
repeat type to the non-default. */
2325
repeat_type = 0; /* Force greedy */
2326
possessive_quantifier = TRUE;
2329
else if (ptr[1] == '?')
2331
repeat_type = greedy_non_default;
2334
else repeat_type = greedy_default;
2336
/* If previous was a recursion, we need to wrap it inside brackets so that
2337
it can be replicated if necessary. */
2339
if (*previous == OP_RECURSE)
2341
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2342
code += 1 + LINK_SIZE;
2344
PUT(previous, 1, code - previous);
2346
PUT(code, 1, code - previous);
2347
code += 1 + LINK_SIZE;
2350
/* If previous was a character match, abolish the item and generate a
2351
repeat item instead. If a char item has a minumum of more than one, ensure
2352
that it is set in reqbyte - it might not be if a sequence such as x{3} is
2353
the first thing in a branch because the x will have gone into firstbyte
2356
if (*previous == OP_CHAR || *previous == OP_CHARNC)
2358
/* Deal with UTF-8 characters that take up more than one byte. It's
2359
easier to write this out separately than try to macrify it. Use c to
2360
hold the length of the character in bytes, plus 0x80 to flag that it's a
2361
length rather than a small character. */
2364
if (utf8 && (code[-1] & 0x80) != 0)
2366
uschar *lastchar = code - 1;
2367
while((*lastchar & 0xc0) == 0x80) lastchar--;
2368
c = code - lastchar; /* Length of UTF-8 character */
2369
memcpy(utf8_char, lastchar, c); /* Save the char */
2370
c |= 0x80; /* Flag c as a length */
2375
/* Handle the case of a single byte - either with no UTF8 support, or
2376
with UTF-8 disabled, or for a UTF-8 character < 128. */
2380
if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2383
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2386
/* If previous was a single negated character ([^a] or similar), we use
2387
one of the special opcodes, replacing it. The code is shared with single-
2388
character repeats by setting opt_type to add a suitable offset into
2389
repeat_type. OP_NOT is currently used only for single-byte chars. */
2391
else if (*previous == OP_NOT)
2393
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2395
goto OUTPUT_SINGLE_REPEAT;
2398
/* If previous was a character type match (\d or similar), abolish it and
2399
create a suitable repeat item. The code is shared with single-character
2400
repeats by setting op_type to add a suitable offset into repeat_type. Note
2401
the the Unicode property types will be present only when SUPPORT_UCP is
2402
defined, but we don't wrap the little bits of code here because it just
2403
makes it horribly messy. */
2405
else if (*previous < OP_EODN)
2409
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2412
OUTPUT_SINGLE_REPEAT:
2413
prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2417
code = previous; /* Usually overwrite previous item */
2419
/* If the maximum is zero then the minimum must also be zero; Perl allows
2420
this case, so we do too - by simply omitting the item altogether. */
2422
if (repeat_max == 0) goto END_REPEAT;
2424
/* All real repeats make it impossible to handle partial matching (maybe
2425
one day we will be able to remove this restriction). */
2427
if (repeat_max != 1) cd->nopartial = TRUE;
2429
/* Combine the op_type with the repeat_type */
2431
repeat_type += op_type;
2433
/* A minimum of zero is handled either as the special case * or ?, or as
2434
an UPTO, with the maximum given. */
2436
if (repeat_min == 0)
2438
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2439
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2442
*code++ = OP_UPTO + repeat_type;
2443
PUT2INC(code, 0, repeat_max);
2447
/* A repeat minimum of 1 is optimized into some special cases. If the
2448
maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2449
left in place and, if the maximum is greater than 1, we use OP_UPTO with
2450
one less than the maximum. */
2452
else if (repeat_min == 1)
2454
if (repeat_max == -1)
2455
*code++ = OP_PLUS + repeat_type;
2458
code = oldcode; /* leave previous item in place */
2459
if (repeat_max == 1) goto END_REPEAT;
2460
*code++ = OP_UPTO + repeat_type;
2461
PUT2INC(code, 0, repeat_max - 1);
2465
/* The case {n,n} is just an EXACT, while the general case {n,m} is
2466
handled as an EXACT followed by an UPTO. */
2470
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2471
PUT2INC(code, 0, repeat_min);
2473
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2474
we have to insert the character for the previous code. For a repeated
2475
Unicode property match, there is an extra byte that defines the
2476
required property. In UTF-8 mode, long characters have their length in
2477
c, with the 0x80 bit as a flag. */
2482
if (utf8 && c >= 128)
2484
memcpy(code, utf8_char, c & 7);
2491
if (prop_type >= 0) *code++ = prop_type;
2493
*code++ = OP_STAR + repeat_type;
2496
/* Else insert an UPTO if the max is greater than the min, again
2497
preceded by the character, for the previously inserted code. */
2499
else if (repeat_max != repeat_min)
2502
if (utf8 && c >= 128)
2504
memcpy(code, utf8_char, c & 7);
2510
if (prop_type >= 0) *code++ = prop_type;
2511
repeat_max -= repeat_min;
2512
*code++ = OP_UPTO + repeat_type;
2513
PUT2INC(code, 0, repeat_max);
2517
/* The character or character type itself comes last in all cases. */
2520
if (utf8 && c >= 128)
2522
memcpy(code, utf8_char, c & 7);
2529
/* For a repeated Unicode property match, there is an extra byte that
2530
defines the required property. */
2533
if (prop_type >= 0) *code++ = prop_type;
2537
/* If previous was a character class or a back reference, we put the repeat
2538
stuff after it, but just skip the item if the repeat was {0,0}. */
2540
else if (*previous == OP_CLASS ||
2541
*previous == OP_NCLASS ||
2543
*previous == OP_XCLASS ||
2545
*previous == OP_REF)
2547
if (repeat_max == 0)
2553
/* All real repeats make it impossible to handle partial matching (maybe
2554
one day we will be able to remove this restriction). */
2556
if (repeat_max != 1) cd->nopartial = TRUE;
2558
if (repeat_min == 0 && repeat_max == -1)
2559
*code++ = OP_CRSTAR + repeat_type;
2560
else if (repeat_min == 1 && repeat_max == -1)
2561
*code++ = OP_CRPLUS + repeat_type;
2562
else if (repeat_min == 0 && repeat_max == 1)
2563
*code++ = OP_CRQUERY + repeat_type;
2566
*code++ = OP_CRRANGE + repeat_type;
2567
PUT2INC(code, 0, repeat_min);
2568
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2569
PUT2INC(code, 0, repeat_max);
2573
/* If previous was a bracket group, we may have to replicate it in certain
2576
else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2577
*previous == OP_COND)
2581
int len = code - previous;
2582
uschar *bralink = NULL;
2584
/* If the maximum repeat count is unlimited, find the end of the bracket
2585
by scanning through from the start, and compute the offset back to it
2586
from the current code pointer. There may be an OP_OPT setting following
2587
the final KET, so we can't find the end just by going back from the code
2590
if (repeat_max == -1)
2592
register uschar *ket = previous;
2593
do ket += GET(ket, 1); while (*ket != OP_KET);
2594
ketoffset = code - ket;
2597
/* The case of a zero minimum is special because of the need to stick
2598
OP_BRAZERO in front of it, and because the group appears once in the
2599
data, whereas in other cases it appears the minimum number of times. For
2600
this reason, it is simplest to treat this case separately, as otherwise
2601
the code gets far too messy. There are several special subcases when the
2604
if (repeat_min == 0)
2606
/* If the maximum is also zero, we just omit the group from the output
2609
if (repeat_max == 0)
2615
/* If the maximum is 1 or unlimited, we just have to stick in the
2616
BRAZERO and do no more at this point. However, we do need to adjust
2617
any OP_RECURSE calls inside the group that refer to the group itself or
2618
any internal group, because the offset is from the start of the whole
2619
regex. Temporarily terminate the pattern while doing this. */
2621
if (repeat_max <= 1)
2624
adjust_recurse(previous, 1, utf8, cd);
2625
memmove(previous+1, previous, len);
2627
*previous++ = OP_BRAZERO + repeat_type;
2630
/* If the maximum is greater than 1 and limited, we have to replicate
2631
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2632
The first one has to be handled carefully because it's the original
2633
copy, which has to be moved up. The remainder can be handled by code
2634
that is common with the non-zero minimum case below. We have to
2635
adjust the value or repeat_max, since one less copy is required. Once
2636
again, we may have to adjust any OP_RECURSE calls inside the group. */
2642
adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2643
memmove(previous + 2 + LINK_SIZE, previous, len);
2644
code += 2 + LINK_SIZE;
2645
*previous++ = OP_BRAZERO + repeat_type;
2646
*previous++ = OP_BRA;
2648
/* We chain together the bracket offset fields that have to be
2649
filled in later when the ends of the brackets are reached. */
2651
offset = (bralink == NULL)? 0 : previous - bralink;
2653
PUTINC(previous, 0, offset);
2659
/* If the minimum is greater than zero, replicate the group as many
2660
times as necessary, and adjust the maximum to the number of subsequent
2661
copies that we need. If we set a first char from the group, and didn't
2662
set a required char, copy the latter from the former. */
2668
if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2669
for (i = 1; i < repeat_min; i++)
2671
memcpy(code, previous, len);
2675
if (repeat_max > 0) repeat_max -= repeat_min;
2678
/* This code is common to both the zero and non-zero minimum cases. If
2679
the maximum is limited, it replicates the group in a nested fashion,
2680
remembering the bracket starts on a stack. In the case of a zero minimum,
2681
the first one was set up above. In all cases the repeat_max now specifies
2682
the number of additional copies needed. */
2684
if (repeat_max >= 0)
2686
for (i = repeat_max - 1; i >= 0; i--)
2688
*code++ = OP_BRAZERO + repeat_type;
2690
/* All but the final copy start a new nesting, maintaining the
2691
chain of brackets outstanding. */
2697
offset = (bralink == NULL)? 0 : code - bralink;
2699
PUTINC(code, 0, offset);
2702
memcpy(code, previous, len);
2706
/* Now chain through the pending brackets, and fill in their length
2707
fields (which are holding the chain links pro tem). */
2709
while (bralink != NULL)
2712
int offset = code - bralink + 1;
2713
uschar *bra = code - offset;
2714
oldlinkoffset = GET(bra, 1);
2715
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2717
PUTINC(code, 0, offset);
2718
PUT(bra, 1, offset);
2722
/* If the maximum is unlimited, set a repeater in the final copy. We
2723
can't just offset backwards from the current code point, because we
2724
don't know if there's been an options resetting after the ket. The
2725
correct offset was computed above. */
2727
else code[-ketoffset] = OP_KETRMAX + repeat_type;
2730
/* Else there's some kind of shambles */
2734
*errorcodeptr = ERR11;
2738
/* If the character following a repeat is '+', we wrap the entire repeated
2739
item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2740
Sun's Java package. The repeated item starts at tempcode, not at previous,
2741
which might be the first part of a string whose (former) last char we
2742
repeated. However, we don't support '+' after a greediness '?'. */
2744
if (possessive_quantifier)
2746
int len = code - tempcode;
2747
memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2748
code += 1 + LINK_SIZE;
2749
len += 1 + LINK_SIZE;
2750
tempcode[0] = OP_ONCE;
2752
PUTINC(code, 0, len);
2753
PUT(tempcode, 1, len);
2756
/* In all case we no longer have a previous item. We also set the
2757
"follows varying string" flag for subsequently encountered reqbytes if
2758
it isn't already set and we have just passed a varying length item. */
2762
cd->req_varyopt |= reqvary;
2766
/* Start of nested bracket sub-expression, or comment or lookahead or
2767
lookbehind or option setting or condition. First deal with special things
2768
that can come after a bracket; all are introduced by ?, and the appearance
2769
of any of them means that this is not a referencing group. They were
2770
checked for validity in the first pass over the string, so we don't have to
2771
check for syntax errors here. */
2774
newoptions = options;
2777
if (*(++ptr) == '?')
2784
case '#': /* Comment; skip to ket */
2786
while (*ptr != ')') ptr++;
2789
case ':': /* Non-extracting bracket */
2795
bravalue = OP_COND; /* Conditional group */
2797
/* Condition to test for recursion */
2801
code[1+LINK_SIZE] = OP_CREF;
2802
PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2807
/* Condition to test for a numbered subpattern match. We know that
2808
if a digit follows ( then there will just be digits until ) because
2809
the syntax was checked in the first pass. */
2811
else if ((digitab[ptr[1]] && ctype_digit) != 0)
2813
int condref; /* Don't amalgamate; some compilers */
2814
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2815
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2818
*errorcodeptr = ERR35;
2822
code[1+LINK_SIZE] = OP_CREF;
2823
PUT2(code, 2+LINK_SIZE, condref);
2826
/* For conditions that are assertions, we just fall through, having
2827
set bravalue above. */
2830
case '=': /* Positive lookahead */
2831
bravalue = OP_ASSERT;
2835
case '!': /* Negative lookahead */
2836
bravalue = OP_ASSERT_NOT;
2840
case '<': /* Lookbehinds */
2843
case '=': /* Positive lookbehind */
2844
bravalue = OP_ASSERTBACK;
2848
case '!': /* Negative lookbehind */
2849
bravalue = OP_ASSERTBACK_NOT;
2855
case '>': /* One-time brackets */
2860
case 'C': /* Callout - may be followed by digits; */
2861
previous_callout = code; /* Save for later completion */
2862
after_manual_callout = 1; /* Skip one item before completing */
2863
*code++ = OP_CALLOUT; /* Already checked that the terminating */
2864
{ /* closing parenthesis is present. */
2866
while ((digitab[*(++ptr)] & ctype_digit) != 0)
2867
n = n * 10 + *ptr - '0';
2870
*errorcodeptr = ERR38;
2874
PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2875
PUT(code, LINK_SIZE, 0); /* Default length */
2876
code += 2 * LINK_SIZE;
2881
case 'P': /* Named subpattern handling */
2882
if (*(++ptr) == '<') /* Definition */
2885
uschar *slot = cd->name_table;
2886
const uschar *name; /* Don't amalgamate; some compilers */
2887
name = ++ptr; /* grumble at autoincrement in declaration */
2889
while (*ptr++ != '>');
2890
namelen = ptr - name - 1;
2892
for (i = 0; i < cd->names_found; i++)
2894
int crc = memcmp(name, slot+2, namelen);
2897
if (slot[2+namelen] == 0)
2899
*errorcodeptr = ERR43;
2902
crc = -1; /* Current name is substring */
2906
memmove(slot + cd->name_entry_size, slot,
2907
(cd->names_found - i) * cd->name_entry_size);
2910
slot += cd->name_entry_size;
2913
PUT2(slot, 0, *brackets + 1);
2914
memcpy(slot + 2, name, namelen);
2915
slot[2+namelen] = 0;
2917
goto NUMBERED_GROUP;
2920
if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2924
const uschar *name = ptr;
2925
uschar *slot = cd->name_table;
2927
while (*ptr != ')') ptr++;
2928
namelen = ptr - name;
2930
for (i = 0; i < cd->names_found; i++)
2932
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2933
slot += cd->name_entry_size;
2935
if (i >= cd->names_found)
2937
*errorcodeptr = ERR15;
2941
recno = GET2(slot, 0);
2943
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2945
/* Back reference */
2949
PUT2INC(code, 0, recno);
2950
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2951
if (recno > cd->top_backref) cd->top_backref = recno;
2955
/* Should never happen */
2958
case 'R': /* Pattern recursion */
2959
ptr++; /* Same as (?0) */
2962
/* Recursion or "subroutine" call */
2964
case '0': case '1': case '2': case '3': case '4':
2965
case '5': case '6': case '7': case '8': case '9':
2967
const uschar *called;
2969
while((digitab[*ptr] & ctype_digit) != 0)
2970
recno = recno * 10 + *ptr++ - '0';
2972
/* Come here from code above that handles a named recursion */
2978
/* Find the bracket that is being referenced. Temporarily end the
2979
regex in case it doesn't exist. */
2982
called = (recno == 0)?
2983
cd->start_code : find_bracket(cd->start_code, utf8, recno);
2987
*errorcodeptr = ERR15;
2991
/* If the subpattern is still open, this is a recursive call. We
2992
check to see if this is a left recursion that could loop for ever,
2993
and diagnose that case. */
2995
if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2997
*errorcodeptr = ERR40;
3001
/* Insert the recursion/subroutine item */
3004
PUT(code, 1, called - cd->start_code);
3005
code += 1 + LINK_SIZE;
3009
/* Character after (? not specially recognized */
3011
default: /* Option setting */
3015
while (*ptr != ')' && *ptr != ':')
3019
case '-': optset = &unset; break;
3021
case 'i': *optset |= PCRE_CASELESS; break;
3022
case 'm': *optset |= PCRE_MULTILINE; break;
3023
case 's': *optset |= PCRE_DOTALL; break;
3024
case 'x': *optset |= PCRE_EXTENDED; break;
3025
case 'U': *optset |= PCRE_UNGREEDY; break;
3026
case 'X': *optset |= PCRE_EXTRA; break;
3030
/* Set up the changed option bits, but don't change anything yet. */
3032
newoptions = (options | set) & (~unset);
3034
/* If the options ended with ')' this is not the start of a nested
3035
group with option changes, so the options change at this level. Compile
3036
code to change the ims options if this setting actually changes any of
3037
them. We also pass the new setting back so that it can be put at the
3038
start of any following branches, and when this group ends (if we are in
3039
a group), a resetting item can be compiled.
3041
Note that if this item is right at the start of the pattern, the
3042
options will have been abstracted and made global, so there will be no
3043
change to compile. */
3047
if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3050
*code++ = newoptions & PCRE_IMS;
3053
/* Change options at this level, and pass them back for use
3054
in subsequent branches. Reset the greedy defaults and the case
3055
value for firstbyte and reqbyte. */
3057
*optionsptr = options = newoptions;
3058
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3059
greedy_non_default = greedy_default ^ 1;
3060
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3062
previous = NULL; /* This item can't be repeated */
3063
continue; /* It is complete */
3066
/* If the options ended with ':' we are heading into a nested group
3067
with possible change of options. Such groups are non-capturing and are
3068
not assertions of any kind. All we need to do is skip over the ':';
3069
the newoptions value is handled below. */
3076
/* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3077
non-capturing and behave like (?:...) brackets */
3079
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3084
/* Else we have a referencing group; adjust the opcode. If the bracket
3085
number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3086
arrange for the true number to follow later, in an OP_BRANUMBER item. */
3091
if (++(*brackets) > EXTRACT_BASIC_MAX)
3093
bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3094
code[1+LINK_SIZE] = OP_BRANUMBER;
3095
PUT2(code, 2+LINK_SIZE, *brackets);
3098
else bravalue = OP_BRA + *brackets;
3101
/* Process nested bracketed re. Assertions may not be repeated, but other
3102
kinds can be. We copy code into a non-register variable in order to be able
3103
to pass its address because some compilers complain otherwise. Pass in a
3104
new setting for the ims options if they have changed. */
3106
previous = (bravalue >= OP_ONCE)? code : NULL;
3109
tempreqvary = cd->req_varyopt; /* Save value before bracket */
3112
newoptions, /* The complete new option state */
3113
options & PCRE_IMS, /* The previous ims option state */
3114
brackets, /* Extracting bracket count */
3115
&tempcode, /* Where to put code (updated) */
3116
&ptr, /* Input pointer (updated) */
3117
errorcodeptr, /* Where to put an error message */
3118
(bravalue == OP_ASSERTBACK ||
3119
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3120
skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3121
&subfirstbyte, /* For possible first char */
3122
&subreqbyte, /* For possible last char */
3123
bcptr, /* Current branch chain */
3124
cd)) /* Tables block */
3127
/* At the end of compiling, code is still pointing to the start of the
3128
group, while tempcode has been updated to point past the end of the group
3129
and any option resetting that may follow it. The pattern pointer (ptr)
3130
is on the bracket. */
3132
/* If this is a conditional bracket, check that there are no more than
3133
two branches in the group. */
3135
else if (bravalue == OP_COND)
3144
while (*tc != OP_KET);
3148
*errorcodeptr = ERR27;
3152
/* If there is just one branch, we must not make use of its firstbyte or
3153
reqbyte, because this is equivalent to an empty second branch. */
3155
if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3158
/* Handle updating of the required and first characters. Update for normal
3159
brackets of all kinds, and conditions with two branches (see code above).
3160
If the bracket is followed by a quantifier with zero repeat, we have to
3161
back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3162
main loop so that they can be accessed for the back off. */
3164
zeroreqbyte = reqbyte;
3165
zerofirstbyte = firstbyte;
3166
groupsetfirstbyte = FALSE;
3168
if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3170
/* If we have not yet set a firstbyte in this branch, take it from the
3171
subpattern, remembering that it was set here so that a repeat of more
3172
than one can replicate it as reqbyte if necessary. If the subpattern has
3173
no firstbyte, set "none" for the whole branch. In both cases, a zero
3174
repeat forces firstbyte to "none". */
3176
if (firstbyte == REQ_UNSET)
3178
if (subfirstbyte >= 0)
3180
firstbyte = subfirstbyte;
3181
groupsetfirstbyte = TRUE;
3183
else firstbyte = REQ_NONE;
3184
zerofirstbyte = REQ_NONE;
3187
/* If firstbyte was previously set, convert the subpattern's firstbyte
3188
into reqbyte if there wasn't one, using the vary flag that was in
3189
existence beforehand. */
3191
else if (subfirstbyte >= 0 && subreqbyte < 0)
3192
subreqbyte = subfirstbyte | tempreqvary;
3194
/* If the subpattern set a required byte (or set a first byte that isn't
3195
really the first byte - see above), set it. */
3197
if (subreqbyte >= 0) reqbyte = subreqbyte;
3200
/* For a forward assertion, we take the reqbyte, if set. This can be
3201
helpful if the pattern that follows the assertion doesn't set a different
3202
char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3203
for an assertion, however because it leads to incorrect effect for patterns
3204
such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3205
of a firstbyte. This is overcome by a scan at the end if there's no
3206
firstbyte, looking for an asserted first char. */
3208
else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3210
/* Now update the main code pointer to the end of the group. */
3214
/* Error if hit end of pattern */
3218
*errorcodeptr = ERR14;
3223
/* Check \ for being a real metacharacter; if not, fall through and handle
3224
it as a data character at the start of a string. Escape items are checked
3225
for validity in the pre-compiling pass. */
3229
c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3231
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3232
are arranged to be the negation of the corresponding OP_values. For the
3233
back references, the values are ESC_REF plus the reference number. Only
3234
back references and those types that consume a character may be repeated.
3235
We can test for values between ESC_b and ESC_Z for the latter; this may
3236
have to change if any new ones are ever created. */
3240
if (-c == ESC_Q) /* Handle start of quoted string */
3242
if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3247
/* For metasequences that actually match a character, we disable the
3248
setting of a first character if it hasn't already been set. */
3250
if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3251
firstbyte = REQ_NONE;
3253
/* Set values to reset to if this is followed by a zero repeat. */
3255
zerofirstbyte = firstbyte;
3256
zeroreqbyte = reqbyte;
3258
/* Back references are handled specially */
3262
int number = -c - ESC_REF;
3265
PUT2INC(code, 0, number);
3268
/* So are Unicode property matches, if supported. We know that get_ucp
3269
won't fail because it was tested in the pre-pass. */
3272
else if (-c == ESC_P || -c == ESC_p)
3275
int value = get_ucp(&ptr, &negated, errorcodeptr);
3277
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3282
/* For the rest, we can obtain the OP value by negating the escape
3287
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3293
/* We have a data character whose value is in c. In UTF-8 mode it may have
3294
a value > 127. We set its representation in the length/buffer, and then
3295
handle it as a data character. */
3298
if (utf8 && c > 127)
3299
mclength = _pcre_ord2utf8(c, mcbuffer);
3310
/* Handle a literal character. It is guaranteed not to be whitespace or #
3311
when the extended flag is set. If we are in UTF-8 mode, it may be a
3312
multi-byte literal character. */
3320
if (utf8 && (c & 0xc0) == 0xc0)
3322
while ((ptr[1] & 0xc0) == 0x80)
3323
mcbuffer[mclength++] = *(++ptr);
3327
/* At this point we have the character's bytes in mcbuffer, and the length
3328
in mclength. When not in UTF-8 mode, the length is always 1. */
3332
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3333
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3335
/* Set the first and required bytes appropriately. If no previous first
3336
byte, set it from this character, but revert to none on a zero repeat.
3337
Otherwise, leave the firstbyte value alone, and don't change it on a zero
3340
if (firstbyte == REQ_UNSET)
3342
zerofirstbyte = REQ_NONE;
3343
zeroreqbyte = reqbyte;
3345
/* If the character is more than one byte long, we can set firstbyte
3346
only if it is not to be matched caselessly. */
3348
if (mclength == 1 || req_caseopt == 0)
3350
firstbyte = mcbuffer[0] | req_caseopt;
3351
if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3353
else firstbyte = reqbyte = REQ_NONE;
3356
/* firstbyte was previously set; we can set reqbyte only the length is
3357
1 or the matching is caseful. */
3361
zerofirstbyte = firstbyte;
3362
zeroreqbyte = reqbyte;
3363
if (mclength == 1 || req_caseopt == 0)
3364
reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3367
break; /* End of literal character handling */
3369
} /* end of big loop */
3371
/* Control never reaches here by falling through, only by a goto for all the
3372
error states. Pass back the position in the pattern so that it can be displayed
3373
to the user for diagnosing the error. */
3383
/*************************************************
3384
* Compile sequence of alternatives *
3385
*************************************************/
3387
/* On entry, ptr is pointing past the bracket character, but on return
3388
it points to the closing bracket, or vertical bar, or end of string.
3389
The code variable is pointing at the byte into which the BRA operator has been
3390
stored. If the ims options are changed at the start (for a (?ims: group) or
3391
during any branch, we need to insert an OP_OPT item at the start of every
3392
following branch to ensure they get set correctly at run time, and also pass
3393
the new options into every subsequent branch compile.
3396
options option bits, including any changes for this subpattern
3397
oldims previous settings of ims option bits
3398
brackets -> int containing the number of extracting brackets used
3399
codeptr -> the address of the current code pointer
3400
ptrptr -> the address of the current pattern pointer
3401
errorcodeptr -> pointer to error code variable
3402
lookbehind TRUE if this is a lookbehind assertion
3403
skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3404
firstbyteptr place to put the first required character, or a negative number
3405
reqbyteptr place to put the last required character, or a negative number
3406
bcptr pointer to the chain of currently open branches
3407
cd points to the data block with tables pointers etc.
3409
Returns: TRUE on success
3413
compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3414
const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3415
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3417
const uschar *ptr = *ptrptr;
3418
uschar *code = *codeptr;
3419
uschar *last_branch = code;
3420
uschar *start_bracket = code;
3421
uschar *reverse_count = NULL;
3422
int firstbyte, reqbyte;
3423
int branchfirstbyte, branchreqbyte;
3429
firstbyte = reqbyte = REQ_UNSET;
3431
/* Offset is set zero to mark that this bracket is still open */
3434
code += 1 + LINK_SIZE + skipbytes;
3436
/* Loop for each alternative branch */
3440
/* Handle a change of ims options at the start of the branch */
3442
if ((options & PCRE_IMS) != oldims)
3445
*code++ = options & PCRE_IMS;
3448
/* Set up dummy OP_REVERSE if lookbehind assertion */
3452
*code++ = OP_REVERSE;
3453
reverse_count = code;
3457
/* Now compile the branch */
3459
if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3460
&branchfirstbyte, &branchreqbyte, &bc, cd))
3466
/* If this is the first branch, the firstbyte and reqbyte values for the
3467
branch become the values for the regex. */
3469
if (*last_branch != OP_ALT)
3471
firstbyte = branchfirstbyte;
3472
reqbyte = branchreqbyte;
3475
/* If this is not the first branch, the first char and reqbyte have to
3476
match the values from all the previous branches, except that if the previous
3477
value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3478
REQ_VARY for the regex. */
3482
/* If we previously had a firstbyte, but it doesn't match the new branch,
3483
we have to abandon the firstbyte for the regex, but if there was previously
3484
no reqbyte, it takes on the value of the old firstbyte. */
3486
if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3488
if (reqbyte < 0) reqbyte = firstbyte;
3489
firstbyte = REQ_NONE;
3492
/* If we (now or from before) have no firstbyte, a firstbyte from the
3493
branch becomes a reqbyte if there isn't a branch reqbyte. */
3495
if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3496
branchreqbyte = branchfirstbyte;
3498
/* Now ensure that the reqbytes match */
3500
if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3502
else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3505
/* If lookbehind, check that this branch matches a fixed-length string,
3506
and put the length into the OP_REVERSE item. Temporarily mark the end of
3507
the branch with OP_END. */
3513
length = find_fixedlength(last_branch, options);
3514
DPRINTF(("fixed length = %d\n", length));
3517
*errorcodeptr = (length == -2)? ERR36 : ERR25;
3521
PUT(reverse_count, 0, length);
3524
/* Reached end of expression, either ')' or end of pattern. Go back through
3525
the alternative branches and reverse the chain of offsets, with the field in
3526
the BRA item now becoming an offset to the first alternative. If there are
3527
no alternatives, it points to the end of the group. The length in the
3528
terminating ket is always the length of the whole bracketed item. If any of
3529
the ims options were changed inside the group, compile a resetting op-code
3530
following, except at the very end of the pattern. Return leaving the pointer
3531
at the terminating char. */
3535
int length = code - last_branch;
3538
int prev_length = GET(last_branch, 1);
3539
PUT(last_branch, 1, length);
3540
length = prev_length;
3541
last_branch -= length;
3545
/* Fill in the ket */
3548
PUT(code, 1, code - start_bracket);
3549
code += 1 + LINK_SIZE;
3551
/* Resetting option if needed */
3553
if ((options & PCRE_IMS) != oldims && *ptr == ')')
3559
/* Set values to pass back */
3563
*firstbyteptr = firstbyte;
3564
*reqbyteptr = reqbyte;
3568
/* Another branch follows; insert an "or" node. Its length field points back
3569
to the previous branch while the bracket remains open. At the end the chain
3570
is reversed. It's done like this so that the start of the bracket has a
3571
zero offset until it is closed, making it possible to detect recursion. */
3574
PUT(code, 1, code - last_branch);
3575
bc.current = last_branch = code;
3576
code += 1 + LINK_SIZE;
3579
/* Control never reaches here */
3585
/*************************************************
3586
* Check for anchored expression *
3587
*************************************************/
3589
/* Try to find out if this is an anchored regular expression. Consider each
3590
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3591
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3592
it's anchored. However, if this is a multiline pattern, then only OP_SOD
3593
counts, since OP_CIRC can match in the middle.
3595
We can also consider a regex to be anchored if OP_SOM starts all its branches.
3596
This is the code for \G, which means "match at start of match position, taking
3597
into account the match offset".
3599
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3600
because that will try the rest of the pattern at all possible matching points,
3601
so there is no point trying again.... er ....
3603
.... except when the .* appears inside capturing parentheses, and there is a
3604
subsequent back reference to those parentheses. We haven't enough information
3605
to catch that case precisely.
3607
At first, the best we could do was to detect when .* was in capturing brackets
3608
and the highest back reference was greater than or equal to that level.
3609
However, by keeping a bitmap of the first 31 back references, we can catch some
3610
of the more common cases more precisely.
3613
code points to start of expression (the bracket)
3614
options points to the options setting
3615
bracket_map a bitmap of which brackets we are inside while testing; this
3616
handles up to substring 31; after that we just have to take
3617
the less precise approach
3618
backref_map the back reference bitmap
3620
Returns: TRUE or FALSE
3624
is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3625
unsigned int backref_map)
3628
const uschar *scode =
3629
first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3630
register int op = *scode;
3632
/* Capturing brackets */
3638
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3639
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3640
if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3643
/* Other brackets */
3645
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3647
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3650
/* .* is not anchored unless DOTALL is set and it isn't in brackets that
3651
are or may be referenced. */
3653
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3654
(*options & PCRE_DOTALL) != 0)
3656
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3659
/* Check for explicit anchoring */
3661
else if (op != OP_SOD && op != OP_SOM &&
3662
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3664
code += GET(code, 1);
3666
while (*code == OP_ALT); /* Loop for each alternative */
3672
/*************************************************
3673
* Check for starting with ^ or .* *
3674
*************************************************/
3676
/* This is called to find out if every branch starts with ^ or .* so that
3677
"first char" processing can be done to speed things up in multiline
3678
matching and for non-DOTALL patterns that start with .* (which must start at
3679
the beginning or after \n). As in the case of is_anchored() (see above), we
3680
have to take account of back references to capturing brackets that contain .*
3681
because in that case we can't make the assumption.
3684
code points to start of expression (the bracket)
3685
bracket_map a bitmap of which brackets we are inside while testing; this
3686
handles up to substring 31; after that we just have to take
3687
the less precise approach
3688
backref_map the back reference bitmap
3690
Returns: TRUE or FALSE
3694
is_startline(const uschar *code, unsigned int bracket_map,
3695
unsigned int backref_map)
3698
const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3700
register int op = *scode;
3702
/* Capturing brackets */
3708
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3709
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3710
if (!is_startline(scode, new_map, backref_map)) return FALSE;
3713
/* Other brackets */
3715
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3716
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3718
/* .* means "start at start or after \n" if it isn't in brackets that
3719
may be referenced. */
3721
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3723
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3726
/* Check for explicit circumflex */
3728
else if (op != OP_CIRC) return FALSE;
3730
/* Move on to the next alternative */
3732
code += GET(code, 1);
3734
while (*code == OP_ALT); /* Loop for each alternative */
3740
/*************************************************
3741
* Check for asserted fixed first char *
3742
*************************************************/
3744
/* During compilation, the "first char" settings from forward assertions are
3745
discarded, because they can cause conflicts with actual literals that follow.
3746
However, if we end up without a first char setting for an unanchored pattern,
3747
it is worth scanning the regex to see if there is an initial asserted first
3748
char. If all branches start with the same asserted char, or with a bracket all
3749
of whose alternatives start with the same asserted char (recurse ad lib), then
3750
we return that char, otherwise -1.
3753
code points to start of expression (the bracket)
3754
options pointer to the options (used to check casing changes)
3755
inassert TRUE if in an assertion
3757
Returns: -1 or the fixed first char
3761
find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3763
register int c = -1;
3766
const uschar *scode =
3767
first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3768
register int op = *scode;
3770
if (op >= OP_BRA) op = OP_BRA;
3781
if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3783
if (c < 0) c = d; else if (c != d) return -1;
3786
case OP_EXACT: /* Fall through */
3793
if (!inassert) return -1;
3797
if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3799
else if (c != scode[1]) return -1;
3803
code += GET(code, 1);
3805
while (*code == OP_ALT);
3811
/*************************************************
3812
* Compile a Regular Expression *
3813
*************************************************/
3815
/* This function takes a string and returns a pointer to a block of store
3816
holding a compiled version of the expression. The original API for this
3817
function had no error code return variable; it is retained for backwards
3818
compatibility. The new function is given a new name.
3821
pattern the regular expression
3822
options various option bits
3823
errorcodeptr pointer to error code variable (pcre_compile2() only)
3824
can be NULL if you don't want a code value
3825
errorptr pointer to pointer to error text
3826
erroroffset ptr offset in pattern where error was detected
3827
tables pointer to character tables or NULL
3829
Returns: pointer to compiled data block, or NULL on error,
3830
with errorptr and erroroffset set
3834
pcre_compile(const char *pattern, int options, const char **errorptr,
3835
int *erroroffset, const unsigned char *tables)
3837
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3842
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3843
const char **errorptr, int *erroroffset, const unsigned char *tables)
3846
int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3847
int c, firstbyte, reqbyte;
3849
int branch_extra = 0;
3850
int branch_newextra;
3851
int item_count = -1;
3853
int max_name_size = 0;
3854
int lastitemlength = 0;
3860
BOOL inescq = FALSE;
3861
unsigned int brastackptr = 0;
3864
const uschar *codestart;
3866
compile_data compile_block;
3867
int brastack[BRASTACK_SIZE];
3868
uschar bralenstack[BRASTACK_SIZE];
3870
/* We can't pass back an error message if errorptr is NULL; I guess the best we
3871
can do is just return NULL, but we can set a code value if there is a code
3874
if (errorptr == NULL)
3876
if (errorcodeptr != NULL) *errorcodeptr = 99;
3881
if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3883
/* However, we can give a message for this error */
3885
if (erroroffset == NULL)
3888
goto PCRE_EARLY_ERROR_RETURN;
3893
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
3896
utf8 = (options & PCRE_UTF8) != 0;
3897
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3898
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3901
goto PCRE_EARLY_ERROR_RETURN;
3904
if ((options & PCRE_UTF8) != 0)
3907
goto PCRE_EARLY_ERROR_RETURN;
3911
if ((options & ~PUBLIC_OPTIONS) != 0)
3914
goto PCRE_EARLY_ERROR_RETURN;
3917
/* Set up pointers to the individual character tables */
3919
if (tables == NULL) tables = _pcre_default_tables;
3920
compile_block.lcc = tables + lcc_offset;
3921
compile_block.fcc = tables + fcc_offset;
3922
compile_block.cbits = tables + cbits_offset;
3923
compile_block.ctypes = tables + ctypes_offset;
3925
/* Maximum back reference and backref bitmap. This is updated for numeric
3926
references during the first pass, but for named references during the actual
3927
compile pass. The bitmap records up to 31 back references to help in deciding
3928
whether (.*) can be treated as anchored or not. */
3930
compile_block.top_backref = 0;
3931
compile_block.backref_map = 0;
3933
/* Reflect pattern for debugging output */
3935
DPRINTF(("------------------------------------------------------------------\n"));
3936
DPRINTF(("%s\n", pattern));
3938
/* The first thing to do is to make a pass over the pattern to compute the
3939
amount of store required to hold the compiled code. This does not have to be
3940
perfect as long as errors are overestimates. At the same time we can detect any
3941
flag settings right at the start, and extract them. Make an attempt to correct
3942
for any counted white space if an "extended" flag setting appears late in the
3943
pattern. We can't be so clever for #-comments. */
3945
ptr = (const uschar *)(pattern - 1);
3946
while ((c = *(++ptr)) != 0)
3953
/* If we are inside a \Q...\E sequence, all chars are literal */
3957
if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3961
/* Otherwise, first check for ignored whitespace and comments */
3963
if ((options & PCRE_EXTENDED) != 0)
3965
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3968
/* The space before the ; is to avoid a warning on a silly compiler
3969
on the Macintosh. */
3970
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3976
item_count++; /* Is zero for the first non-comment item */
3978
/* Allow space for auto callout before every item except quantifiers. */
3980
if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3981
c != '*' && c != '+' && c != '?' &&
3982
(c != '{' || !is_counted_repeat(ptr + 1)))
3983
length += 2 + 2*LINK_SIZE;
3987
/* A backslashed item may be an escaped data character or it may be a
3991
c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
3992
if (errorcode != 0) goto PCRE_ERROR_RETURN;
3994
lastitemlength = 1; /* Default length of last item for repeats */
3996
if (c >= 0) /* Data character */
3998
length += 2; /* For a one-byte character */
4001
if (utf8 && c > 127)
4004
for (i = 0; i < _pcre_utf8_table1_size; i++)
4005
if (c <= _pcre_utf8_table1[i]) break;
4007
lastitemlength += i;
4014
/* If \Q, enter "literal" mode */
4022
/* \X is supported only if Unicode property support is compiled */
4028
goto PCRE_ERROR_RETURN;
4032
/* \P and \p are for Unicode properties, but only when the support has
4033
been compiled. Each item needs 2 bytes. */
4035
else if (-c == ESC_P || -c == ESC_p)
4041
if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4045
goto PCRE_ERROR_RETURN;
4049
/* Other escapes need one byte */
4053
/* A back reference needs an additional 2 bytes, plus either one or 5
4054
bytes for a repeat. We also need to keep the value of the highest
4059
int refnum = -c - ESC_REF;
4060
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4061
if (refnum > compile_block.top_backref)
4062
compile_block.top_backref = refnum;
4063
length += 2; /* For single back reference */
4064
if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4066
ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4067
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4068
if ((min == 0 && (max == 1 || max == -1)) ||
4069
(min == 1 && max == -1))
4072
if (ptr[1] == '?') ptr++;
4077
case '^': /* Single-byte metacharacters */
4084
case '*': /* These repeats won't be after brackets; */
4085
case '+': /* those are handled separately */
4088
goto POSESSIVE; /* A few lines below */
4090
/* This covers the cases of braced repeats after a single char, metachar,
4091
class, or back reference. */
4094
if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4095
ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4096
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4098
/* These special cases just insert one extra opcode */
4100
if ((min == 0 && (max == 1 || max == -1)) ||
4101
(min == 1 && max == -1))
4104
/* These cases might insert additional copies of a preceding character. */
4110
length -= lastitemlength; /* Uncount the original char or metachar */
4111
if (min > 0) length += 3 + lastitemlength;
4113
length += lastitemlength + ((max > 0)? 3 : 1);
4116
if (ptr[1] == '?') ptr++; /* Needs no extra length */
4118
POSESSIVE: /* Test for possessive quantifier */
4122
length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4126
/* An alternation contains an offset to the next branch or ket. If any ims
4127
options changed in the previous branch(es), and/or if we are in a
4128
lookbehind assertion, extra space will be needed at the start of the
4129
branch. This is handled by branch_extra. */
4132
length += 1 + LINK_SIZE + branch_extra;
4135
/* A character class uses 33 characters provided that all the character
4136
values are less than 256. Otherwise, it uses a bit map for low valued
4137
characters, and individual items for others. Don't worry about character
4138
types that aren't allowed in classes - they'll get picked up during the
4139
compile. A character class that contains only one single-byte character
4140
uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4141
where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4144
if (*(++ptr) == '^')
4146
class_optcount = 10; /* Greater than one */
4149
else class_optcount = 0;
4155
/* Written as a "do" so that an initial ']' is taken as data */
4159
/* Inside \Q...\E everything is literal except \E */
4163
if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4169
/* Outside \Q...\E, check for escapes */
4173
c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4174
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4176
/* \b is backspace inside a class; \X is literal */
4178
if (-c == ESC_b) c = '\b';
4179
else if (-c == ESC_X) c = 'X';
4181
/* \Q enters quoting mode */
4183
else if (-c == ESC_Q)
4189
/* Handle escapes that turn into characters */
4191
if (c >= 0) goto NON_SPECIAL_CHARACTER;
4193
/* Escapes that are meta-things. The normal ones just affect the
4194
bit map, but Unicode properties require an XCLASS extended item. */
4198
class_optcount = 10; /* \d, \s etc; make sure > 1 */
4200
if (-c == ESC_p || -c == ESC_P)
4205
length += LINK_SIZE + 2;
4213
/* Check the syntax for POSIX stuff. The bits we actually handle are
4214
checked during the real compile phase. */
4216
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4219
class_optcount = 10; /* Make sure > 1 */
4222
/* Anything else increments the possible optimization count. We have to
4223
detect ranges here so that we can compute the number of extra ranges for
4224
caseless wide characters when UCP support is available. If there are wide
4225
characters, we are going to have to use an XCLASS, even for single
4238
GETCHARLEN(c, ptr, extra);
4246
/* Come here from handling \ above when it escapes to a char value */
4248
NON_SPECIAL_CHARACTER:
4254
uschar const *hyptr = ptr++;
4258
d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4259
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4260
if (-d == ESC_b) d = '\b'; /* backspace */
4261
else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4263
else if (ptr[1] != 0 && ptr[1] != ']')
4270
GETCHARLEN(d, ptr, extra);
4277
if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4280
/* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4281
127 for caseless matching, we will need to use an XCLASS. */
4285
class_optcount = 10; /* Ensure > 1 */
4289
goto PCRE_ERROR_RETURN;
4293
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4296
if (!class_utf8) /* Allow for XCLASS overhead */
4299
length += LINK_SIZE + 2;
4303
/* If we have UCP support, find out how many extra ranges are
4304
needed to map the other case of characters within this range. We
4305
have to mimic the range optimization here, because extending the
4306
range upwards might push d over a boundary that makes is use
4307
another byte in the UTF-8 representation. */
4309
if ((options & PCRE_CASELESS) != 0)
4314
while (get_othercase_range(&cc, origd, &occ, &ocd))
4316
if (occ >= c && ocd <= d) continue; /* Skip embedded */
4318
if (occ < c && ocd >= c - 1) /* Extend the basic range */
4319
{ /* if there is overlap, */
4320
c = occ; /* noting that if occ < c */
4321
continue; /* we can't have ocd > d */
4322
} /* because a subrange is */
4323
if (ocd > d && occ <= d + 1) /* always shorter than */
4324
{ /* the basic range. */
4329
/* An extra item is needed */
4331
length += 1 + _pcre_ord2utf8(occ, buffer) +
4332
((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4335
#endif /* SUPPORT_UCP */
4337
/* The length of the (possibly extended) range */
4339
length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4341
#endif /* SUPPORT_UTF8 */
4345
/* We have a single character. There is nothing to be done unless we
4346
are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4347
allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4353
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4356
class_optcount = 10; /* Ensure > 1 */
4357
if (!class_utf8) /* Allow for XCLASS overhead */
4360
length += LINK_SIZE + 2;
4363
length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4364
(1 + _pcre_ord2utf8(c, buffer));
4365
#else /* SUPPORT_UCP */
4366
length += 1 + _pcre_ord2utf8(c, buffer);
4367
#endif /* SUPPORT_UCP */
4369
#endif /* SUPPORT_UTF8 */
4373
while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4375
if (*ptr == 0) /* Missing terminating ']' */
4378
goto PCRE_ERROR_RETURN;
4381
/* We can optimize when there was only one optimizable character. Repeats
4382
for positive and negated single one-byte chars are handled by the general
4383
code. Here, we handle repeats for the class opcodes. */
4385
if (class_optcount == 1) length += 3; else
4389
/* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4390
we also need extra for wrapping the whole thing in a sub-pattern. */
4392
if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4394
ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4395
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4396
if ((min == 0 && (max == 1 || max == -1)) ||
4397
(min == 1 && max == -1))
4403
length += 2 + 2*LINK_SIZE;
4405
else if (ptr[1] == '?') ptr++;
4410
/* Brackets may be genuine groups or special things */
4413
branch_newextra = 0;
4414
bracket_length = 1 + LINK_SIZE;
4416
/* Handle special forms of bracket, which all start (? */
4425
/* Skip over comments entirely */
4428
while (*ptr != 0 && *ptr != ')') ptr++;
4432
goto PCRE_ERROR_RETURN;
4436
/* Non-referencing groups and lookaheads just move the pointer on, and
4437
then behave like a non-special bracket, except that they don't increment
4438
the count of extracting brackets. Ditto for the "once only" bracket,
4439
which is in Perl from version 5.005. */
4448
/* (?R) specifies a recursive call to the regex, which is an extension
4449
to provide the facility which can be obtained by (?p{perl-code}) in
4450
Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4452
From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4453
the appropriate numbered brackets. This includes both recursive and
4454
non-recursive calls. (?R) is now synonymous with (?0). */
4459
case '0': case '1': case '2': case '3': case '4':
4460
case '5': case '6': case '7': case '8': case '9':
4463
while ((digitab[*(++ptr)] & ctype_digit) != 0);
4467
goto PCRE_ERROR_RETURN;
4469
length += 1 + LINK_SIZE;
4471
/* If this item is quantified, it will get wrapped inside brackets so
4472
as to use the code for quantified brackets. We jump down and use the
4473
code that handles this for real brackets. */
4475
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4477
length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4478
duplength = 5 + 3 * LINK_SIZE;
4479
goto HANDLE_QUANTIFIED_BRACKETS;
4483
/* (?C) is an extension which provides "callout" - to provide a bit of
4484
the functionality of the Perl (?{...}) feature. An optional number may
4485
follow (default is zero). */
4489
while ((digitab[*(++ptr)] & ctype_digit) != 0);
4493
goto PCRE_ERROR_RETURN;
4495
length += 2 + 2*LINK_SIZE;
4498
/* Named subpatterns are an extension copied from Python */
4504
const uschar *p; /* Don't amalgamate; some compilers */
4505
p = ++ptr; /* grumble at autoincrement in declaration */
4506
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4510
goto PCRE_ERROR_RETURN;
4513
if (ptr - p > max_name_size) max_name_size = (ptr - p);
4517
if (*ptr == '=' || *ptr == '>')
4519
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4523
goto PCRE_ERROR_RETURN;
4528
/* Unknown character after (?P */
4531
goto PCRE_ERROR_RETURN;
4533
/* Lookbehinds are in Perl from version 5.005 */
4537
if (*ptr == '=' || *ptr == '!')
4539
branch_newextra = 1 + LINK_SIZE;
4540
length += 1 + LINK_SIZE; /* For the first branch */
4544
goto PCRE_ERROR_RETURN;
4546
/* Conditionals are in Perl from version 5.005. The bracket must either
4547
be followed by a number (for bracket reference) or by an assertion
4548
group, or (a PCRE extension) by 'R' for a recursion test. */
4551
if (ptr[3] == 'R' && ptr[4] == ')')
4556
else if ((digitab[ptr[3]] & ctype_digit) != 0)
4560
while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4564
goto PCRE_ERROR_RETURN;
4567
else /* An assertion must follow */
4569
ptr++; /* Can treat like ':' as far as spacing is concerned */
4570
if (ptr[2] != '?' ||
4571
(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4573
ptr += 2; /* To get right offset in message */
4575
goto PCRE_ERROR_RETURN;
4580
/* Else loop checking valid options until ) is met. Anything else is an
4581
error. If we are without any brackets, i.e. at top level, the settings
4582
act as if specified in the options, so massage the options immediately.
4583
This is for backward compatibility with Perl 5.004. */
4596
*optset |= PCRE_CASELESS;
4600
*optset |= PCRE_MULTILINE;
4604
*optset |= PCRE_DOTALL;
4608
*optset |= PCRE_EXTENDED;
4612
*optset |= PCRE_EXTRA;
4616
*optset |= PCRE_UNGREEDY;
4623
/* A termination by ')' indicates an options-setting-only item; if
4624
this is at the very start of the pattern (indicated by item_count
4625
being zero), we use it to set the global options. This is helpful
4626
when analyzing the pattern for first characters, etc. Otherwise
4627
nothing is done here and it is handled during the compiling
4630
[Historical note: Up to Perl 5.8, options settings at top level
4631
were always global settings, wherever they appeared in the pattern.
4632
That is, they were equivalent to an external setting. From 5.8
4633
onwards, they apply only to what follows (which is what you might
4637
if (item_count == 0)
4639
options = (options | set) & (~unset);
4640
set = unset = 0; /* To save length */
4641
item_count--; /* To allow for several */
4646
/* A termination by ':' indicates the start of a nested group with
4647
the given options set. This is again handled at compile time, but
4648
we must allow for compiled space if any of the ims options are
4649
set. We also have to allow for resetting space at the end of
4650
the group, which is why 4 is added to the length and not just 2.
4651
If there are several changes of options within the same group, this
4652
will lead to an over-estimate on the length, but this shouldn't
4653
matter very much. We also have to allow for resetting options at
4654
the start of any alternations, which we do by setting
4655
branch_newextra to 2. Finally, we record whether the case-dependent
4656
flag ever changes within the regex. This is used by the "required
4660
if (((set|unset) & PCRE_IMS) != 0)
4663
branch_newextra = 2;
4664
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4668
/* Unrecognized option character */
4672
goto PCRE_ERROR_RETURN;
4676
/* If we hit a closing bracket, that's it - this is a freestanding
4677
option-setting. We need to ensure that branch_extra is updated if
4678
necessary. The only values branch_newextra can have here are 0 or 2.
4679
If the value is 2, then branch_extra must either be 2 or 5, depending
4680
on whether this is a lookbehind group or not. */
4685
if (branch_newextra == 2 &&
4686
(branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4687
branch_extra += branch_newextra;
4691
/* If options were terminated by ':' control comes here. Fall through
4692
to handle the group below. */
4696
/* Extracting brackets must be counted so we can process escapes in a
4697
Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4698
need an additional 3 bytes of store per extracting bracket. However, if
4699
PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4700
must leave the count alone (it will aways be zero). */
4702
else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4705
if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4708
/* Save length for computing whole length at end if there's a repeat that
4709
requires duplication of the group. Also save the current value of
4710
branch_extra, and start the new group with the new value. If non-zero, this
4711
will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4713
if (brastackptr >= sizeof(brastack)/sizeof(int))
4716
goto PCRE_ERROR_RETURN;
4719
bralenstack[brastackptr] = branch_extra;
4720
branch_extra = branch_newextra;
4722
brastack[brastackptr++] = length;
4723
length += bracket_length;
4726
/* Handle ket. Look for subsequent max/min; for certain sets of values we
4727
have to replicate this bracket up to that many times. If brastackptr is
4728
0 this is an unmatched bracket which will generate an error, but take care
4729
not to try to access brastack[-1] when computing the length and restoring
4730
the branch_extra value. */
4733
length += 1 + LINK_SIZE;
4734
if (brastackptr > 0)
4736
duplength = length - brastack[--brastackptr];
4737
branch_extra = bralenstack[brastackptr];
4741
/* The following code is also used when a recursion such as (?3) is
4742
followed by a quantifier, because in that case, it has to be wrapped inside
4743
brackets so that the quantifier works. The value of duplength must be
4744
set before arrival. */
4746
HANDLE_QUANTIFIED_BRACKETS:
4748
/* Leave ptr at the final char; for read_repeat_counts this happens
4749
automatically; for the others we need an increment. */
4751
if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4753
ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4754
if (errorcode != 0) goto PCRE_ERROR_RETURN;
4756
else if (c == '*') { min = 0; max = -1; ptr++; }
4757
else if (c == '+') { min = 1; max = -1; ptr++; }
4758
else if (c == '?') { min = 0; max = 1; ptr++; }
4759
else { min = 1; max = 1; }
4761
/* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4762
group, and if the maximum is greater than zero, we have to replicate
4763
maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4769
if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4772
/* When the minimum is greater than zero, we have to replicate up to
4773
minval-1 times, with no additions required in the copies. Then, if there
4774
is a limited maximum we have to replicate up to maxval-1 times allowing
4775
for a BRAZERO item before each optional copy and nesting brackets for all
4776
but one of the optional copies. */
4780
length += (min - 1) * duplength;
4781
if (max > min) /* Need this test as max=-1 means no limit */
4782
length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4783
- (2 + 2*LINK_SIZE);
4786
/* Allow space for once brackets for "possessive quantifier" */
4791
length += 2 + 2*LINK_SIZE;
4795
/* Non-special character. It won't be space or # in extended mode, so it is
4796
always a genuine character. If we are in a \Q...\E sequence, check for the
4797
end; if not, we have a literal. */
4802
if (inescq && c == '\\' && ptr[1] == 'E')
4809
length += 2; /* For a one-byte character */
4810
lastitemlength = 1; /* Default length of last item for repeats */
4812
/* In UTF-8 mode, check for additional bytes. */
4815
if (utf8 && (c & 0xc0) == 0xc0)
4817
while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4818
{ /* because the end is marked */
4819
lastitemlength++; /* by a zero byte. */
4830
length += 2 + LINK_SIZE; /* For final KET and END */
4832
if ((options & PCRE_AUTO_CALLOUT) != 0)
4833
length += 2 + 2*LINK_SIZE; /* For final callout */
4835
if (length > MAX_PATTERN_SIZE)
4838
goto PCRE_EARLY_ERROR_RETURN;
4841
/* Compute the size of data block needed and get it, either from malloc or
4842
externally provided function. */
4844
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4845
re = (real_pcre *)(pcre_malloc)(size);
4850
goto PCRE_EARLY_ERROR_RETURN;
4853
/* Put in the magic number, and save the sizes, options, and character table
4854
pointer. NULL is used for the default character tables. The nullpad field is at
4855
the end; it's there to help in the case when a regex compiled on a system with
4856
4-byte pointers is run on another with 8-byte pointers. */
4858
re->magic_number = MAGIC_NUMBER;
4860
re->options = options;
4862
re->name_table_offset = sizeof(real_pcre);
4863
re->name_entry_size = max_name_size + 3;
4864
re->name_count = name_count;
4866
re->tables = (tables == _pcre_default_tables)? NULL : tables;
4869
/* The starting points of the name/number translation table and of the code are
4870
passed around in the compile data block. */
4872
compile_block.names_found = 0;
4873
compile_block.name_entry_size = max_name_size + 3;
4874
compile_block.name_table = (uschar *)re + re->name_table_offset;
4875
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4876
compile_block.start_code = codestart;
4877
compile_block.start_pattern = (const uschar *)pattern;
4878
compile_block.req_varyopt = 0;
4879
compile_block.nopartial = FALSE;
4881
/* Set up a starting, non-extracting bracket, then compile the expression. On
4882
error, errorcode will be set non-zero, so we don't need to look at the result
4883
of the function here. */
4885
ptr = (const uschar *)pattern;
4886
code = (uschar *)codestart;
4889
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4890
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4891
re->top_bracket = bracount;
4892
re->top_backref = compile_block.top_backref;
4894
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4896
/* If not reached end of pattern on success, there's an excess bracket. */
4898
if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4900
/* Fill in the terminating state and check for disastrous overflow, but
4901
if debugging, leave the test till after things are printed out. */
4906
if (code - codestart > length) errorcode = ERR23;
4909
/* Give an error if there's back reference to a non-existent capturing
4912
if (re->top_backref > re->top_bracket) errorcode = ERR15;
4914
/* Failed to compile, or error while post-processing */
4920
*erroroffset = ptr - (const uschar *)pattern;
4921
PCRE_EARLY_ERROR_RETURN:
4922
*errorptr = error_texts[errorcode];
4923
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4927
/* If the anchored option was not passed, set the flag if we can determine that
4928
the pattern is anchored by virtue of ^ characters or \A or anything else (such
4929
as starting with .* when DOTALL is set).
4931
Otherwise, if we know what the first character has to be, save it, because that
4932
speeds up unanchored matches no end. If not, see if we can set the
4933
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4934
start with ^. and also when all branches start with .* for non-DOTALL matches.
4937
if ((options & PCRE_ANCHORED) == 0)
4939
int temp_options = options;
4940
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4941
re->options |= PCRE_ANCHORED;
4945
firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4946
if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4948
int ch = firstbyte & 255;
4949
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4950
compile_block.fcc[ch] == ch)? ch : firstbyte;
4951
re->options |= PCRE_FIRSTSET;
4953
else if (is_startline(codestart, 0, compile_block.backref_map))
4954
re->options |= PCRE_STARTLINE;
4958
/* For an anchored pattern, we use the "required byte" only if it follows a
4959
variable length item in the regex. Remove the caseless flag for non-caseable
4963
((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4965
int ch = reqbyte & 255;
4966
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4967
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4968
re->options |= PCRE_REQCHSET;
4971
/* Print out the compiled data for debugging */
4975
printf("Length = %d top_bracket = %d top_backref = %d\n",
4976
length, re->top_bracket, re->top_backref);
4978
if (re->options != 0)
4980
printf("%s%s%s%s%s%s%s%s%s%s\n",
4981
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
4982
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4983
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4984
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4985
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4986
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4987
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4988
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4989
((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4990
((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4993
if ((re->options & PCRE_FIRSTSET) != 0)
4995
int ch = re->first_byte & 255;
4996
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4997
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4998
else printf("First char = \\x%02x%s\n", ch, caseless);
5001
if ((re->options & PCRE_REQCHSET) != 0)
5003
int ch = re->req_byte & 255;
5004
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5005
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5006
else printf("Req char = \\x%02x%s\n", ch, caseless);
5009
_pcre_printint(re, stdout);
5011
/* This check is done here in the debugging case so that the code that
5012
was compiled can be seen. */
5014
if (code - codestart > length)
5017
*errorptr = error_texts[ERR23];
5018
*erroroffset = ptr - (uschar *)pattern;
5019
if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5027
/* End of pcre_compile.c */