1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
6
This is a library of functions to support regular expressions whose syntax
7
and semantics are as close as possible to those of the Perl 5 language. See
8
the file Tech.Notes for some information on the internals.
10
Written by: Philip Hazel <ph10@cam.ac.uk>
12
Copyright (c) 1997-2004 University of Cambridge
14
-----------------------------------------------------------------------------
15
Redistribution and use in source and binary forms, with or without
16
modification, are permitted provided that the following conditions are met:
18
* Redistributions of source code must retain the above copyright notice,
19
this list of conditions and the following disclaimer.
21
* Redistributions in binary form must reproduce the above copyright
22
notice, this list of conditions and the following disclaimer in the
23
documentation and/or other materials provided with the distribution.
25
* Neither the name of the University of Cambridge nor the names of its
26
contributors may be used to endorse or promote products derived from
27
this software without specific prior written permission.
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39
POSSIBILITY OF SUCH DAMAGE.
40
-----------------------------------------------------------------------------
44
/* Define DEBUG to get debugging output on stdout. */
47
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48
inline, and there are *still* stupid compilers about that don't like indented
49
pre-processor statements. I suppose it's only been 10 years... */
52
#define DPRINTF(p) printf p
54
#define DPRINTF(p) /*nothing*/
57
/* Include the internals header, which itself includes "config.h", the Standard
58
C headers, and the external pcre header. */
62
/* If Unicode Property support is wanted, include a private copy of the
63
function that does it, and the table that translates names to numbers. */
67
#include "ucptypetable.c"
70
/* Maximum number of items on the nested bracket stacks at compile time. This
71
applies to the nesting of all kinds of parentheses. It does not limit
72
un-nested, non-capturing parentheses. This number can be made bigger if
73
necessary - it is used to dimension one int and one unsigned char vector at
76
#define BRASTACK_SIZE 200
79
/* Maximum number of ints of offset to save on the stack for recursive calls.
80
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81
because the offset vector is always a multiple of 3 long. */
83
#define REC_STACK_SAVE_MAX 30
86
/* The maximum remaining length of subject we are prepared to search for a
89
#define REQ_BYTE_MAX 1000
92
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93
the definition is next to the definition of the opcodes in internal.h. */
95
static const uschar OP_lengths[] = { OP_LENGTHS };
97
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
103
are simple data values; negative values are for special things like \d and so
104
on. Zero means further processing is needed (for things like \x), or the escape
107
#if !EBCDIC /* This is the "normal" table for ASCII systems */
108
static const short int escapes[] = {
109
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111
'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113
-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114
-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115
'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116
0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117
-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118
0, 0, -ESC_z /* x - z */
121
#else /* This is the "abnormal" table for EBCDIC systems */
122
static const short int escapes[] = {
123
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125
/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126
/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130
/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131
/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132
/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139
/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140
/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141
/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142
/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
150
/* Tables of names of POSIX character classes and their lengths. The list is
151
terminated by a zero length entry. The first three must be alpha, upper, lower,
152
as this is assumed for handling case independence. */
154
static const char *const posix_names[] = {
155
"alpha", "lower", "upper",
156
"alnum", "ascii", "blank", "cntrl", "digit", "graph",
157
"print", "punct", "space", "word", "xdigit" };
159
static const uschar posix_name_lengths[] = {
160
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
162
/* Table of class bit maps for each POSIX class; up to three may be combined
163
to form the class. The table for [:blank:] is dynamically modified to remove
164
the vertical space characters. */
166
static const int posix_class_maps[] = {
167
cbit_lower, cbit_upper, -1, /* alpha */
168
cbit_lower, -1, -1, /* lower */
169
cbit_upper, -1, -1, /* upper */
170
cbit_digit, cbit_lower, cbit_upper, /* alnum */
171
cbit_print, cbit_cntrl, -1, /* ascii */
172
cbit_space, -1, -1, /* blank - a GNU extension */
173
cbit_cntrl, -1, -1, /* cntrl */
174
cbit_digit, -1, -1, /* digit */
175
cbit_graph, -1, -1, /* graph */
176
cbit_print, -1, -1, /* print */
177
cbit_punct, -1, -1, /* punct */
178
cbit_space, -1, -1, /* space */
179
cbit_word, -1, -1, /* word - a Perl extension */
180
cbit_xdigit,-1, -1 /* xdigit */
183
/* Table to identify digits and hex digits. This is used when compiling
184
patterns. Note that the tables in chartables are dependent on the locale, and
185
may mark arbitrary characters as digits - but the PCRE compiling code expects
186
to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187
a private table here. It costs 256 bytes, but it is a lot faster than doing
188
character value tests (at least in some simple cases I timed), and in some
189
applications one wants PCRE to compile efficiently as well as match
192
For convenience, we use the same bit definitions as in chartables:
195
0x08 hexadecimal digit
197
Then we can use ctype_digit and ctype_xdigit in the code. */
199
#if !EBCDIC /* This is the "normal" case, for ASCII systems */
200
static const unsigned char digitab[] =
202
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
235
#else /* This is the "abnormal" case, for EBCDIC systems */
236
static const unsigned char digitab[] =
238
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- � */
250
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
271
static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272
0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273
0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- � */
284
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288
0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294
0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295
0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296
0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302
0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303
0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
307
/* Definition to allow mutual recursion */
310
compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311
BOOL, int, int *, int *, branch_chain *, compile_data *);
313
/* Structure for building a chain of data that actually lives on the
314
stack, for holding the values of the subject pointer at the start of each
315
subpattern, so as to detect when an empty string has been matched by a
316
subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317
are on the heap, not on the stack. */
319
typedef struct eptrblock {
320
struct eptrblock *epb_prev;
321
const uschar *epb_saved_eptr;
324
/* Flag bits for the match() function */
326
#define match_condassert 0x01 /* Called to check a condition assertion */
327
#define match_isgroup 0x02 /* Set if start of bracketed group */
329
/* Non-error returns from the match() function. Error returns are externally
330
defined PCRE_ERROR_xxx codes, which are all negative. */
332
#define MATCH_MATCH 1
333
#define MATCH_NOMATCH 0
337
/*************************************************
339
*************************************************/
341
/* PCRE is thread-clean and doesn't use any global variables in the normal
342
sense. However, it calls memory allocation and free functions via the four
343
indirections below, and it can optionally do callouts. These values can be
344
changed by the caller, but are shared between all threads. However, when
345
compiling for Virtual Pascal, things are done differently (see pcre.in). */
349
extern "C" void *(*pcre_malloc)(size_t) = malloc;
350
extern "C" void (*pcre_free)(void *) = free;
351
extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352
extern "C" void (*pcre_stack_free)(void *) = free;
353
extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
355
void *(*pcre_malloc)(size_t) = malloc;
356
void (*pcre_free)(void *) = free;
357
void *(*pcre_stack_malloc)(size_t) = malloc;
358
void (*pcre_stack_free)(void *) = free;
359
int (*pcre_callout)(pcre_callout_block *) = NULL;
364
/*************************************************
365
* Macros and tables for character handling *
366
*************************************************/
368
/* When UTF-8 encoding is being used, a character is no longer just a single
369
byte. The macros for character handling generate simple sequences when used in
370
byte-mode, and more complicated ones for UTF-8 characters. */
373
#define GETCHAR(c, eptr) c = *eptr;
374
#define GETCHARINC(c, eptr) c = *eptr++;
375
#define GETCHARINCTEST(c, eptr) c = *eptr++;
376
#define GETCHARLEN(c, eptr, len) c = *eptr;
377
#define BACKCHAR(eptr)
379
#else /* SUPPORT_UTF8 */
381
/* Get the next UTF-8 character, not advancing the pointer. This is called when
382
we know we are in UTF-8 mode. */
384
#define GETCHAR(c, eptr) \
386
if ((c & 0xc0) == 0xc0) \
389
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
391
c = (c & utf8_table3[gcaa]) << gcss; \
392
for (gcii = 1; gcii <= gcaa; gcii++) \
395
c |= (eptr[gcii] & 0x3f) << gcss; \
399
/* Get the next UTF-8 character, advancing the pointer. This is called when we
400
know we are in UTF-8 mode. */
402
#define GETCHARINC(c, eptr) \
404
if ((c & 0xc0) == 0xc0) \
406
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
408
c = (c & utf8_table3[gcaa]) << gcss; \
412
c |= (*eptr++ & 0x3f) << gcss; \
416
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
418
#define GETCHARINCTEST(c, eptr) \
420
if (md->utf8 && (c & 0xc0) == 0xc0) \
422
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
424
c = (c & utf8_table3[gcaa]) << gcss; \
428
c |= (*eptr++ & 0x3f) << gcss; \
432
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
433
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
435
#define GETCHARLEN(c, eptr, len) \
437
if ((c & 0xc0) == 0xc0) \
440
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
442
c = (c & utf8_table3[gcaa]) << gcss; \
443
for (gcii = 1; gcii <= gcaa; gcii++) \
446
c |= (eptr[gcii] & 0x3f) << gcss; \
451
/* If the pointer is not at the start of a character, move it back until
452
it is. Called only in UTF-8 mode. */
454
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
460
/*************************************************
461
* Default character tables *
462
*************************************************/
464
/* A default set of character tables is included in the PCRE binary. Its source
465
is built by the maketables auxiliary program, which uses the default C ctypes
466
functions, and put in the file chartables.c. These tables are used by PCRE
467
whenever the caller of pcre_compile() does not provide an alternate set of
470
#include "chartables.c"
475
/*************************************************
476
* Tables for UTF-8 support *
477
*************************************************/
479
/* These are the breakpoints for different numbers of bytes in a UTF-8
482
static const int utf8_table1[] =
483
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
485
/* These are the indicator bits and the mask for the data bits to set in the
486
first byte of a character, indexed by the number of additional bytes. */
488
static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489
static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
491
/* Table of the number of extra characters, indexed by the first character
492
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
495
static const uschar utf8_table4[] = {
496
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502
/*************************************************
503
* Convert character value to UTF-8 *
504
*************************************************/
506
/* This function takes an integer value in the range 0 - 0x7fffffff
507
and encodes it as a UTF-8 character in 0 to 6 bytes.
510
cvalue the character value
511
buffer pointer to buffer for result - at least 6 bytes long
513
Returns: number of characters placed in the buffer
517
ord2utf8(int cvalue, uschar *buffer)
520
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521
if (cvalue <= utf8_table1[i]) break;
523
for (j = i; j > 0; j--)
525
*buffer-- = 0x80 | (cvalue & 0x3f);
528
*buffer = utf8_table2[i] | cvalue;
535
/*************************************************
536
* Print compiled regex *
537
*************************************************/
539
/* The code for doing this is held in a separate file that is also included in
540
pcretest.c. It defines a function called print_internals(). */
543
#include "printint.c"
548
/*************************************************
549
* Return version string *
550
*************************************************/
552
#define STRING(a) # a
553
#define XSTRING(s) STRING(s)
558
return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
564
/*************************************************
565
* Flip bytes in an integer *
566
*************************************************/
568
/* This function is called when the magic number in a regex doesn't match in
569
order to flip its bytes to see if we are dealing with a pattern that was
570
compiled on a host of different endianness. If so, this function is used to
571
flip other byte values.
574
value the number to flip
575
n the number of bytes to flip (assumed to be 2 or 4)
577
Returns: the flipped value
581
byteflip2(pcre_uint16 value)
583
return ((value & 0x00ff) << 8) |
584
((value & 0xff00) >> 8);
588
byteflip4(pcre_uint32 value)
590
return ((value & 0x000000ff) << 24) |
591
((value & 0x0000ff00) << 8) |
592
((value & 0x00ff0000) >> 8) |
593
((value & 0xff000000) >> 24);
596
/*************************************************
597
* Test for a byte-flipped compiled regex *
598
*************************************************/
600
/* This function is called from pce_exec() and also from pcre_fullinfo(). Its
601
job is to test whether the regex is byte-flipped - that is, it was compiled on
602
a system of opposite endianness. The function is called only when the native
603
MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
604
relevant values into a different data block, and return it.
607
re points to the regex
608
study points to study data, or NULL
609
internal_re points to a new regex block
610
internal_study points to a new study block
612
Returns: the new block if is is indeed a byte-flipped regex
617
try_flipped(const real_pcre *re, real_pcre *internal_re,
618
const pcre_study_data *study, pcre_study_data *internal_study)
620
if (byteflip4(re->magic_number) != MAGIC_NUMBER)
623
*internal_re = *re; /* To copy other fields */
624
internal_re->size = byteflip4(re->size);
625
internal_re->options = byteflip4(re->options);
626
internal_re->top_bracket = byteflip2(re->top_bracket);
627
internal_re->top_backref = byteflip2(re->top_backref);
628
internal_re->first_byte = byteflip2(re->first_byte);
629
internal_re->req_byte = byteflip2(re->req_byte);
630
internal_re->name_table_offset = byteflip2(re->name_table_offset);
631
internal_re->name_entry_size = byteflip2(re->name_entry_size);
632
internal_re->name_count = byteflip2(re->name_count);
636
*internal_study = *study; /* To copy other fields */
637
internal_study->size = byteflip4(study->size);
638
internal_study->options = byteflip4(study->options);
646
/*************************************************
647
* (Obsolete) Return info about compiled pattern *
648
*************************************************/
650
/* This is the original "info" function. It picks potentially useful data out
651
of the private structure, but its interface was too rigid. It remains for
652
backwards compatibility. The public options are passed back in an int - though
653
the re->options field has been expanded to a long int, all the public options
654
at the low end of it, and so even on 16-bit systems this will still be OK.
655
Therefore, I haven't changed the API for pcre_info().
658
argument_re points to compiled code
659
optptr where to pass back the options
660
first_byte where to pass back the first character,
661
or -1 if multiline and all branches start ^,
664
Returns: number of capturing subpatterns
665
or negative values on error
669
pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
671
real_pcre internal_re;
672
const real_pcre *re = (const real_pcre *)argument_re;
673
if (re == NULL) return PCRE_ERROR_NULL;
674
if (re->magic_number != MAGIC_NUMBER)
676
re = try_flipped(re, &internal_re, NULL, NULL);
677
if (re == NULL) return PCRE_ERROR_BADMAGIC;
679
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
680
if (first_byte != NULL)
681
*first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
682
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
683
return re->top_bracket;
688
/*************************************************
689
* Return info about compiled pattern *
690
*************************************************/
692
/* This is a newer "info" function which has an extensible interface so
693
that additional items can be added compatibly.
696
argument_re points to compiled code
697
extra_data points extra data, or NULL
698
what what information is required
699
where where to put the information
701
Returns: 0 if data returned, negative on error
705
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
708
real_pcre internal_re;
709
pcre_study_data internal_study;
710
const real_pcre *re = (const real_pcre *)argument_re;
711
const pcre_study_data *study = NULL;
713
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
715
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
716
study = (const pcre_study_data *)extra_data->study_data;
718
if (re->magic_number != MAGIC_NUMBER)
720
re = try_flipped(re, &internal_re, study, &internal_study);
721
if (re == NULL) return PCRE_ERROR_BADMAGIC;
722
if (study != NULL) study = &internal_study;
727
case PCRE_INFO_OPTIONS:
728
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
732
*((size_t *)where) = re->size;
735
case PCRE_INFO_STUDYSIZE:
736
*((size_t *)where) = (study == NULL)? 0 : study->size;
739
case PCRE_INFO_CAPTURECOUNT:
740
*((int *)where) = re->top_bracket;
743
case PCRE_INFO_BACKREFMAX:
744
*((int *)where) = re->top_backref;
747
case PCRE_INFO_FIRSTBYTE:
749
((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
750
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
753
/* Make sure we pass back the pointer to the bit vector in the external
754
block, not the internal copy (with flipped integer fields). */
756
case PCRE_INFO_FIRSTTABLE:
757
*((const uschar **)where) =
758
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
759
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
762
case PCRE_INFO_LASTLITERAL:
764
((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
767
case PCRE_INFO_NAMEENTRYSIZE:
768
*((int *)where) = re->name_entry_size;
771
case PCRE_INFO_NAMECOUNT:
772
*((int *)where) = re->name_count;
775
case PCRE_INFO_NAMETABLE:
776
*((const uschar **)where) = (const uschar *)re + re->name_table_offset;
779
case PCRE_INFO_DEFAULT_TABLES:
780
*((const uschar **)where) = (const uschar *)pcre_default_tables;
783
default: return PCRE_ERROR_BADOPTION;
791
/*************************************************
792
* Return info about what features are configured *
793
*************************************************/
795
/* This is function which has an extensible interface so that additional items
796
can be added compatibly.
799
what what information is required
800
where where to put the information
802
Returns: 0 if data returned, negative on error
806
pcre_config(int what, void *where)
810
case PCRE_CONFIG_UTF8:
818
case PCRE_CONFIG_UNICODE_PROPERTIES:
826
case PCRE_CONFIG_NEWLINE:
827
*((int *)where) = NEWLINE;
830
case PCRE_CONFIG_LINK_SIZE:
831
*((int *)where) = LINK_SIZE;
834
case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
835
*((int *)where) = POSIX_MALLOC_THRESHOLD;
838
case PCRE_CONFIG_MATCH_LIMIT:
839
*((unsigned int *)where) = MATCH_LIMIT;
842
case PCRE_CONFIG_STACKRECURSE:
850
default: return PCRE_ERROR_BADOPTION;
859
/*************************************************
860
* Debugging function to print chars *
861
*************************************************/
863
/* Print a sequence of chars in printable format, stopping at the end of the
864
subject if the requested.
867
p points to characters
868
length number to print
869
is_subject TRUE if printing from within md->start_subject
870
md pointer to matching data block, if is_subject is TRUE
876
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
879
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
881
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
888
/*************************************************
890
*************************************************/
892
/* This function is called when a \ has been encountered. It either returns a
893
positive value for a simple escape such as \n, or a negative value which
894
encodes one of the more complicated things such as \d. When UTF-8 is enabled,
895
a positive value greater than 255 may be returned. On entry, ptr is pointing at
896
the \. On exit, it is on the final character of the escape sequence.
899
ptrptr points to the pattern position pointer
900
errorptr points to the pointer to the error message
901
bracount number of previous extracting brackets
902
options the options bits
903
isclass TRUE if inside a character class
905
Returns: zero or positive => a data character
906
negative => a special escape sequence
907
on error, errorptr is set
911
check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
912
int options, BOOL isclass)
914
const uschar *ptr = *ptrptr;
917
/* If backslash is at the end of the pattern, it's an error. */
920
if (c == 0) *errorptr = ERR1;
922
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
923
a table. A non-zero result is something that can be returned immediately.
924
Otherwise further processing may be required. */
926
#if !EBCDIC /* ASCII coding */
927
else if (c < '0' || c > 'z') {} /* Not alphameric */
928
else if ((i = escapes[c - '0']) != 0) c = i;
930
#else /* EBCDIC coding */
931
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
932
else if ((i = escapes[c - 0x48]) != 0) c = i;
935
/* Escapes that need further processing, or are illegal. */
939
const uschar *oldptr;
942
/* A number of Perl escapes are not handled by PCRE. We give an explicit
953
/* The handling of escape sequences consisting of a string of digits
954
starting with one that is not zero is not straightforward. By experiment,
955
the way Perl works seems to be as follows:
957
Outside a character class, the digits are read as a decimal number. If the
958
number is less than 10, or if there are that many previous extracting
959
left brackets, then it is a back reference. Otherwise, up to three octal
960
digits are read to form an escaped byte. Thus \123 is likely to be octal
961
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
962
value is greater than 377, the least significant 8 bits are taken. Inside a
963
character class, \ followed by a digit is always an octal number. */
965
case '1': case '2': case '3': case '4': case '5':
966
case '6': case '7': case '8': case '9':
972
while ((digitab[ptr[1]] & ctype_digit) != 0)
973
c = c * 10 + *(++ptr) - '0';
974
if (c < 10 || c <= bracount)
979
ptr = oldptr; /* Put the pointer back and fall through */
982
/* Handle an octal number following \. If the first digit is 8 or 9, Perl
983
generates a binary zero byte and treats the digit as a following literal.
984
Thus we have to pull back the pointer by one. */
986
if ((c = *ptr) >= '8')
993
/* \0 always starts an octal number, but we may drop through to here with a
994
larger first octal digit. */
998
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
999
c = c * 8 + *(++ptr) - '0';
1000
c &= 255; /* Take least significant 8 bits */
1003
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004
which can be greater than 0xff, but only if the ddd are hex digits. */
1008
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1010
const uschar *pt = ptr + 2;
1011
register int count = 0;
1013
while ((digitab[*pt] & ctype_xdigit) != 0)
1017
#if !EBCDIC /* ASCII coding */
1018
if (cc >= 'a') cc -= 32; /* Convert to upper case */
1019
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020
#else /* EBCDIC coding */
1021
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1022
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1027
if (c < 0 || count > 8) *errorptr = ERR34;
1031
/* If the sequence of hex digits does not end with '}', then we don't
1032
recognize this construct; fall through to the normal \x handling. */
1036
/* Read just a single hex char */
1039
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1041
int cc; /* Some compilers don't like ++ */
1042
cc = *(++ptr); /* in initializers */
1043
#if !EBCDIC /* ASCII coding */
1044
if (cc >= 'a') cc -= 32; /* Convert to upper case */
1045
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046
#else /* EBCDIC coding */
1047
if (cc <= 'z') cc += 64; /* Convert to upper case */
1048
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1053
/* Other special escapes not starting with a digit are straightforward */
1063
/* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064
is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065
(However, an EBCDIC equivalent has now been added.) */
1067
#if !EBCDIC /* ASCII coding */
1068
if (c >= 'a' && c <= 'z') c -= 32;
1070
#else /* EBCDIC coding */
1071
if (c >= 'a' && c <= 'z') c += 64;
1076
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078
for Perl compatibility, it is a literal. This code looks a bit odd, but
1079
there used to be some cases other than the default, and there may be again
1080
in future, so I haven't "optimized" it. */
1083
if ((options & PCRE_EXTRA) != 0) switch(c)
1100
/*************************************************
1101
* Handle \P and \p *
1102
*************************************************/
1104
/* This function is called after \P or \p has been encountered, provided that
1105
PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106
pointing at the P or p. On exit, it is pointing at the final character of the
1110
ptrptr points to the pattern position pointer
1111
negptr points to a boolean that is set TRUE for negation else FALSE
1112
errorptr points to the pointer to the error message
1114
Returns: value from ucp_type_table, or -1 for an invalid type
1118
get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1121
const uschar *ptr = *ptrptr;
1125
if (c == 0) goto ERROR_RETURN;
1129
/* \P or \p can be followed by a one- or two-character name in {}, optionally
1130
preceded by ^ for negation. */
1139
for (i = 0; i <= 2; i++)
1142
if (c == 0) goto ERROR_RETURN;
1143
if (c == '}') break;
1146
if (c !='}') /* Try to distinguish error cases */
1148
while (*(++ptr) != 0 && *ptr != '}');
1149
if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1154
/* Otherwise there is just one following character */
1164
/* Search for a recognized property name using binary chop */
1167
top = sizeof(utt)/sizeof(ucp_type_table);
1172
c = strcmp(name, utt[i].name);
1173
if (c == 0) return utt[i].value;
1174
if (c > 0) bot = i + 1; else top = i;
1192
/*************************************************
1193
* Check for counted repeat *
1194
*************************************************/
1196
/* This function is called when a '{' is encountered in a place where it might
1197
start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199
where the ddds are digits.
1202
p pointer to the first char after '{'
1204
Returns: TRUE or FALSE
1208
is_counted_repeat(const uschar *p)
1210
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211
while ((digitab[*p] & ctype_digit) != 0) p++;
1212
if (*p == '}') return TRUE;
1214
if (*p++ != ',') return FALSE;
1215
if (*p == '}') return TRUE;
1217
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218
while ((digitab[*p] & ctype_digit) != 0) p++;
1225
/*************************************************
1226
* Read repeat counts *
1227
*************************************************/
1229
/* Read an item of the form {n,m} and return the values. This is called only
1230
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231
so the syntax is guaranteed to be correct, but we need to check the values.
1234
p pointer to first char after '{'
1235
minp pointer to int for min
1236
maxp pointer to int for max
1237
returned as -1 if no max
1238
errorptr points to pointer to error message
1240
Returns: pointer to '}' on success;
1241
current ptr on error, with errorptr set
1244
static const uschar *
1245
read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1250
/* Read the minimum value and do a paranoid check: a negative value indicates
1251
an integer overflow. */
1253
while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1254
if (min < 0 || min > 65535)
1260
/* Read the maximum value if there is one, and again do a paranoid on its size.
1261
Also, max must not be less than min. */
1263
if (*p == '}') max = min; else
1268
while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1269
if (max < 0 || max > 65535)
1282
/* Fill in the required variables, and pass back the pointer to the terminating
1292
/*************************************************
1293
* Find first significant op code *
1294
*************************************************/
1296
/* This is called by several functions that scan a compiled expression looking
1297
for a fixed first character, or an anchoring op code etc. It skips over things
1298
that do not influence this. For some calls, a change of option is important.
1299
For some calls, it makes sense to skip negative forward and all backward
1300
assertions, and also the \b assertion; for others it does not.
1303
code pointer to the start of the group
1304
options pointer to external options
1305
optbit the option bit whose changing is significant, or
1307
skipassert TRUE if certain assertions are to be skipped
1309
Returns: pointer to the first significant opcode
1312
static const uschar*
1313
first_significant_code(const uschar *code, int *options, int optbit,
1321
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1322
*options = (int)code[1];
1328
case OP_ASSERTBACK_NOT:
1329
if (!skipassert) return code;
1330
do code += GET(code, 1); while (*code == OP_ALT);
1331
code += OP_lengths[*code];
1334
case OP_WORD_BOUNDARY:
1335
case OP_NOT_WORD_BOUNDARY:
1336
if (!skipassert) return code;
1342
code += OP_lengths[*code];
1349
/* Control never reaches here */
1355
/*************************************************
1356
* Find the fixed length of a pattern *
1357
*************************************************/
1359
/* Scan a pattern and compute the fixed length of subject that will match it,
1360
if the length is fixed. This is needed for dealing with backward assertions.
1361
In UTF8 mode, the result is in characters rather than bytes.
1364
code points to the start of the pattern (the bracket)
1365
options the compiling options
1367
Returns: the fixed length, or -1 if there is no fixed length,
1368
or -2 if \C was encountered
1372
find_fixedlength(uschar *code, int options)
1376
register int branchlength = 0;
1377
register uschar *cc = code + 1 + LINK_SIZE;
1379
/* Scan along the opcodes for this branch. If we get to the end of the
1380
branch, check the length against that of the other branches. */
1385
register int op = *cc;
1386
if (op >= OP_BRA) op = OP_BRA;
1393
d = find_fixedlength(cc, options);
1394
if (d < 0) return d;
1396
do cc += GET(cc, 1); while (*cc == OP_ALT);
1397
cc += 1 + LINK_SIZE;
1400
/* Reached end of a branch; if it's a ket it is the end of a nested
1401
call. If it's ALT it is an alternation in a nested call. If it is
1402
END it's the end of the outer call. All can be handled by the same code. */
1409
if (length < 0) length = branchlength;
1410
else if (length != branchlength) return -1;
1411
if (*cc != OP_ALT) return length;
1412
cc += 1 + LINK_SIZE;
1416
/* Skip over assertive subpatterns */
1421
case OP_ASSERTBACK_NOT:
1422
do cc += GET(cc, 1); while (*cc == OP_ALT);
1425
/* Skip over things that don't match chars */
1438
case OP_NOT_WORD_BOUNDARY:
1439
case OP_WORD_BOUNDARY:
1440
cc += OP_lengths[*cc];
1443
/* Handle literal characters */
1450
if ((options & PCRE_UTF8) != 0)
1452
while ((*cc & 0xc0) == 0x80) cc++;
1457
/* Handle exact repetitions. The count is already in characters, but we
1458
need to skip over a multibyte character in UTF8 mode. */
1461
branchlength += GET2(cc,1);
1464
if ((options & PCRE_UTF8) != 0)
1466
while((*cc & 0x80) == 0x80) cc++;
1472
branchlength += GET2(cc,1);
1476
/* Handle single-char matchers */
1485
case OP_NOT_WHITESPACE:
1487
case OP_NOT_WORDCHAR:
1494
/* The single-byte matcher isn't allowed */
1499
/* Check a class for variable quantification */
1503
cc += GET(cc, 1) - 33;
1521
if (GET2(cc,1) != GET2(cc,3)) return -1;
1522
branchlength += GET2(cc,1);
1531
/* Anything else is variable length */
1537
/* Control never gets here */
1543
/*************************************************
1544
* Scan compiled regex for numbered bracket *
1545
*************************************************/
1547
/* This little function scans through a compiled pattern until it finds a
1548
capturing bracket with the given number.
1551
code points to start of expression
1552
utf8 TRUE in UTF-8 mode
1553
number the required bracket number
1555
Returns: pointer to the opcode for the bracket, or NULL if not found
1558
static const uschar *
1559
find_bracket(const uschar *code, BOOL utf8, int number)
1561
#ifndef SUPPORT_UTF8
1562
utf8 = utf8; /* Stop pedantic compilers complaining */
1567
register int c = *code;
1568
if (c == OP_END) return NULL;
1569
else if (c > OP_BRA)
1572
if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1573
if (n == number) return (uschar *)code;
1574
code += OP_lengths[OP_BRA];
1578
code += OP_lengths[c];
1582
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1583
by a multi-byte character. The length in the table is a minimum, so we have
1584
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1585
can use relatively efficient code. */
1600
while ((*code & 0xc0) == 0x80) code++;
1603
/* XCLASS is used for classes that cannot be represented just by a bit
1604
map. This includes negated single high-valued characters. The length in
1605
the table is zero; the actual length is stored in the compiled code. */
1608
code += GET(code, 1) + 1;
1618
/*************************************************
1619
* Scan compiled regex for recursion reference *
1620
*************************************************/
1622
/* This little function scans through a compiled pattern until it finds an
1623
instance of OP_RECURSE.
1626
code points to start of expression
1627
utf8 TRUE in UTF-8 mode
1629
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1632
static const uschar *
1633
find_recurse(const uschar *code, BOOL utf8)
1635
#ifndef SUPPORT_UTF8
1636
utf8 = utf8; /* Stop pedantic compilers complaining */
1641
register int c = *code;
1642
if (c == OP_END) return NULL;
1643
else if (c == OP_RECURSE) return code;
1644
else if (c > OP_BRA)
1646
code += OP_lengths[OP_BRA];
1650
code += OP_lengths[c];
1654
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1655
by a multi-byte character. The length in the table is a minimum, so we have
1656
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1657
can use relatively efficient code. */
1672
while ((*code & 0xc0) == 0x80) code++;
1675
/* XCLASS is used for classes that cannot be represented just by a bit
1676
map. This includes negated single high-valued characters. The length in
1677
the table is zero; the actual length is stored in the compiled code. */
1680
code += GET(code, 1) + 1;
1690
/*************************************************
1691
* Scan compiled branch for non-emptiness *
1692
*************************************************/
1694
/* This function scans through a branch of a compiled pattern to see whether it
1695
can match the empty string or not. It is called only from could_be_empty()
1696
below. Note that first_significant_code() skips over assertions. If we hit an
1697
unclosed bracket, we return "empty" - this means we've struck an inner bracket
1698
whose current branch will already have been scanned.
1701
code points to start of search
1702
endcode points to where to stop
1703
utf8 TRUE if in UTF8 mode
1705
Returns: TRUE if what is matched could be empty
1709
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1712
for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1714
code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1716
const uschar *ccode;
1723
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1725
/* Scan a closed bracket */
1727
empty_branch = FALSE;
1730
if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1731
empty_branch = TRUE;
1732
code += GET(code, 1);
1734
while (*code == OP_ALT);
1735
if (!empty_branch) return FALSE; /* All branches are non-empty */
1736
code += 1 + LINK_SIZE;
1742
/* Check for quantifiers after a class */
1746
ccode = code + GET(code, 1);
1747
goto CHECK_CLASS_REPEAT;
1760
case OP_CRSTAR: /* These could be empty; continue */
1766
default: /* Non-repeat => class must match */
1767
case OP_CRPLUS: /* These repeats aren't empty */
1773
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1778
/* Opcodes that must match a character */
1785
case OP_NOT_WHITESPACE:
1787
case OP_NOT_WORDCHAR:
1801
case OP_TYPEMINPLUS:
1813
/* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1814
followed by a multibyte character */
1823
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1834
/*************************************************
1835
* Scan compiled regex for non-emptiness *
1836
*************************************************/
1838
/* This function is called to check for left recursive calls. We want to check
1839
the current branch of the current pattern to see if it could match the empty
1840
string. If it could, we must look outwards for branches at other levels,
1841
stopping when we pass beyond the bracket which is the subject of the recursion.
1844
code points to start of the recursion
1845
endcode points to where to stop (current RECURSE item)
1846
bcptr points to the chain of current (unclosed) branch starts
1847
utf8 TRUE if in UTF-8 mode
1849
Returns: TRUE if what is matched could be empty
1853
could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1856
while (bcptr != NULL && bcptr->current >= code)
1858
if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1859
bcptr = bcptr->outer;
1866
/*************************************************
1867
* Check for POSIX class syntax *
1868
*************************************************/
1870
/* This function is called when the sequence "[:" or "[." or "[=" is
1871
encountered in a character class. It checks whether this is followed by an
1872
optional ^ and then a sequence of letters, terminated by a matching ":]" or
1876
ptr pointer to the initial [
1877
endptr where to return the end pointer
1878
cd pointer to compile data
1880
Returns: TRUE or FALSE
1884
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1886
int terminator; /* Don't combine these lines; the Solaris cc */
1887
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1888
if (*(++ptr) == '^') ptr++;
1889
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1890
if (*ptr == terminator && ptr[1] == ']')
1901
/*************************************************
1902
* Check POSIX class name *
1903
*************************************************/
1905
/* This function is called to check the name given in a POSIX-style class entry
1909
ptr points to the first letter
1910
len the length of the name
1912
Returns: a value representing the name, or -1 if unknown
1916
check_posix_name(const uschar *ptr, int len)
1918
register int yield = 0;
1919
while (posix_name_lengths[yield] != 0)
1921
if (len == posix_name_lengths[yield] &&
1922
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1929
/*************************************************
1930
* Adjust OP_RECURSE items in repeated group *
1931
*************************************************/
1933
/* OP_RECURSE items contain an offset from the start of the regex to the group
1934
that is referenced. This means that groups can be replicated for fixed
1935
repetition simply by copying (because the recursion is allowed to refer to
1936
earlier groups that are outside the current group). However, when a group is
1937
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1938
it, after it has been compiled. This means that any OP_RECURSE items within it
1939
that refer to the group itself or any contained groups have to have their
1940
offsets adjusted. That is the job of this function. Before it is called, the
1941
partially compiled regex must be temporarily terminated with OP_END.
1944
group points to the start of the group
1945
adjust the amount by which the group is to be moved
1946
utf8 TRUE in UTF-8 mode
1947
cd contains pointers to tables etc.
1953
adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1955
uschar *ptr = group;
1956
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1958
int offset = GET(ptr, 1);
1959
if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1960
ptr += 1 + LINK_SIZE;
1966
/*************************************************
1967
* Insert an automatic callout point *
1968
*************************************************/
1970
/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1971
callout points before each pattern item.
1974
code current code pointer
1975
ptr current pattern pointer
1976
cd pointers to tables etc
1978
Returns: new code pointer
1982
auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1984
*code++ = OP_CALLOUT;
1986
PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1987
PUT(code, LINK_SIZE, 0); /* Default length */
1988
return code + 2*LINK_SIZE;
1993
/*************************************************
1994
* Complete a callout item *
1995
*************************************************/
1997
/* A callout item contains the length of the next item in the pattern, which
1998
we can't fill in till after we have reached the relevant point. This is used
1999
for both automatic and manual callouts.
2002
previous_callout points to previous callout item
2003
ptr current pattern pointer
2004
cd pointers to tables etc
2010
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2012
int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2013
PUT(previous_callout, 2 + LINK_SIZE, length);
2019
/*************************************************
2020
* Get othercase range *
2021
*************************************************/
2023
/* This function is passed the start and end of a class range, in UTF-8 mode
2024
with UCP support. It searches up the characters, looking for internal ranges of
2025
characters in the "other" case. Each call returns the next one, updating the
2029
cptr points to starting character value; updated
2031
ocptr where to put start of othercase range
2032
odptr where to put end of othercase range
2034
Yield: TRUE when range returned; FALSE when no more
2038
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2040
int c, chartype, othercase, next;
2042
for (c = *cptr; c <= d; c++)
2044
if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2047
if (c > d) return FALSE;
2050
next = othercase + 1;
2052
for (++c; c <= d; c++)
2054
if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2064
#endif /* SUPPORT_UCP */
2067
/*************************************************
2068
* Compile one branch *
2069
*************************************************/
2071
/* Scan the pattern, compiling it into the code vector. If the options are
2072
changed during the branch, the pointer is used to change the external options
2076
optionsptr pointer to the option bits
2077
brackets points to number of extracting brackets used
2078
codeptr points to the pointer to the current code point
2079
ptrptr points to the current pattern pointer
2080
errorptr points to pointer to error message
2081
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2082
reqbyteptr set to the last literal character required, else < 0
2083
bcptr points to current branch chain
2084
cd contains pointers to tables etc.
2086
Returns: TRUE on success
2087
FALSE, with *errorptr set on error
2091
compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2092
const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2093
int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2095
int repeat_type, op_type;
2096
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2098
int greedy_default, greedy_non_default;
2099
int firstbyte, reqbyte;
2100
int zeroreqbyte, zerofirstbyte;
2101
int req_caseopt, reqvary, tempreqvary;
2103
int options = *optionsptr;
2104
int after_manual_callout = 0;
2106
register uschar *code = *codeptr;
2108
BOOL inescq = FALSE;
2109
BOOL groupsetfirstbyte = FALSE;
2110
const uschar *ptr = *ptrptr;
2111
const uschar *tempptr;
2112
uschar *previous = NULL;
2113
uschar *previous_callout = NULL;
2114
uschar classbits[32];
2118
BOOL utf8 = (options & PCRE_UTF8) != 0;
2119
uschar *class_utf8data;
2120
uschar utf8_char[6];
2125
/* Set up the default and non-default settings for greediness */
2127
greedy_default = ((options & PCRE_UNGREEDY) != 0);
2128
greedy_non_default = greedy_default ^ 1;
2130
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2131
matching encountered yet". It gets changed to REQ_NONE if we hit something that
2132
matches a non-fixed char first char; reqbyte just remains unset if we never
2135
When we hit a repeat whose minimum is zero, we may have to adjust these values
2136
to take the zero repeat into account. This is implemented by setting them to
2137
zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2138
item types that can be repeated set these backoff variables appropriately. */
2140
firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2142
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2143
according to the current setting of the caseless flag. REQ_CASELESS is a bit
2144
value > 255. It is added into the firstbyte or reqbyte variables to record the
2145
case status of the value. This is used only for ASCII characters. */
2147
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2149
/* Switch on next character until the end of the branch */
2154
BOOL possessive_quantifier;
2156
int class_charcount;
2166
/* Next byte in the pattern */
2170
/* If in \Q...\E, check for the end; if not, we have a literal */
2172
if (inescq && c != 0)
2174
if (c == '\\' && ptr[1] == 'E')
2182
if (previous_callout != NULL)
2184
complete_callout(previous_callout, ptr, cd);
2185
previous_callout = NULL;
2187
if ((options & PCRE_AUTO_CALLOUT) != 0)
2189
previous_callout = code;
2190
code = auto_callout(code, ptr, cd);
2196
/* Fill in length of a previous callout, except when the next thing is
2199
is_quantifier = c == '*' || c == '+' || c == '?' ||
2200
(c == '{' && is_counted_repeat(ptr+1));
2202
if (!is_quantifier && previous_callout != NULL &&
2203
after_manual_callout-- <= 0)
2205
complete_callout(previous_callout, ptr, cd);
2206
previous_callout = NULL;
2209
/* In extended mode, skip white space and comments */
2211
if ((options & PCRE_EXTENDED) != 0)
2213
if ((cd->ctypes[c] & ctype_space) != 0) continue;
2216
/* The space before the ; is to avoid a warning on a silly compiler
2217
on the Macintosh. */
2218
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2219
if (c != 0) continue; /* Else fall through to handle end of string */
2223
/* No auto callout for quantifiers. */
2225
if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2227
previous_callout = code;
2228
code = auto_callout(code, ptr, cd);
2233
/* The branch terminates at end of string, |, or ). */
2238
*firstbyteptr = firstbyte;
2239
*reqbyteptr = reqbyte;
2244
/* Handle single-character metacharacters. In multiline mode, ^ disables
2245
the setting of any following char as a first character. */
2248
if ((options & PCRE_MULTILINE) != 0)
2250
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2261
/* There can never be a first char if '.' is first, whatever happens about
2262
repeats. The value of reqbyte doesn't change either. */
2265
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2266
zerofirstbyte = firstbyte;
2267
zeroreqbyte = reqbyte;
2272
/* Character classes. If the included characters are all < 255 in value, we
2273
build a 32-byte bitmap of the permitted characters, except in the special
2274
case where there is only one such character. For negated classes, we build
2275
the map as usual, then invert it at the end. However, we use a different
2276
opcode so that data characters > 255 can be handled correctly.
2278
If the class contains characters outside the 0-255 range, a different
2279
opcode is compiled. It may optionally have a bit map for characters < 256,
2280
but those above are are explicitly listed afterwards. A flag byte tells
2281
whether the bitmap is present, and whether this is a negated class or not.
2287
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2288
they are encountered at the top level, so we'll do that too. */
2290
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2291
check_posix_syntax(ptr, &tempptr, cd))
2293
*errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2297
/* If the first character is '^', set the negation flag and skip it. */
2299
if ((c = *(++ptr)) == '^')
2301
negate_class = TRUE;
2306
negate_class = FALSE;
2309
/* Keep a count of chars with values < 256 so that we can optimize the case
2310
of just a single character (as long as it's < 256). For higher valued UTF-8
2311
characters, we don't yet do any optimization. */
2313
class_charcount = 0;
2314
class_lastchar = -1;
2317
class_utf8 = FALSE; /* No chars >= 256 */
2318
class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2321
/* Initialize the 32-char bit map to all zeros. We have to build the
2322
map in a temporary bit of store, in case the class contains only 1
2323
character (< 256), because in that case the compiled code doesn't use the
2326
memset(classbits, 0, 32 * sizeof(uschar));
2328
/* Process characters until ] is reached. By writing this as a "do" it
2329
means that an initial ] is taken as a data character. The first pass
2330
through the regex checked the overall syntax, so we don't need to be very
2331
strict here. At the start of the loop, c contains the first byte of the
2337
if (utf8 && c > 127)
2338
{ /* Braces are required because the */
2339
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2343
/* Inside \Q...\E everything is literal except \E */
2347
if (c == '\\' && ptr[1] == 'E')
2353
else goto LONE_SINGLE_CHARACTER;
2356
/* Handle POSIX class names. Perl allows a negation extension of the
2357
form [:^name:]. A square bracket that doesn't match the syntax is
2358
treated as a literal. We also recognize the POSIX constructions
2359
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2363
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2364
check_posix_syntax(ptr, &tempptr, cd))
2366
BOOL local_negate = FALSE;
2368
register const uschar *cbits = cd->cbits;
2379
local_negate = TRUE;
2383
posix_class = check_posix_name(ptr, tempptr - ptr);
2384
if (posix_class < 0)
2390
/* If matching is caseless, upper and lower are converted to
2391
alpha. This relies on the fact that the class table starts with
2392
alpha, lower, upper as the first 3 entries. */
2394
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2397
/* Or into the map we are building up to 3 of the static class
2398
tables, or their negations. The [:blank:] class sets up the same
2399
chars as the [:space:] class (all white space). We remove the vertical
2400
white space chars afterwards. */
2403
for (i = 0; i < 3; i++)
2405
BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2406
int taboffset = posix_class_maps[posix_class + i];
2407
if (taboffset < 0) break;
2411
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2413
for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2414
if (blankclass) classbits[1] |= 0x3c;
2418
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2419
if (blankclass) classbits[1] &= ~0x3c;
2424
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2425
continue; /* End of POSIX syntax handling */
2428
/* Backslash may introduce a single character, or it may introduce one
2429
of the specials, which just set a flag. Escaped items are checked for
2430
validity in the pre-compiling pass. The sequence \b is a special case.
2431
Inside a class (and only there) it is treated as backspace. Elsewhere
2432
it marks a word boundary. Other escapes have preset maps ready to
2433
or into the one we are building. We assume they have more than one
2434
character in them, so set class_charcount bigger than one. */
2438
c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2440
if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2441
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2442
else if (-c == ESC_Q) /* Handle start of quoted string */
2444
if (ptr[1] == '\\' && ptr[2] == 'E')
2446
ptr += 2; /* avoid empty string */
2454
register const uschar *cbits = cd->cbits;
2455
class_charcount += 2; /* Greater than 1 is what matters */
2459
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2463
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2467
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2471
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2475
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2476
classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2480
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2481
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2489
int property = get_ucp(&ptr, &negated, errorptr);
2490
if (property < 0) goto FAILED;
2492
*class_utf8data++ = ((-c == ESC_p) != negated)?
2493
XCL_PROP : XCL_NOTPROP;
2494
*class_utf8data++ = property;
2495
class_charcount -= 2; /* Not a < 256 character */
2500
/* Unrecognized escapes are faulted if PCRE is running in its
2501
strict mode. By default, for compatibility with Perl, they are
2502
treated as literals. */
2505
if ((options & PCRE_EXTRA) != 0)
2510
c = *ptr; /* The final character */
2511
class_charcount -= 2; /* Undo the default count from above */
2515
/* Fall through if we have a single character (c >= 0). This may be
2516
> 256 in UTF-8 mode. */
2518
} /* End of backslash handling */
2520
/* A single character may be followed by '-' to form a range. However,
2521
Perl does not permit ']' to be the end of the range. A '-' character
2522
here is treated as a literal. */
2524
if (ptr[1] == '-' && ptr[2] != ']')
2531
{ /* Braces are required because the */
2532
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2536
d = *ptr; /* Not UTF-8 mode */
2538
/* The second part of a range can be a single-character escape, but
2539
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2540
in such circumstances. */
2544
const uschar *oldptr = ptr;
2545
d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2547
/* \b is backslash; \X is literal X; any other special means the '-'
2552
if (d == -ESC_b) d = '\b';
2553
else if (d == -ESC_X) d = 'X'; else
2556
goto LONE_SINGLE_CHARACTER; /* A few lines below */
2561
/* The check that the two values are in the correct order happens in
2562
the pre-pass. Optimize one-character ranges */
2564
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2566
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2567
matching, we have to use an XCLASS with extra data items. Caseless
2568
matching for characters > 127 is available only if UCP support is
2572
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2576
/* With UCP support, we can find the other case equivalents of
2577
the relevant characters. There may be several ranges. Optimize how
2578
they fit with the basic range. */
2581
if ((options & PCRE_CASELESS) != 0)
2586
while (get_othercase_range(&cc, origd, &occ, &ocd))
2588
if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2590
if (occ < c && ocd >= c - 1) /* Extend the basic range */
2591
{ /* if there is overlap, */
2592
c = occ; /* noting that if occ < c */
2593
continue; /* we can't have ocd > d */
2594
} /* because a subrange is */
2595
if (ocd > d && occ <= d + 1) /* always shorter than */
2596
{ /* the basic range. */
2603
*class_utf8data++ = XCL_SINGLE;
2607
*class_utf8data++ = XCL_RANGE;
2608
class_utf8data += ord2utf8(occ, class_utf8data);
2610
class_utf8data += ord2utf8(ocd, class_utf8data);
2613
#endif /* SUPPORT_UCP */
2615
/* Now record the original range, possibly modified for UCP caseless
2616
overlapping ranges. */
2618
*class_utf8data++ = XCL_RANGE;
2619
class_utf8data += ord2utf8(c, class_utf8data);
2620
class_utf8data += ord2utf8(d, class_utf8data);
2622
/* With UCP support, we are done. Without UCP support, there is no
2623
caseless matching for UTF-8 characters > 127; we can use the bit map
2624
for the smaller ones. */
2627
continue; /* With next character in the class */
2629
if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2631
/* Adjust upper limit and fall through to set up the map */
2635
#endif /* SUPPORT_UCP */
2637
#endif /* SUPPORT_UTF8 */
2639
/* We use the bit map for all cases when not in UTF-8 mode; else
2640
ranges that lie entirely within 0-127 when there is UCP support; else
2641
for partial ranges without UCP support. */
2645
classbits[c/8] |= (1 << (c&7));
2646
if ((options & PCRE_CASELESS) != 0)
2648
int uc = cd->fcc[c]; /* flip case */
2649
classbits[uc/8] |= (1 << (uc&7));
2651
class_charcount++; /* in case a one-char range */
2655
continue; /* Go get the next char in the class */
2658
/* Handle a lone single character - we can get here for a normal
2659
non-escape char, or after \ that introduces a single character or for an
2660
apparent range that isn't. */
2662
LONE_SINGLE_CHARACTER:
2664
/* Handle a character that cannot go in the bit map */
2667
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2670
*class_utf8data++ = XCL_SINGLE;
2671
class_utf8data += ord2utf8(c, class_utf8data);
2674
if ((options & PCRE_CASELESS) != 0)
2678
if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2680
*class_utf8data++ = XCL_SINGLE;
2681
class_utf8data += ord2utf8(othercase, class_utf8data);
2684
#endif /* SUPPORT_UCP */
2688
#endif /* SUPPORT_UTF8 */
2690
/* Handle a single-byte character */
2692
classbits[c/8] |= (1 << (c&7));
2693
if ((options & PCRE_CASELESS) != 0)
2695
c = cd->fcc[c]; /* flip case */
2696
classbits[c/8] |= (1 << (c&7));
2703
/* Loop until ']' reached; the check for end of string happens inside the
2704
loop. This "while" is the end of the "do" above. */
2706
while ((c = *(++ptr)) != ']' || inescq);
2708
/* If class_charcount is 1, we saw precisely one character whose value is
2709
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2710
can optimize the negative case only if there were no characters >= 128
2711
because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2712
single-bytes only. This is an historical hangover. Maybe one day we can
2713
tidy these opcodes to handle multi-byte characters.
2715
The optimization throws away the bit map. We turn the item into a
2716
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2717
that OP_NOT does not support multibyte characters. In the positive case, it
2718
can cause firstbyte to be set. Otherwise, there can be no first char if
2719
this item is first, whatever repeat count may follow. In the case of
2720
reqbyte, save the previous value for reinstating. */
2723
if (class_charcount == 1 &&
2725
(!class_utf8 && (!negate_class || class_lastchar < 128))))
2728
if (class_charcount == 1)
2731
zeroreqbyte = reqbyte;
2733
/* The OP_NOT opcode works on one-byte characters only. */
2737
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738
zerofirstbyte = firstbyte;
2740
*code++ = class_lastchar;
2744
/* For a single, positive character, get the value into mcbuffer, and
2745
then we can handle this with the normal one-character code. */
2748
if (utf8 && class_lastchar > 127)
2749
mclength = ord2utf8(class_lastchar, mcbuffer);
2753
mcbuffer[0] = class_lastchar;
2757
} /* End of 1-char optimization */
2759
/* The general case - not the one-char optimization. If this is the first
2760
thing in the branch, there can be no first char setting, whatever the
2761
repeat count. Any reqbyte setting must remain unchanged after any kind of
2764
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2765
zerofirstbyte = firstbyte;
2766
zeroreqbyte = reqbyte;
2768
/* If there are characters with values > 255, we have to compile an
2769
extended class, with its own opcode. If there are no characters < 256,
2770
we can omit the bitmap. */
2775
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
2776
*code++ = OP_XCLASS;
2778
*code = negate_class? XCL_NOT : 0;
2780
/* If the map is required, install it, and move on to the end of
2783
if (class_charcount > 0)
2786
memcpy(code, classbits, 32);
2787
code = class_utf8data;
2790
/* If the map is not required, slide down the extra data. */
2794
int len = class_utf8data - (code + 33);
2795
memmove(code + 1, code + 33, len);
2799
/* Now fill in the complete length of the item */
2801
PUT(previous, 1, code - previous);
2802
break; /* End of class handling */
2806
/* If there are no characters > 255, negate the 32-byte map if necessary,
2807
and copy it into the code vector. If this is the first thing in the branch,
2808
there can be no first char setting, whatever the repeat count. Any reqbyte
2809
setting must remain unchanged after any kind of repeat. */
2813
*code++ = OP_NCLASS;
2814
for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2819
memcpy(code, classbits, 32);
2824
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2825
has been tested above. */
2828
if (!is_quantifier) goto NORMAL_CHAR;
2829
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2830
if (*errorptr != NULL) goto FAILED;
2848
if (previous == NULL)
2854
if (repeat_min == 0)
2856
firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2857
reqbyte = zeroreqbyte; /* Ditto */
2860
/* Remember whether this is a variable length repeat */
2862
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2864
op_type = 0; /* Default single-char op codes */
2865
possessive_quantifier = FALSE; /* Default not possessive quantifier */
2867
/* Save start of previous item, in case we have to move it up to make space
2868
for an inserted OP_ONCE for the additional '+' extension. */
2870
tempcode = previous;
2872
/* If the next character is '+', we have a possessive quantifier. This
2873
implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2874
If the next character is '?' this is a minimizing repeat, by default,
2875
but if PCRE_UNGREEDY is set, it works the other way round. We change the
2876
repeat type to the non-default. */
2880
repeat_type = 0; /* Force greedy */
2881
possessive_quantifier = TRUE;
2884
else if (ptr[1] == '?')
2886
repeat_type = greedy_non_default;
2889
else repeat_type = greedy_default;
2891
/* If previous was a recursion, we need to wrap it inside brackets so that
2892
it can be replicated if necessary. */
2894
if (*previous == OP_RECURSE)
2896
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2897
code += 1 + LINK_SIZE;
2899
PUT(previous, 1, code - previous);
2901
PUT(code, 1, code - previous);
2902
code += 1 + LINK_SIZE;
2905
/* If previous was a character match, abolish the item and generate a
2906
repeat item instead. If a char item has a minumum of more than one, ensure
2907
that it is set in reqbyte - it might not be if a sequence such as x{3} is
2908
the first thing in a branch because the x will have gone into firstbyte
2911
if (*previous == OP_CHAR || *previous == OP_CHARNC)
2913
/* Deal with UTF-8 characters that take up more than one byte. It's
2914
easier to write this out separately than try to macrify it. Use c to
2915
hold the length of the character in bytes, plus 0x80 to flag that it's a
2916
length rather than a small character. */
2919
if (utf8 && (code[-1] & 0x80) != 0)
2921
uschar *lastchar = code - 1;
2922
while((*lastchar & 0xc0) == 0x80) lastchar--;
2923
c = code - lastchar; /* Length of UTF-8 character */
2924
memcpy(utf8_char, lastchar, c); /* Save the char */
2925
c |= 0x80; /* Flag c as a length */
2930
/* Handle the case of a single byte - either with no UTF8 support, or
2931
with UTF-8 disabled, or for a UTF-8 character < 128. */
2935
if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2938
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2941
/* If previous was a single negated character ([^a] or similar), we use
2942
one of the special opcodes, replacing it. The code is shared with single-
2943
character repeats by setting opt_type to add a suitable offset into
2944
repeat_type. OP_NOT is currently used only for single-byte chars. */
2946
else if (*previous == OP_NOT)
2948
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2950
goto OUTPUT_SINGLE_REPEAT;
2953
/* If previous was a character type match (\d or similar), abolish it and
2954
create a suitable repeat item. The code is shared with single-character
2955
repeats by setting op_type to add a suitable offset into repeat_type. Note
2956
the the Unicode property types will be present only when SUPPORT_UCP is
2957
defined, but we don't wrap the little bits of code here because it just
2958
makes it horribly messy. */
2960
else if (*previous < OP_EODN)
2964
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2967
OUTPUT_SINGLE_REPEAT:
2968
prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2972
code = previous; /* Usually overwrite previous item */
2974
/* If the maximum is zero then the minimum must also be zero; Perl allows
2975
this case, so we do too - by simply omitting the item altogether. */
2977
if (repeat_max == 0) goto END_REPEAT;
2979
/* All real repeats make it impossible to handle partial matching (maybe
2980
one day we will be able to remove this restriction). */
2982
if (repeat_max != 1) cd->nopartial = TRUE;
2984
/* Combine the op_type with the repeat_type */
2986
repeat_type += op_type;
2988
/* A minimum of zero is handled either as the special case * or ?, or as
2989
an UPTO, with the maximum given. */
2991
if (repeat_min == 0)
2993
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2994
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2997
*code++ = OP_UPTO + repeat_type;
2998
PUT2INC(code, 0, repeat_max);
3002
/* A repeat minimum of 1 is optimized into some special cases. If the
3003
maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
3004
left in place and, if the maximum is greater than 1, we use OP_UPTO with
3005
one less than the maximum. */
3007
else if (repeat_min == 1)
3009
if (repeat_max == -1)
3010
*code++ = OP_PLUS + repeat_type;
3013
code = oldcode; /* leave previous item in place */
3014
if (repeat_max == 1) goto END_REPEAT;
3015
*code++ = OP_UPTO + repeat_type;
3016
PUT2INC(code, 0, repeat_max - 1);
3020
/* The case {n,n} is just an EXACT, while the general case {n,m} is
3021
handled as an EXACT followed by an UPTO. */
3025
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3026
PUT2INC(code, 0, repeat_min);
3028
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3029
we have to insert the character for the previous code. For a repeated
3030
Unicode property match, there is an extra byte that defines the
3031
required property. In UTF-8 mode, long characters have their length in
3032
c, with the 0x80 bit as a flag. */
3037
if (utf8 && c >= 128)
3039
memcpy(code, utf8_char, c & 7);
3046
if (prop_type >= 0) *code++ = prop_type;
3048
*code++ = OP_STAR + repeat_type;
3051
/* Else insert an UPTO if the max is greater than the min, again
3052
preceded by the character, for the previously inserted code. */
3054
else if (repeat_max != repeat_min)
3057
if (utf8 && c >= 128)
3059
memcpy(code, utf8_char, c & 7);
3065
if (prop_type >= 0) *code++ = prop_type;
3066
repeat_max -= repeat_min;
3067
*code++ = OP_UPTO + repeat_type;
3068
PUT2INC(code, 0, repeat_max);
3072
/* The character or character type itself comes last in all cases. */
3075
if (utf8 && c >= 128)
3077
memcpy(code, utf8_char, c & 7);
3084
/* For a repeated Unicode property match, there is an extra byte that
3085
defines the required property. */
3088
if (prop_type >= 0) *code++ = prop_type;
3092
/* If previous was a character class or a back reference, we put the repeat
3093
stuff after it, but just skip the item if the repeat was {0,0}. */
3095
else if (*previous == OP_CLASS ||
3096
*previous == OP_NCLASS ||
3098
*previous == OP_XCLASS ||
3100
*previous == OP_REF)
3102
if (repeat_max == 0)
3108
/* All real repeats make it impossible to handle partial matching (maybe
3109
one day we will be able to remove this restriction). */
3111
if (repeat_max != 1) cd->nopartial = TRUE;
3113
if (repeat_min == 0 && repeat_max == -1)
3114
*code++ = OP_CRSTAR + repeat_type;
3115
else if (repeat_min == 1 && repeat_max == -1)
3116
*code++ = OP_CRPLUS + repeat_type;
3117
else if (repeat_min == 0 && repeat_max == 1)
3118
*code++ = OP_CRQUERY + repeat_type;
3121
*code++ = OP_CRRANGE + repeat_type;
3122
PUT2INC(code, 0, repeat_min);
3123
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3124
PUT2INC(code, 0, repeat_max);
3128
/* If previous was a bracket group, we may have to replicate it in certain
3131
else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3132
*previous == OP_COND)
3136
int len = code - previous;
3137
uschar *bralink = NULL;
3139
/* If the maximum repeat count is unlimited, find the end of the bracket
3140
by scanning through from the start, and compute the offset back to it
3141
from the current code pointer. There may be an OP_OPT setting following
3142
the final KET, so we can't find the end just by going back from the code
3145
if (repeat_max == -1)
3147
register uschar *ket = previous;
3148
do ket += GET(ket, 1); while (*ket != OP_KET);
3149
ketoffset = code - ket;
3152
/* The case of a zero minimum is special because of the need to stick
3153
OP_BRAZERO in front of it, and because the group appears once in the
3154
data, whereas in other cases it appears the minimum number of times. For
3155
this reason, it is simplest to treat this case separately, as otherwise
3156
the code gets far too messy. There are several special subcases when the
3159
if (repeat_min == 0)
3161
/* If the maximum is also zero, we just omit the group from the output
3164
if (repeat_max == 0)
3170
/* If the maximum is 1 or unlimited, we just have to stick in the
3171
BRAZERO and do no more at this point. However, we do need to adjust
3172
any OP_RECURSE calls inside the group that refer to the group itself or
3173
any internal group, because the offset is from the start of the whole
3174
regex. Temporarily terminate the pattern while doing this. */
3176
if (repeat_max <= 1)
3179
adjust_recurse(previous, 1, utf8, cd);
3180
memmove(previous+1, previous, len);
3182
*previous++ = OP_BRAZERO + repeat_type;
3185
/* If the maximum is greater than 1 and limited, we have to replicate
3186
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3187
The first one has to be handled carefully because it's the original
3188
copy, which has to be moved up. The remainder can be handled by code
3189
that is common with the non-zero minimum case below. We have to
3190
adjust the value or repeat_max, since one less copy is required. Once
3191
again, we may have to adjust any OP_RECURSE calls inside the group. */
3197
adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3198
memmove(previous + 2 + LINK_SIZE, previous, len);
3199
code += 2 + LINK_SIZE;
3200
*previous++ = OP_BRAZERO + repeat_type;
3201
*previous++ = OP_BRA;
3203
/* We chain together the bracket offset fields that have to be
3204
filled in later when the ends of the brackets are reached. */
3206
offset = (bralink == NULL)? 0 : previous - bralink;
3208
PUTINC(previous, 0, offset);
3214
/* If the minimum is greater than zero, replicate the group as many
3215
times as necessary, and adjust the maximum to the number of subsequent
3216
copies that we need. If we set a first char from the group, and didn't
3217
set a required char, copy the latter from the former. */
3223
if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3224
for (i = 1; i < repeat_min; i++)
3226
memcpy(code, previous, len);
3230
if (repeat_max > 0) repeat_max -= repeat_min;
3233
/* This code is common to both the zero and non-zero minimum cases. If
3234
the maximum is limited, it replicates the group in a nested fashion,
3235
remembering the bracket starts on a stack. In the case of a zero minimum,
3236
the first one was set up above. In all cases the repeat_max now specifies
3237
the number of additional copies needed. */
3239
if (repeat_max >= 0)
3241
for (i = repeat_max - 1; i >= 0; i--)
3243
*code++ = OP_BRAZERO + repeat_type;
3245
/* All but the final copy start a new nesting, maintaining the
3246
chain of brackets outstanding. */
3252
offset = (bralink == NULL)? 0 : code - bralink;
3254
PUTINC(code, 0, offset);
3257
memcpy(code, previous, len);
3261
/* Now chain through the pending brackets, and fill in their length
3262
fields (which are holding the chain links pro tem). */
3264
while (bralink != NULL)
3267
int offset = code - bralink + 1;
3268
uschar *bra = code - offset;
3269
oldlinkoffset = GET(bra, 1);
3270
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3272
PUTINC(code, 0, offset);
3273
PUT(bra, 1, offset);
3277
/* If the maximum is unlimited, set a repeater in the final copy. We
3278
can't just offset backwards from the current code point, because we
3279
don't know if there's been an options resetting after the ket. The
3280
correct offset was computed above. */
3282
else code[-ketoffset] = OP_KETRMAX + repeat_type;
3285
/* Else there's some kind of shambles */
3293
/* If the character following a repeat is '+', we wrap the entire repeated
3294
item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3295
Sun's Java package. The repeated item starts at tempcode, not at previous,
3296
which might be the first part of a string whose (former) last char we
3297
repeated. However, we don't support '+' after a greediness '?'. */
3299
if (possessive_quantifier)
3301
int len = code - tempcode;
3302
memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3303
code += 1 + LINK_SIZE;
3304
len += 1 + LINK_SIZE;
3305
tempcode[0] = OP_ONCE;
3307
PUTINC(code, 0, len);
3308
PUT(tempcode, 1, len);
3311
/* In all case we no longer have a previous item. We also set the
3312
"follows varying string" flag for subsequently encountered reqbytes if
3313
it isn't already set and we have just passed a varying length item. */
3317
cd->req_varyopt |= reqvary;
3321
/* Start of nested bracket sub-expression, or comment or lookahead or
3322
lookbehind or option setting or condition. First deal with special things
3323
that can come after a bracket; all are introduced by ?, and the appearance
3324
of any of them means that this is not a referencing group. They were
3325
checked for validity in the first pass over the string, so we don't have to
3326
check for syntax errors here. */
3329
newoptions = options;
3332
if (*(++ptr) == '?')
3339
case '#': /* Comment; skip to ket */
3341
while (*ptr != ')') ptr++;
3344
case ':': /* Non-extracting bracket */
3350
bravalue = OP_COND; /* Conditional group */
3352
/* Condition to test for recursion */
3356
code[1+LINK_SIZE] = OP_CREF;
3357
PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3362
/* Condition to test for a numbered subpattern match. We know that
3363
if a digit follows ( then there will just be digits until ) because
3364
the syntax was checked in the first pass. */
3366
else if ((digitab[ptr[1]] && ctype_digit) != 0)
3368
int condref; /* Don't amalgamate; some compilers */
3369
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3370
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3377
code[1+LINK_SIZE] = OP_CREF;
3378
PUT2(code, 2+LINK_SIZE, condref);
3381
/* For conditions that are assertions, we just fall through, having
3382
set bravalue above. */
3385
case '=': /* Positive lookahead */
3386
bravalue = OP_ASSERT;
3390
case '!': /* Negative lookahead */
3391
bravalue = OP_ASSERT_NOT;
3395
case '<': /* Lookbehinds */
3398
case '=': /* Positive lookbehind */
3399
bravalue = OP_ASSERTBACK;
3403
case '!': /* Negative lookbehind */
3404
bravalue = OP_ASSERTBACK_NOT;
3410
case '>': /* One-time brackets */
3415
case 'C': /* Callout - may be followed by digits; */
3416
previous_callout = code; /* Save for later completion */
3417
after_manual_callout = 1; /* Skip one item before completing */
3418
*code++ = OP_CALLOUT; /* Already checked that the terminating */
3419
{ /* closing parenthesis is present. */
3421
while ((digitab[*(++ptr)] & ctype_digit) != 0)
3422
n = n * 10 + *ptr - '0';
3429
PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3430
PUT(code, LINK_SIZE, 0); /* Default length */
3431
code += 2 * LINK_SIZE;
3436
case 'P': /* Named subpattern handling */
3437
if (*(++ptr) == '<') /* Definition */
3440
uschar *slot = cd->name_table;
3441
const uschar *name; /* Don't amalgamate; some compilers */
3442
name = ++ptr; /* grumble at autoincrement in declaration */
3444
while (*ptr++ != '>');
3445
namelen = ptr - name - 1;
3447
for (i = 0; i < cd->names_found; i++)
3449
int crc = memcmp(name, slot+2, namelen);
3452
if (slot[2+namelen] == 0)
3457
crc = -1; /* Current name is substring */
3461
memmove(slot + cd->name_entry_size, slot,
3462
(cd->names_found - i) * cd->name_entry_size);
3465
slot += cd->name_entry_size;
3468
PUT2(slot, 0, *brackets + 1);
3469
memcpy(slot + 2, name, namelen);
3470
slot[2+namelen] = 0;
3472
goto NUMBERED_GROUP;
3475
if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3479
const uschar *name = ptr;
3480
uschar *slot = cd->name_table;
3482
while (*ptr != ')') ptr++;
3483
namelen = ptr - name;
3485
for (i = 0; i < cd->names_found; i++)
3487
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3488
slot += cd->name_entry_size;
3490
if (i >= cd->names_found)
3496
recno = GET2(slot, 0);
3498
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3500
/* Back reference */
3504
PUT2INC(code, 0, recno);
3505
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3506
if (recno > cd->top_backref) cd->top_backref = recno;
3510
/* Should never happen */
3513
case 'R': /* Pattern recursion */
3514
ptr++; /* Same as (?0) */
3517
/* Recursion or "subroutine" call */
3519
case '0': case '1': case '2': case '3': case '4':
3520
case '5': case '6': case '7': case '8': case '9':
3522
const uschar *called;
3524
while((digitab[*ptr] & ctype_digit) != 0)
3525
recno = recno * 10 + *ptr++ - '0';
3527
/* Come here from code above that handles a named recursion */
3533
/* Find the bracket that is being referenced. Temporarily end the
3534
regex in case it doesn't exist. */
3537
called = (recno == 0)?
3538
cd->start_code : find_bracket(cd->start_code, utf8, recno);
3546
/* If the subpattern is still open, this is a recursive call. We
3547
check to see if this is a left recursion that could loop for ever,
3548
and diagnose that case. */
3550
if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3556
/* Insert the recursion/subroutine item */
3559
PUT(code, 1, called - cd->start_code);
3560
code += 1 + LINK_SIZE;
3564
/* Character after (? not specially recognized */
3566
default: /* Option setting */
3570
while (*ptr != ')' && *ptr != ':')
3574
case '-': optset = &unset; break;
3576
case 'i': *optset |= PCRE_CASELESS; break;
3577
case 'm': *optset |= PCRE_MULTILINE; break;
3578
case 's': *optset |= PCRE_DOTALL; break;
3579
case 'x': *optset |= PCRE_EXTENDED; break;
3580
case 'U': *optset |= PCRE_UNGREEDY; break;
3581
case 'X': *optset |= PCRE_EXTRA; break;
3585
/* Set up the changed option bits, but don't change anything yet. */
3587
newoptions = (options | set) & (~unset);
3589
/* If the options ended with ')' this is not the start of a nested
3590
group with option changes, so the options change at this level. Compile
3591
code to change the ims options if this setting actually changes any of
3592
them. We also pass the new setting back so that it can be put at the
3593
start of any following branches, and when this group ends (if we are in
3594
a group), a resetting item can be compiled.
3596
Note that if this item is right at the start of the pattern, the
3597
options will have been abstracted and made global, so there will be no
3598
change to compile. */
3602
if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3605
*code++ = newoptions & PCRE_IMS;
3608
/* Change options at this level, and pass them back for use
3609
in subsequent branches. Reset the greedy defaults and the case
3610
value for firstbyte and reqbyte. */
3612
*optionsptr = options = newoptions;
3613
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3614
greedy_non_default = greedy_default ^ 1;
3615
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3617
previous = NULL; /* This item can't be repeated */
3618
continue; /* It is complete */
3621
/* If the options ended with ':' we are heading into a nested group
3622
with possible change of options. Such groups are non-capturing and are
3623
not assertions of any kind. All we need to do is skip over the ':';
3624
the newoptions value is handled below. */
3631
/* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3632
non-capturing and behave like (?:...) brackets */
3634
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3639
/* Else we have a referencing group; adjust the opcode. If the bracket
3640
number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3641
arrange for the true number to follow later, in an OP_BRANUMBER item. */
3646
if (++(*brackets) > EXTRACT_BASIC_MAX)
3648
bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3649
code[1+LINK_SIZE] = OP_BRANUMBER;
3650
PUT2(code, 2+LINK_SIZE, *brackets);
3653
else bravalue = OP_BRA + *brackets;
3656
/* Process nested bracketed re. Assertions may not be repeated, but other
3657
kinds can be. We copy code into a non-register variable in order to be able
3658
to pass its address because some compilers complain otherwise. Pass in a
3659
new setting for the ims options if they have changed. */
3661
previous = (bravalue >= OP_ONCE)? code : NULL;
3664
tempreqvary = cd->req_varyopt; /* Save value before bracket */
3667
newoptions, /* The complete new option state */
3668
options & PCRE_IMS, /* The previous ims option state */
3669
brackets, /* Extracting bracket count */
3670
&tempcode, /* Where to put code (updated) */
3671
&ptr, /* Input pointer (updated) */
3672
errorptr, /* Where to put an error message */
3673
(bravalue == OP_ASSERTBACK ||
3674
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3675
skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3676
&subfirstbyte, /* For possible first char */
3677
&subreqbyte, /* For possible last char */
3678
bcptr, /* Current branch chain */
3679
cd)) /* Tables block */
3682
/* At the end of compiling, code is still pointing to the start of the
3683
group, while tempcode has been updated to point past the end of the group
3684
and any option resetting that may follow it. The pattern pointer (ptr)
3685
is on the bracket. */
3687
/* If this is a conditional bracket, check that there are no more than
3688
two branches in the group. */
3690
else if (bravalue == OP_COND)
3699
while (*tc != OP_KET);
3707
/* If there is just one branch, we must not make use of its firstbyte or
3708
reqbyte, because this is equivalent to an empty second branch. */
3710
if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3713
/* Handle updating of the required and first characters. Update for normal
3714
brackets of all kinds, and conditions with two branches (see code above).
3715
If the bracket is followed by a quantifier with zero repeat, we have to
3716
back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3717
main loop so that they can be accessed for the back off. */
3719
zeroreqbyte = reqbyte;
3720
zerofirstbyte = firstbyte;
3721
groupsetfirstbyte = FALSE;
3723
if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3725
/* If we have not yet set a firstbyte in this branch, take it from the
3726
subpattern, remembering that it was set here so that a repeat of more
3727
than one can replicate it as reqbyte if necessary. If the subpattern has
3728
no firstbyte, set "none" for the whole branch. In both cases, a zero
3729
repeat forces firstbyte to "none". */
3731
if (firstbyte == REQ_UNSET)
3733
if (subfirstbyte >= 0)
3735
firstbyte = subfirstbyte;
3736
groupsetfirstbyte = TRUE;
3738
else firstbyte = REQ_NONE;
3739
zerofirstbyte = REQ_NONE;
3742
/* If firstbyte was previously set, convert the subpattern's firstbyte
3743
into reqbyte if there wasn't one, using the vary flag that was in
3744
existence beforehand. */
3746
else if (subfirstbyte >= 0 && subreqbyte < 0)
3747
subreqbyte = subfirstbyte | tempreqvary;
3749
/* If the subpattern set a required byte (or set a first byte that isn't
3750
really the first byte - see above), set it. */
3752
if (subreqbyte >= 0) reqbyte = subreqbyte;
3755
/* For a forward assertion, we take the reqbyte, if set. This can be
3756
helpful if the pattern that follows the assertion doesn't set a different
3757
char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3758
for an assertion, however because it leads to incorrect effect for patterns
3759
such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3760
of a firstbyte. This is overcome by a scan at the end if there's no
3761
firstbyte, looking for an asserted first char. */
3763
else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3765
/* Now update the main code pointer to the end of the group. */
3769
/* Error if hit end of pattern */
3778
/* Check \ for being a real metacharacter; if not, fall through and handle
3779
it as a data character at the start of a string. Escape items are checked
3780
for validity in the pre-compiling pass. */
3784
c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3786
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3787
are arranged to be the negation of the corresponding OP_values. For the
3788
back references, the values are ESC_REF plus the reference number. Only
3789
back references and those types that consume a character may be repeated.
3790
We can test for values between ESC_b and ESC_Z for the latter; this may
3791
have to change if any new ones are ever created. */
3795
if (-c == ESC_Q) /* Handle start of quoted string */
3797
if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3802
/* For metasequences that actually match a character, we disable the
3803
setting of a first character if it hasn't already been set. */
3805
if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3806
firstbyte = REQ_NONE;
3808
/* Set values to reset to if this is followed by a zero repeat. */
3810
zerofirstbyte = firstbyte;
3811
zeroreqbyte = reqbyte;
3813
/* Back references are handled specially */
3817
int number = -c - ESC_REF;
3820
PUT2INC(code, 0, number);
3823
/* So are Unicode property matches, if supported. We know that get_ucp
3824
won't fail because it was tested in the pre-pass. */
3827
else if (-c == ESC_P || -c == ESC_p)
3830
int value = get_ucp(&ptr, &negated, errorptr);
3832
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3837
/* For the rest, we can obtain the OP value by negating the escape
3842
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3848
/* We have a data character whose value is in c. In UTF-8 mode it may have
3849
a value > 127. We set its representation in the length/buffer, and then
3850
handle it as a data character. */
3853
if (utf8 && c > 127)
3854
mclength = ord2utf8(c, mcbuffer);
3865
/* Handle a literal character. It is guaranteed not to be whitespace or #
3866
when the extended flag is set. If we are in UTF-8 mode, it may be a
3867
multi-byte literal character. */
3875
if (utf8 && (c & 0xc0) == 0xc0)
3877
while ((ptr[1] & 0xc0) == 0x80)
3878
mcbuffer[mclength++] = *(++ptr);
3882
/* At this point we have the character's bytes in mcbuffer, and the length
3883
in mclength. When not in UTF-8 mode, the length is always 1. */
3887
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3888
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3890
/* Set the first and required bytes appropriately. If no previous first
3891
byte, set it from this character, but revert to none on a zero repeat.
3892
Otherwise, leave the firstbyte value alone, and don't change it on a zero
3895
if (firstbyte == REQ_UNSET)
3897
zerofirstbyte = REQ_NONE;
3898
zeroreqbyte = reqbyte;
3900
/* If the character is more than one byte long, we can set firstbyte
3901
only if it is not to be matched caselessly. */
3903
if (mclength == 1 || req_caseopt == 0)
3905
firstbyte = mcbuffer[0] | req_caseopt;
3906
if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3908
else firstbyte = reqbyte = REQ_NONE;
3911
/* firstbyte was previously set; we can set reqbyte only the length is
3912
1 or the matching is caseful. */
3916
zerofirstbyte = firstbyte;
3917
zeroreqbyte = reqbyte;
3918
if (mclength == 1 || req_caseopt == 0)
3919
reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3922
break; /* End of literal character handling */
3924
} /* end of big loop */
3926
/* Control never reaches here by falling through, only by a goto for all the
3927
error states. Pass back the position in the pattern so that it can be displayed
3928
to the user for diagnosing the error. */
3938
/*************************************************
3939
* Compile sequence of alternatives *
3940
*************************************************/
3942
/* On entry, ptr is pointing past the bracket character, but on return
3943
it points to the closing bracket, or vertical bar, or end of string.
3944
The code variable is pointing at the byte into which the BRA operator has been
3945
stored. If the ims options are changed at the start (for a (?ims: group) or
3946
during any branch, we need to insert an OP_OPT item at the start of every
3947
following branch to ensure they get set correctly at run time, and also pass
3948
the new options into every subsequent branch compile.
3951
options option bits, including any changes for this subpattern
3952
oldims previous settings of ims option bits
3953
brackets -> int containing the number of extracting brackets used
3954
codeptr -> the address of the current code pointer
3955
ptrptr -> the address of the current pattern pointer
3956
errorptr -> pointer to error message
3957
lookbehind TRUE if this is a lookbehind assertion
3958
skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3959
firstbyteptr place to put the first required character, or a negative number
3960
reqbyteptr place to put the last required character, or a negative number
3961
bcptr pointer to the chain of currently open branches
3962
cd points to the data block with tables pointers etc.
3964
Returns: TRUE on success
3968
compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3969
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3970
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3972
const uschar *ptr = *ptrptr;
3973
uschar *code = *codeptr;
3974
uschar *last_branch = code;
3975
uschar *start_bracket = code;
3976
uschar *reverse_count = NULL;
3977
int firstbyte, reqbyte;
3978
int branchfirstbyte, branchreqbyte;
3984
firstbyte = reqbyte = REQ_UNSET;
3986
/* Offset is set zero to mark that this bracket is still open */
3989
code += 1 + LINK_SIZE + skipbytes;
3991
/* Loop for each alternative branch */
3995
/* Handle a change of ims options at the start of the branch */
3997
if ((options & PCRE_IMS) != oldims)
4000
*code++ = options & PCRE_IMS;
4003
/* Set up dummy OP_REVERSE if lookbehind assertion */
4007
*code++ = OP_REVERSE;
4008
reverse_count = code;
4012
/* Now compile the branch */
4014
if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4015
&branchfirstbyte, &branchreqbyte, &bc, cd))
4021
/* If this is the first branch, the firstbyte and reqbyte values for the
4022
branch become the values for the regex. */
4024
if (*last_branch != OP_ALT)
4026
firstbyte = branchfirstbyte;
4027
reqbyte = branchreqbyte;
4030
/* If this is not the first branch, the first char and reqbyte have to
4031
match the values from all the previous branches, except that if the previous
4032
value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4033
REQ_VARY for the regex. */
4037
/* If we previously had a firstbyte, but it doesn't match the new branch,
4038
we have to abandon the firstbyte for the regex, but if there was previously
4039
no reqbyte, it takes on the value of the old firstbyte. */
4041
if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4043
if (reqbyte < 0) reqbyte = firstbyte;
4044
firstbyte = REQ_NONE;
4047
/* If we (now or from before) have no firstbyte, a firstbyte from the
4048
branch becomes a reqbyte if there isn't a branch reqbyte. */
4050
if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4051
branchreqbyte = branchfirstbyte;
4053
/* Now ensure that the reqbytes match */
4055
if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4057
else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4060
/* If lookbehind, check that this branch matches a fixed-length string,
4061
and put the length into the OP_REVERSE item. Temporarily mark the end of
4062
the branch with OP_END. */
4068
length = find_fixedlength(last_branch, options);
4069
DPRINTF(("fixed length = %d\n", length));
4072
*errorptr = (length == -2)? ERR36 : ERR25;
4076
PUT(reverse_count, 0, length);
4079
/* Reached end of expression, either ')' or end of pattern. Go back through
4080
the alternative branches and reverse the chain of offsets, with the field in
4081
the BRA item now becoming an offset to the first alternative. If there are
4082
no alternatives, it points to the end of the group. The length in the
4083
terminating ket is always the length of the whole bracketed item. If any of
4084
the ims options were changed inside the group, compile a resetting op-code
4085
following, except at the very end of the pattern. Return leaving the pointer
4086
at the terminating char. */
4090
int length = code - last_branch;
4093
int prev_length = GET(last_branch, 1);
4094
PUT(last_branch, 1, length);
4095
length = prev_length;
4096
last_branch -= length;
4100
/* Fill in the ket */
4103
PUT(code, 1, code - start_bracket);
4104
code += 1 + LINK_SIZE;
4106
/* Resetting option if needed */
4108
if ((options & PCRE_IMS) != oldims && *ptr == ')')
4114
/* Set values to pass back */
4118
*firstbyteptr = firstbyte;
4119
*reqbyteptr = reqbyte;
4123
/* Another branch follows; insert an "or" node. Its length field points back
4124
to the previous branch while the bracket remains open. At the end the chain
4125
is reversed. It's done like this so that the start of the bracket has a
4126
zero offset until it is closed, making it possible to detect recursion. */
4129
PUT(code, 1, code - last_branch);
4130
bc.current = last_branch = code;
4131
code += 1 + LINK_SIZE;
4134
/* Control never reaches here */
4140
/*************************************************
4141
* Check for anchored expression *
4142
*************************************************/
4144
/* Try to find out if this is an anchored regular expression. Consider each
4145
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4146
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4147
it's anchored. However, if this is a multiline pattern, then only OP_SOD
4148
counts, since OP_CIRC can match in the middle.
4150
We can also consider a regex to be anchored if OP_SOM starts all its branches.
4151
This is the code for \G, which means "match at start of match position, taking
4152
into account the match offset".
4154
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4155
because that will try the rest of the pattern at all possible matching points,
4156
so there is no point trying again.... er ....
4158
.... except when the .* appears inside capturing parentheses, and there is a
4159
subsequent back reference to those parentheses. We haven't enough information
4160
to catch that case precisely.
4162
At first, the best we could do was to detect when .* was in capturing brackets
4163
and the highest back reference was greater than or equal to that level.
4164
However, by keeping a bitmap of the first 31 back references, we can catch some
4165
of the more common cases more precisely.
4168
code points to start of expression (the bracket)
4169
options points to the options setting
4170
bracket_map a bitmap of which brackets we are inside while testing; this
4171
handles up to substring 31; after that we just have to take
4172
the less precise approach
4173
backref_map the back reference bitmap
4175
Returns: TRUE or FALSE
4179
is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4180
unsigned int backref_map)
4183
const uschar *scode =
4184
first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4185
register int op = *scode;
4187
/* Capturing brackets */
4193
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4194
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4195
if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4198
/* Other brackets */
4200
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4202
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4205
/* .* is not anchored unless DOTALL is set and it isn't in brackets that
4206
are or may be referenced. */
4208
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4209
(*options & PCRE_DOTALL) != 0)
4211
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4214
/* Check for explicit anchoring */
4216
else if (op != OP_SOD && op != OP_SOM &&
4217
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4219
code += GET(code, 1);
4221
while (*code == OP_ALT); /* Loop for each alternative */
4227
/*************************************************
4228
* Check for starting with ^ or .* *
4229
*************************************************/
4231
/* This is called to find out if every branch starts with ^ or .* so that
4232
"first char" processing can be done to speed things up in multiline
4233
matching and for non-DOTALL patterns that start with .* (which must start at
4234
the beginning or after \n). As in the case of is_anchored() (see above), we
4235
have to take account of back references to capturing brackets that contain .*
4236
because in that case we can't make the assumption.
4239
code points to start of expression (the bracket)
4240
bracket_map a bitmap of which brackets we are inside while testing; this
4241
handles up to substring 31; after that we just have to take
4242
the less precise approach
4243
backref_map the back reference bitmap
4245
Returns: TRUE or FALSE
4249
is_startline(const uschar *code, unsigned int bracket_map,
4250
unsigned int backref_map)
4253
const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4255
register int op = *scode;
4257
/* Capturing brackets */
4263
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4264
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4265
if (!is_startline(scode, new_map, backref_map)) return FALSE;
4268
/* Other brackets */
4270
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4271
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4273
/* .* means "start at start or after \n" if it isn't in brackets that
4274
may be referenced. */
4276
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4278
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4281
/* Check for explicit circumflex */
4283
else if (op != OP_CIRC) return FALSE;
4285
/* Move on to the next alternative */
4287
code += GET(code, 1);
4289
while (*code == OP_ALT); /* Loop for each alternative */
4295
/*************************************************
4296
* Check for asserted fixed first char *
4297
*************************************************/
4299
/* During compilation, the "first char" settings from forward assertions are
4300
discarded, because they can cause conflicts with actual literals that follow.
4301
However, if we end up without a first char setting for an unanchored pattern,
4302
it is worth scanning the regex to see if there is an initial asserted first
4303
char. If all branches start with the same asserted char, or with a bracket all
4304
of whose alternatives start with the same asserted char (recurse ad lib), then
4305
we return that char, otherwise -1.
4308
code points to start of expression (the bracket)
4309
options pointer to the options (used to check casing changes)
4310
inassert TRUE if in an assertion
4312
Returns: -1 or the fixed first char
4316
find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4318
register int c = -1;
4321
const uschar *scode =
4322
first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4323
register int op = *scode;
4325
if (op >= OP_BRA) op = OP_BRA;
4336
if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4338
if (c < 0) c = d; else if (c != d) return -1;
4341
case OP_EXACT: /* Fall through */
4348
if (!inassert) return -1;
4352
if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4354
else if (c != scode[1]) return -1;
4358
code += GET(code, 1);
4360
while (*code == OP_ALT);
4368
/*************************************************
4369
* Validate a UTF-8 string *
4370
*************************************************/
4372
/* This function is called (optionally) at the start of compile or match, to
4373
validate that a supposed UTF-8 string is actually valid. The early check means
4374
that subsequent code can assume it is dealing with a valid string. The check
4375
can be turned off for maximum performance, but then consequences of supplying
4376
an invalid string are then undefined.
4379
string points to the string
4380
length length of string, or -1 if the string is zero-terminated
4382
Returns: < 0 if the string is a valid UTF-8 string
4383
>= 0 otherwise; the value is the offset of the bad byte
4387
valid_utf8(const uschar *string, int length)
4389
register const uschar *p;
4393
for (p = string; *p != 0; p++);
4394
length = p - string;
4397
for (p = string; length-- > 0; p++)
4400
register int c = *p;
4401
if (c < 128) continue;
4402
if ((c & 0xc0) != 0xc0) return p - string;
4403
ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4404
if (length < ab) return p - string;
4407
/* Check top bits in the second byte */
4408
if ((*(++p) & 0xc0) != 0x80) return p - string;
4410
/* Check for overlong sequences for each different length */
4413
/* Check for xx00 000x */
4415
if ((c & 0x3e) == 0) return p - string;
4416
continue; /* We know there aren't any more bytes to check */
4418
/* Check for 1110 0000, xx0x xxxx */
4420
if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4423
/* Check for 1111 0000, xx00 xxxx */
4425
if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4428
/* Check for 1111 1000, xx00 0xxx */
4430
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4433
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4435
if (c == 0xfe || c == 0xff ||
4436
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4440
/* Check for valid bytes after the 2nd, if any; all must start 10 */
4443
if ((*(++p) & 0xc0) != 0x80) return p - string;
4453
/*************************************************
4454
* Compile a Regular Expression *
4455
*************************************************/
4457
/* This function takes a string and returns a pointer to a block of store
4458
holding a compiled version of the expression.
4461
pattern the regular expression
4462
options various option bits
4463
errorptr pointer to pointer to error text
4464
erroroffset ptr offset in pattern where error was detected
4465
tables pointer to character tables or NULL
4467
Returns: pointer to compiled data block, or NULL on error,
4468
with errorptr and erroroffset set
4472
pcre_compile(const char *pattern, int options, const char **errorptr,
4473
int *erroroffset, const unsigned char *tables)
4476
int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4477
int c, firstbyte, reqbyte;
4479
int branch_extra = 0;
4480
int branch_newextra;
4481
int item_count = -1;
4483
int max_name_size = 0;
4484
int lastitemlength = 0;
4489
BOOL inescq = FALSE;
4490
unsigned int brastackptr = 0;
4493
const uschar *codestart;
4495
compile_data compile_block;
4496
int brastack[BRASTACK_SIZE];
4497
uschar bralenstack[BRASTACK_SIZE];
4499
/* We can't pass back an error message if errorptr is NULL; I guess the best we
4500
can do is just return NULL. */
4502
if (errorptr == NULL) return NULL;
4505
/* However, we can give a message for this error */
4507
if (erroroffset == NULL)
4514
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
4517
utf8 = (options & PCRE_UTF8) != 0;
4518
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4519
(*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4525
if ((options & PCRE_UTF8) != 0)
4532
if ((options & ~PUBLIC_OPTIONS) != 0)
4538
/* Set up pointers to the individual character tables */
4540
if (tables == NULL) tables = pcre_default_tables;
4541
compile_block.lcc = tables + lcc_offset;
4542
compile_block.fcc = tables + fcc_offset;
4543
compile_block.cbits = tables + cbits_offset;
4544
compile_block.ctypes = tables + ctypes_offset;
4546
/* Maximum back reference and backref bitmap. This is updated for numeric
4547
references during the first pass, but for named references during the actual
4548
compile pass. The bitmap records up to 31 back references to help in deciding
4549
whether (.*) can be treated as anchored or not. */
4551
compile_block.top_backref = 0;
4552
compile_block.backref_map = 0;
4554
/* Reflect pattern for debugging output */
4556
DPRINTF(("------------------------------------------------------------------\n"));
4557
DPRINTF(("%s\n", pattern));
4559
/* The first thing to do is to make a pass over the pattern to compute the
4560
amount of store required to hold the compiled code. This does not have to be
4561
perfect as long as errors are overestimates. At the same time we can detect any
4562
flag settings right at the start, and extract them. Make an attempt to correct
4563
for any counted white space if an "extended" flag setting appears late in the
4564
pattern. We can't be so clever for #-comments. */
4566
ptr = (const uschar *)(pattern - 1);
4567
while ((c = *(++ptr)) != 0)
4574
/* If we are inside a \Q...\E sequence, all chars are literal */
4578
if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4582
/* Otherwise, first check for ignored whitespace and comments */
4584
if ((options & PCRE_EXTENDED) != 0)
4586
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4589
/* The space before the ; is to avoid a warning on a silly compiler
4590
on the Macintosh. */
4591
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4597
item_count++; /* Is zero for the first non-comment item */
4599
/* Allow space for auto callout before every item except quantifiers. */
4601
if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4602
c != '*' && c != '+' && c != '?' &&
4603
(c != '{' || !is_counted_repeat(ptr + 1)))
4604
length += 2 + 2*LINK_SIZE;
4608
/* A backslashed item may be an escaped data character or it may be a
4612
c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4613
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4615
lastitemlength = 1; /* Default length of last item for repeats */
4617
if (c >= 0) /* Data character */
4619
length += 2; /* For a one-byte character */
4622
if (utf8 && c > 127)
4625
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4626
if (c <= utf8_table1[i]) break;
4628
lastitemlength += i;
4635
/* If \Q, enter "literal" mode */
4643
/* \X is supported only if Unicode property support is compiled */
4649
goto PCRE_ERROR_RETURN;
4653
/* \P and \p are for Unicode properties, but only when the support has
4654
been compiled. Each item needs 2 bytes. */
4656
else if (-c == ESC_P || -c == ESC_p)
4662
if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4666
goto PCRE_ERROR_RETURN;
4670
/* Other escapes need one byte */
4674
/* A back reference needs an additional 2 bytes, plus either one or 5
4675
bytes for a repeat. We also need to keep the value of the highest
4680
int refnum = -c - ESC_REF;
4681
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4682
if (refnum > compile_block.top_backref)
4683
compile_block.top_backref = refnum;
4684
length += 2; /* For single back reference */
4685
if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4687
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4688
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4689
if ((min == 0 && (max == 1 || max == -1)) ||
4690
(min == 1 && max == -1))
4693
if (ptr[1] == '?') ptr++;
4698
case '^': /* Single-byte metacharacters */
4705
case '*': /* These repeats won't be after brackets; */
4706
case '+': /* those are handled separately */
4709
goto POSESSIVE; /* A few lines below */
4711
/* This covers the cases of braced repeats after a single char, metachar,
4712
class, or back reference. */
4715
if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4716
ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4717
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4719
/* These special cases just insert one extra opcode */
4721
if ((min == 0 && (max == 1 || max == -1)) ||
4722
(min == 1 && max == -1))
4725
/* These cases might insert additional copies of a preceding character. */
4731
length -= lastitemlength; /* Uncount the original char or metachar */
4732
if (min > 0) length += 3 + lastitemlength;
4734
length += lastitemlength + ((max > 0)? 3 : 1);
4737
if (ptr[1] == '?') ptr++; /* Needs no extra length */
4739
POSESSIVE: /* Test for possessive quantifier */
4743
length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4747
/* An alternation contains an offset to the next branch or ket. If any ims
4748
options changed in the previous branch(es), and/or if we are in a
4749
lookbehind assertion, extra space will be needed at the start of the
4750
branch. This is handled by branch_extra. */
4753
length += 1 + LINK_SIZE + branch_extra;
4756
/* A character class uses 33 characters provided that all the character
4757
values are less than 256. Otherwise, it uses a bit map for low valued
4758
characters, and individual items for others. Don't worry about character
4759
types that aren't allowed in classes - they'll get picked up during the
4760
compile. A character class that contains only one single-byte character
4761
uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4762
where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4765
if (*(++ptr) == '^')
4767
class_optcount = 10; /* Greater than one */
4770
else class_optcount = 0;
4776
/* Written as a "do" so that an initial ']' is taken as data */
4780
/* Inside \Q...\E everything is literal except \E */
4784
if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4790
/* Outside \Q...\E, check for escapes */
4794
c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4795
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4797
/* \b is backspace inside a class; \X is literal */
4799
if (-c == ESC_b) c = '\b';
4800
else if (-c == ESC_X) c = 'X';
4802
/* \Q enters quoting mode */
4804
else if (-c == ESC_Q)
4810
/* Handle escapes that turn into characters */
4812
if (c >= 0) goto NON_SPECIAL_CHARACTER;
4814
/* Escapes that are meta-things. The normal ones just affect the
4815
bit map, but Unicode properties require an XCLASS extended item. */
4819
class_optcount = 10; /* \d, \s etc; make sure > 1 */
4821
if (-c == ESC_p || -c == ESC_P)
4826
length += LINK_SIZE + 2;
4834
/* Check the syntax for POSIX stuff. The bits we actually handle are
4835
checked during the real compile phase. */
4837
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4840
class_optcount = 10; /* Make sure > 1 */
4843
/* Anything else increments the possible optimization count. We have to
4844
detect ranges here so that we can compute the number of extra ranges for
4845
caseless wide characters when UCP support is available. If there are wide
4846
characters, we are going to have to use an XCLASS, even for single
4859
GETCHARLEN(c, ptr, extra);
4867
/* Come here from handling \ above when it escapes to a char value */
4869
NON_SPECIAL_CHARACTER:
4875
uschar const *hyptr = ptr++;
4879
d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4880
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4881
if (-d == ESC_b) d = '\b'; /* backspace */
4882
else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4884
else if (ptr[1] != 0 && ptr[1] != ']')
4891
GETCHARLEN(d, ptr, extra);
4898
if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4901
/* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4902
127 for caseless matching, we will need to use an XCLASS. */
4906
class_optcount = 10; /* Ensure > 1 */
4910
goto PCRE_ERROR_RETURN;
4914
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4917
if (!class_utf8) /* Allow for XCLASS overhead */
4920
length += LINK_SIZE + 2;
4924
/* If we have UCP support, find out how many extra ranges are
4925
needed to map the other case of characters within this range. We
4926
have to mimic the range optimization here, because extending the
4927
range upwards might push d over a boundary that makes is use
4928
another byte in the UTF-8 representation. */
4930
if ((options & PCRE_CASELESS) != 0)
4935
while (get_othercase_range(&cc, origd, &occ, &ocd))
4937
if (occ >= c && ocd <= d) continue; /* Skip embedded */
4939
if (occ < c && ocd >= c - 1) /* Extend the basic range */
4940
{ /* if there is overlap, */
4941
c = occ; /* noting that if occ < c */
4942
continue; /* we can't have ocd > d */
4943
} /* because a subrange is */
4944
if (ocd > d && occ <= d + 1) /* always shorter than */
4945
{ /* the basic range. */
4950
/* An extra item is needed */
4952
length += 1 + ord2utf8(occ, buffer) +
4953
((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4956
#endif /* SUPPORT_UCP */
4958
/* The length of the (possibly extended) range */
4960
length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4962
#endif /* SUPPORT_UTF8 */
4966
/* We have a single character. There is nothing to be done unless we
4967
are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4968
allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4974
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4977
class_optcount = 10; /* Ensure > 1 */
4978
if (!class_utf8) /* Allow for XCLASS overhead */
4981
length += LINK_SIZE + 2;
4984
length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4985
(1 + ord2utf8(c, buffer));
4986
#else /* SUPPORT_UCP */
4987
length += 1 + ord2utf8(c, buffer);
4988
#endif /* SUPPORT_UCP */
4990
#endif /* SUPPORT_UTF8 */
4994
while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4996
if (*ptr == 0) /* Missing terminating ']' */
4999
goto PCRE_ERROR_RETURN;
5002
/* We can optimize when there was only one optimizable character. Repeats
5003
for positive and negated single one-byte chars are handled by the general
5004
code. Here, we handle repeats for the class opcodes. */
5006
if (class_optcount == 1) length += 3; else
5010
/* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5011
we also need extra for wrapping the whole thing in a sub-pattern. */
5013
if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5015
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5016
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5017
if ((min == 0 && (max == 1 || max == -1)) ||
5018
(min == 1 && max == -1))
5024
length += 2 + 2*LINK_SIZE;
5026
else if (ptr[1] == '?') ptr++;
5031
/* Brackets may be genuine groups or special things */
5034
branch_newextra = 0;
5035
bracket_length = 1 + LINK_SIZE;
5037
/* Handle special forms of bracket, which all start (? */
5046
/* Skip over comments entirely */
5049
while (*ptr != 0 && *ptr != ')') ptr++;
5053
goto PCRE_ERROR_RETURN;
5057
/* Non-referencing groups and lookaheads just move the pointer on, and
5058
then behave like a non-special bracket, except that they don't increment
5059
the count of extracting brackets. Ditto for the "once only" bracket,
5060
which is in Perl from version 5.005. */
5069
/* (?R) specifies a recursive call to the regex, which is an extension
5070
to provide the facility which can be obtained by (?p{perl-code}) in
5071
Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5073
From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5074
the appropriate numbered brackets. This includes both recursive and
5075
non-recursive calls. (?R) is now synonymous with (?0). */
5080
case '0': case '1': case '2': case '3': case '4':
5081
case '5': case '6': case '7': case '8': case '9':
5084
while ((digitab[*(++ptr)] & ctype_digit) != 0);
5088
goto PCRE_ERROR_RETURN;
5090
length += 1 + LINK_SIZE;
5092
/* If this item is quantified, it will get wrapped inside brackets so
5093
as to use the code for quantified brackets. We jump down and use the
5094
code that handles this for real brackets. */
5096
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5098
length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5099
duplength = 5 + 3 * LINK_SIZE;
5100
goto HANDLE_QUANTIFIED_BRACKETS;
5104
/* (?C) is an extension which provides "callout" - to provide a bit of
5105
the functionality of the Perl (?{...}) feature. An optional number may
5106
follow (default is zero). */
5110
while ((digitab[*(++ptr)] & ctype_digit) != 0);
5114
goto PCRE_ERROR_RETURN;
5116
length += 2 + 2*LINK_SIZE;
5119
/* Named subpatterns are an extension copied from Python */
5125
const uschar *p; /* Don't amalgamate; some compilers */
5126
p = ++ptr; /* grumble at autoincrement in declaration */
5127
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5131
goto PCRE_ERROR_RETURN;
5134
if (ptr - p > max_name_size) max_name_size = (ptr - p);
5138
if (*ptr == '=' || *ptr == '>')
5140
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5144
goto PCRE_ERROR_RETURN;
5149
/* Unknown character after (?P */
5152
goto PCRE_ERROR_RETURN;
5154
/* Lookbehinds are in Perl from version 5.005 */
5158
if (*ptr == '=' || *ptr == '!')
5160
branch_newextra = 1 + LINK_SIZE;
5161
length += 1 + LINK_SIZE; /* For the first branch */
5165
goto PCRE_ERROR_RETURN;
5167
/* Conditionals are in Perl from version 5.005. The bracket must either
5168
be followed by a number (for bracket reference) or by an assertion
5169
group, or (a PCRE extension) by 'R' for a recursion test. */
5172
if (ptr[3] == 'R' && ptr[4] == ')')
5177
else if ((digitab[ptr[3]] & ctype_digit) != 0)
5181
while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5185
goto PCRE_ERROR_RETURN;
5188
else /* An assertion must follow */
5190
ptr++; /* Can treat like ':' as far as spacing is concerned */
5191
if (ptr[2] != '?' ||
5192
(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5194
ptr += 2; /* To get right offset in message */
5196
goto PCRE_ERROR_RETURN;
5201
/* Else loop checking valid options until ) is met. Anything else is an
5202
error. If we are without any brackets, i.e. at top level, the settings
5203
act as if specified in the options, so massage the options immediately.
5204
This is for backward compatibility with Perl 5.004. */
5217
*optset |= PCRE_CASELESS;
5221
*optset |= PCRE_MULTILINE;
5225
*optset |= PCRE_DOTALL;
5229
*optset |= PCRE_EXTENDED;
5233
*optset |= PCRE_EXTRA;
5237
*optset |= PCRE_UNGREEDY;
5244
/* A termination by ')' indicates an options-setting-only item; if
5245
this is at the very start of the pattern (indicated by item_count
5246
being zero), we use it to set the global options. This is helpful
5247
when analyzing the pattern for first characters, etc. Otherwise
5248
nothing is done here and it is handled during the compiling
5251
[Historical note: Up to Perl 5.8, options settings at top level
5252
were always global settings, wherever they appeared in the pattern.
5253
That is, they were equivalent to an external setting. From 5.8
5254
onwards, they apply only to what follows (which is what you might
5258
if (item_count == 0)
5260
options = (options | set) & (~unset);
5261
set = unset = 0; /* To save length */
5262
item_count--; /* To allow for several */
5267
/* A termination by ':' indicates the start of a nested group with
5268
the given options set. This is again handled at compile time, but
5269
we must allow for compiled space if any of the ims options are
5270
set. We also have to allow for resetting space at the end of
5271
the group, which is why 4 is added to the length and not just 2.
5272
If there are several changes of options within the same group, this
5273
will lead to an over-estimate on the length, but this shouldn't
5274
matter very much. We also have to allow for resetting options at
5275
the start of any alternations, which we do by setting
5276
branch_newextra to 2. Finally, we record whether the case-dependent
5277
flag ever changes within the regex. This is used by the "required
5281
if (((set|unset) & PCRE_IMS) != 0)
5284
branch_newextra = 2;
5285
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5289
/* Unrecognized option character */
5293
goto PCRE_ERROR_RETURN;
5297
/* If we hit a closing bracket, that's it - this is a freestanding
5298
option-setting. We need to ensure that branch_extra is updated if
5299
necessary. The only values branch_newextra can have here are 0 or 2.
5300
If the value is 2, then branch_extra must either be 2 or 5, depending
5301
on whether this is a lookbehind group or not. */
5306
if (branch_newextra == 2 &&
5307
(branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5308
branch_extra += branch_newextra;
5312
/* If options were terminated by ':' control comes here. Fall through
5313
to handle the group below. */
5317
/* Extracting brackets must be counted so we can process escapes in a
5318
Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5319
need an additional 3 bytes of store per extracting bracket. However, if
5320
PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5321
must leave the count alone (it will aways be zero). */
5323
else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5326
if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5329
/* Save length for computing whole length at end if there's a repeat that
5330
requires duplication of the group. Also save the current value of
5331
branch_extra, and start the new group with the new value. If non-zero, this
5332
will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5334
if (brastackptr >= sizeof(brastack)/sizeof(int))
5337
goto PCRE_ERROR_RETURN;
5340
bralenstack[brastackptr] = branch_extra;
5341
branch_extra = branch_newextra;
5343
brastack[brastackptr++] = length;
5344
length += bracket_length;
5347
/* Handle ket. Look for subsequent max/min; for certain sets of values we
5348
have to replicate this bracket up to that many times. If brastackptr is
5349
0 this is an unmatched bracket which will generate an error, but take care
5350
not to try to access brastack[-1] when computing the length and restoring
5351
the branch_extra value. */
5354
length += 1 + LINK_SIZE;
5355
if (brastackptr > 0)
5357
duplength = length - brastack[--brastackptr];
5358
branch_extra = bralenstack[brastackptr];
5362
/* The following code is also used when a recursion such as (?3) is
5363
followed by a quantifier, because in that case, it has to be wrapped inside
5364
brackets so that the quantifier works. The value of duplength must be
5365
set before arrival. */
5367
HANDLE_QUANTIFIED_BRACKETS:
5369
/* Leave ptr at the final char; for read_repeat_counts this happens
5370
automatically; for the others we need an increment. */
5372
if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5374
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5375
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5377
else if (c == '*') { min = 0; max = -1; ptr++; }
5378
else if (c == '+') { min = 1; max = -1; ptr++; }
5379
else if (c == '?') { min = 0; max = 1; ptr++; }
5380
else { min = 1; max = 1; }
5382
/* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5383
group, and if the maximum is greater than zero, we have to replicate
5384
maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5390
if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5393
/* When the minimum is greater than zero, we have to replicate up to
5394
minval-1 times, with no additions required in the copies. Then, if there
5395
is a limited maximum we have to replicate up to maxval-1 times allowing
5396
for a BRAZERO item before each optional copy and nesting brackets for all
5397
but one of the optional copies. */
5401
length += (min - 1) * duplength;
5402
if (max > min) /* Need this test as max=-1 means no limit */
5403
length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5404
- (2 + 2*LINK_SIZE);
5407
/* Allow space for once brackets for "possessive quantifier" */
5412
length += 2 + 2*LINK_SIZE;
5416
/* Non-special character. It won't be space or # in extended mode, so it is
5417
always a genuine character. If we are in a \Q...\E sequence, check for the
5418
end; if not, we have a literal. */
5423
if (inescq && c == '\\' && ptr[1] == 'E')
5430
length += 2; /* For a one-byte character */
5431
lastitemlength = 1; /* Default length of last item for repeats */
5433
/* In UTF-8 mode, check for additional bytes. */
5436
if (utf8 && (c & 0xc0) == 0xc0)
5438
while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5439
{ /* because the end is marked */
5440
lastitemlength++; /* by a zero byte. */
5451
length += 2 + LINK_SIZE; /* For final KET and END */
5453
if ((options & PCRE_AUTO_CALLOUT) != 0)
5454
length += 2 + 2*LINK_SIZE; /* For final callout */
5456
if (length > MAX_PATTERN_SIZE)
5462
/* Compute the size of data block needed and get it, either from malloc or
5463
externally provided function. */
5465
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5466
re = (real_pcre *)(pcre_malloc)(size);
5474
/* Put in the magic number, and save the sizes, options, and character table
5475
pointer. NULL is used for the default character tables. The nullpad field is at
5476
the end; it's there to help in the case when a regex compiled on a system with
5477
4-byte pointers is run on another with 8-byte pointers. */
5479
re->magic_number = MAGIC_NUMBER;
5481
re->options = options;
5482
re->dummy1 = re->dummy2 = 0;
5483
re->name_table_offset = sizeof(real_pcre);
5484
re->name_entry_size = max_name_size + 3;
5485
re->name_count = name_count;
5486
re->tables = (tables == pcre_default_tables)? NULL : tables;
5489
/* The starting points of the name/number translation table and of the code are
5490
passed around in the compile data block. */
5492
compile_block.names_found = 0;
5493
compile_block.name_entry_size = max_name_size + 3;
5494
compile_block.name_table = (uschar *)re + re->name_table_offset;
5495
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5496
compile_block.start_code = codestart;
5497
compile_block.start_pattern = (const uschar *)pattern;
5498
compile_block.req_varyopt = 0;
5499
compile_block.nopartial = FALSE;
5501
/* Set up a starting, non-extracting bracket, then compile the expression. On
5502
error, *errorptr will be set non-NULL, so we don't need to look at the result
5503
of the function here. */
5505
ptr = (const uschar *)pattern;
5506
code = (uschar *)codestart;
5509
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5510
errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5511
re->top_bracket = bracount;
5512
re->top_backref = compile_block.top_backref;
5514
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5516
/* If not reached end of pattern on success, there's an excess bracket. */
5518
if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5520
/* Fill in the terminating state and check for disastrous overflow, but
5521
if debugging, leave the test till after things are printed out. */
5526
if (code - codestart > length) *errorptr = ERR23;
5529
/* Give an error if there's back reference to a non-existent capturing
5532
if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5534
/* Failed to compile, or error while post-processing */
5536
if (*errorptr != NULL)
5540
*erroroffset = ptr - (const uschar *)pattern;
5544
/* If the anchored option was not passed, set the flag if we can determine that
5545
the pattern is anchored by virtue of ^ characters or \A or anything else (such
5546
as starting with .* when DOTALL is set).
5548
Otherwise, if we know what the first character has to be, save it, because that
5549
speeds up unanchored matches no end. If not, see if we can set the
5550
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5551
start with ^. and also when all branches start with .* for non-DOTALL matches.
5554
if ((options & PCRE_ANCHORED) == 0)
5556
int temp_options = options;
5557
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5558
re->options |= PCRE_ANCHORED;
5562
firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5563
if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5565
int ch = firstbyte & 255;
5566
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5567
compile_block.fcc[ch] == ch)? ch : firstbyte;
5568
re->options |= PCRE_FIRSTSET;
5570
else if (is_startline(codestart, 0, compile_block.backref_map))
5571
re->options |= PCRE_STARTLINE;
5575
/* For an anchored pattern, we use the "required byte" only if it follows a
5576
variable length item in the regex. Remove the caseless flag for non-caseable
5580
((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5582
int ch = reqbyte & 255;
5583
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5584
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5585
re->options |= PCRE_REQCHSET;
5588
/* Print out the compiled data for debugging */
5592
printf("Length = %d top_bracket = %d top_backref = %d\n",
5593
length, re->top_bracket, re->top_backref);
5595
if (re->options != 0)
5597
printf("%s%s%s%s%s%s%s%s%s%s\n",
5598
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5599
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5600
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5601
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5602
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5603
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5604
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5605
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5606
((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5607
((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5610
if ((re->options & PCRE_FIRSTSET) != 0)
5612
int ch = re->first_byte & 255;
5613
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5614
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5615
else printf("First char = \\x%02x%s\n", ch, caseless);
5618
if ((re->options & PCRE_REQCHSET) != 0)
5620
int ch = re->req_byte & 255;
5621
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5622
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5623
else printf("Req char = \\x%02x%s\n", ch, caseless);
5626
print_internals(re, stdout);
5628
/* This check is done here in the debugging case so that the code that
5629
was compiled can be seen. */
5631
if (code - codestart > length)
5635
*erroroffset = ptr - (uschar *)pattern;
5645
/*************************************************
5646
* Match a back-reference *
5647
*************************************************/
5649
/* If a back reference hasn't been set, the length that is passed is greater
5650
than the number of characters left in the string, so the match fails.
5653
offset index into the offset vector
5654
eptr points into the subject
5655
length length to be matched
5656
md points to match data block
5659
Returns: TRUE if matched
5663
match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5664
unsigned long int ims)
5666
const uschar *p = md->start_subject + md->offset_vector[offset];
5669
if (eptr >= md->end_subject)
5670
printf("matching subject <null>");
5673
printf("matching subject ");
5674
pchars(eptr, length, TRUE, md);
5676
printf(" against backref ");
5677
pchars(p, length, FALSE, md);
5681
/* Always fail if not enough characters left */
5683
if (length > md->end_subject - eptr) return FALSE;
5685
/* Separate the caselesss case for speed */
5687
if ((ims & PCRE_CASELESS) != 0)
5689
while (length-- > 0)
5690
if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5693
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5700
/*************************************************
5701
* Match character against an XCLASS *
5702
*************************************************/
5704
/* This function is called from within the XCLASS code below, to match a
5705
character against an extended class which might match values > 255.
5709
data points to the flag byte of the XCLASS data
5711
Returns: TRUE if character matches, else FALSE
5715
match_xclass(int c, const uschar *data)
5718
BOOL negated = (*data & XCL_NOT) != 0;
5720
/* Character values < 256 are matched against a bitmap, if one is present. If
5721
not, we still carry on, because there may be ranges that start below 256 in the
5726
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5727
return !negated; /* char found */
5730
/* First skip the bit map if present. Then match against the list of Unicode
5731
properties or large chars or ranges that end with a large char. We won't ever
5732
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5734
if ((*data++ & XCL_MAP) != 0) data += 32;
5736
while ((t = *data++) != XCL_END)
5739
if (t == XCL_SINGLE)
5741
GETCHARINC(x, data);
5742
if (c == x) return !negated;
5744
else if (t == XCL_RANGE)
5746
GETCHARINC(x, data);
5747
GETCHARINC(y, data);
5748
if (c >= x && c <= y) return !negated;
5752
else /* XCL_PROP & XCL_NOTPROP */
5754
int chartype, othercase;
5755
int rqdtype = *data++;
5756
int category = ucp_findchar(c, &chartype, &othercase);
5759
if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5763
if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5766
#endif /* SUPPORT_UCP */
5769
return negated; /* char did not match */
5774
/***************************************************************************
5775
****************************************************************************
5776
RECURSION IN THE match() FUNCTION
5778
The match() function is highly recursive. Some regular expressions can cause
5779
it to recurse thousands of times. I was writing for Unix, so I just let it
5780
call itself recursively. This uses the stack for saving everything that has
5781
to be saved for a recursive call. On Unix, the stack can be large, and this
5784
It turns out that on non-Unix systems there are problems with programs that
5785
use a lot of stack. (This despite the fact that every last chip has oodles
5786
of memory these days, and techniques for extending the stack have been known
5787
for decades.) So....
5789
There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5790
calls by keeping local variables that need to be preserved in blocks of memory
5791
obtained from malloc instead instead of on the stack. Macros are used to
5792
achieve this so that the actual code doesn't look very different to what it
5794
****************************************************************************
5795
***************************************************************************/
5798
/* These versions of the macros use the stack, as normal */
5801
#define REGISTER register
5802
#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5803
#define RRETURN(ra) return ra
5807
/* These versions of the macros manage a private stack on the heap. Note
5808
that the rd argument of RMATCH isn't actually used. It's the md argument of
5809
match(), which never changes. */
5813
#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5815
heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5816
if (setjmp(frame->Xwhere) == 0)\
5818
newframe->Xeptr = ra;\
5819
newframe->Xecode = rb;\
5820
newframe->Xoffset_top = rc;\
5821
newframe->Xims = re;\
5822
newframe->Xeptrb = rf;\
5823
newframe->Xflags = rg;\
5824
newframe->Xprevframe = frame;\
5826
DPRINTF(("restarting from line %d\n", __LINE__));\
5831
DPRINTF(("longjumped back to line %d\n", __LINE__));\
5832
frame = md->thisframe;\
5833
rx = frame->Xresult;\
5837
#define RRETURN(ra)\
5839
heapframe *newframe = frame;\
5840
frame = newframe->Xprevframe;\
5841
(pcre_stack_free)(newframe);\
5844
frame->Xresult = ra;\
5845
md->thisframe = frame;\
5846
longjmp(frame->Xwhere, 1);\
5852
/* Structure for remembering the local variables in a private frame */
5854
typedef struct heapframe {
5855
struct heapframe *Xprevframe;
5857
/* Function arguments that may change */
5859
const uschar *Xeptr;
5860
const uschar *Xecode;
5866
/* Function local variables */
5868
const uschar *Xcallpat;
5869
const uschar *Xcharptr;
5870
const uschar *Xdata;
5871
const uschar *Xnext;
5873
const uschar *Xprev;
5874
const uschar *Xsaved_eptr;
5876
recursion_info Xnew_recursive;
5883
unsigned long int Xoriginal_ims;
5887
int Xprop_fail_result;
5890
int Xprop_othercase;
5891
int Xprop_test_against;
5892
int *Xprop_test_variable;
5904
int Xsave_capture_last;
5905
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5906
int Xstacksave[REC_STACK_SAVE_MAX];
5910
/* Place to pass back result, and where to jump back to */
5920
/***************************************************************************
5921
***************************************************************************/
5925
/*************************************************
5926
* Match from current position *
5927
*************************************************/
5929
/* On entry ecode points to the first opcode, and eptr to the first character
5930
in the subject string, while eptrb holds the value of eptr at the start of the
5931
last bracketed group - used for breaking infinite loops matching zero-length
5932
strings. This function is called recursively in many circumstances. Whenever it
5933
returns a negative (error) response, the outer incarnation must also return the
5936
Performance note: It might be tempting to extract commonly used fields from the
5937
md structure (e.g. utf8, end_subject) into individual variables to improve
5938
performance. Tests using gcc on a SPARC disproved this; in the first case, it
5939
made performance worse.
5942
eptr pointer in subject
5943
ecode position in code
5944
offset_top current top pointer
5945
md pointer to "static" info for the match
5946
ims current /i, /m, and /s options
5947
eptrb pointer to chain of blocks containing eptr at start of
5948
brackets - for testing for empty matches
5950
match_condassert - this is an assertion condition
5951
match_isgroup - this is the start of a bracketed group
5953
Returns: MATCH_MATCH if matched ) these values are >= 0
5954
MATCH_NOMATCH if failed to match )
5955
a negative PCRE_ERROR_xxx value if aborted by an error condition
5956
(e.g. stopped by recursion limit)
5960
match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5961
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5964
/* These variables do not need to be preserved over recursion in this function,
5965
so they can be ordinary variables in all cases. Mark them with "register"
5966
because they are used a lot in loops. */
5968
register int rrc; /* Returns from recursive calls */
5969
register int i; /* Used for loops not involving calls to RMATCH() */
5970
register int c; /* Character values not kept over RMATCH() calls */
5972
/* When recursion is not being used, all "local" variables that have to be
5973
preserved over calls to RMATCH() are part of a "frame" which is obtained from
5974
heap storage. Set up the top-level frame here; others are obtained from the
5975
heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5978
heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5979
frame->Xprevframe = NULL; /* Marks the top level */
5981
/* Copy in the original argument variables */
5983
frame->Xeptr = eptr;
5984
frame->Xecode = ecode;
5985
frame->Xoffset_top = offset_top;
5987
frame->Xeptrb = eptrb;
5988
frame->Xflags = flags;
5990
/* This is where control jumps back to to effect "recursion" */
5994
/* Macros make the argument variables come from the current frame */
5996
#define eptr frame->Xeptr
5997
#define ecode frame->Xecode
5998
#define offset_top frame->Xoffset_top
5999
#define ims frame->Xims
6000
#define eptrb frame->Xeptrb
6001
#define flags frame->Xflags
6003
/* Ditto for the local variables */
6006
#define charptr frame->Xcharptr
6008
#define callpat frame->Xcallpat
6009
#define data frame->Xdata
6010
#define next frame->Xnext
6011
#define pp frame->Xpp
6012
#define prev frame->Xprev
6013
#define saved_eptr frame->Xsaved_eptr
6015
#define new_recursive frame->Xnew_recursive
6017
#define cur_is_word frame->Xcur_is_word
6018
#define condition frame->Xcondition
6019
#define minimize frame->Xminimize
6020
#define prev_is_word frame->Xprev_is_word
6022
#define original_ims frame->Xoriginal_ims
6025
#define prop_type frame->Xprop_type
6026
#define prop_fail_result frame->Xprop_fail_result
6027
#define prop_category frame->Xprop_category
6028
#define prop_chartype frame->Xprop_chartype
6029
#define prop_othercase frame->Xprop_othercase
6030
#define prop_test_against frame->Xprop_test_against
6031
#define prop_test_variable frame->Xprop_test_variable
6034
#define ctype frame->Xctype
6035
#define fc frame->Xfc
6036
#define fi frame->Xfi
6037
#define length frame->Xlength
6038
#define max frame->Xmax
6039
#define min frame->Xmin
6040
#define number frame->Xnumber
6041
#define offset frame->Xoffset
6042
#define op frame->Xop
6043
#define save_capture_last frame->Xsave_capture_last
6044
#define save_offset1 frame->Xsave_offset1
6045
#define save_offset2 frame->Xsave_offset2
6046
#define save_offset3 frame->Xsave_offset3
6047
#define stacksave frame->Xstacksave
6049
#define newptrb frame->Xnewptrb
6051
/* When recursion is being used, local variables are allocated on the stack and
6052
get preserved during recursion in the normal way. In this environment, fi and
6053
i, and fc and c, can be the same variables. */
6060
#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6061
const uschar *charptr; /* small blocks of the code. My normal */
6062
#endif /* style of coding would have declared */
6063
const uschar *callpat; /* them within each of those blocks. */
6064
const uschar *data; /* However, in order to accommodate the */
6065
const uschar *next; /* version of this code that uses an */
6066
const uschar *pp; /* external "stack" implemented on the */
6067
const uschar *prev; /* heap, it is easier to declare them */
6068
const uschar *saved_eptr; /* all here, so the declarations can */
6069
/* be cut out in a block. The only */
6070
recursion_info new_recursive; /* declarations within blocks below are */
6071
/* for variables that do not have to */
6072
BOOL cur_is_word; /* be preserved over a recursive call */
6073
BOOL condition; /* to RMATCH(). */
6077
unsigned long int original_ims;
6081
int prop_fail_result;
6085
int prop_test_against;
6086
int *prop_test_variable;
6096
int save_capture_last;
6097
int save_offset1, save_offset2, save_offset3;
6098
int stacksave[REC_STACK_SAVE_MAX];
6103
/* These statements are here to stop the compiler complaining about unitialized
6107
prop_fail_result = 0;
6108
prop_test_against = 0;
6109
prop_test_variable = NULL;
6112
/* OK, now we can get on with the real code of the function. Recursion is
6113
specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6114
these just turn into a recursive call to match() and a "return", respectively.
6115
However, RMATCH isn't like a function call because it's quite a complicated
6116
macro. It has to be used in one particular way. This shouldn't, however, impact
6117
performance when true recursion is being used. */
6119
if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6121
original_ims = ims; /* Save for resetting on ')' */
6123
/* At the start of a bracketed group, add the current subject pointer to the
6124
stack of such pointers, to be re-instated at the end of the group when we hit
6125
the closing ket. When match() is called in other circumstances, we don't add to
6128
if ((flags & match_isgroup) != 0)
6130
newptrb.epb_prev = eptrb;
6131
newptrb.epb_saved_eptr = eptr;
6135
/* Now start processing the operations. */
6142
/* For partial matching, remember if we ever hit the end of the subject after
6143
matching at least one subject character. */
6146
eptr >= md->end_subject &&
6147
eptr > md->start_match)
6150
/* Opening capturing bracket. If there is space in the offset vector, save
6151
the current subject position in the working slot at the top of the vector. We
6152
mustn't change the current values of the data slot, because they may be set
6153
from a previous iteration of this group, and be referred to by a reference
6156
If the bracket fails to match, we need to restore this value and also the
6157
values of the final offsets, in case they were set by a previous iteration of
6160
If there isn't enough space in the offset vector, treat this as if it were a
6161
non-capturing bracket. Don't worry about setting the flag for the error case
6162
here; that is handled in the code for KET. */
6166
number = op - OP_BRA;
6168
/* For extended extraction brackets (large number), we have to fish out the
6169
number from a dummy opcode at the start. */
6171
if (number > EXTRACT_BASIC_MAX)
6172
number = GET2(ecode, 2+LINK_SIZE);
6173
offset = number << 1;
6176
printf("start bracket %d subject=", number);
6177
pchars(eptr, 16, TRUE, md);
6181
if (offset < md->offset_max)
6183
save_offset1 = md->offset_vector[offset];
6184
save_offset2 = md->offset_vector[offset+1];
6185
save_offset3 = md->offset_vector[md->offset_end - number];
6186
save_capture_last = md->capture_last;
6188
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6189
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6193
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6195
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6196
md->capture_last = save_capture_last;
6197
ecode += GET(ecode, 1);
6199
while (*ecode == OP_ALT);
6201
DPRINTF(("bracket %d failed\n", number));
6203
md->offset_vector[offset] = save_offset1;
6204
md->offset_vector[offset+1] = save_offset2;
6205
md->offset_vector[md->offset_end - number] = save_offset3;
6207
RRETURN(MATCH_NOMATCH);
6210
/* Insufficient room for saving captured contents */
6215
/* Other types of node can be handled by a switch */
6219
case OP_BRA: /* Non-capturing bracket: optimized */
6220
DPRINTF(("start bracket 0\n"));
6223
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6225
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6226
ecode += GET(ecode, 1);
6228
while (*ecode == OP_ALT);
6229
DPRINTF(("bracket 0 failed\n"));
6230
RRETURN(MATCH_NOMATCH);
6232
/* Conditional group: compilation checked that there are no more than
6233
two branches. If the condition is false, skipping the first branch takes us
6234
past the end if there is only one branch, but that's OK because that is
6235
exactly what going to the ket would do. */
6238
if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6240
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6241
condition = (offset == CREF_RECURSE * 2)?
6242
(md->recursive != NULL) :
6243
(offset < offset_top && md->offset_vector[offset] >= 0);
6244
RMATCH(rrc, eptr, ecode + (condition?
6245
(LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6246
offset_top, md, ims, eptrb, match_isgroup);
6250
/* The condition is an assertion. Call match() to evaluate it - setting
6251
the final argument TRUE causes it to stop at the end of an assertion. */
6255
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6256
match_condassert | match_isgroup);
6257
if (rrc == MATCH_MATCH)
6259
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6260
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6262
else if (rrc != MATCH_NOMATCH)
6264
RRETURN(rrc); /* Need braces because of following else */
6266
else ecode += GET(ecode, 1);
6267
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6271
/* Control never reaches here */
6273
/* Skip over conditional reference or large extraction number data if
6281
/* End of the pattern. If we are in a recursion, we should restore the
6282
offsets appropriately and continue from after the call. */
6285
if (md->recursive != NULL && md->recursive->group_num == 0)
6287
recursion_info *rec = md->recursive;
6288
DPRINTF(("Hit the end in a (?0) recursion\n"));
6289
md->recursive = rec->prevrec;
6290
memmove(md->offset_vector, rec->offset_save,
6291
rec->saved_max * sizeof(int));
6292
md->start_match = rec->save_start;
6294
ecode = rec->after_call;
6298
/* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6299
string - backtracking will then try other alternatives, if any. */
6301
if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6302
md->end_match_ptr = eptr; /* Record where we ended */
6303
md->end_offset_top = offset_top; /* and how many extracts were taken */
6304
RRETURN(MATCH_MATCH);
6306
/* Change option settings */
6311
DPRINTF(("ims set to %02lx\n", ims));
6314
/* Assertion brackets. Check the alternative branches in turn - the
6315
matching won't pass the KET for an assertion. If any one branch matches,
6316
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6317
start of each branch to move the current point backwards, so the code at
6318
this level is identical to the lookahead case. */
6324
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6326
if (rrc == MATCH_MATCH) break;
6327
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6328
ecode += GET(ecode, 1);
6330
while (*ecode == OP_ALT);
6331
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6333
/* If checking an assertion for a condition, return MATCH_MATCH. */
6335
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6337
/* Continue from after the assertion, updating the offsets high water
6338
mark, since extracts may have been taken during the assertion. */
6340
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6341
ecode += 1 + LINK_SIZE;
6342
offset_top = md->end_offset_top;
6345
/* Negative assertion: all branches must fail to match */
6348
case OP_ASSERTBACK_NOT:
6351
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6353
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6354
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6355
ecode += GET(ecode,1);
6357
while (*ecode == OP_ALT);
6359
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6361
ecode += 1 + LINK_SIZE;
6364
/* Move the subject pointer back. This occurs only at the start of
6365
each branch of a lookbehind assertion. If we are too close to the start to
6366
move back, this match function fails. When working with UTF-8 we move
6367
back a number of characters, not bytes. */
6374
for (i = 0; i < c; i++)
6377
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6384
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6387
eptr -= GET(ecode,1);
6388
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6391
/* Skip to next op code */
6393
ecode += 1 + LINK_SIZE;
6396
/* The callout item calls an external function, if one is provided, passing
6397
details of the match so far. This is mainly for debugging, though the
6398
function is able to force a failure. */
6401
if (pcre_callout != NULL)
6403
pcre_callout_block cb;
6404
cb.version = 1; /* Version 1 of the callout block */
6405
cb.callout_number = ecode[1];
6406
cb.offset_vector = md->offset_vector;
6407
cb.subject = (const char *)md->start_subject;
6408
cb.subject_length = md->end_subject - md->start_subject;
6409
cb.start_match = md->start_match - md->start_subject;
6410
cb.current_position = eptr - md->start_subject;
6411
cb.pattern_position = GET(ecode, 2);
6412
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6413
cb.capture_top = offset_top/2;
6414
cb.capture_last = md->capture_last;
6415
cb.callout_data = md->callout_data;
6416
if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6417
if (rrc < 0) RRETURN(rrc);
6419
ecode += 2 + 2*LINK_SIZE;
6422
/* Recursion either matches the current regex, or some subexpression. The
6423
offset data is the offset to the starting bracket from the start of the
6424
whole pattern. (This is so that it works from duplicated subpatterns.)
6426
If there are any capturing brackets started but not finished, we have to
6427
save their starting points and reinstate them after the recursion. However,
6428
we don't know how many such there are (offset_top records the completed
6429
total) so we just have to save all the potential data. There may be up to
6430
65535 such values, which is too large to put on the stack, but using malloc
6431
for small numbers seems expensive. As a compromise, the stack is used when
6432
there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6433
is used. A problem is what to do if the malloc fails ... there is no way of
6434
returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6435
values on the stack, and accept that the rest may be wrong.
6437
There are also other values that have to be saved. We use a chained
6438
sequence of blocks that actually live on the stack. Thanks to Robin Houston
6439
for the original version of this logic. */
6443
callpat = md->start_code + GET(ecode, 1);
6444
new_recursive.group_num = *callpat - OP_BRA;
6446
/* For extended extraction brackets (large number), we have to fish out
6447
the number from a dummy opcode at the start. */
6449
if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6450
new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6452
/* Add to "recursing stack" */
6454
new_recursive.prevrec = md->recursive;
6455
md->recursive = &new_recursive;
6457
/* Find where to continue from afterwards */
6459
ecode += 1 + LINK_SIZE;
6460
new_recursive.after_call = ecode;
6462
/* Now save the offset data. */
6464
new_recursive.saved_max = md->offset_end;
6465
if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6466
new_recursive.offset_save = stacksave;
6469
new_recursive.offset_save =
6470
(int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6471
if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6474
memcpy(new_recursive.offset_save, md->offset_vector,
6475
new_recursive.saved_max * sizeof(int));
6476
new_recursive.save_start = md->start_match;
6477
md->start_match = eptr;
6479
/* OK, now we can do the recursion. For each top-level alternative we
6480
restore the offset and recursion data. */
6482
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6485
RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6486
eptrb, match_isgroup);
6487
if (rrc == MATCH_MATCH)
6489
md->recursive = new_recursive.prevrec;
6490
if (new_recursive.offset_save != stacksave)
6491
(pcre_free)(new_recursive.offset_save);
6492
RRETURN(MATCH_MATCH);
6494
else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6496
md->recursive = &new_recursive;
6497
memcpy(md->offset_vector, new_recursive.offset_save,
6498
new_recursive.saved_max * sizeof(int));
6499
callpat += GET(callpat, 1);
6501
while (*callpat == OP_ALT);
6503
DPRINTF(("Recursion didn't match\n"));
6504
md->recursive = new_recursive.prevrec;
6505
if (new_recursive.offset_save != stacksave)
6506
(pcre_free)(new_recursive.offset_save);
6507
RRETURN(MATCH_NOMATCH);
6509
/* Control never reaches here */
6511
/* "Once" brackets are like assertion brackets except that after a match,
6512
the point in the subject string is not moved back. Thus there can never be
6513
a move back into the brackets. Friedl calls these "atomic" subpatterns.
6514
Check the alternative branches in turn - the matching won't pass the KET
6515
for this kind of subpattern. If any one branch matches, we carry on as at
6516
the end of a normal bracket, leaving the subject pointer. */
6525
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6526
eptrb, match_isgroup);
6527
if (rrc == MATCH_MATCH) break;
6528
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6529
ecode += GET(ecode,1);
6531
while (*ecode == OP_ALT);
6533
/* If hit the end of the group (which could be repeated), fail */
6535
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6537
/* Continue as from after the assertion, updating the offsets high water
6538
mark, since extracts may have been taken. */
6540
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6542
offset_top = md->end_offset_top;
6543
eptr = md->end_match_ptr;
6545
/* For a non-repeating ket, just continue at this level. This also
6546
happens for a repeating ket if no characters were matched in the group.
6547
This is the forcible breaking of infinite loops as implemented in Perl
6548
5.005. If there is an options reset, it will get obeyed in the normal
6549
course of events. */
6551
if (*ecode == OP_KET || eptr == saved_eptr)
6553
ecode += 1+LINK_SIZE;
6557
/* The repeating kets try the rest of the pattern or restart from the
6558
preceding bracket, in the appropriate order. We need to reset any options
6559
that changed within the bracket before re-running it, so check the next
6562
if (ecode[1+LINK_SIZE] == OP_OPT)
6564
ims = (ims & ~PCRE_IMS) | ecode[4];
6565
DPRINTF(("ims set to %02lx at group repeat\n", ims));
6568
if (*ecode == OP_KETRMIN)
6570
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6571
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6572
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6573
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6575
else /* OP_KETRMAX */
6577
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6578
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6579
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6580
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6583
RRETURN(MATCH_NOMATCH);
6585
/* An alternation is the end of a branch; scan along to find the end of the
6586
bracketed group and go to there. */
6589
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6592
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6593
that it may occur zero times. It may repeat infinitely, or not at all -
6594
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6595
repeat limits are compiled as a number of copies, with the optional ones
6596
preceded by BRAZERO or BRAMINZERO. */
6601
RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6602
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603
do next += GET(next,1); while (*next == OP_ALT);
6604
ecode = next + 1+LINK_SIZE;
6611
do next += GET(next,1); while (*next == OP_ALT);
6612
RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6614
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6619
/* End of a group, repeated or non-repeating. If we are at the end of
6620
an assertion "group", stop matching and return MATCH_MATCH, but record the
6621
current high water mark for use by positive assertions. Do this also
6622
for the "once" (not-backup up) groups. */
6628
prev = ecode - GET(ecode, 1);
6629
saved_eptr = eptrb->epb_saved_eptr;
6631
/* Back up the stack of bracket start pointers. */
6633
eptrb = eptrb->epb_prev;
6635
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6636
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6639
md->end_match_ptr = eptr; /* For ONCE */
6640
md->end_offset_top = offset_top;
6641
RRETURN(MATCH_MATCH);
6644
/* In all other cases except a conditional group we have to check the
6645
group number back at the start and if necessary complete handling an
6646
extraction by setting the offsets and bumping the high water mark. */
6648
if (*prev != OP_COND)
6650
number = *prev - OP_BRA;
6652
/* For extended extraction brackets (large number), we have to fish out
6653
the number from a dummy opcode at the start. */
6655
if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6656
offset = number << 1;
6659
printf("end bracket %d", number);
6663
/* Test for a numbered group. This includes groups called as a result
6664
of recursion. Note that whole-pattern recursion is coded as a recurse
6665
into group 0, so it won't be picked up here. Instead, we catch it when
6666
the OP_END is reached. */
6670
md->capture_last = number;
6671
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6673
md->offset_vector[offset] =
6674
md->offset_vector[md->offset_end - number];
6675
md->offset_vector[offset+1] = eptr - md->start_subject;
6676
if (offset_top <= offset) offset_top = offset + 2;
6679
/* Handle a recursively called group. Restore the offsets
6680
appropriately and continue from after the call. */
6682
if (md->recursive != NULL && md->recursive->group_num == number)
6684
recursion_info *rec = md->recursive;
6685
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6686
md->recursive = rec->prevrec;
6687
md->start_match = rec->save_start;
6688
memcpy(md->offset_vector, rec->offset_save,
6689
rec->saved_max * sizeof(int));
6690
ecode = rec->after_call;
6697
/* Reset the value of the ims flags, in case they got changed during
6701
DPRINTF(("ims reset to %02lx\n", ims));
6703
/* For a non-repeating ket, just continue at this level. This also
6704
happens for a repeating ket if no characters were matched in the group.
6705
This is the forcible breaking of infinite loops as implemented in Perl
6706
5.005. If there is an options reset, it will get obeyed in the normal
6707
course of events. */
6709
if (*ecode == OP_KET || eptr == saved_eptr)
6711
ecode += 1 + LINK_SIZE;
6715
/* The repeating kets try the rest of the pattern or restart from the
6716
preceding bracket, in the appropriate order. */
6718
if (*ecode == OP_KETRMIN)
6720
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6721
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6722
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6723
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6725
else /* OP_KETRMAX */
6727
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6728
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6729
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6730
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6734
RRETURN(MATCH_NOMATCH);
6736
/* Start of subject unless notbol, or after internal newline if multiline */
6739
if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6740
if ((ims & PCRE_MULTILINE) != 0)
6742
if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6743
RRETURN(MATCH_NOMATCH);
6747
/* ... else fall through */
6749
/* Start of subject assertion */
6752
if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6756
/* Start of match assertion */
6759
if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6763
/* Assert before internal newline if multiline, or before a terminating
6764
newline unless endonly is set, else end of subject unless noteol is set. */
6767
if ((ims & PCRE_MULTILINE) != 0)
6769
if (eptr < md->end_subject)
6770
{ if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6772
{ if (md->noteol) RRETURN(MATCH_NOMATCH); }
6778
if (md->noteol) RRETURN(MATCH_NOMATCH);
6781
if (eptr < md->end_subject - 1 ||
6782
(eptr == md->end_subject - 1 && *eptr != NEWLINE))
6783
RRETURN(MATCH_NOMATCH);
6788
/* ... else fall through */
6790
/* End of subject assertion (\z) */
6793
if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6797
/* End of subject or ending \n assertion (\Z) */
6800
if (eptr < md->end_subject - 1 ||
6801
(eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6805
/* Word boundary assertions */
6807
case OP_NOT_WORD_BOUNDARY:
6808
case OP_WORD_BOUNDARY:
6811
/* Find out if the previous and current characters are "word" characters.
6812
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6813
be "non-word" characters. */
6818
if (eptr == md->start_subject) prev_is_word = FALSE; else
6820
const uschar *lastptr = eptr - 1;
6821
while((*lastptr & 0xc0) == 0x80) lastptr--;
6822
GETCHAR(c, lastptr);
6823
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6825
if (eptr >= md->end_subject) cur_is_word = FALSE; else
6828
cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6834
/* More streamlined when not in UTF-8 mode */
6837
prev_is_word = (eptr != md->start_subject) &&
6838
((md->ctypes[eptr[-1]] & ctype_word) != 0);
6839
cur_is_word = (eptr < md->end_subject) &&
6840
((md->ctypes[*eptr] & ctype_word) != 0);
6843
/* Now see if the situation is what we want */
6845
if ((*ecode++ == OP_WORD_BOUNDARY)?
6846
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6847
RRETURN(MATCH_NOMATCH);
6851
/* Match a single character type; inline for speed */
6854
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6855
RRETURN(MATCH_NOMATCH);
6856
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6859
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6864
/* Match a single byte, even in UTF-8 mode. This opcode really does match
6865
any byte, even newline, independent of the setting of PCRE_DOTALL. */
6868
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6873
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6874
GETCHARINCTEST(c, eptr);
6879
(md->ctypes[c] & ctype_digit) != 0
6881
RRETURN(MATCH_NOMATCH);
6886
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6887
GETCHARINCTEST(c, eptr);
6892
(md->ctypes[c] & ctype_digit) == 0
6894
RRETURN(MATCH_NOMATCH);
6898
case OP_NOT_WHITESPACE:
6899
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6900
GETCHARINCTEST(c, eptr);
6905
(md->ctypes[c] & ctype_space) != 0
6907
RRETURN(MATCH_NOMATCH);
6912
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6913
GETCHARINCTEST(c, eptr);
6918
(md->ctypes[c] & ctype_space) == 0
6920
RRETURN(MATCH_NOMATCH);
6924
case OP_NOT_WORDCHAR:
6925
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6926
GETCHARINCTEST(c, eptr);
6931
(md->ctypes[c] & ctype_word) != 0
6933
RRETURN(MATCH_NOMATCH);
6938
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6939
GETCHARINCTEST(c, eptr);
6944
(md->ctypes[c] & ctype_word) == 0
6946
RRETURN(MATCH_NOMATCH);
6951
/* Check the next character by Unicode property. We will get here only
6952
if the support is in the binary; otherwise a compile-time error occurs. */
6956
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6957
GETCHARINCTEST(c, eptr);
6959
int chartype, rqdtype;
6961
int category = ucp_findchar(c, &chartype, &othercase);
6963
rqdtype = *(++ecode);
6968
if ((rqdtype - 128 != category) == (op == OP_PROP))
6969
RRETURN(MATCH_NOMATCH);
6973
if ((rqdtype != chartype) == (op == OP_PROP))
6974
RRETURN(MATCH_NOMATCH);
6979
/* Match an extended Unicode sequence. We will get here only if the support
6980
is in the binary; otherwise a compile-time error occurs. */
6983
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6984
GETCHARINCTEST(c, eptr);
6988
int category = ucp_findchar(c, &chartype, &othercase);
6989
if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6990
while (eptr < md->end_subject)
6993
if (!md->utf8) c = *eptr; else
6995
GETCHARLEN(c, eptr, len);
6997
category = ucp_findchar(c, &chartype, &othercase);
6998
if (category != ucp_M) break;
7007
/* Match a back reference, possibly repeatedly. Look past the end of the
7008
item to see if there is repeat information following. The code is similar
7009
to that for character classes, but repeated for efficiency. Then obey
7010
similar code to character type repeats - written out again for speed.
7011
However, if the referenced string is the empty string, always treat
7012
it as matched, any number of times (otherwise there could be infinite
7017
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7018
ecode += 3; /* Advance past item */
7020
/* If the reference is unset, set the length to be longer than the amount
7021
of subject left; this ensures that every attempt at a match fails. We
7022
can't just fail here, because of the possibility of quantifiers with zero
7025
length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7026
md->end_subject - eptr + 1 :
7027
md->offset_vector[offset+1] - md->offset_vector[offset];
7029
/* Set up for repetition, or handle the non-repeated case */
7039
c = *ecode++ - OP_CRSTAR;
7040
minimize = (c & 1) != 0;
7041
min = rep_min[c]; /* Pick up values from tables; */
7042
max = rep_max[c]; /* zero for max => infinity */
7043
if (max == 0) max = INT_MAX;
7048
minimize = (*ecode == OP_CRMINRANGE);
7049
min = GET2(ecode, 1);
7050
max = GET2(ecode, 3);
7051
if (max == 0) max = INT_MAX;
7055
default: /* No repeat follows */
7056
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7058
continue; /* With the main loop */
7061
/* If the length of the reference is zero, just continue with the
7064
if (length == 0) continue;
7066
/* First, ensure the minimum number of matches are present. We get back
7067
the length of the reference string explicitly rather than passing the
7068
address of eptr, so that eptr can be a register variable. */
7070
for (i = 1; i <= min; i++)
7072
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7076
/* If min = max, continue at the same level without recursion.
7077
They are not both allowed to be zero. */
7079
if (min == max) continue;
7081
/* If minimizing, keep trying and advancing the pointer */
7085
for (fi = min;; fi++)
7087
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7088
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7089
if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7090
RRETURN(MATCH_NOMATCH);
7093
/* Control never gets here */
7096
/* If maximizing, find the longest string and work backwards */
7101
for (i = min; i < max; i++)
7103
if (!match_ref(offset, eptr, length, md, ims)) break;
7108
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7109
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7112
RRETURN(MATCH_NOMATCH);
7115
/* Control never gets here */
7119
/* Match a bit-mapped character class, possibly repeatedly. This op code is
7120
used when all the characters in the class have values in the range 0-255,
7121
and either the matching is caseful, or the characters are in the range
7122
0-127 when UTF-8 processing is enabled. The only difference between
7123
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7126
First, look past the end of the item to see if there is repeat information
7127
following. Then obey similar code to character type repeats - written out
7133
data = ecode + 1; /* Save for matching */
7134
ecode += 33; /* Advance past the item */
7144
c = *ecode++ - OP_CRSTAR;
7145
minimize = (c & 1) != 0;
7146
min = rep_min[c]; /* Pick up values from tables; */
7147
max = rep_max[c]; /* zero for max => infinity */
7148
if (max == 0) max = INT_MAX;
7153
minimize = (*ecode == OP_CRMINRANGE);
7154
min = GET2(ecode, 1);
7155
max = GET2(ecode, 3);
7156
if (max == 0) max = INT_MAX;
7160
default: /* No repeat follows */
7165
/* First, ensure the minimum number of matches are present. */
7171
for (i = 1; i <= min; i++)
7173
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7174
GETCHARINC(c, eptr);
7177
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7181
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7187
/* Not UTF-8 mode */
7189
for (i = 1; i <= min; i++)
7191
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7193
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7197
/* If max == min we can continue with the main loop without the
7200
if (min == max) continue;
7202
/* If minimizing, keep testing the rest of the expression and advancing
7203
the pointer while it matches the class. */
7211
for (fi = min;; fi++)
7213
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7214
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7215
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7216
GETCHARINC(c, eptr);
7219
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7223
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7229
/* Not UTF-8 mode */
7231
for (fi = min;; fi++)
7233
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7234
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7235
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7237
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7240
/* Control never gets here */
7243
/* If maximizing, find the longest possible run, then work backwards. */
7253
for (i = min; i < max; i++)
7256
if (eptr >= md->end_subject) break;
7257
GETCHARLEN(c, eptr, len);
7260
if (op == OP_CLASS) break;
7264
if ((data[c/8] & (1 << (c&7))) == 0) break;
7270
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7271
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7272
if (eptr-- == pp) break; /* Stop if tried at original pos */
7278
/* Not UTF-8 mode */
7280
for (i = min; i < max; i++)
7282
if (eptr >= md->end_subject) break;
7284
if ((data[c/8] & (1 << (c&7))) == 0) break;
7289
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7291
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7295
RRETURN(MATCH_NOMATCH);
7298
/* Control never gets here */
7301
/* Match an extended character class. This opcode is encountered only
7302
in UTF-8 mode, because that's the only time it is compiled. */
7307
data = ecode + 1 + LINK_SIZE; /* Save for matching */
7308
ecode += GET(ecode, 1); /* Advance past the item */
7318
c = *ecode++ - OP_CRSTAR;
7319
minimize = (c & 1) != 0;
7320
min = rep_min[c]; /* Pick up values from tables; */
7321
max = rep_max[c]; /* zero for max => infinity */
7322
if (max == 0) max = INT_MAX;
7327
minimize = (*ecode == OP_CRMINRANGE);
7328
min = GET2(ecode, 1);
7329
max = GET2(ecode, 3);
7330
if (max == 0) max = INT_MAX;
7334
default: /* No repeat follows */
7339
/* First, ensure the minimum number of matches are present. */
7341
for (i = 1; i <= min; i++)
7343
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7344
GETCHARINC(c, eptr);
7345
if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7348
/* If max == min we can continue with the main loop without the
7351
if (min == max) continue;
7353
/* If minimizing, keep testing the rest of the expression and advancing
7354
the pointer while it matches the class. */
7358
for (fi = min;; fi++)
7360
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7361
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7362
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7363
GETCHARINC(c, eptr);
7364
if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7366
/* Control never gets here */
7369
/* If maximizing, find the longest possible run, then work backwards. */
7374
for (i = min; i < max; i++)
7377
if (eptr >= md->end_subject) break;
7378
GETCHARLEN(c, eptr, len);
7379
if (!match_xclass(c, data)) break;
7384
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7385
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7386
if (eptr-- == pp) break; /* Stop if tried at original pos */
7389
RRETURN(MATCH_NOMATCH);
7392
/* Control never gets here */
7394
#endif /* End of XCLASS */
7396
/* Match a single character, casefully */
7404
GETCHARLEN(fc, ecode, length);
7405
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7406
while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7411
/* Non-UTF-8 mode */
7413
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7414
if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7419
/* Match a single character, caselessly */
7427
GETCHARLEN(fc, ecode, length);
7429
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7431
/* If the pattern character's value is < 128, we have only one byte, and
7432
can use the fast lookup table. */
7436
if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7439
/* Otherwise we must pick up the subject character */
7444
GETCHARINC(dc, eptr);
7447
/* If we have Unicode property support, we can use it to test the other
7448
case of the character, if there is one. The result of ucp_findchar() is
7449
< 0 if the char isn't found, and othercase is returned as zero if there
7457
if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7459
RRETURN(MATCH_NOMATCH);
7464
#endif /* SUPPORT_UTF8 */
7466
/* Non-UTF-8 mode */
7468
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7469
if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7474
/* Match a single character repeatedly; different opcodes share code. */
7477
min = max = GET2(ecode, 1);
7484
max = GET2(ecode, 1);
7485
minimize = *ecode == OP_MINUPTO;
7495
c = *ecode++ - OP_STAR;
7496
minimize = (c & 1) != 0;
7497
min = rep_min[c]; /* Pick up values from tables; */
7498
max = rep_max[c]; /* zero for max => infinity */
7499
if (max == 0) max = INT_MAX;
7501
/* Common code for all repeated single-character matches. We can give
7502
up quickly if there are fewer than the minimum number of characters left in
7511
GETCHARLEN(fc, ecode, length);
7512
if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7515
/* Handle multibyte character matching specially here. There is
7516
support for caseless matching if UCP support is present. */
7526
if ((ims & PCRE_CASELESS) != 0 &&
7527
ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7529
oclength = ord2utf8(othercase, occhars);
7530
#endif /* SUPPORT_UCP */
7532
for (i = 1; i <= min; i++)
7534
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7535
/* Need braces because of following else */
7536
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7539
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7544
if (min == max) continue;
7548
for (fi = min;; fi++)
7550
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7551
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7552
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7553
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7554
/* Need braces because of following else */
7555
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7558
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7562
/* Control never gets here */
7567
for (i = min; i < max; i++)
7569
if (eptr > md->end_subject - length) break;
7570
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7571
else if (oclength == 0) break;
7574
if (memcmp(eptr, occhars, oclength) != 0) break;
7580
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7581
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7584
RRETURN(MATCH_NOMATCH);
7586
/* Control never gets here */
7589
/* If the length of a UTF-8 character is 1, we fall through here, and
7590
obey the code as for non-UTF-8 characters below, though in this case the
7591
value of fc will always be < 128. */
7594
#endif /* SUPPORT_UTF8 */
7596
/* When not in UTF-8 mode, load a single-byte character. */
7598
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7602
/* The value of fc at this point is always less than 256, though we may or
7603
may not be in UTF-8 mode. The code is duplicated for the caseless and
7604
caseful cases, for speed, since matching characters is likely to be quite
7605
common. First, ensure the minimum number of matches are present. If min =
7606
max, continue at the same level without recursing. Otherwise, if
7607
minimizing, keep trying the rest of the expression and advancing one
7608
matching character if failing, up to the maximum. Alternatively, if
7609
maximizing, find the maximum number of characters and work backwards. */
7611
DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7614
if ((ims & PCRE_CASELESS) != 0)
7617
for (i = 1; i <= min; i++)
7618
if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7619
if (min == max) continue;
7622
for (fi = min;; fi++)
7624
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7625
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7626
if (fi >= max || eptr >= md->end_subject ||
7627
fc != md->lcc[*eptr++])
7628
RRETURN(MATCH_NOMATCH);
7630
/* Control never gets here */
7635
for (i = min; i < max; i++)
7637
if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7642
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7644
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7646
RRETURN(MATCH_NOMATCH);
7648
/* Control never gets here */
7651
/* Caseful comparisons (includes all multi-byte characters) */
7655
for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7656
if (min == max) continue;
7659
for (fi = min;; fi++)
7661
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7662
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7663
if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7664
RRETURN(MATCH_NOMATCH);
7666
/* Control never gets here */
7671
for (i = min; i < max; i++)
7673
if (eptr >= md->end_subject || fc != *eptr) break;
7678
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7680
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7682
RRETURN(MATCH_NOMATCH);
7685
/* Control never gets here */
7687
/* Match a negated single one-byte character. The character we are
7688
checking can be multibyte. */
7691
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7693
GETCHARINCTEST(c, eptr);
7694
if ((ims & PCRE_CASELESS) != 0)
7700
if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7704
if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7708
/* Match a negated single one-byte character repeatedly. This is almost a
7709
repeat of the code for a repeated single character, but I haven't found a
7710
nice way of commoning these up that doesn't require a test of the
7711
positive/negative option for each character match. Maybe that wouldn't add
7712
very much to the time taken, but character matching *is* what this is all
7716
min = max = GET2(ecode, 1);
7723
max = GET2(ecode, 1);
7724
minimize = *ecode == OP_NOTMINUPTO;
7733
case OP_NOTMINQUERY:
7734
c = *ecode++ - OP_NOTSTAR;
7735
minimize = (c & 1) != 0;
7736
min = rep_min[c]; /* Pick up values from tables; */
7737
max = rep_max[c]; /* zero for max => infinity */
7738
if (max == 0) max = INT_MAX;
7740
/* Common code for all repeated single-byte matches. We can give up quickly
7741
if there are fewer than the minimum number of bytes left in the
7745
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7748
/* The code is duplicated for the caseless and caseful cases, for speed,
7749
since matching characters is likely to be quite common. First, ensure the
7750
minimum number of matches are present. If min = max, continue at the same
7751
level without recursing. Otherwise, if minimizing, keep trying the rest of
7752
the expression and advancing one matching character if failing, up to the
7753
maximum. Alternatively, if maximizing, find the maximum number of
7754
characters and work backwards. */
7756
DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7759
if ((ims & PCRE_CASELESS) != 0)
7768
for (i = 1; i <= min; i++)
7770
GETCHARINC(d, eptr);
7771
if (d < 256) d = md->lcc[d];
7772
if (fc == d) RRETURN(MATCH_NOMATCH);
7778
/* Not UTF-8 mode */
7780
for (i = 1; i <= min; i++)
7781
if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7784
if (min == max) continue;
7793
for (fi = min;; fi++)
7795
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7796
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7797
GETCHARINC(d, eptr);
7798
if (d < 256) d = md->lcc[d];
7799
if (fi >= max || eptr >= md->end_subject || fc == d)
7800
RRETURN(MATCH_NOMATCH);
7805
/* Not UTF-8 mode */
7807
for (fi = min;; fi++)
7809
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7810
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7811
if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7812
RRETURN(MATCH_NOMATCH);
7815
/* Control never gets here */
7829
for (i = min; i < max; i++)
7832
if (eptr >= md->end_subject) break;
7833
GETCHARLEN(d, eptr, len);
7834
if (d < 256) d = md->lcc[d];
7840
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7841
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7842
if (eptr-- == pp) break; /* Stop if tried at original pos */
7848
/* Not UTF-8 mode */
7850
for (i = min; i < max; i++)
7852
if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7857
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7858
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7863
RRETURN(MATCH_NOMATCH);
7865
/* Control never gets here */
7868
/* Caseful comparisons */
7877
for (i = 1; i <= min; i++)
7879
GETCHARINC(d, eptr);
7880
if (fc == d) RRETURN(MATCH_NOMATCH);
7885
/* Not UTF-8 mode */
7887
for (i = 1; i <= min; i++)
7888
if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7891
if (min == max) continue;
7900
for (fi = min;; fi++)
7902
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7903
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7904
GETCHARINC(d, eptr);
7905
if (fi >= max || eptr >= md->end_subject || fc == d)
7906
RRETURN(MATCH_NOMATCH);
7911
/* Not UTF-8 mode */
7913
for (fi = min;; fi++)
7915
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7916
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7917
if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7918
RRETURN(MATCH_NOMATCH);
7921
/* Control never gets here */
7935
for (i = min; i < max; i++)
7938
if (eptr >= md->end_subject) break;
7939
GETCHARLEN(d, eptr, len);
7945
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7946
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7947
if (eptr-- == pp) break; /* Stop if tried at original pos */
7953
/* Not UTF-8 mode */
7955
for (i = min; i < max; i++)
7957
if (eptr >= md->end_subject || fc == *eptr) break;
7962
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7963
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7968
RRETURN(MATCH_NOMATCH);
7971
/* Control never gets here */
7973
/* Match a single character type repeatedly; several different opcodes
7974
share code. This is very similar to the code for single characters, but we
7975
repeat it in the interests of efficiency. */
7978
min = max = GET2(ecode, 1);
7984
case OP_TYPEMINUPTO:
7986
max = GET2(ecode, 1);
7987
minimize = *ecode == OP_TYPEMINUPTO;
7992
case OP_TYPEMINSTAR:
7994
case OP_TYPEMINPLUS:
7996
case OP_TYPEMINQUERY:
7997
c = *ecode++ - OP_TYPESTAR;
7998
minimize = (c & 1) != 0;
7999
min = rep_min[c]; /* Pick up values from tables; */
8000
max = rep_max[c]; /* zero for max => infinity */
8001
if (max == 0) max = INT_MAX;
8003
/* Common code for all repeated single character type matches. Note that
8004
in UTF-8 mode, '.' matches a character of any length, but for the other
8005
character types, the valid characters are all one-byte long. */
8008
ctype = *ecode++; /* Code for the character type */
8011
if (ctype == OP_PROP || ctype == OP_NOTPROP)
8013
prop_fail_result = ctype == OP_NOTPROP;
8014
prop_type = *ecode++;
8015
if (prop_type >= 128)
8017
prop_test_against = prop_type - 128;
8018
prop_test_variable = &prop_category;
8022
prop_test_against = prop_type;
8023
prop_test_variable = &prop_chartype;
8026
else prop_type = -1;
8029
/* First, ensure the minimum number of matches are present. Use inline
8030
code for maximizing the speed, and do the type test once at the start
8031
(i.e. keep it out of the loop). Also we can test that there are at least
8032
the minimum number of bytes before we start. This isn't as effective in
8033
UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8034
is tidier. Also separate the UCP code, which can be the same for both UTF-8
8035
and single-bytes. */
8037
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8043
for (i = 1; i <= min; i++)
8045
GETCHARINC(c, eptr);
8046
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8047
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8048
RRETURN(MATCH_NOMATCH);
8052
/* Match extended Unicode sequences. We will get here only if the
8053
support is in the binary; otherwise a compile-time error occurs. */
8055
else if (ctype == OP_EXTUNI)
8057
for (i = 1; i <= min; i++)
8059
GETCHARINCTEST(c, eptr);
8060
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8061
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8062
while (eptr < md->end_subject)
8065
if (!md->utf8) c = *eptr; else
8067
GETCHARLEN(c, eptr, len);
8069
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8070
if (prop_category != ucp_M) break;
8077
#endif /* SUPPORT_UCP */
8079
/* Handle all other cases when the coding is UTF-8 */
8082
if (md->utf8) switch(ctype)
8085
for (i = 1; i <= min; i++)
8087
if (eptr >= md->end_subject ||
8088
(*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8089
RRETURN(MATCH_NOMATCH);
8090
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8099
for (i = 1; i <= min; i++)
8101
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8102
GETCHARINC(c, eptr);
8103
if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8104
RRETURN(MATCH_NOMATCH);
8109
for (i = 1; i <= min; i++)
8111
if (eptr >= md->end_subject ||
8112
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8113
RRETURN(MATCH_NOMATCH);
8114
/* No need to skip more bytes - we know it's a 1-byte character */
8118
case OP_NOT_WHITESPACE:
8119
for (i = 1; i <= min; i++)
8121
if (eptr >= md->end_subject ||
8122
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8123
RRETURN(MATCH_NOMATCH);
8124
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8129
for (i = 1; i <= min; i++)
8131
if (eptr >= md->end_subject ||
8132
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8133
RRETURN(MATCH_NOMATCH);
8134
/* No need to skip more bytes - we know it's a 1-byte character */
8138
case OP_NOT_WORDCHAR:
8139
for (i = 1; i <= min; i++)
8141
if (eptr >= md->end_subject ||
8142
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8143
RRETURN(MATCH_NOMATCH);
8144
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8149
for (i = 1; i <= min; i++)
8151
if (eptr >= md->end_subject ||
8152
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8153
RRETURN(MATCH_NOMATCH);
8154
/* No need to skip more bytes - we know it's a 1-byte character */
8159
RRETURN(PCRE_ERROR_INTERNAL);
8160
} /* End switch(ctype) */
8163
#endif /* SUPPORT_UTF8 */
8165
/* Code for the non-UTF-8 case for minimum matching of operators other
8166
than OP_PROP and OP_NOTPROP. */
8171
if ((ims & PCRE_DOTALL) == 0)
8173
for (i = 1; i <= min; i++)
8174
if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8184
for (i = 1; i <= min; i++)
8185
if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8189
for (i = 1; i <= min; i++)
8190
if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8193
case OP_NOT_WHITESPACE:
8194
for (i = 1; i <= min; i++)
8195
if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8199
for (i = 1; i <= min; i++)
8200
if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8203
case OP_NOT_WORDCHAR:
8204
for (i = 1; i <= min; i++)
8205
if ((md->ctypes[*eptr++] & ctype_word) != 0)
8206
RRETURN(MATCH_NOMATCH);
8210
for (i = 1; i <= min; i++)
8211
if ((md->ctypes[*eptr++] & ctype_word) == 0)
8212
RRETURN(MATCH_NOMATCH);
8216
RRETURN(PCRE_ERROR_INTERNAL);
8220
/* If min = max, continue at the same level without recursing */
8222
if (min == max) continue;
8224
/* If minimizing, we have to test the rest of the pattern before each
8225
subsequent match. Again, separate the UTF-8 case for speed, and also
8226
separate the UCP cases. */
8233
for (fi = min;; fi++)
8235
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8236
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8237
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8238
GETCHARINC(c, eptr);
8239
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8240
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8241
RRETURN(MATCH_NOMATCH);
8245
/* Match extended Unicode sequences. We will get here only if the
8246
support is in the binary; otherwise a compile-time error occurs. */
8248
else if (ctype == OP_EXTUNI)
8250
for (fi = min;; fi++)
8252
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8253
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8254
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8255
GETCHARINCTEST(c, eptr);
8256
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8257
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8258
while (eptr < md->end_subject)
8261
if (!md->utf8) c = *eptr; else
8263
GETCHARLEN(c, eptr, len);
8265
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8266
if (prop_category != ucp_M) break;
8273
#endif /* SUPPORT_UCP */
8279
for (fi = min;; fi++)
8281
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8282
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8283
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8285
GETCHARINC(c, eptr);
8289
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8296
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8297
RRETURN(MATCH_NOMATCH);
8301
if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8302
RRETURN(MATCH_NOMATCH);
8305
case OP_NOT_WHITESPACE:
8306
if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8307
RRETURN(MATCH_NOMATCH);
8311
if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8312
RRETURN(MATCH_NOMATCH);
8315
case OP_NOT_WORDCHAR:
8316
if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8317
RRETURN(MATCH_NOMATCH);
8321
if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8322
RRETURN(MATCH_NOMATCH);
8326
RRETURN(PCRE_ERROR_INTERNAL);
8332
/* Not UTF-8 mode */
8334
for (fi = min;; fi++)
8336
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8337
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8338
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8343
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8350
if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8354
if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8357
case OP_NOT_WHITESPACE:
8358
if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8362
if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8365
case OP_NOT_WORDCHAR:
8366
if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8370
if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8374
RRETURN(PCRE_ERROR_INTERNAL);
8378
/* Control never gets here */
8381
/* If maximizing it is worth using inline code for speed, doing the type
8382
test once at the start (i.e. keep it out of the loop). Again, keep the
8383
UTF-8 and UCP stuff separate. */
8387
pp = eptr; /* Remember where we started */
8392
for (i = min; i < max; i++)
8395
if (eptr >= md->end_subject) break;
8396
GETCHARLEN(c, eptr, len);
8397
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8398
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8403
/* eptr is now past the end of the maximum run */
8407
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8408
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8409
if (eptr-- == pp) break; /* Stop if tried at original pos */
8414
/* Match extended Unicode sequences. We will get here only if the
8415
support is in the binary; otherwise a compile-time error occurs. */
8417
else if (ctype == OP_EXTUNI)
8419
for (i = min; i < max; i++)
8421
if (eptr >= md->end_subject) break;
8422
GETCHARINCTEST(c, eptr);
8423
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8424
if (prop_category == ucp_M) break;
8425
while (eptr < md->end_subject)
8428
if (!md->utf8) c = *eptr; else
8430
GETCHARLEN(c, eptr, len);
8432
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8433
if (prop_category != ucp_M) break;
8438
/* eptr is now past the end of the maximum run */
8442
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8443
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8444
if (eptr-- == pp) break; /* Stop if tried at original pos */
8445
for (;;) /* Move back over one extended */
8449
if (!md->utf8) c = *eptr; else
8451
GETCHARLEN(c, eptr, len);
8453
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8454
if (prop_category != ucp_M) break;
8461
#endif /* SUPPORT_UCP */
8472
/* Special code is required for UTF8, but when the maximum is unlimited
8473
we don't need it, so we repeat the non-UTF8 code. This is probably
8474
worth it, because .* is quite a common idiom. */
8478
if ((ims & PCRE_DOTALL) == 0)
8480
for (i = min; i < max; i++)
8482
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8484
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8489
for (i = min; i < max; i++)
8492
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8497
/* Handle unlimited UTF-8 repeat */
8501
if ((ims & PCRE_DOTALL) == 0)
8503
for (i = min; i < max; i++)
8505
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8513
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8519
/* The byte case is the same as non-UTF8 */
8523
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8528
for (i = min; i < max; i++)
8531
if (eptr >= md->end_subject) break;
8532
GETCHARLEN(c, eptr, len);
8533
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8539
for (i = min; i < max; i++)
8542
if (eptr >= md->end_subject) break;
8543
GETCHARLEN(c, eptr, len);
8544
if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8549
case OP_NOT_WHITESPACE:
8550
for (i = min; i < max; i++)
8553
if (eptr >= md->end_subject) break;
8554
GETCHARLEN(c, eptr, len);
8555
if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8561
for (i = min; i < max; i++)
8564
if (eptr >= md->end_subject) break;
8565
GETCHARLEN(c, eptr, len);
8566
if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8571
case OP_NOT_WORDCHAR:
8572
for (i = min; i < max; i++)
8575
if (eptr >= md->end_subject) break;
8576
GETCHARLEN(c, eptr, len);
8577
if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8583
for (i = min; i < max; i++)
8586
if (eptr >= md->end_subject) break;
8587
GETCHARLEN(c, eptr, len);
8588
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8594
RRETURN(PCRE_ERROR_INTERNAL);
8597
/* eptr is now past the end of the maximum run */
8601
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8602
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8603
if (eptr-- == pp) break; /* Stop if tried at original pos */
8610
/* Not UTF-8 mode */
8615
if ((ims & PCRE_DOTALL) == 0)
8617
for (i = min; i < max; i++)
8619
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8624
/* For DOTALL case, fall through and treat as \C */
8628
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8633
for (i = min; i < max; i++)
8635
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8642
for (i = min; i < max; i++)
8644
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8650
case OP_NOT_WHITESPACE:
8651
for (i = min; i < max; i++)
8653
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8660
for (i = min; i < max; i++)
8662
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8668
case OP_NOT_WORDCHAR:
8669
for (i = min; i < max; i++)
8671
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8678
for (i = min; i < max; i++)
8680
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8687
RRETURN(PCRE_ERROR_INTERNAL);
8690
/* eptr is now past the end of the maximum run */
8694
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8696
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8700
/* Get here if we can't make it match with any permitted repetitions */
8702
RRETURN(MATCH_NOMATCH);
8704
/* Control never gets here */
8706
/* There's been some horrible disaster. Since all codes > OP_BRA are
8707
for capturing brackets, and there shouldn't be any gaps between 0 and
8708
OP_BRA, arrival here can only mean there is something seriously wrong
8709
in the code above or the OP_xxx definitions. */
8712
DPRINTF(("Unknown opcode %d\n", *ecode));
8713
RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8716
/* Do not stick any code in here without much thought; it is assumed
8717
that "continue" in the code above comes out to here to repeat the main
8720
} /* End of main loop */
8721
/* Control never reaches here */
8725
/***************************************************************************
8726
****************************************************************************
8727
RECURSION IN THE match() FUNCTION
8729
Undefine all the macros that were defined above to handle this. */
8747
#undef new_recursive
8763
#undef save_capture_last
8773
/* These two are defined as macros in both cases */
8778
/***************************************************************************
8779
***************************************************************************/
8783
/*************************************************
8784
* Execute a Regular Expression *
8785
*************************************************/
8787
/* This function applies a compiled re to a subject string and picks out
8788
portions of the string if it matches. Two elements in the vector are set for
8789
each substring: the offsets to the start and end of the substring.
8792
argument_re points to the compiled expression
8793
extra_data points to extra data or is NULL
8794
subject points to the subject string
8795
length length of subject string (may contain binary zeros)
8796
start_offset where to start in the subject string
8798
offsets points to a vector of ints to be filled in with offsets
8799
offsetcount the number of elements in the vector
8801
Returns: > 0 => success; value is the number of elements filled in
8802
= 0 => success, but offsets is not big enough
8803
-1 => failed to match
8804
< -1 => some kind of unexpected problem
8808
pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8809
const char *subject, int length, int start_offset, int options, int *offsets,
8812
int rc, resetcount, ocount;
8813
int first_byte = -1;
8816
unsigned long int ims = 0;
8817
BOOL using_temporary_offsets = FALSE;
8820
BOOL first_byte_caseless = FALSE;
8821
BOOL req_byte_caseless = FALSE;
8822
match_data match_block;
8823
const uschar *tables;
8824
const uschar *start_bits = NULL;
8825
const uschar *start_match = (const uschar *)subject + start_offset;
8826
const uschar *end_subject;
8827
const uschar *req_byte_ptr = start_match - 1;
8829
pcre_study_data internal_study;
8830
const pcre_study_data *study;
8832
real_pcre internal_re;
8833
const real_pcre *external_re = (const real_pcre *)argument_re;
8834
const real_pcre *re = external_re;
8836
/* Plausibility checks */
8838
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8839
if (re == NULL || subject == NULL ||
8840
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8841
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8843
/* Fish out the optional data from the extra_data structure, first setting
8844
the default values. */
8847
match_block.match_limit = MATCH_LIMIT;
8848
match_block.callout_data = NULL;
8850
/* The table pointer is always in native byte order. */
8852
tables = external_re->tables;
8854
if (extra_data != NULL)
8856
register unsigned int flags = extra_data->flags;
8857
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8858
study = (const pcre_study_data *)extra_data->study_data;
8859
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8860
match_block.match_limit = extra_data->match_limit;
8861
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8862
match_block.callout_data = extra_data->callout_data;
8863
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8866
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
8867
is a feature that makes it possible to save compiled regex and re-use them
8868
in other programs later. */
8870
if (tables == NULL) tables = pcre_default_tables;
8872
/* Check that the first field in the block is the magic number. If it is not,
8873
test for a regex that was compiled on a host of opposite endianness. If this is
8874
the case, flipped values are put in internal_re and internal_study if there was
8877
if (re->magic_number != MAGIC_NUMBER)
8879
re = try_flipped(re, &internal_re, study, &internal_study);
8880
if (re == NULL) return PCRE_ERROR_BADMAGIC;
8881
if (study != NULL) study = &internal_study;
8884
/* Set up other data */
8886
anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8887
startline = (re->options & PCRE_STARTLINE) != 0;
8889
/* The code starts after the real_pcre block and the capture name table. */
8891
match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8892
re->name_count * re->name_entry_size;
8894
match_block.start_subject = (const uschar *)subject;
8895
match_block.start_offset = start_offset;
8896
match_block.end_subject = match_block.start_subject + length;
8897
end_subject = match_block.end_subject;
8899
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8900
match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8902
match_block.notbol = (options & PCRE_NOTBOL) != 0;
8903
match_block.noteol = (options & PCRE_NOTEOL) != 0;
8904
match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8905
match_block.partial = (options & PCRE_PARTIAL) != 0;
8906
match_block.hitend = FALSE;
8908
match_block.recursive = NULL; /* No recursion at top level */
8910
match_block.lcc = tables + lcc_offset;
8911
match_block.ctypes = tables + ctypes_offset;
8913
/* Partial matching is supported only for a restricted set of regexes at the
8916
if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8917
return PCRE_ERROR_BADPARTIAL;
8919
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
8920
back the character offset. */
8923
if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8925
if (valid_utf8((uschar *)subject, length) >= 0)
8926
return PCRE_ERROR_BADUTF8;
8927
if (start_offset > 0 && start_offset < length)
8929
int tb = ((uschar *)subject)[start_offset];
8933
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8939
/* The ims options can vary during the matching as a result of the presence
8940
of (?ims) items in the pattern. They are kept in a local variable so that
8941
restoring at the exit of a group is easy. */
8943
ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8945
/* If the expression has got more back references than the offsets supplied can
8946
hold, we get a temporary chunk of working store to use during the matching.
8947
Otherwise, we can use the vector supplied, rounding down its size to a multiple
8950
ocount = offsetcount - (offsetcount % 3);
8952
if (re->top_backref > 0 && re->top_backref >= ocount/3)
8954
ocount = re->top_backref * 3 + 3;
8955
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8956
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8957
using_temporary_offsets = TRUE;
8958
DPRINTF(("Got memory to hold back references\n"));
8960
else match_block.offset_vector = offsets;
8962
match_block.offset_end = ocount;
8963
match_block.offset_max = (2*ocount)/3;
8964
match_block.offset_overflow = FALSE;
8965
match_block.capture_last = -1;
8967
/* Compute the minimum number of offsets that we need to reset each time. Doing
8968
this makes a huge difference to execution time when there aren't many brackets
8971
resetcount = 2 + re->top_bracket * 2;
8972
if (resetcount > offsetcount) resetcount = ocount;
8974
/* Reset the working variable associated with each extraction. These should
8975
never be used unless previously set, but they get saved and restored, and so we
8976
initialize them to avoid reading uninitialized locations. */
8978
if (match_block.offset_vector != NULL)
8980
register int *iptr = match_block.offset_vector + ocount;
8981
register int *iend = iptr - resetcount/2 + 1;
8982
while (--iptr >= iend) *iptr = -1;
8985
/* Set up the first character to match, if available. The first_byte value is
8986
never set for an anchored regular expression, but the anchoring may be forced
8987
at run time, so we have to test for anchoring. The first char may be unset for
8988
an unanchored pattern, of course. If there's no first char and the pattern was
8989
studied, there may be a bitmap of possible first characters. */
8993
if ((re->options & PCRE_FIRSTSET) != 0)
8995
first_byte = re->first_byte & 255;
8996
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8997
first_byte = match_block.lcc[first_byte];
9000
if (!startline && study != NULL &&
9001
(study->options & PCRE_STUDY_MAPPED) != 0)
9002
start_bits = study->start_bits;
9005
/* For anchored or unanchored matches, there may be a "last known required
9008
if ((re->options & PCRE_REQCHSET) != 0)
9010
req_byte = re->req_byte & 255;
9011
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9012
req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9015
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
9016
the loop runs just once. */
9020
/* Reset the maximum number of extractions we might see. */
9022
if (match_block.offset_vector != NULL)
9024
register int *iptr = match_block.offset_vector;
9025
register int *iend = iptr + resetcount;
9026
while (iptr < iend) *iptr++ = -1;
9029
/* Advance to a unique first char if possible */
9031
if (first_byte >= 0)
9033
if (first_byte_caseless)
9034
while (start_match < end_subject &&
9035
match_block.lcc[*start_match] != first_byte)
9038
while (start_match < end_subject && *start_match != first_byte)
9042
/* Or to just after \n for a multiline match if possible */
9046
if (start_match > match_block.start_subject + start_offset)
9048
while (start_match < end_subject && start_match[-1] != NEWLINE)
9053
/* Or to a non-unique first char after study */
9055
else if (start_bits != NULL)
9057
while (start_match < end_subject)
9059
register unsigned int c = *start_match;
9060
if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9064
#ifdef DEBUG /* Sigh. Some compilers never learn. */
9065
printf(">>>> Match against: ");
9066
pchars(start_match, end_subject - start_match, TRUE, &match_block);
9070
/* If req_byte is set, we know that that character must appear in the subject
9071
for the match to succeed. If the first character is set, req_byte must be
9072
later in the subject; otherwise the test starts at the match point. This
9073
optimization can save a huge amount of backtracking in patterns with nested
9074
unlimited repeats that aren't going to match. Writing separate code for
9075
cased/caseless versions makes it go faster, as does using an autoincrement
9076
and backing off on a match.
9078
HOWEVER: when the subject string is very, very long, searching to its end can
9079
take a long time, and give bad performance on quite ordinary patterns. This
9080
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9081
don't do this when the string is sufficiently long.
9083
ALSO: this processing is disabled when partial matching is requested.
9086
if (req_byte >= 0 &&
9087
end_subject - start_match < REQ_BYTE_MAX &&
9088
!match_block.partial)
9090
register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9092
/* We don't need to repeat the search if we haven't yet reached the
9093
place we found it at last time. */
9095
if (p > req_byte_ptr)
9097
if (req_byte_caseless)
9099
while (p < end_subject)
9101
register int pp = *p++;
9102
if (pp == req_byte || pp == req_byte2) { p--; break; }
9107
while (p < end_subject)
9109
if (*p++ == req_byte) { p--; break; }
9113
/* If we can't find the required character, break the matching loop */
9115
if (p >= end_subject) break;
9117
/* If we have found the required character, save the point where we
9118
found it, so that we don't search again next time round the loop if
9119
the start hasn't passed this character yet. */
9125
/* When a match occurs, substrings will be set for all internal extractions;
9126
we just need to set up the whole thing as substring 0 before returning. If
9127
there were too many extractions, set the return code to zero. In the case
9128
where we had to get some local store to hold offsets for backreferences, copy
9129
those back references that we can. In this case there need not be overflow
9130
if certain parts of the pattern were not used. */
9132
match_block.start_match = start_match;
9133
match_block.match_call_count = 0;
9135
rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9138
if (rc == MATCH_NOMATCH)
9142
if (match_block.utf8)
9143
while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9149
if (rc != MATCH_MATCH)
9151
DPRINTF((">>>> error: returning %d\n", rc));
9155
/* We have a match! Copy the offset information from temporary store if
9158
if (using_temporary_offsets)
9160
if (offsetcount >= 4)
9162
memcpy(offsets + 2, match_block.offset_vector + 2,
9163
(offsetcount - 2) * sizeof(int));
9164
DPRINTF(("Copied offsets from temporary memory\n"));
9166
if (match_block.end_offset_top > offsetcount)
9167
match_block.offset_overflow = TRUE;
9169
DPRINTF(("Freeing temporary memory\n"));
9170
(pcre_free)(match_block.offset_vector);
9173
rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9175
if (offsetcount < 2) rc = 0; else
9177
offsets[0] = start_match - match_block.start_subject;
9178
offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9181
DPRINTF((">>>> returning %d\n", rc));
9185
/* This "while" is the end of the "do" above */
9187
while (!anchored && start_match <= end_subject);
9189
if (using_temporary_offsets)
9191
DPRINTF(("Freeing temporary memory\n"));
9192
(pcre_free)(match_block.offset_vector);
9195
if (match_block.partial && match_block.hitend)
9197
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9198
return PCRE_ERROR_PARTIAL;
9202
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9203
return PCRE_ERROR_NOMATCH;