1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
6
This is a library of functions to support regular expressions whose syntax
7
and semantics are as close as possible to those of the Perl 5 language. See
8
the file Tech.Notes for some information on the internals.
10
Written by: Philip Hazel <ph10@cam.ac.uk>
12
Copyright (c) 1997-2004 University of Cambridge
14
-----------------------------------------------------------------------------
15
Redistribution and use in source and binary forms, with or without
16
modification, are permitted provided that the following conditions are met:
18
* Redistributions of source code must retain the above copyright notice,
19
this list of conditions and the following disclaimer.
21
* Redistributions in binary form must reproduce the above copyright
22
notice, this list of conditions and the following disclaimer in the
23
documentation and/or other materials provided with the distribution.
25
* Neither the name of the University of Cambridge nor the names of its
26
contributors may be used to endorse or promote products derived from
27
this software without specific prior written permission.
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39
POSSIBILITY OF SUCH DAMAGE.
40
-----------------------------------------------------------------------------
44
/* Define DEBUG to get debugging output on stdout. */
47
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48
inline, and there are *still* stupid compilers about that don't like indented
49
pre-processor statements. I suppose it's only been 10 years... */
52
#define DPRINTF(p) printf p
54
#define DPRINTF(p) /*nothing*/
57
/* Include the internals header, which itself includes "config.h", the Standard
58
C headers, and the external pcre header. */
60
#include "pcreinternal.h"
62
/* If Unicode Property support is wanted, include a private copy of the
63
function that does it, and the table that translates names to numbers. */
67
#include "ucptypetable.c"
70
/* Maximum number of items on the nested bracket stacks at compile time. This
71
applies to the nesting of all kinds of parentheses. It does not limit
72
un-nested, non-capturing parentheses. This number can be made bigger if
73
necessary - it is used to dimension one int and one unsigned char vector at
76
#define BRASTACK_SIZE 200
79
/* Maximum number of ints of offset to save on the stack for recursive calls.
80
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81
because the offset vector is always a multiple of 3 long. */
83
#define REC_STACK_SAVE_MAX 30
86
/* The maximum remaining length of subject we are prepared to search for a
89
#define REQ_BYTE_MAX 1000
92
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93
the definition is next to the definition of the opcodes in internal.h. */
95
static const uschar OP_lengths[] = { OP_LENGTHS };
97
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
103
are simple data values; negative values are for special things like \d and so
104
on. Zero means further processing is needed (for things like \x), or the escape
107
#if !EBCDIC /* This is the "normal" table for ASCII systems */
108
static const short int escapes[] = {
109
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111
'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113
-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114
-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115
'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116
0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117
-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118
0, 0, -ESC_z /* x - z */
121
#else /* This is the "abnormal" table for EBCDIC systems */
122
static const short int escapes[] = {
123
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125
/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126
/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130
/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131
/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132
/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139
/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140
/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141
/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142
/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
150
/* Tables of names of POSIX character classes and their lengths. The list is
151
terminated by a zero length entry. The first three must be alpha, upper, lower,
152
as this is assumed for handling case independence. */
154
static const char *const posix_names[] = {
155
"alpha", "lower", "upper",
156
"alnum", "ascii", "blank", "cntrl", "digit", "graph",
157
"print", "punct", "space", "word", "xdigit" };
159
static const uschar posix_name_lengths[] = {
160
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
162
/* Table of class bit maps for each POSIX class; up to three may be combined
163
to form the class. The table for [:blank:] is dynamically modified to remove
164
the vertical space characters. */
166
static const int posix_class_maps[] = {
167
cbit_lower, cbit_upper, -1, /* alpha */
168
cbit_lower, -1, -1, /* lower */
169
cbit_upper, -1, -1, /* upper */
170
cbit_digit, cbit_lower, cbit_upper, /* alnum */
171
cbit_print, cbit_cntrl, -1, /* ascii */
172
cbit_space, -1, -1, /* blank - a GNU extension */
173
cbit_cntrl, -1, -1, /* cntrl */
174
cbit_digit, -1, -1, /* digit */
175
cbit_graph, -1, -1, /* graph */
176
cbit_print, -1, -1, /* print */
177
cbit_punct, -1, -1, /* punct */
178
cbit_space, -1, -1, /* space */
179
cbit_word, -1, -1, /* word - a Perl extension */
180
cbit_xdigit,-1, -1 /* xdigit */
183
/* Table to identify digits and hex digits. This is used when compiling
184
patterns. Note that the tables in chartables are dependent on the locale, and
185
may mark arbitrary characters as digits - but the PCRE compiling code expects
186
to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187
a private table here. It costs 256 bytes, but it is a lot faster than doing
188
character value tests (at least in some simple cases I timed), and in some
189
applications one wants PCRE to compile efficiently as well as match
192
For convenience, we use the same bit definitions as in chartables:
195
0x08 hexadecimal digit
197
Then we can use ctype_digit and ctype_xdigit in the code. */
199
#if !EBCDIC /* This is the "normal" case, for ASCII systems */
200
static const unsigned char digitab[] =
202
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
235
#else /* This is the "abnormal" case, for EBCDIC systems */
236
static const unsigned char digitab[] =
238
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ļæ½ */
250
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
271
static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272
0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273
0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ļæ½ */
284
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288
0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294
0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295
0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296
0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302
0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303
0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
307
/* Definition to allow mutual recursion */
310
compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311
BOOL, int, int *, int *, branch_chain *, compile_data *);
313
/* Structure for building a chain of data that actually lives on the
314
stack, for holding the values of the subject pointer at the start of each
315
subpattern, so as to detect when an empty string has been matched by a
316
subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317
are on the heap, not on the stack. */
319
typedef struct eptrblock {
320
struct eptrblock *epb_prev;
321
const uschar *epb_saved_eptr;
324
/* Flag bits for the match() function */
326
#define match_condassert 0x01 /* Called to check a condition assertion */
327
#define match_isgroup 0x02 /* Set if start of bracketed group */
329
/* Non-error returns from the match() function. Error returns are externally
330
defined PCRE_ERROR_xxx codes, which are all negative. */
332
#define MATCH_MATCH 1
333
#define MATCH_NOMATCH 0
337
/*************************************************
339
*************************************************/
341
/* PCRE is thread-clean and doesn't use any global variables in the normal
342
sense. However, it calls memory allocation and free functions via the four
343
indirections below, and it can optionally do callouts. These values can be
344
changed by the caller, but are shared between all threads. However, when
345
compiling for Virtual Pascal, things are done differently (see pcre.in). */
349
extern "C" void *(*pcre_malloc)(size_t) = malloc;
350
extern "C" void (*pcre_free)(void *) = free;
351
extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352
extern "C" void (*pcre_stack_free)(void *) = free;
353
extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
355
void *(*pcre_malloc)(size_t) = malloc;
356
void (*pcre_free)(void *) = free;
357
void *(*pcre_stack_malloc)(size_t) = malloc;
358
void (*pcre_stack_free)(void *) = free;
359
int (*pcre_callout)(pcre_callout_block *) = NULL;
364
/*************************************************
365
* Macros and tables for character handling *
366
*************************************************/
368
/* When UTF-8 encoding is being used, a character is no longer just a single
369
byte. The macros for character handling generate simple sequences when used in
370
byte-mode, and more complicated ones for UTF-8 characters. */
373
#define GETCHAR(c, eptr) c = *eptr;
374
#define GETCHARINC(c, eptr) c = *eptr++;
375
#define GETCHARINCTEST(c, eptr) c = *eptr++;
376
#define GETCHARLEN(c, eptr, len) c = *eptr;
377
#define BACKCHAR(eptr)
379
#else /* SUPPORT_UTF8 */
381
/* Get the next UTF-8 character, not advancing the pointer. This is called when
382
we know we are in UTF-8 mode. */
384
#define GETCHAR(c, eptr) \
386
if ((c & 0xc0) == 0xc0) \
389
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
391
c = (c & utf8_table3[gcaa]) << gcss; \
392
for (gcii = 1; gcii <= gcaa; gcii++) \
395
c |= (eptr[gcii] & 0x3f) << gcss; \
399
/* Get the next UTF-8 character, advancing the pointer. This is called when we
400
know we are in UTF-8 mode. */
402
#define GETCHARINC(c, eptr) \
404
if ((c & 0xc0) == 0xc0) \
406
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
408
c = (c & utf8_table3[gcaa]) << gcss; \
412
c |= (*eptr++ & 0x3f) << gcss; \
416
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
418
#define GETCHARINCTEST(c, eptr) \
420
if (md->utf8 && (c & 0xc0) == 0xc0) \
422
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
424
c = (c & utf8_table3[gcaa]) << gcss; \
428
c |= (*eptr++ & 0x3f) << gcss; \
432
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
433
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
435
#define GETCHARLEN(c, eptr, len) \
437
if ((c & 0xc0) == 0xc0) \
440
int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
442
c = (c & utf8_table3[gcaa]) << gcss; \
443
for (gcii = 1; gcii <= gcaa; gcii++) \
446
c |= (eptr[gcii] & 0x3f) << gcss; \
451
/* If the pointer is not at the start of a character, move it back until
452
it is. Called only in UTF-8 mode. */
454
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
460
/*************************************************
461
* Default character tables *
462
*************************************************/
464
/* A default set of character tables is included in the PCRE binary. Its source
465
is built by the maketables auxiliary program, which uses the default C ctypes
466
functions, and put in the file chartables.c. These tables are used by PCRE
467
whenever the caller of pcre_compile() does not provide an alternate set of
470
#include "chartables.c"
475
/*************************************************
476
* Tables for UTF-8 support *
477
*************************************************/
479
/* These are the breakpoints for different numbers of bytes in a UTF-8
482
static const int utf8_table1[] =
483
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
485
/* These are the indicator bits and the mask for the data bits to set in the
486
first byte of a character, indexed by the number of additional bytes. */
488
static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489
static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
491
/* Table of the number of extra characters, indexed by the first character
492
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
495
static const uschar utf8_table4[] = {
496
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502
/*************************************************
503
* Convert character value to UTF-8 *
504
*************************************************/
506
/* This function takes an integer value in the range 0 - 0x7fffffff
507
and encodes it as a UTF-8 character in 0 to 6 bytes.
510
cvalue the character value
511
buffer pointer to buffer for result - at least 6 bytes long
513
Returns: number of characters placed in the buffer
517
ord2utf8(int cvalue, uschar *buffer)
520
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521
if (cvalue <= utf8_table1[i]) break;
523
for (j = i; j > 0; j--)
525
*buffer-- = 0x80 | (cvalue & 0x3f);
528
*buffer = utf8_table2[i] | cvalue;
535
/*************************************************
536
* Print compiled regex *
537
*************************************************/
539
/* The code for doing this is held in a separate file that is also included in
540
pcretest.c. It defines a function called print_internals(). */
543
#include "printint.c"
548
/*************************************************
549
* Return version string *
550
*************************************************/
552
#define STRING(a) # a
553
#define XSTRING(s) STRING(s)
558
return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
564
/*************************************************
565
* Flip bytes in an integer *
566
*************************************************/
568
/* This function is called when the magic number in a regex doesn't match in
569
order to flip its bytes to see if we are dealing with a pattern that was
570
compiled on a host of different endianness. If so, this function is used to
571
flip other byte values.
574
value the number to flip
575
n the number of bytes to flip (assumed to be 2 or 4)
577
Returns: the flipped value
581
byteflip(long int value, int n)
583
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584
return ((value & 0x000000ff) << 24) |
585
((value & 0x0000ff00) << 8) |
586
((value & 0x00ff0000) >> 8) |
587
((value & 0xff000000) >> 24);
592
/*************************************************
593
* Test for a byte-flipped compiled regex *
594
*************************************************/
596
/* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597
job is to test whether the regex is byte-flipped - that is, it was compiled on
598
a system of opposite endianness. The function is called only when the native
599
MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600
relevant values into a different data block, and return it.
603
re points to the regex
604
study points to study data, or NULL
605
internal_re points to a new regex block
606
internal_study points to a new study block
608
Returns: the new block if is is indeed a byte-flipped regex
613
try_flipped(const real_pcre *re, real_pcre *internal_re,
614
const pcre_study_data *study, pcre_study_data *internal_study)
616
if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
619
*internal_re = *re; /* To copy other fields */
620
internal_re->size = byteflip(re->size, sizeof(re->size));
621
internal_re->options = byteflip(re->options, sizeof(re->options));
622
internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623
internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624
internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625
internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626
internal_re->name_table_offset = byteflip(re->name_table_offset,
627
sizeof(re->name_table_offset));
628
internal_re->name_entry_size = byteflip(re->name_entry_size,
629
sizeof(re->name_entry_size));
630
internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
634
*internal_study = *study; /* To copy other fields */
635
internal_study->size = byteflip(study->size, sizeof(study->size));
636
internal_study->options = byteflip(study->options, sizeof(study->options));
644
/*************************************************
645
* (Obsolete) Return info about compiled pattern *
646
*************************************************/
648
/* This is the original "info" function. It picks potentially useful data out
649
of the private structure, but its interface was too rigid. It remains for
650
backwards compatibility. The public options are passed back in an int - though
651
the re->options field has been expanded to a long int, all the public options
652
at the low end of it, and so even on 16-bit systems this will still be OK.
653
Therefore, I haven't changed the API for pcre_info().
656
argument_re points to compiled code
657
optptr where to pass back the options
658
first_byte where to pass back the first character,
659
or -1 if multiline and all branches start ^,
662
Returns: number of capturing subpatterns
663
or negative values on error
667
pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
669
real_pcre internal_re;
670
const real_pcre *re = (const real_pcre *)argument_re;
671
if (re == NULL) return PCRE_ERROR_NULL;
672
if (re->magic_number != MAGIC_NUMBER)
674
re = try_flipped(re, &internal_re, NULL, NULL);
675
if (re == NULL) return PCRE_ERROR_BADMAGIC;
677
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678
if (first_byte != NULL)
679
*first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
680
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
681
return re->top_bracket;
686
/*************************************************
687
* Return info about compiled pattern *
688
*************************************************/
690
/* This is a newer "info" function which has an extensible interface so
691
that additional items can be added compatibly.
694
argument_re points to compiled code
695
extra_data points extra data, or NULL
696
what what information is required
697
where where to put the information
699
Returns: 0 if data returned, negative on error
703
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
706
real_pcre internal_re;
707
pcre_study_data internal_study;
708
const real_pcre *re = (const real_pcre *)argument_re;
709
const pcre_study_data *study = NULL;
711
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
713
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714
study = (const pcre_study_data *)extra_data->study_data;
716
if (re->magic_number != MAGIC_NUMBER)
718
re = try_flipped(re, &internal_re, study, &internal_study);
719
if (re == NULL) return PCRE_ERROR_BADMAGIC;
720
if (study != NULL) study = &internal_study;
725
case PCRE_INFO_OPTIONS:
726
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
730
*((size_t *)where) = re->size;
733
case PCRE_INFO_STUDYSIZE:
734
*((size_t *)where) = (study == NULL)? 0 : study->size;
737
case PCRE_INFO_CAPTURECOUNT:
738
*((int *)where) = re->top_bracket;
741
case PCRE_INFO_BACKREFMAX:
742
*((int *)where) = re->top_backref;
745
case PCRE_INFO_FIRSTBYTE:
747
((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
748
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
751
/* Make sure we pass back the pointer to the bit vector in the external
752
block, not the internal copy (with flipped integer fields). */
754
case PCRE_INFO_FIRSTTABLE:
755
*((const uschar **)where) =
756
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
760
case PCRE_INFO_LASTLITERAL:
762
((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
765
case PCRE_INFO_NAMEENTRYSIZE:
766
*((int *)where) = re->name_entry_size;
769
case PCRE_INFO_NAMECOUNT:
770
*((int *)where) = re->name_count;
773
case PCRE_INFO_NAMETABLE:
774
*((const uschar **)where) = (const uschar *)re + re->name_table_offset;
777
case PCRE_INFO_DEFAULT_TABLES:
778
*((const uschar **)where) = (const uschar *)pcre_default_tables;
781
default: return PCRE_ERROR_BADOPTION;
789
/*************************************************
790
* Return info about what features are configured *
791
*************************************************/
793
/* This is function which has an extensible interface so that additional items
794
can be added compatibly.
797
what what information is required
798
where where to put the information
800
Returns: 0 if data returned, negative on error
804
pcre_config(int what, void *where)
808
case PCRE_CONFIG_UTF8:
816
case PCRE_CONFIG_UNICODE_PROPERTIES:
824
case PCRE_CONFIG_NEWLINE:
825
*((int *)where) = NEWLINE;
828
case PCRE_CONFIG_LINK_SIZE:
829
*((int *)where) = LINK_SIZE;
832
case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
833
*((int *)where) = POSIX_MALLOC_THRESHOLD;
836
case PCRE_CONFIG_MATCH_LIMIT:
837
*((unsigned int *)where) = MATCH_LIMIT;
840
case PCRE_CONFIG_STACKRECURSE:
848
default: return PCRE_ERROR_BADOPTION;
857
/*************************************************
858
* Debugging function to print chars *
859
*************************************************/
861
/* Print a sequence of chars in printable format, stopping at the end of the
862
subject if the requested.
865
p points to characters
866
length number to print
867
is_subject TRUE if printing from within md->start_subject
868
md pointer to matching data block, if is_subject is TRUE
874
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
877
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
879
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
886
/*************************************************
888
*************************************************/
890
/* This function is called when a \ has been encountered. It either returns a
891
positive value for a simple escape such as \n, or a negative value which
892
encodes one of the more complicated things such as \d. When UTF-8 is enabled,
893
a positive value greater than 255 may be returned. On entry, ptr is pointing at
894
the \. On exit, it is on the final character of the escape sequence.
897
ptrptr points to the pattern position pointer
898
errorptr points to the pointer to the error message
899
bracount number of previous extracting brackets
900
options the options bits
901
isclass TRUE if inside a character class
903
Returns: zero or positive => a data character
904
negative => a special escape sequence
905
on error, errorptr is set
909
check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
910
int options, BOOL isclass)
912
const uschar *ptr = *ptrptr;
915
/* If backslash is at the end of the pattern, it's an error. */
918
if (c == 0) *errorptr = ERR1;
920
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
921
a table. A non-zero result is something that can be returned immediately.
922
Otherwise further processing may be required. */
924
#if !EBCDIC /* ASCII coding */
925
else if (c < '0' || c > 'z') {} /* Not alphameric */
926
else if ((i = escapes[c - '0']) != 0) c = i;
928
#else /* EBCDIC coding */
929
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
930
else if ((i = escapes[c - 0x48]) != 0) c = i;
933
/* Escapes that need further processing, or are illegal. */
937
const uschar *oldptr;
940
/* A number of Perl escapes are not handled by PCRE. We give an explicit
951
/* The handling of escape sequences consisting of a string of digits
952
starting with one that is not zero is not straightforward. By experiment,
953
the way Perl works seems to be as follows:
955
Outside a character class, the digits are read as a decimal number. If the
956
number is less than 10, or if there are that many previous extracting
957
left brackets, then it is a back reference. Otherwise, up to three octal
958
digits are read to form an escaped byte. Thus \123 is likely to be octal
959
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960
value is greater than 377, the least significant 8 bits are taken. Inside a
961
character class, \ followed by a digit is always an octal number. */
963
case '1': case '2': case '3': case '4': case '5':
964
case '6': case '7': case '8': case '9':
970
while ((digitab[ptr[1]] & ctype_digit) != 0)
971
c = c * 10 + *(++ptr) - '0';
972
if (c < 10 || c <= bracount)
977
ptr = oldptr; /* Put the pointer back and fall through */
980
/* Handle an octal number following \. If the first digit is 8 or 9, Perl
981
generates a binary zero byte and treats the digit as a following literal.
982
Thus we have to pull back the pointer by one. */
984
if ((c = *ptr) >= '8')
991
/* \0 always starts an octal number, but we may drop through to here with a
992
larger first octal digit. */
996
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
997
c = c * 8 + *(++ptr) - '0';
998
c &= 255; /* Take least significant 8 bits */
1001
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002
which can be greater than 0xff, but only if the ddd are hex digits. */
1006
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1008
const uschar *pt = ptr + 2;
1009
register int count = 0;
1011
while ((digitab[*pt] & ctype_xdigit) != 0)
1015
#if !EBCDIC /* ASCII coding */
1016
if (cc >= 'a') cc -= 32; /* Convert to upper case */
1017
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018
#else /* EBCDIC coding */
1019
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1020
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1025
if (c < 0 || count > 8) *errorptr = ERR34;
1029
/* If the sequence of hex digits does not end with '}', then we don't
1030
recognize this construct; fall through to the normal \x handling. */
1034
/* Read just a single hex char */
1037
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1039
int cc; /* Some compilers don't like ++ */
1040
cc = *(++ptr); /* in initializers */
1041
#if !EBCDIC /* ASCII coding */
1042
if (cc >= 'a') cc -= 32; /* Convert to upper case */
1043
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044
#else /* EBCDIC coding */
1045
if (cc <= 'z') cc += 64; /* Convert to upper case */
1046
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1051
/* Other special escapes not starting with a digit are straightforward */
1061
/* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062
is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063
(However, an EBCDIC equivalent has now been added.) */
1065
#if !EBCDIC /* ASCII coding */
1066
if (c >= 'a' && c <= 'z') c -= 32;
1068
#else /* EBCDIC coding */
1069
if (c >= 'a' && c <= 'z') c += 64;
1074
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076
for Perl compatibility, it is a literal. This code looks a bit odd, but
1077
there used to be some cases other than the default, and there may be again
1078
in future, so I haven't "optimized" it. */
1081
if ((options & PCRE_EXTRA) != 0) switch(c)
1098
/*************************************************
1099
* Handle \P and \p *
1100
*************************************************/
1102
/* This function is called after \P or \p has been encountered, provided that
1103
PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104
pointing at the P or p. On exit, it is pointing at the final character of the
1108
ptrptr points to the pattern position pointer
1109
negptr points to a boolean that is set TRUE for negation else FALSE
1110
errorptr points to the pointer to the error message
1112
Returns: value from ucp_type_table, or -1 for an invalid type
1116
get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1119
const uschar *ptr = *ptrptr;
1123
if (c == 0) goto ERROR_RETURN;
1127
/* \P or \p can be followed by a one- or two-character name in {}, optionally
1128
preceded by ^ for negation. */
1137
for (i = 0; i <= 2; i++)
1140
if (c == 0) goto ERROR_RETURN;
1141
if (c == '}') break;
1144
if (c !='}') /* Try to distinguish error cases */
1146
while (*(++ptr) != 0 && *ptr != '}');
1147
if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1152
/* Otherwise there is just one following character */
1162
/* Search for a recognized property name using binary chop */
1165
top = sizeof(utt)/sizeof(ucp_type_table);
1170
c = strcmp(name, utt[i].name);
1171
if (c == 0) return utt[i].value;
1172
if (c > 0) bot = i + 1; else top = i;
1190
/*************************************************
1191
* Check for counted repeat *
1192
*************************************************/
1194
/* This function is called when a '{' is encountered in a place where it might
1195
start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197
where the ddds are digits.
1200
p pointer to the first char after '{'
1202
Returns: TRUE or FALSE
1206
is_counted_repeat(const uschar *p)
1208
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209
while ((digitab[*p] & ctype_digit) != 0) p++;
1210
if (*p == '}') return TRUE;
1212
if (*p++ != ',') return FALSE;
1213
if (*p == '}') return TRUE;
1215
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216
while ((digitab[*p] & ctype_digit) != 0) p++;
1223
/*************************************************
1224
* Read repeat counts *
1225
*************************************************/
1227
/* Read an item of the form {n,m} and return the values. This is called only
1228
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229
so the syntax is guaranteed to be correct, but we need to check the values.
1232
p pointer to first char after '{'
1233
minp pointer to int for min
1234
maxp pointer to int for max
1235
returned as -1 if no max
1236
errorptr points to pointer to error message
1238
Returns: pointer to '}' on success;
1239
current ptr on error, with errorptr set
1242
static const uschar *
1243
read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1248
while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1250
if (*p == '}') max = min; else
1255
while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1264
/* Do paranoid checks, then fill in the required variables, and pass back the
1265
pointer to the terminating '}'. */
1267
if (min > 65535 || max > 65535)
1279
/*************************************************
1280
* Find first significant op code *
1281
*************************************************/
1283
/* This is called by several functions that scan a compiled expression looking
1284
for a fixed first character, or an anchoring op code etc. It skips over things
1285
that do not influence this. For some calls, a change of option is important.
1286
For some calls, it makes sense to skip negative forward and all backward
1287
assertions, and also the \b assertion; for others it does not.
1290
code pointer to the start of the group
1291
options pointer to external options
1292
optbit the option bit whose changing is significant, or
1294
skipassert TRUE if certain assertions are to be skipped
1296
Returns: pointer to the first significant opcode
1299
static const uschar*
1300
first_significant_code(const uschar *code, int *options, int optbit,
1308
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309
*options = (int)code[1];
1315
case OP_ASSERTBACK_NOT:
1316
if (!skipassert) return code;
1317
do code += GET(code, 1); while (*code == OP_ALT);
1318
code += OP_lengths[*code];
1321
case OP_WORD_BOUNDARY:
1322
case OP_NOT_WORD_BOUNDARY:
1323
if (!skipassert) return code;
1329
code += OP_lengths[*code];
1336
/* Control never reaches here */
1342
/*************************************************
1343
* Find the fixed length of a pattern *
1344
*************************************************/
1346
/* Scan a pattern and compute the fixed length of subject that will match it,
1347
if the length is fixed. This is needed for dealing with backward assertions.
1348
In UTF8 mode, the result is in characters rather than bytes.
1351
code points to the start of the pattern (the bracket)
1352
options the compiling options
1354
Returns: the fixed length, or -1 if there is no fixed length,
1355
or -2 if \C was encountered
1359
find_fixedlength(uschar *code, int options)
1363
register int branchlength = 0;
1364
register uschar *cc = code + 1 + LINK_SIZE;
1366
/* Scan along the opcodes for this branch. If we get to the end of the
1367
branch, check the length against that of the other branches. */
1372
register int op = *cc;
1373
if (op >= OP_BRA) op = OP_BRA;
1380
d = find_fixedlength(cc, options);
1381
if (d < 0) return d;
1383
do cc += GET(cc, 1); while (*cc == OP_ALT);
1384
cc += 1 + LINK_SIZE;
1387
/* Reached end of a branch; if it's a ket it is the end of a nested
1388
call. If it's ALT it is an alternation in a nested call. If it is
1389
END it's the end of the outer call. All can be handled by the same code. */
1396
if (length < 0) length = branchlength;
1397
else if (length != branchlength) return -1;
1398
if (*cc != OP_ALT) return length;
1399
cc += 1 + LINK_SIZE;
1403
/* Skip over assertive subpatterns */
1408
case OP_ASSERTBACK_NOT:
1409
do cc += GET(cc, 1); while (*cc == OP_ALT);
1412
/* Skip over things that don't match chars */
1425
case OP_NOT_WORD_BOUNDARY:
1426
case OP_WORD_BOUNDARY:
1427
cc += OP_lengths[*cc];
1430
/* Handle literal characters */
1437
if ((options & PCRE_UTF8) != 0)
1439
while ((*cc & 0xc0) == 0x80) cc++;
1444
/* Handle exact repetitions. The count is already in characters, but we
1445
need to skip over a multibyte character in UTF8 mode. */
1448
branchlength += GET2(cc,1);
1451
if ((options & PCRE_UTF8) != 0)
1453
while((*cc & 0x80) == 0x80) cc++;
1459
branchlength += GET2(cc,1);
1463
/* Handle single-char matchers */
1472
case OP_NOT_WHITESPACE:
1474
case OP_NOT_WORDCHAR:
1481
/* The single-byte matcher isn't allowed */
1486
/* Check a class for variable quantification */
1490
cc += GET(cc, 1) - 33;
1508
if (GET2(cc,1) != GET2(cc,3)) return -1;
1509
branchlength += GET2(cc,1);
1518
/* Anything else is variable length */
1524
/* Control never gets here */
1530
/*************************************************
1531
* Scan compiled regex for numbered bracket *
1532
*************************************************/
1534
/* This little function scans through a compiled pattern until it finds a
1535
capturing bracket with the given number.
1538
code points to start of expression
1539
utf8 TRUE in UTF-8 mode
1540
number the required bracket number
1542
Returns: pointer to the opcode for the bracket, or NULL if not found
1545
static const uschar *
1546
find_bracket(const uschar *code, BOOL utf8, int number)
1548
#ifndef SUPPORT_UTF8
1549
utf8 = utf8; /* Stop pedantic compilers complaining */
1554
register int c = *code;
1555
if (c == OP_END) return NULL;
1556
else if (c > OP_BRA)
1559
if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560
if (n == number) return (uschar *)code;
1561
code += OP_lengths[OP_BRA];
1565
code += OP_lengths[c];
1569
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1570
by a multi-byte character. The length in the table is a minimum, so we have
1571
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572
can use relatively efficient code. */
1587
while ((*code & 0xc0) == 0x80) code++;
1590
/* XCLASS is used for classes that cannot be represented just by a bit
1591
map. This includes negated single high-valued characters. The length in
1592
the table is zero; the actual length is stored in the compiled code. */
1595
code += GET(code, 1) + 1;
1605
/*************************************************
1606
* Scan compiled regex for recursion reference *
1607
*************************************************/
1609
/* This little function scans through a compiled pattern until it finds an
1610
instance of OP_RECURSE.
1613
code points to start of expression
1614
utf8 TRUE in UTF-8 mode
1616
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1619
static const uschar *
1620
find_recurse(const uschar *code, BOOL utf8)
1622
#ifndef SUPPORT_UTF8
1623
utf8 = utf8; /* Stop pedantic compilers complaining */
1628
register int c = *code;
1629
if (c == OP_END) return NULL;
1630
else if (c == OP_RECURSE) return code;
1631
else if (c > OP_BRA)
1633
code += OP_lengths[OP_BRA];
1637
code += OP_lengths[c];
1641
/* In UTF-8 mode, opcodes that are followed by a character may be followed
1642
by a multi-byte character. The length in the table is a minimum, so we have
1643
to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644
can use relatively efficient code. */
1659
while ((*code & 0xc0) == 0x80) code++;
1662
/* XCLASS is used for classes that cannot be represented just by a bit
1663
map. This includes negated single high-valued characters. The length in
1664
the table is zero; the actual length is stored in the compiled code. */
1667
code += GET(code, 1) + 1;
1677
/*************************************************
1678
* Scan compiled branch for non-emptiness *
1679
*************************************************/
1681
/* This function scans through a branch of a compiled pattern to see whether it
1682
can match the empty string or not. It is called only from could_be_empty()
1683
below. Note that first_significant_code() skips over assertions. If we hit an
1684
unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685
whose current branch will already have been scanned.
1688
code points to start of search
1689
endcode points to where to stop
1690
utf8 TRUE if in UTF8 mode
1692
Returns: TRUE if what is matched could be empty
1696
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1699
for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1701
code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1703
const uschar *ccode;
1710
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1712
/* Scan a closed bracket */
1714
empty_branch = FALSE;
1717
if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718
empty_branch = TRUE;
1719
code += GET(code, 1);
1721
while (*code == OP_ALT);
1722
if (!empty_branch) return FALSE; /* All branches are non-empty */
1723
code += 1 + LINK_SIZE;
1729
/* Check for quantifiers after a class */
1733
ccode = code + GET(code, 1);
1734
goto CHECK_CLASS_REPEAT;
1747
case OP_CRSTAR: /* These could be empty; continue */
1753
default: /* Non-repeat => class must match */
1754
case OP_CRPLUS: /* These repeats aren't empty */
1760
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1765
/* Opcodes that must match a character */
1772
case OP_NOT_WHITESPACE:
1774
case OP_NOT_WORDCHAR:
1788
case OP_TYPEMINPLUS:
1800
/* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1801
followed by a multibyte character */
1810
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1821
/*************************************************
1822
* Scan compiled regex for non-emptiness *
1823
*************************************************/
1825
/* This function is called to check for left recursive calls. We want to check
1826
the current branch of the current pattern to see if it could match the empty
1827
string. If it could, we must look outwards for branches at other levels,
1828
stopping when we pass beyond the bracket which is the subject of the recursion.
1831
code points to start of the recursion
1832
endcode points to where to stop (current RECURSE item)
1833
bcptr points to the chain of current (unclosed) branch starts
1834
utf8 TRUE if in UTF-8 mode
1836
Returns: TRUE if what is matched could be empty
1840
could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1843
while (bcptr != NULL && bcptr->current >= code)
1845
if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846
bcptr = bcptr->outer;
1853
/*************************************************
1854
* Check for POSIX class syntax *
1855
*************************************************/
1857
/* This function is called when the sequence "[:" or "[." or "[=" is
1858
encountered in a character class. It checks whether this is followed by an
1859
optional ^ and then a sequence of letters, terminated by a matching ":]" or
1863
ptr pointer to the initial [
1864
endptr where to return the end pointer
1865
cd pointer to compile data
1867
Returns: TRUE or FALSE
1871
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1873
int terminator; /* Don't combine these lines; the Solaris cc */
1874
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1875
if (*(++ptr) == '^') ptr++;
1876
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877
if (*ptr == terminator && ptr[1] == ']')
1888
/*************************************************
1889
* Check POSIX class name *
1890
*************************************************/
1892
/* This function is called to check the name given in a POSIX-style class entry
1896
ptr points to the first letter
1897
len the length of the name
1899
Returns: a value representing the name, or -1 if unknown
1903
check_posix_name(const uschar *ptr, int len)
1905
register int yield = 0;
1906
while (posix_name_lengths[yield] != 0)
1908
if (len == posix_name_lengths[yield] &&
1909
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1916
/*************************************************
1917
* Adjust OP_RECURSE items in repeated group *
1918
*************************************************/
1920
/* OP_RECURSE items contain an offset from the start of the regex to the group
1921
that is referenced. This means that groups can be replicated for fixed
1922
repetition simply by copying (because the recursion is allowed to refer to
1923
earlier groups that are outside the current group). However, when a group is
1924
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925
it, after it has been compiled. This means that any OP_RECURSE items within it
1926
that refer to the group itself or any contained groups have to have their
1927
offsets adjusted. That is the job of this function. Before it is called, the
1928
partially compiled regex must be temporarily terminated with OP_END.
1931
group points to the start of the group
1932
adjust the amount by which the group is to be moved
1933
utf8 TRUE in UTF-8 mode
1934
cd contains pointers to tables etc.
1940
adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1942
uschar *ptr = group;
1943
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1945
int offset = GET(ptr, 1);
1946
if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947
ptr += 1 + LINK_SIZE;
1953
/*************************************************
1954
* Insert an automatic callout point *
1955
*************************************************/
1957
/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958
callout points before each pattern item.
1961
code current code pointer
1962
ptr current pattern pointer
1963
cd pointers to tables etc
1965
Returns: new code pointer
1969
auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1971
*code++ = OP_CALLOUT;
1973
PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1974
PUT(code, LINK_SIZE, 0); /* Default length */
1975
return code + 2*LINK_SIZE;
1980
/*************************************************
1981
* Complete a callout item *
1982
*************************************************/
1984
/* A callout item contains the length of the next item in the pattern, which
1985
we can't fill in till after we have reached the relevant point. This is used
1986
for both automatic and manual callouts.
1989
previous_callout points to previous callout item
1990
ptr current pattern pointer
1991
cd pointers to tables etc
1997
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1999
int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000
PUT(previous_callout, 2 + LINK_SIZE, length);
2006
/*************************************************
2007
* Get othercase range *
2008
*************************************************/
2010
/* This function is passed the start and end of a class range, in UTF-8 mode
2011
with UCP support. It searches up the characters, looking for internal ranges of
2012
characters in the "other" case. Each call returns the next one, updating the
2016
cptr points to starting character value; updated
2018
ocptr where to put start of othercase range
2019
odptr where to put end of othercase range
2021
Yield: TRUE when range returned; FALSE when no more
2025
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2027
int c, chartype, othercase, next;
2029
for (c = *cptr; c <= d; c++)
2031
if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2034
if (c > d) return FALSE;
2037
next = othercase + 1;
2039
for (++c; c <= d; c++)
2041
if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2051
#endif /* SUPPORT_UCP */
2054
/*************************************************
2055
* Compile one branch *
2056
*************************************************/
2058
/* Scan the pattern, compiling it into the code vector. If the options are
2059
changed during the branch, the pointer is used to change the external options
2063
optionsptr pointer to the option bits
2064
brackets points to number of extracting brackets used
2065
codeptr points to the pointer to the current code point
2066
ptrptr points to the current pattern pointer
2067
errorptr points to pointer to error message
2068
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069
reqbyteptr set to the last literal character required, else < 0
2070
bcptr points to current branch chain
2071
cd contains pointers to tables etc.
2073
Returns: TRUE on success
2074
FALSE, with *errorptr set on error
2078
compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079
const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080
int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2082
int repeat_type, op_type;
2083
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2085
int greedy_default, greedy_non_default;
2086
int firstbyte, reqbyte;
2087
int zeroreqbyte, zerofirstbyte;
2088
int req_caseopt, reqvary, tempreqvary;
2090
int options = *optionsptr;
2091
int after_manual_callout = 0;
2093
register uschar *code = *codeptr;
2095
BOOL inescq = FALSE;
2096
BOOL groupsetfirstbyte = FALSE;
2097
const uschar *ptr = *ptrptr;
2098
const uschar *tempptr;
2099
uschar *previous = NULL;
2100
uschar *previous_callout = NULL;
2101
uschar classbits[32];
2105
BOOL utf8 = (options & PCRE_UTF8) != 0;
2106
uschar *class_utf8data;
2107
uschar utf8_char[6];
2112
/* Set up the default and non-default settings for greediness */
2114
greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115
greedy_non_default = greedy_default ^ 1;
2117
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118
matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119
matches a non-fixed char first char; reqbyte just remains unset if we never
2122
When we hit a repeat whose minimum is zero, we may have to adjust these values
2123
to take the zero repeat into account. This is implemented by setting them to
2124
zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125
item types that can be repeated set these backoff variables appropriately. */
2127
firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2129
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130
according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131
value > 255. It is added into the firstbyte or reqbyte variables to record the
2132
case status of the value. This is used only for ASCII characters. */
2134
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2136
/* Switch on next character until the end of the branch */
2141
BOOL possessive_quantifier;
2143
int class_charcount;
2153
/* Next byte in the pattern */
2157
/* If in \Q...\E, check for the end; if not, we have a literal */
2159
if (inescq && c != 0)
2161
if (c == '\\' && ptr[1] == 'E')
2169
if (previous_callout != NULL)
2171
complete_callout(previous_callout, ptr, cd);
2172
previous_callout = NULL;
2174
if ((options & PCRE_AUTO_CALLOUT) != 0)
2176
previous_callout = code;
2177
code = auto_callout(code, ptr, cd);
2183
/* Fill in length of a previous callout, except when the next thing is
2186
is_quantifier = c == '*' || c == '+' || c == '?' ||
2187
(c == '{' && is_counted_repeat(ptr+1));
2189
if (!is_quantifier && previous_callout != NULL &&
2190
after_manual_callout-- <= 0)
2192
complete_callout(previous_callout, ptr, cd);
2193
previous_callout = NULL;
2196
/* In extended mode, skip white space and comments */
2198
if ((options & PCRE_EXTENDED) != 0)
2200
if ((cd->ctypes[c] & ctype_space) != 0) continue;
2203
/* The space before the ; is to avoid a warning on a silly compiler
2204
on the Macintosh. */
2205
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206
if (c != 0) continue; /* Else fall through to handle end of string */
2210
/* No auto callout for quantifiers. */
2212
if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2214
previous_callout = code;
2215
code = auto_callout(code, ptr, cd);
2220
/* The branch terminates at end of string, |, or ). */
2225
*firstbyteptr = firstbyte;
2226
*reqbyteptr = reqbyte;
2231
/* Handle single-character metacharacters. In multiline mode, ^ disables
2232
the setting of any following char as a first character. */
2235
if ((options & PCRE_MULTILINE) != 0)
2237
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2248
/* There can never be a first char if '.' is first, whatever happens about
2249
repeats. The value of reqbyte doesn't change either. */
2252
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253
zerofirstbyte = firstbyte;
2254
zeroreqbyte = reqbyte;
2259
/* Character classes. If the included characters are all < 255 in value, we
2260
build a 32-byte bitmap of the permitted characters, except in the special
2261
case where there is only one such character. For negated classes, we build
2262
the map as usual, then invert it at the end. However, we use a different
2263
opcode so that data characters > 255 can be handled correctly.
2265
If the class contains characters outside the 0-255 range, a different
2266
opcode is compiled. It may optionally have a bit map for characters < 256,
2267
but those above are are explicitly listed afterwards. A flag byte tells
2268
whether the bitmap is present, and whether this is a negated class or not.
2274
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275
they are encountered at the top level, so we'll do that too. */
2277
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278
check_posix_syntax(ptr, &tempptr, cd))
2280
*errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2284
/* If the first character is '^', set the negation flag and skip it. */
2286
if ((c = *(++ptr)) == '^')
2288
negate_class = TRUE;
2293
negate_class = FALSE;
2296
/* Keep a count of chars with values < 256 so that we can optimize the case
2297
of just a single character (as long as it's < 256). For higher valued UTF-8
2298
characters, we don't yet do any optimization. */
2300
class_charcount = 0;
2301
class_lastchar = -1;
2304
class_utf8 = FALSE; /* No chars >= 256 */
2305
class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2308
/* Initialize the 32-char bit map to all zeros. We have to build the
2309
map in a temporary bit of store, in case the class contains only 1
2310
character (< 256), because in that case the compiled code doesn't use the
2313
memset(classbits, 0, 32 * sizeof(uschar));
2315
/* Process characters until ] is reached. By writing this as a "do" it
2316
means that an initial ] is taken as a data character. The first pass
2317
through the regex checked the overall syntax, so we don't need to be very
2318
strict here. At the start of the loop, c contains the first byte of the
2324
if (utf8 && c > 127)
2325
{ /* Braces are required because the */
2326
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2330
/* Inside \Q...\E everything is literal except \E */
2334
if (c == '\\' && ptr[1] == 'E')
2340
else goto LONE_SINGLE_CHARACTER;
2343
/* Handle POSIX class names. Perl allows a negation extension of the
2344
form [:^name:]. A square bracket that doesn't match the syntax is
2345
treated as a literal. We also recognize the POSIX constructions
2346
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2350
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351
check_posix_syntax(ptr, &tempptr, cd))
2353
BOOL local_negate = FALSE;
2355
register const uschar *cbits = cd->cbits;
2366
local_negate = TRUE;
2370
posix_class = check_posix_name(ptr, tempptr - ptr);
2371
if (posix_class < 0)
2377
/* If matching is caseless, upper and lower are converted to
2378
alpha. This relies on the fact that the class table starts with
2379
alpha, lower, upper as the first 3 entries. */
2381
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2384
/* Or into the map we are building up to 3 of the static class
2385
tables, or their negations. The [:blank:] class sets up the same
2386
chars as the [:space:] class (all white space). We remove the vertical
2387
white space chars afterwards. */
2390
for (i = 0; i < 3; i++)
2392
BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393
int taboffset = posix_class_maps[posix_class + i];
2394
if (taboffset < 0) break;
2398
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2400
for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401
if (blankclass) classbits[1] |= 0x3c;
2405
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406
if (blankclass) classbits[1] &= ~0x3c;
2411
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2412
continue; /* End of POSIX syntax handling */
2415
/* Backslash may introduce a single character, or it may introduce one
2416
of the specials, which just set a flag. Escaped items are checked for
2417
validity in the pre-compiling pass. The sequence \b is a special case.
2418
Inside a class (and only there) it is treated as backspace. Elsewhere
2419
it marks a word boundary. Other escapes have preset maps ready to
2420
or into the one we are building. We assume they have more than one
2421
character in them, so set class_charcount bigger than one. */
2425
c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2427
if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2428
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2429
else if (-c == ESC_Q) /* Handle start of quoted string */
2431
if (ptr[1] == '\\' && ptr[2] == 'E')
2433
ptr += 2; /* avoid empty string */
2441
register const uschar *cbits = cd->cbits;
2442
class_charcount += 2; /* Greater than 1 is what matters */
2446
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2450
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2454
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2458
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2462
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463
classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2467
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2476
int property = get_ucp(&ptr, &negated, errorptr);
2477
if (property < 0) goto FAILED;
2479
*class_utf8data++ = ((-c == ESC_p) != negated)?
2480
XCL_PROP : XCL_NOTPROP;
2481
*class_utf8data++ = property;
2482
class_charcount -= 2; /* Not a < 256 character */
2487
/* Unrecognized escapes are faulted if PCRE is running in its
2488
strict mode. By default, for compatibility with Perl, they are
2489
treated as literals. */
2492
if ((options & PCRE_EXTRA) != 0)
2497
c = *ptr; /* The final character */
2498
class_charcount -= 2; /* Undo the default count from above */
2502
/* Fall through if we have a single character (c >= 0). This may be
2503
> 256 in UTF-8 mode. */
2505
} /* End of backslash handling */
2507
/* A single character may be followed by '-' to form a range. However,
2508
Perl does not permit ']' to be the end of the range. A '-' character
2509
here is treated as a literal. */
2511
if (ptr[1] == '-' && ptr[2] != ']')
2518
{ /* Braces are required because the */
2519
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2523
d = *ptr; /* Not UTF-8 mode */
2525
/* The second part of a range can be a single-character escape, but
2526
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527
in such circumstances. */
2531
const uschar *oldptr = ptr;
2532
d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2534
/* \b is backslash; \X is literal X; any other special means the '-'
2539
if (d == -ESC_b) d = '\b';
2540
else if (d == -ESC_X) d = 'X'; else
2543
goto LONE_SINGLE_CHARACTER; /* A few lines below */
2548
/* The check that the two values are in the correct order happens in
2549
the pre-pass. Optimize one-character ranges */
2551
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2553
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554
matching, we have to use an XCLASS with extra data items. Caseless
2555
matching for characters > 127 is available only if UCP support is
2559
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2563
/* With UCP support, we can find the other case equivalents of
2564
the relevant characters. There may be several ranges. Optimize how
2565
they fit with the basic range. */
2568
if ((options & PCRE_CASELESS) != 0)
2573
while (get_othercase_range(&cc, origd, &occ, &ocd))
2575
if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2577
if (occ < c && ocd >= c - 1) /* Extend the basic range */
2578
{ /* if there is overlap, */
2579
c = occ; /* noting that if occ < c */
2580
continue; /* we can't have ocd > d */
2581
} /* because a subrange is */
2582
if (ocd > d && occ <= d + 1) /* always shorter than */
2583
{ /* the basic range. */
2590
*class_utf8data++ = XCL_SINGLE;
2594
*class_utf8data++ = XCL_RANGE;
2595
class_utf8data += ord2utf8(occ, class_utf8data);
2597
class_utf8data += ord2utf8(ocd, class_utf8data);
2600
#endif /* SUPPORT_UCP */
2602
/* Now record the original range, possibly modified for UCP caseless
2603
overlapping ranges. */
2605
*class_utf8data++ = XCL_RANGE;
2606
class_utf8data += ord2utf8(c, class_utf8data);
2607
class_utf8data += ord2utf8(d, class_utf8data);
2609
/* With UCP support, we are done. Without UCP support, there is no
2610
caseless matching for UTF-8 characters > 127; we can use the bit map
2611
for the smaller ones. */
2614
continue; /* With next character in the class */
2616
if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2618
/* Adjust upper limit and fall through to set up the map */
2622
#endif /* SUPPORT_UCP */
2624
#endif /* SUPPORT_UTF8 */
2626
/* We use the bit map for all cases when not in UTF-8 mode; else
2627
ranges that lie entirely within 0-127 when there is UCP support; else
2628
for partial ranges without UCP support. */
2632
classbits[c/8] |= (1 << (c&7));
2633
if ((options & PCRE_CASELESS) != 0)
2635
int uc = cd->fcc[c]; /* flip case */
2636
classbits[uc/8] |= (1 << (uc&7));
2638
class_charcount++; /* in case a one-char range */
2642
continue; /* Go get the next char in the class */
2645
/* Handle a lone single character - we can get here for a normal
2646
non-escape char, or after \ that introduces a single character or for an
2647
apparent range that isn't. */
2649
LONE_SINGLE_CHARACTER:
2651
/* Handle a character that cannot go in the bit map */
2654
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2657
*class_utf8data++ = XCL_SINGLE;
2658
class_utf8data += ord2utf8(c, class_utf8data);
2661
if ((options & PCRE_CASELESS) != 0)
2665
if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2667
*class_utf8data++ = XCL_SINGLE;
2668
class_utf8data += ord2utf8(othercase, class_utf8data);
2671
#endif /* SUPPORT_UCP */
2675
#endif /* SUPPORT_UTF8 */
2677
/* Handle a single-byte character */
2679
classbits[c/8] |= (1 << (c&7));
2680
if ((options & PCRE_CASELESS) != 0)
2682
c = cd->fcc[c]; /* flip case */
2683
classbits[c/8] |= (1 << (c&7));
2690
/* Loop until ']' reached; the check for end of string happens inside the
2691
loop. This "while" is the end of the "do" above. */
2693
while ((c = *(++ptr)) != ']' || inescq);
2695
/* If class_charcount is 1, we saw precisely one character whose value is
2696
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697
can optimize the negative case only if there were no characters >= 128
2698
because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699
single-bytes only. This is an historical hangover. Maybe one day we can
2700
tidy these opcodes to handle multi-byte characters.
2702
The optimization throws away the bit map. We turn the item into a
2703
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704
that OP_NOT does not support multibyte characters. In the positive case, it
2705
can cause firstbyte to be set. Otherwise, there can be no first char if
2706
this item is first, whatever repeat count may follow. In the case of
2707
reqbyte, save the previous value for reinstating. */
2710
if (class_charcount == 1 &&
2712
(!class_utf8 && (!negate_class || class_lastchar < 128))))
2715
if (class_charcount == 1)
2718
zeroreqbyte = reqbyte;
2720
/* The OP_NOT opcode works on one-byte characters only. */
2724
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725
zerofirstbyte = firstbyte;
2727
*code++ = class_lastchar;
2731
/* For a single, positive character, get the value into mcbuffer, and
2732
then we can handle this with the normal one-character code. */
2735
if (utf8 && class_lastchar > 127)
2736
mclength = ord2utf8(class_lastchar, mcbuffer);
2740
mcbuffer[0] = class_lastchar;
2744
} /* End of 1-char optimization */
2746
/* The general case - not the one-char optimization. If this is the first
2747
thing in the branch, there can be no first char setting, whatever the
2748
repeat count. Any reqbyte setting must remain unchanged after any kind of
2751
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752
zerofirstbyte = firstbyte;
2753
zeroreqbyte = reqbyte;
2755
/* If there are characters with values > 255, we have to compile an
2756
extended class, with its own opcode. If there are no characters < 256,
2757
we can omit the bitmap. */
2762
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
2763
*code++ = OP_XCLASS;
2765
*code = negate_class? XCL_NOT : 0;
2767
/* If the map is required, install it, and move on to the end of
2770
if (class_charcount > 0)
2773
memcpy(code, classbits, 32);
2774
code = class_utf8data;
2777
/* If the map is not required, slide down the extra data. */
2781
int len = class_utf8data - (code + 33);
2782
memmove(code + 1, code + 33, len);
2786
/* Now fill in the complete length of the item */
2788
PUT(previous, 1, code - previous);
2789
break; /* End of class handling */
2793
/* If there are no characters > 255, negate the 32-byte map if necessary,
2794
and copy it into the code vector. If this is the first thing in the branch,
2795
there can be no first char setting, whatever the repeat count. Any reqbyte
2796
setting must remain unchanged after any kind of repeat. */
2800
*code++ = OP_NCLASS;
2801
for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2806
memcpy(code, classbits, 32);
2811
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812
has been tested above. */
2815
if (!is_quantifier) goto NORMAL_CHAR;
2816
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817
if (*errorptr != NULL) goto FAILED;
2835
if (previous == NULL)
2841
if (repeat_min == 0)
2843
firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2844
reqbyte = zeroreqbyte; /* Ditto */
2847
/* Remember whether this is a variable length repeat */
2849
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2851
op_type = 0; /* Default single-char op codes */
2852
possessive_quantifier = FALSE; /* Default not possessive quantifier */
2854
/* Save start of previous item, in case we have to move it up to make space
2855
for an inserted OP_ONCE for the additional '+' extension. */
2857
tempcode = previous;
2859
/* If the next character is '+', we have a possessive quantifier. This
2860
implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861
If the next character is '?' this is a minimizing repeat, by default,
2862
but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863
repeat type to the non-default. */
2867
repeat_type = 0; /* Force greedy */
2868
possessive_quantifier = TRUE;
2871
else if (ptr[1] == '?')
2873
repeat_type = greedy_non_default;
2876
else repeat_type = greedy_default;
2878
/* If previous was a recursion, we need to wrap it inside brackets so that
2879
it can be replicated if necessary. */
2881
if (*previous == OP_RECURSE)
2883
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884
code += 1 + LINK_SIZE;
2886
PUT(previous, 1, code - previous);
2888
PUT(code, 1, code - previous);
2889
code += 1 + LINK_SIZE;
2892
/* If previous was a character match, abolish the item and generate a
2893
repeat item instead. If a char item has a minumum of more than one, ensure
2894
that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895
the first thing in a branch because the x will have gone into firstbyte
2898
if (*previous == OP_CHAR || *previous == OP_CHARNC)
2900
/* Deal with UTF-8 characters that take up more than one byte. It's
2901
easier to write this out separately than try to macrify it. Use c to
2902
hold the length of the character in bytes, plus 0x80 to flag that it's a
2903
length rather than a small character. */
2906
if (utf8 && (code[-1] & 0x80) != 0)
2908
uschar *lastchar = code - 1;
2909
while((*lastchar & 0xc0) == 0x80) lastchar--;
2910
c = code - lastchar; /* Length of UTF-8 character */
2911
memcpy(utf8_char, lastchar, c); /* Save the char */
2912
c |= 0x80; /* Flag c as a length */
2917
/* Handle the case of a single byte - either with no UTF8 support, or
2918
with UTF-8 disabled, or for a UTF-8 character < 128. */
2922
if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2925
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2928
/* If previous was a single negated character ([^a] or similar), we use
2929
one of the special opcodes, replacing it. The code is shared with single-
2930
character repeats by setting opt_type to add a suitable offset into
2931
repeat_type. OP_NOT is currently used only for single-byte chars. */
2933
else if (*previous == OP_NOT)
2935
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2937
goto OUTPUT_SINGLE_REPEAT;
2940
/* If previous was a character type match (\d or similar), abolish it and
2941
create a suitable repeat item. The code is shared with single-character
2942
repeats by setting op_type to add a suitable offset into repeat_type. Note
2943
the the Unicode property types will be present only when SUPPORT_UCP is
2944
defined, but we don't wrap the little bits of code here because it just
2945
makes it horribly messy. */
2947
else if (*previous < OP_EODN)
2951
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2954
OUTPUT_SINGLE_REPEAT:
2955
prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2959
code = previous; /* Usually overwrite previous item */
2961
/* If the maximum is zero then the minimum must also be zero; Perl allows
2962
this case, so we do too - by simply omitting the item altogether. */
2964
if (repeat_max == 0) goto END_REPEAT;
2966
/* All real repeats make it impossible to handle partial matching (maybe
2967
one day we will be able to remove this restriction). */
2969
if (repeat_max != 1) cd->nopartial = TRUE;
2971
/* Combine the op_type with the repeat_type */
2973
repeat_type += op_type;
2975
/* A minimum of zero is handled either as the special case * or ?, or as
2976
an UPTO, with the maximum given. */
2978
if (repeat_min == 0)
2980
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2984
*code++ = OP_UPTO + repeat_type;
2985
PUT2INC(code, 0, repeat_max);
2989
/* A repeat minimum of 1 is optimized into some special cases. If the
2990
maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991
left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992
one less than the maximum. */
2994
else if (repeat_min == 1)
2996
if (repeat_max == -1)
2997
*code++ = OP_PLUS + repeat_type;
3000
code = oldcode; /* leave previous item in place */
3001
if (repeat_max == 1) goto END_REPEAT;
3002
*code++ = OP_UPTO + repeat_type;
3003
PUT2INC(code, 0, repeat_max - 1);
3007
/* The case {n,n} is just an EXACT, while the general case {n,m} is
3008
handled as an EXACT followed by an UPTO. */
3012
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3013
PUT2INC(code, 0, repeat_min);
3015
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016
we have to insert the character for the previous code. For a repeated
3017
Unicode property match, there is an extra byte that defines the
3018
required property. In UTF-8 mode, long characters have their length in
3019
c, with the 0x80 bit as a flag. */
3024
if (utf8 && c >= 128)
3026
memcpy(code, utf8_char, c & 7);
3033
if (prop_type >= 0) *code++ = prop_type;
3035
*code++ = OP_STAR + repeat_type;
3038
/* Else insert an UPTO if the max is greater than the min, again
3039
preceded by the character, for the previously inserted code. */
3041
else if (repeat_max != repeat_min)
3044
if (utf8 && c >= 128)
3046
memcpy(code, utf8_char, c & 7);
3052
if (prop_type >= 0) *code++ = prop_type;
3053
repeat_max -= repeat_min;
3054
*code++ = OP_UPTO + repeat_type;
3055
PUT2INC(code, 0, repeat_max);
3059
/* The character or character type itself comes last in all cases. */
3062
if (utf8 && c >= 128)
3064
memcpy(code, utf8_char, c & 7);
3071
/* For a repeated Unicode property match, there is an extra byte that
3072
defines the required property. */
3075
if (prop_type >= 0) *code++ = prop_type;
3079
/* If previous was a character class or a back reference, we put the repeat
3080
stuff after it, but just skip the item if the repeat was {0,0}. */
3082
else if (*previous == OP_CLASS ||
3083
*previous == OP_NCLASS ||
3085
*previous == OP_XCLASS ||
3087
*previous == OP_REF)
3089
if (repeat_max == 0)
3095
/* All real repeats make it impossible to handle partial matching (maybe
3096
one day we will be able to remove this restriction). */
3098
if (repeat_max != 1) cd->nopartial = TRUE;
3100
if (repeat_min == 0 && repeat_max == -1)
3101
*code++ = OP_CRSTAR + repeat_type;
3102
else if (repeat_min == 1 && repeat_max == -1)
3103
*code++ = OP_CRPLUS + repeat_type;
3104
else if (repeat_min == 0 && repeat_max == 1)
3105
*code++ = OP_CRQUERY + repeat_type;
3108
*code++ = OP_CRRANGE + repeat_type;
3109
PUT2INC(code, 0, repeat_min);
3110
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3111
PUT2INC(code, 0, repeat_max);
3115
/* If previous was a bracket group, we may have to replicate it in certain
3118
else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119
*previous == OP_COND)
3123
int len = code - previous;
3124
uschar *bralink = NULL;
3126
/* If the maximum repeat count is unlimited, find the end of the bracket
3127
by scanning through from the start, and compute the offset back to it
3128
from the current code pointer. There may be an OP_OPT setting following
3129
the final KET, so we can't find the end just by going back from the code
3132
if (repeat_max == -1)
3134
register uschar *ket = previous;
3135
do ket += GET(ket, 1); while (*ket != OP_KET);
3136
ketoffset = code - ket;
3139
/* The case of a zero minimum is special because of the need to stick
3140
OP_BRAZERO in front of it, and because the group appears once in the
3141
data, whereas in other cases it appears the minimum number of times. For
3142
this reason, it is simplest to treat this case separately, as otherwise
3143
the code gets far too messy. There are several special subcases when the
3146
if (repeat_min == 0)
3148
/* If the maximum is also zero, we just omit the group from the output
3151
if (repeat_max == 0)
3157
/* If the maximum is 1 or unlimited, we just have to stick in the
3158
BRAZERO and do no more at this point. However, we do need to adjust
3159
any OP_RECURSE calls inside the group that refer to the group itself or
3160
any internal group, because the offset is from the start of the whole
3161
regex. Temporarily terminate the pattern while doing this. */
3163
if (repeat_max <= 1)
3166
adjust_recurse(previous, 1, utf8, cd);
3167
memmove(previous+1, previous, len);
3169
*previous++ = OP_BRAZERO + repeat_type;
3172
/* If the maximum is greater than 1 and limited, we have to replicate
3173
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174
The first one has to be handled carefully because it's the original
3175
copy, which has to be moved up. The remainder can be handled by code
3176
that is common with the non-zero minimum case below. We have to
3177
adjust the value or repeat_max, since one less copy is required. Once
3178
again, we may have to adjust any OP_RECURSE calls inside the group. */
3184
adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185
memmove(previous + 2 + LINK_SIZE, previous, len);
3186
code += 2 + LINK_SIZE;
3187
*previous++ = OP_BRAZERO + repeat_type;
3188
*previous++ = OP_BRA;
3190
/* We chain together the bracket offset fields that have to be
3191
filled in later when the ends of the brackets are reached. */
3193
offset = (bralink == NULL)? 0 : previous - bralink;
3195
PUTINC(previous, 0, offset);
3201
/* If the minimum is greater than zero, replicate the group as many
3202
times as necessary, and adjust the maximum to the number of subsequent
3203
copies that we need. If we set a first char from the group, and didn't
3204
set a required char, copy the latter from the former. */
3210
if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211
for (i = 1; i < repeat_min; i++)
3213
memcpy(code, previous, len);
3217
if (repeat_max > 0) repeat_max -= repeat_min;
3220
/* This code is common to both the zero and non-zero minimum cases. If
3221
the maximum is limited, it replicates the group in a nested fashion,
3222
remembering the bracket starts on a stack. In the case of a zero minimum,
3223
the first one was set up above. In all cases the repeat_max now specifies
3224
the number of additional copies needed. */
3226
if (repeat_max >= 0)
3228
for (i = repeat_max - 1; i >= 0; i--)
3230
*code++ = OP_BRAZERO + repeat_type;
3232
/* All but the final copy start a new nesting, maintaining the
3233
chain of brackets outstanding. */
3239
offset = (bralink == NULL)? 0 : code - bralink;
3241
PUTINC(code, 0, offset);
3244
memcpy(code, previous, len);
3248
/* Now chain through the pending brackets, and fill in their length
3249
fields (which are holding the chain links pro tem). */
3251
while (bralink != NULL)
3254
int offset = code - bralink + 1;
3255
uschar *bra = code - offset;
3256
oldlinkoffset = GET(bra, 1);
3257
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3259
PUTINC(code, 0, offset);
3260
PUT(bra, 1, offset);
3264
/* If the maximum is unlimited, set a repeater in the final copy. We
3265
can't just offset backwards from the current code point, because we
3266
don't know if there's been an options resetting after the ket. The
3267
correct offset was computed above. */
3269
else code[-ketoffset] = OP_KETRMAX + repeat_type;
3272
/* Else there's some kind of shambles */
3280
/* If the character following a repeat is '+', we wrap the entire repeated
3281
item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282
Sun's Java package. The repeated item starts at tempcode, not at previous,
3283
which might be the first part of a string whose (former) last char we
3284
repeated. However, we don't support '+' after a greediness '?'. */
3286
if (possessive_quantifier)
3288
int len = code - tempcode;
3289
memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290
code += 1 + LINK_SIZE;
3291
len += 1 + LINK_SIZE;
3292
tempcode[0] = OP_ONCE;
3294
PUTINC(code, 0, len);
3295
PUT(tempcode, 1, len);
3298
/* In all case we no longer have a previous item. We also set the
3299
"follows varying string" flag for subsequently encountered reqbytes if
3300
it isn't already set and we have just passed a varying length item. */
3304
cd->req_varyopt |= reqvary;
3308
/* Start of nested bracket sub-expression, or comment or lookahead or
3309
lookbehind or option setting or condition. First deal with special things
3310
that can come after a bracket; all are introduced by ?, and the appearance
3311
of any of them means that this is not a referencing group. They were
3312
checked for validity in the first pass over the string, so we don't have to
3313
check for syntax errors here. */
3316
newoptions = options;
3319
if (*(++ptr) == '?')
3326
case '#': /* Comment; skip to ket */
3328
while (*ptr != ')') ptr++;
3331
case ':': /* Non-extracting bracket */
3337
bravalue = OP_COND; /* Conditional group */
3339
/* Condition to test for recursion */
3343
code[1+LINK_SIZE] = OP_CREF;
3344
PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3349
/* Condition to test for a numbered subpattern match. We know that
3350
if a digit follows ( then there will just be digits until ) because
3351
the syntax was checked in the first pass. */
3353
else if ((digitab[ptr[1]] && ctype_digit) != 0)
3355
int condref; /* Don't amalgamate; some compilers */
3356
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3357
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3364
code[1+LINK_SIZE] = OP_CREF;
3365
PUT2(code, 2+LINK_SIZE, condref);
3368
/* For conditions that are assertions, we just fall through, having
3369
set bravalue above. */
3372
case '=': /* Positive lookahead */
3373
bravalue = OP_ASSERT;
3377
case '!': /* Negative lookahead */
3378
bravalue = OP_ASSERT_NOT;
3382
case '<': /* Lookbehinds */
3385
case '=': /* Positive lookbehind */
3386
bravalue = OP_ASSERTBACK;
3390
case '!': /* Negative lookbehind */
3391
bravalue = OP_ASSERTBACK_NOT;
3397
case '>': /* One-time brackets */
3402
case 'C': /* Callout - may be followed by digits; */
3403
previous_callout = code; /* Save for later completion */
3404
after_manual_callout = 1; /* Skip one item before completing */
3405
*code++ = OP_CALLOUT; /* Already checked that the terminating */
3406
{ /* closing parenthesis is present. */
3408
while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409
n = n * 10 + *ptr - '0';
3416
PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3417
PUT(code, LINK_SIZE, 0); /* Default length */
3418
code += 2 * LINK_SIZE;
3423
case 'P': /* Named subpattern handling */
3424
if (*(++ptr) == '<') /* Definition */
3427
uschar *slot = cd->name_table;
3428
const uschar *name; /* Don't amalgamate; some compilers */
3429
name = ++ptr; /* grumble at autoincrement in declaration */
3431
while (*ptr++ != '>');
3432
namelen = ptr - name - 1;
3434
for (i = 0; i < cd->names_found; i++)
3436
int crc = memcmp(name, slot+2, namelen);
3439
if (slot[2+namelen] == 0)
3444
crc = -1; /* Current name is substring */
3448
memmove(slot + cd->name_entry_size, slot,
3449
(cd->names_found - i) * cd->name_entry_size);
3452
slot += cd->name_entry_size;
3455
PUT2(slot, 0, *brackets + 1);
3456
memcpy(slot + 2, name, namelen);
3457
slot[2+namelen] = 0;
3459
goto NUMBERED_GROUP;
3462
if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3466
const uschar *name = ptr;
3467
uschar *slot = cd->name_table;
3469
while (*ptr != ')') ptr++;
3470
namelen = ptr - name;
3472
for (i = 0; i < cd->names_found; i++)
3474
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475
slot += cd->name_entry_size;
3477
if (i >= cd->names_found)
3483
recno = GET2(slot, 0);
3485
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3487
/* Back reference */
3491
PUT2INC(code, 0, recno);
3492
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493
if (recno > cd->top_backref) cd->top_backref = recno;
3497
/* Should never happen */
3500
case 'R': /* Pattern recursion */
3501
ptr++; /* Same as (?0) */
3504
/* Recursion or "subroutine" call */
3506
case '0': case '1': case '2': case '3': case '4':
3507
case '5': case '6': case '7': case '8': case '9':
3509
const uschar *called;
3511
while((digitab[*ptr] & ctype_digit) != 0)
3512
recno = recno * 10 + *ptr++ - '0';
3514
/* Come here from code above that handles a named recursion */
3520
/* Find the bracket that is being referenced. Temporarily end the
3521
regex in case it doesn't exist. */
3524
called = (recno == 0)?
3525
cd->start_code : find_bracket(cd->start_code, utf8, recno);
3533
/* If the subpattern is still open, this is a recursive call. We
3534
check to see if this is a left recursion that could loop for ever,
3535
and diagnose that case. */
3537
if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3543
/* Insert the recursion/subroutine item */
3546
PUT(code, 1, called - cd->start_code);
3547
code += 1 + LINK_SIZE;
3551
/* Character after (? not specially recognized */
3553
default: /* Option setting */
3557
while (*ptr != ')' && *ptr != ':')
3561
case '-': optset = &unset; break;
3563
case 'i': *optset |= PCRE_CASELESS; break;
3564
case 'm': *optset |= PCRE_MULTILINE; break;
3565
case 's': *optset |= PCRE_DOTALL; break;
3566
case 'x': *optset |= PCRE_EXTENDED; break;
3567
case 'U': *optset |= PCRE_UNGREEDY; break;
3568
case 'X': *optset |= PCRE_EXTRA; break;
3572
/* Set up the changed option bits, but don't change anything yet. */
3574
newoptions = (options | set) & (~unset);
3576
/* If the options ended with ')' this is not the start of a nested
3577
group with option changes, so the options change at this level. Compile
3578
code to change the ims options if this setting actually changes any of
3579
them. We also pass the new setting back so that it can be put at the
3580
start of any following branches, and when this group ends (if we are in
3581
a group), a resetting item can be compiled.
3583
Note that if this item is right at the start of the pattern, the
3584
options will have been abstracted and made global, so there will be no
3585
change to compile. */
3589
if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3592
*code++ = newoptions & PCRE_IMS;
3595
/* Change options at this level, and pass them back for use
3596
in subsequent branches. Reset the greedy defaults and the case
3597
value for firstbyte and reqbyte. */
3599
*optionsptr = options = newoptions;
3600
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601
greedy_non_default = greedy_default ^ 1;
3602
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3604
previous = NULL; /* This item can't be repeated */
3605
continue; /* It is complete */
3608
/* If the options ended with ':' we are heading into a nested group
3609
with possible change of options. Such groups are non-capturing and are
3610
not assertions of any kind. All we need to do is skip over the ':';
3611
the newoptions value is handled below. */
3618
/* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619
non-capturing and behave like (?:...) brackets */
3621
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3626
/* Else we have a referencing group; adjust the opcode. If the bracket
3627
number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628
arrange for the true number to follow later, in an OP_BRANUMBER item. */
3633
if (++(*brackets) > EXTRACT_BASIC_MAX)
3635
bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636
code[1+LINK_SIZE] = OP_BRANUMBER;
3637
PUT2(code, 2+LINK_SIZE, *brackets);
3640
else bravalue = OP_BRA + *brackets;
3643
/* Process nested bracketed re. Assertions may not be repeated, but other
3644
kinds can be. We copy code into a non-register variable in order to be able
3645
to pass its address because some compilers complain otherwise. Pass in a
3646
new setting for the ims options if they have changed. */
3648
previous = (bravalue >= OP_ONCE)? code : NULL;
3651
tempreqvary = cd->req_varyopt; /* Save value before bracket */
3654
newoptions, /* The complete new option state */
3655
options & PCRE_IMS, /* The previous ims option state */
3656
brackets, /* Extracting bracket count */
3657
&tempcode, /* Where to put code (updated) */
3658
&ptr, /* Input pointer (updated) */
3659
errorptr, /* Where to put an error message */
3660
(bravalue == OP_ASSERTBACK ||
3661
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662
skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3663
&subfirstbyte, /* For possible first char */
3664
&subreqbyte, /* For possible last char */
3665
bcptr, /* Current branch chain */
3666
cd)) /* Tables block */
3669
/* At the end of compiling, code is still pointing to the start of the
3670
group, while tempcode has been updated to point past the end of the group
3671
and any option resetting that may follow it. The pattern pointer (ptr)
3672
is on the bracket. */
3674
/* If this is a conditional bracket, check that there are no more than
3675
two branches in the group. */
3677
else if (bravalue == OP_COND)
3686
while (*tc != OP_KET);
3694
/* If there is just one branch, we must not make use of its firstbyte or
3695
reqbyte, because this is equivalent to an empty second branch. */
3697
if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3700
/* Handle updating of the required and first characters. Update for normal
3701
brackets of all kinds, and conditions with two branches (see code above).
3702
If the bracket is followed by a quantifier with zero repeat, we have to
3703
back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704
main loop so that they can be accessed for the back off. */
3706
zeroreqbyte = reqbyte;
3707
zerofirstbyte = firstbyte;
3708
groupsetfirstbyte = FALSE;
3710
if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3712
/* If we have not yet set a firstbyte in this branch, take it from the
3713
subpattern, remembering that it was set here so that a repeat of more
3714
than one can replicate it as reqbyte if necessary. If the subpattern has
3715
no firstbyte, set "none" for the whole branch. In both cases, a zero
3716
repeat forces firstbyte to "none". */
3718
if (firstbyte == REQ_UNSET)
3720
if (subfirstbyte >= 0)
3722
firstbyte = subfirstbyte;
3723
groupsetfirstbyte = TRUE;
3725
else firstbyte = REQ_NONE;
3726
zerofirstbyte = REQ_NONE;
3729
/* If firstbyte was previously set, convert the subpattern's firstbyte
3730
into reqbyte if there wasn't one, using the vary flag that was in
3731
existence beforehand. */
3733
else if (subfirstbyte >= 0 && subreqbyte < 0)
3734
subreqbyte = subfirstbyte | tempreqvary;
3736
/* If the subpattern set a required byte (or set a first byte that isn't
3737
really the first byte - see above), set it. */
3739
if (subreqbyte >= 0) reqbyte = subreqbyte;
3742
/* For a forward assertion, we take the reqbyte, if set. This can be
3743
helpful if the pattern that follows the assertion doesn't set a different
3744
char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745
for an assertion, however because it leads to incorrect effect for patterns
3746
such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747
of a firstbyte. This is overcome by a scan at the end if there's no
3748
firstbyte, looking for an asserted first char. */
3750
else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3752
/* Now update the main code pointer to the end of the group. */
3756
/* Error if hit end of pattern */
3765
/* Check \ for being a real metacharacter; if not, fall through and handle
3766
it as a data character at the start of a string. Escape items are checked
3767
for validity in the pre-compiling pass. */
3771
c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3773
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774
are arranged to be the negation of the corresponding OP_values. For the
3775
back references, the values are ESC_REF plus the reference number. Only
3776
back references and those types that consume a character may be repeated.
3777
We can test for values between ESC_b and ESC_Z for the latter; this may
3778
have to change if any new ones are ever created. */
3782
if (-c == ESC_Q) /* Handle start of quoted string */
3784
if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3789
/* For metasequences that actually match a character, we disable the
3790
setting of a first character if it hasn't already been set. */
3792
if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793
firstbyte = REQ_NONE;
3795
/* Set values to reset to if this is followed by a zero repeat. */
3797
zerofirstbyte = firstbyte;
3798
zeroreqbyte = reqbyte;
3800
/* Back references are handled specially */
3804
int number = -c - ESC_REF;
3807
PUT2INC(code, 0, number);
3810
/* So are Unicode property matches, if supported. We know that get_ucp
3811
won't fail because it was tested in the pre-pass. */
3814
else if (-c == ESC_P || -c == ESC_p)
3817
int value = get_ucp(&ptr, &negated, errorptr);
3819
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3824
/* For the rest, we can obtain the OP value by negating the escape
3829
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3835
/* We have a data character whose value is in c. In UTF-8 mode it may have
3836
a value > 127. We set its representation in the length/buffer, and then
3837
handle it as a data character. */
3840
if (utf8 && c > 127)
3841
mclength = ord2utf8(c, mcbuffer);
3852
/* Handle a literal character. It is guaranteed not to be whitespace or #
3853
when the extended flag is set. If we are in UTF-8 mode, it may be a
3854
multi-byte literal character. */
3862
if (utf8 && (c & 0xc0) == 0xc0)
3864
while ((ptr[1] & 0xc0) == 0x80)
3865
mcbuffer[mclength++] = *(++ptr);
3869
/* At this point we have the character's bytes in mcbuffer, and the length
3870
in mclength. When not in UTF-8 mode, the length is always 1. */
3874
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3877
/* Set the first and required bytes appropriately. If no previous first
3878
byte, set it from this character, but revert to none on a zero repeat.
3879
Otherwise, leave the firstbyte value alone, and don't change it on a zero
3882
if (firstbyte == REQ_UNSET)
3884
zerofirstbyte = REQ_NONE;
3885
zeroreqbyte = reqbyte;
3887
/* If the character is more than one byte long, we can set firstbyte
3888
only if it is not to be matched caselessly. */
3890
if (mclength == 1 || req_caseopt == 0)
3892
firstbyte = mcbuffer[0] | req_caseopt;
3893
if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3895
else firstbyte = reqbyte = REQ_NONE;
3898
/* firstbyte was previously set; we can set reqbyte only the length is
3899
1 or the matching is caseful. */
3903
zerofirstbyte = firstbyte;
3904
zeroreqbyte = reqbyte;
3905
if (mclength == 1 || req_caseopt == 0)
3906
reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3909
break; /* End of literal character handling */
3911
} /* end of big loop */
3913
/* Control never reaches here by falling through, only by a goto for all the
3914
error states. Pass back the position in the pattern so that it can be displayed
3915
to the user for diagnosing the error. */
3925
/*************************************************
3926
* Compile sequence of alternatives *
3927
*************************************************/
3929
/* On entry, ptr is pointing past the bracket character, but on return
3930
it points to the closing bracket, or vertical bar, or end of string.
3931
The code variable is pointing at the byte into which the BRA operator has been
3932
stored. If the ims options are changed at the start (for a (?ims: group) or
3933
during any branch, we need to insert an OP_OPT item at the start of every
3934
following branch to ensure they get set correctly at run time, and also pass
3935
the new options into every subsequent branch compile.
3938
options option bits, including any changes for this subpattern
3939
oldims previous settings of ims option bits
3940
brackets -> int containing the number of extracting brackets used
3941
codeptr -> the address of the current code pointer
3942
ptrptr -> the address of the current pattern pointer
3943
errorptr -> pointer to error message
3944
lookbehind TRUE if this is a lookbehind assertion
3945
skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946
firstbyteptr place to put the first required character, or a negative number
3947
reqbyteptr place to put the last required character, or a negative number
3948
bcptr pointer to the chain of currently open branches
3949
cd points to the data block with tables pointers etc.
3951
Returns: TRUE on success
3955
compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3959
const uschar *ptr = *ptrptr;
3960
uschar *code = *codeptr;
3961
uschar *last_branch = code;
3962
uschar *start_bracket = code;
3963
uschar *reverse_count = NULL;
3964
int firstbyte, reqbyte;
3965
int branchfirstbyte, branchreqbyte;
3971
firstbyte = reqbyte = REQ_UNSET;
3973
/* Offset is set zero to mark that this bracket is still open */
3976
code += 1 + LINK_SIZE + skipbytes;
3978
/* Loop for each alternative branch */
3982
/* Handle a change of ims options at the start of the branch */
3984
if ((options & PCRE_IMS) != oldims)
3987
*code++ = options & PCRE_IMS;
3990
/* Set up dummy OP_REVERSE if lookbehind assertion */
3994
*code++ = OP_REVERSE;
3995
reverse_count = code;
3999
/* Now compile the branch */
4001
if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002
&branchfirstbyte, &branchreqbyte, &bc, cd))
4008
/* If this is the first branch, the firstbyte and reqbyte values for the
4009
branch become the values for the regex. */
4011
if (*last_branch != OP_ALT)
4013
firstbyte = branchfirstbyte;
4014
reqbyte = branchreqbyte;
4017
/* If this is not the first branch, the first char and reqbyte have to
4018
match the values from all the previous branches, except that if the previous
4019
value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020
REQ_VARY for the regex. */
4024
/* If we previously had a firstbyte, but it doesn't match the new branch,
4025
we have to abandon the firstbyte for the regex, but if there was previously
4026
no reqbyte, it takes on the value of the old firstbyte. */
4028
if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4030
if (reqbyte < 0) reqbyte = firstbyte;
4031
firstbyte = REQ_NONE;
4034
/* If we (now or from before) have no firstbyte, a firstbyte from the
4035
branch becomes a reqbyte if there isn't a branch reqbyte. */
4037
if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038
branchreqbyte = branchfirstbyte;
4040
/* Now ensure that the reqbytes match */
4042
if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4044
else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4047
/* If lookbehind, check that this branch matches a fixed-length string,
4048
and put the length into the OP_REVERSE item. Temporarily mark the end of
4049
the branch with OP_END. */
4055
length = find_fixedlength(last_branch, options);
4056
DPRINTF(("fixed length = %d\n", length));
4059
*errorptr = (length == -2)? ERR36 : ERR25;
4063
PUT(reverse_count, 0, length);
4066
/* Reached end of expression, either ')' or end of pattern. Go back through
4067
the alternative branches and reverse the chain of offsets, with the field in
4068
the BRA item now becoming an offset to the first alternative. If there are
4069
no alternatives, it points to the end of the group. The length in the
4070
terminating ket is always the length of the whole bracketed item. If any of
4071
the ims options were changed inside the group, compile a resetting op-code
4072
following, except at the very end of the pattern. Return leaving the pointer
4073
at the terminating char. */
4077
int length = code - last_branch;
4080
int prev_length = GET(last_branch, 1);
4081
PUT(last_branch, 1, length);
4082
length = prev_length;
4083
last_branch -= length;
4087
/* Fill in the ket */
4090
PUT(code, 1, code - start_bracket);
4091
code += 1 + LINK_SIZE;
4093
/* Resetting option if needed */
4095
if ((options & PCRE_IMS) != oldims && *ptr == ')')
4101
/* Set values to pass back */
4105
*firstbyteptr = firstbyte;
4106
*reqbyteptr = reqbyte;
4110
/* Another branch follows; insert an "or" node. Its length field points back
4111
to the previous branch while the bracket remains open. At the end the chain
4112
is reversed. It's done like this so that the start of the bracket has a
4113
zero offset until it is closed, making it possible to detect recursion. */
4116
PUT(code, 1, code - last_branch);
4117
bc.current = last_branch = code;
4118
code += 1 + LINK_SIZE;
4121
/* Control never reaches here */
4127
/*************************************************
4128
* Check for anchored expression *
4129
*************************************************/
4131
/* Try to find out if this is an anchored regular expression. Consider each
4132
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134
it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135
counts, since OP_CIRC can match in the middle.
4137
We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138
This is the code for \G, which means "match at start of match position, taking
4139
into account the match offset".
4141
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142
because that will try the rest of the pattern at all possible matching points,
4143
so there is no point trying again.... er ....
4145
.... except when the .* appears inside capturing parentheses, and there is a
4146
subsequent back reference to those parentheses. We haven't enough information
4147
to catch that case precisely.
4149
At first, the best we could do was to detect when .* was in capturing brackets
4150
and the highest back reference was greater than or equal to that level.
4151
However, by keeping a bitmap of the first 31 back references, we can catch some
4152
of the more common cases more precisely.
4155
code points to start of expression (the bracket)
4156
options points to the options setting
4157
bracket_map a bitmap of which brackets we are inside while testing; this
4158
handles up to substring 31; after that we just have to take
4159
the less precise approach
4160
backref_map the back reference bitmap
4162
Returns: TRUE or FALSE
4166
is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167
unsigned int backref_map)
4170
const uschar *scode =
4171
first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172
register int op = *scode;
4174
/* Capturing brackets */
4180
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182
if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4185
/* Other brackets */
4187
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4189
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4192
/* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193
are or may be referenced. */
4195
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196
(*options & PCRE_DOTALL) != 0)
4198
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4201
/* Check for explicit anchoring */
4203
else if (op != OP_SOD && op != OP_SOM &&
4204
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4206
code += GET(code, 1);
4208
while (*code == OP_ALT); /* Loop for each alternative */
4214
/*************************************************
4215
* Check for starting with ^ or .* *
4216
*************************************************/
4218
/* This is called to find out if every branch starts with ^ or .* so that
4219
"first char" processing can be done to speed things up in multiline
4220
matching and for non-DOTALL patterns that start with .* (which must start at
4221
the beginning or after \n). As in the case of is_anchored() (see above), we
4222
have to take account of back references to capturing brackets that contain .*
4223
because in that case we can't make the assumption.
4226
code points to start of expression (the bracket)
4227
bracket_map a bitmap of which brackets we are inside while testing; this
4228
handles up to substring 31; after that we just have to take
4229
the less precise approach
4230
backref_map the back reference bitmap
4232
Returns: TRUE or FALSE
4236
is_startline(const uschar *code, unsigned int bracket_map,
4237
unsigned int backref_map)
4240
const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4242
register int op = *scode;
4244
/* Capturing brackets */
4250
if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251
new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252
if (!is_startline(scode, new_map, backref_map)) return FALSE;
4255
/* Other brackets */
4257
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4260
/* .* means "start at start or after \n" if it isn't in brackets that
4261
may be referenced. */
4263
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4265
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4268
/* Check for explicit circumflex */
4270
else if (op != OP_CIRC) return FALSE;
4272
/* Move on to the next alternative */
4274
code += GET(code, 1);
4276
while (*code == OP_ALT); /* Loop for each alternative */
4282
/*************************************************
4283
* Check for asserted fixed first char *
4284
*************************************************/
4286
/* During compilation, the "first char" settings from forward assertions are
4287
discarded, because they can cause conflicts with actual literals that follow.
4288
However, if we end up without a first char setting for an unanchored pattern,
4289
it is worth scanning the regex to see if there is an initial asserted first
4290
char. If all branches start with the same asserted char, or with a bracket all
4291
of whose alternatives start with the same asserted char (recurse ad lib), then
4292
we return that char, otherwise -1.
4295
code points to start of expression (the bracket)
4296
options pointer to the options (used to check casing changes)
4297
inassert TRUE if in an assertion
4299
Returns: -1 or the fixed first char
4303
find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4305
register int c = -1;
4308
const uschar *scode =
4309
first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310
register int op = *scode;
4312
if (op >= OP_BRA) op = OP_BRA;
4323
if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4325
if (c < 0) c = d; else if (c != d) return -1;
4328
case OP_EXACT: /* Fall through */
4335
if (!inassert) return -1;
4339
if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4341
else if (c != scode[1]) return -1;
4345
code += GET(code, 1);
4347
while (*code == OP_ALT);
4355
/*************************************************
4356
* Validate a UTF-8 string *
4357
*************************************************/
4359
/* This function is called (optionally) at the start of compile or match, to
4360
validate that a supposed UTF-8 string is actually valid. The early check means
4361
that subsequent code can assume it is dealing with a valid string. The check
4362
can be turned off for maximum performance, but then consequences of supplying
4363
an invalid string are then undefined.
4366
string points to the string
4367
length length of string, or -1 if the string is zero-terminated
4369
Returns: < 0 if the string is a valid UTF-8 string
4370
>= 0 otherwise; the value is the offset of the bad byte
4374
valid_utf8(const uschar *string, int length)
4376
register const uschar *p;
4380
for (p = string; *p != 0; p++);
4381
length = p - string;
4384
for (p = string; length-- > 0; p++)
4387
register int c = *p;
4388
if (c < 128) continue;
4389
if ((c & 0xc0) != 0xc0) return p - string;
4390
ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4391
if (length < ab) return p - string;
4394
/* Check top bits in the second byte */
4395
if ((*(++p) & 0xc0) != 0x80) return p - string;
4397
/* Check for overlong sequences for each different length */
4400
/* Check for xx00 000x */
4402
if ((c & 0x3e) == 0) return p - string;
4403
continue; /* We know there aren't any more bytes to check */
4405
/* Check for 1110 0000, xx0x xxxx */
4407
if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4410
/* Check for 1111 0000, xx00 xxxx */
4412
if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4415
/* Check for 1111 1000, xx00 0xxx */
4417
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4420
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4422
if (c == 0xfe || c == 0xff ||
4423
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4427
/* Check for valid bytes after the 2nd, if any; all must start 10 */
4430
if ((*(++p) & 0xc0) != 0x80) return p - string;
4440
/*************************************************
4441
* Compile a Regular Expression *
4442
*************************************************/
4444
/* This function takes a string and returns a pointer to a block of store
4445
holding a compiled version of the expression.
4448
pattern the regular expression
4449
options various option bits
4450
errorptr pointer to pointer to error text
4451
erroroffset ptr offset in pattern where error was detected
4452
tables pointer to character tables or NULL
4454
Returns: pointer to compiled data block, or NULL on error,
4455
with errorptr and erroroffset set
4459
pcre_compile(const char *pattern, int options, const char **errorptr,
4460
int *erroroffset, const unsigned char *tables)
4463
int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4465
int c, firstbyte, reqbyte;
4467
int branch_extra = 0;
4468
int branch_newextra;
4469
int item_count = -1;
4471
int max_name_size = 0;
4472
int lastitemlength = 0;
4477
BOOL inescq = FALSE;
4478
unsigned int brastackptr = 0;
4481
const uschar *codestart;
4483
compile_data compile_block;
4484
int brastack[BRASTACK_SIZE];
4485
uschar bralenstack[BRASTACK_SIZE];
4487
/* We can't pass back an error message if errorptr is NULL; I guess the best we
4488
can do is just return NULL. */
4490
if (errorptr == NULL) return NULL;
4493
/* However, we can give a message for this error */
4495
if (erroroffset == NULL)
4502
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
4505
utf8 = (options & PCRE_UTF8) != 0;
4506
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4507
(*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4513
if ((options & PCRE_UTF8) != 0)
4520
if ((options & ~PUBLIC_OPTIONS) != 0)
4526
/* Set up pointers to the individual character tables */
4528
if (tables == NULL) tables = pcre_default_tables;
4529
compile_block.lcc = tables + lcc_offset;
4530
compile_block.fcc = tables + fcc_offset;
4531
compile_block.cbits = tables + cbits_offset;
4532
compile_block.ctypes = tables + ctypes_offset;
4534
/* Maximum back reference and backref bitmap. This is updated for numeric
4535
references during the first pass, but for named references during the actual
4536
compile pass. The bitmap records up to 31 back references to help in deciding
4537
whether (.*) can be treated as anchored or not. */
4539
compile_block.top_backref = 0;
4540
compile_block.backref_map = 0;
4542
/* Reflect pattern for debugging output */
4544
DPRINTF(("------------------------------------------------------------------\n"));
4545
DPRINTF(("%s\n", pattern));
4547
/* The first thing to do is to make a pass over the pattern to compute the
4548
amount of store required to hold the compiled code. This does not have to be
4549
perfect as long as errors are overestimates. At the same time we can detect any
4550
flag settings right at the start, and extract them. Make an attempt to correct
4551
for any counted white space if an "extended" flag setting appears late in the
4552
pattern. We can't be so clever for #-comments. */
4554
ptr = (const uschar *)(pattern - 1);
4555
while ((c = *(++ptr)) != 0)
4562
/* If we are inside a \Q...\E sequence, all chars are literal */
4566
if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4570
/* Otherwise, first check for ignored whitespace and comments */
4572
if ((options & PCRE_EXTENDED) != 0)
4574
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4577
/* The space before the ; is to avoid a warning on a silly compiler
4578
on the Macintosh. */
4579
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4585
item_count++; /* Is zero for the first non-comment item */
4587
/* Allow space for auto callout before every item except quantifiers. */
4589
if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4590
c != '*' && c != '+' && c != '?' &&
4591
(c != '{' || !is_counted_repeat(ptr + 1)))
4592
length += 2 + 2*LINK_SIZE;
4596
/* A backslashed item may be an escaped data character or it may be a
4600
c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4601
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4603
lastitemlength = 1; /* Default length of last item for repeats */
4605
if (c >= 0) /* Data character */
4607
length += 2; /* For a one-byte character */
4610
if (utf8 && c > 127)
4613
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4614
if (c <= utf8_table1[i]) break;
4616
lastitemlength += i;
4623
/* If \Q, enter "literal" mode */
4631
/* \X is supported only if Unicode property support is compiled */
4637
goto PCRE_ERROR_RETURN;
4641
/* \P and \p are for Unicode properties, but only when the support has
4642
been compiled. Each item needs 2 bytes. */
4644
else if (-c == ESC_P || -c == ESC_p)
4650
if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4654
goto PCRE_ERROR_RETURN;
4658
/* Other escapes need one byte */
4662
/* A back reference needs an additional 2 bytes, plus either one or 5
4663
bytes for a repeat. We also need to keep the value of the highest
4668
int refnum = -c - ESC_REF;
4669
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4670
if (refnum > compile_block.top_backref)
4671
compile_block.top_backref = refnum;
4672
length += 2; /* For single back reference */
4673
if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4675
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4676
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4677
if ((min == 0 && (max == 1 || max == -1)) ||
4678
(min == 1 && max == -1))
4681
if (ptr[1] == '?') ptr++;
4686
case '^': /* Single-byte metacharacters */
4693
case '*': /* These repeats won't be after brackets; */
4694
case '+': /* those are handled separately */
4697
goto POSESSIVE; /* A few lines below */
4699
/* This covers the cases of braced repeats after a single char, metachar,
4700
class, or back reference. */
4703
if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4704
ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4705
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4707
/* These special cases just insert one extra opcode */
4709
if ((min == 0 && (max == 1 || max == -1)) ||
4710
(min == 1 && max == -1))
4713
/* These cases might insert additional copies of a preceding character. */
4719
length -= lastitemlength; /* Uncount the original char or metachar */
4720
if (min > 0) length += 3 + lastitemlength;
4722
length += lastitemlength + ((max > 0)? 3 : 1);
4725
if (ptr[1] == '?') ptr++; /* Needs no extra length */
4727
POSESSIVE: /* Test for possessive quantifier */
4731
length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4735
/* An alternation contains an offset to the next branch or ket. If any ims
4736
options changed in the previous branch(es), and/or if we are in a
4737
lookbehind assertion, extra space will be needed at the start of the
4738
branch. This is handled by branch_extra. */
4741
length += 1 + LINK_SIZE + branch_extra;
4744
/* A character class uses 33 characters provided that all the character
4745
values are less than 256. Otherwise, it uses a bit map for low valued
4746
characters, and individual items for others. Don't worry about character
4747
types that aren't allowed in classes - they'll get picked up during the
4748
compile. A character class that contains only one single-byte character
4749
uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4750
where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4753
if (*(++ptr) == '^')
4755
class_optcount = 10; /* Greater than one */
4758
else class_optcount = 0;
4764
/* Written as a "do" so that an initial ']' is taken as data */
4768
/* Inside \Q...\E everything is literal except \E */
4772
if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4778
/* Outside \Q...\E, check for escapes */
4782
c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4783
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4785
/* \b is backspace inside a class; \X is literal */
4787
if (-c == ESC_b) c = '\b';
4788
else if (-c == ESC_X) c = 'X';
4790
/* \Q enters quoting mode */
4792
else if (-c == ESC_Q)
4798
/* Handle escapes that turn into characters */
4800
if (c >= 0) goto NON_SPECIAL_CHARACTER;
4802
/* Escapes that are meta-things. The normal ones just affect the
4803
bit map, but Unicode properties require an XCLASS extended item. */
4807
class_optcount = 10; /* \d, \s etc; make sure > 1 */
4809
if (-c == ESC_p || -c == ESC_P)
4814
length += LINK_SIZE + 2;
4822
/* Check the syntax for POSIX stuff. The bits we actually handle are
4823
checked during the real compile phase. */
4825
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4828
class_optcount = 10; /* Make sure > 1 */
4831
/* Anything else increments the possible optimization count. We have to
4832
detect ranges here so that we can compute the number of extra ranges for
4833
caseless wide characters when UCP support is available. If there are wide
4834
characters, we are going to have to use an XCLASS, even for single
4847
GETCHARLEN(c, ptr, extra);
4855
/* Come here from handling \ above when it escapes to a char value */
4857
NON_SPECIAL_CHARACTER:
4863
uschar const *hyptr = ptr++;
4867
d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4868
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4869
if (-d == ESC_b) d = '\b'; /* backspace */
4870
else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4872
else if (ptr[1] != 0 && ptr[1] != ']')
4879
GETCHARLEN(d, ptr, extra);
4886
if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4889
/* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4890
127 for caseless matching, we will need to use an XCLASS. */
4894
class_optcount = 10; /* Ensure > 1 */
4898
goto PCRE_ERROR_RETURN;
4902
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4905
if (!class_utf8) /* Allow for XCLASS overhead */
4908
length += LINK_SIZE + 2;
4912
/* If we have UCP support, find out how many extra ranges are
4913
needed to map the other case of characters within this range. We
4914
have to mimic the range optimization here, because extending the
4915
range upwards might push d over a boundary that makes is use
4916
another byte in the UTF-8 representation. */
4918
if ((options & PCRE_CASELESS) != 0)
4923
while (get_othercase_range(&cc, origd, &occ, &ocd))
4925
if (occ >= c && ocd <= d) continue; /* Skip embedded */
4927
if (occ < c && ocd >= c - 1) /* Extend the basic range */
4928
{ /* if there is overlap, */
4929
c = occ; /* noting that if occ < c */
4930
continue; /* we can't have ocd > d */
4931
} /* because a subrange is */
4932
if (ocd > d && occ <= d + 1) /* always shorter than */
4933
{ /* the basic range. */
4938
/* An extra item is needed */
4940
length += 1 + ord2utf8(occ, buffer) +
4941
((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4944
#endif /* SUPPORT_UCP */
4946
/* The length of the (possibly extended) range */
4948
length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4950
#endif /* SUPPORT_UTF8 */
4954
/* We have a single character. There is nothing to be done unless we
4955
are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4956
allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4962
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4965
class_optcount = 10; /* Ensure > 1 */
4966
if (!class_utf8) /* Allow for XCLASS overhead */
4969
length += LINK_SIZE + 2;
4972
length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4973
(1 + ord2utf8(c, buffer));
4974
#else /* SUPPORT_UCP */
4975
length += 1 + ord2utf8(c, buffer);
4976
#endif /* SUPPORT_UCP */
4978
#endif /* SUPPORT_UTF8 */
4982
while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4984
if (*ptr == 0) /* Missing terminating ']' */
4987
goto PCRE_ERROR_RETURN;
4990
/* We can optimize when there was only one optimizable character. Repeats
4991
for positive and negated single one-byte chars are handled by the general
4992
code. Here, we handle repeats for the class opcodes. */
4994
if (class_optcount == 1) length += 3; else
4998
/* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4999
we also need extra for wrapping the whole thing in a sub-pattern. */
5001
if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5003
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5004
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5005
if ((min == 0 && (max == 1 || max == -1)) ||
5006
(min == 1 && max == -1))
5012
length += 2 + 2*LINK_SIZE;
5014
else if (ptr[1] == '?') ptr++;
5019
/* Brackets may be genuine groups or special things */
5022
branch_newextra = 0;
5023
bracket_length = 1 + LINK_SIZE;
5025
/* Handle special forms of bracket, which all start (? */
5034
/* Skip over comments entirely */
5037
while (*ptr != 0 && *ptr != ')') ptr++;
5041
goto PCRE_ERROR_RETURN;
5045
/* Non-referencing groups and lookaheads just move the pointer on, and
5046
then behave like a non-special bracket, except that they don't increment
5047
the count of extracting brackets. Ditto for the "once only" bracket,
5048
which is in Perl from version 5.005. */
5057
/* (?R) specifies a recursive call to the regex, which is an extension
5058
to provide the facility which can be obtained by (?p{perl-code}) in
5059
Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5061
From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5062
the appropriate numbered brackets. This includes both recursive and
5063
non-recursive calls. (?R) is now synonymous with (?0). */
5068
case '0': case '1': case '2': case '3': case '4':
5069
case '5': case '6': case '7': case '8': case '9':
5072
while ((digitab[*(++ptr)] & ctype_digit) != 0);
5076
goto PCRE_ERROR_RETURN;
5078
length += 1 + LINK_SIZE;
5080
/* If this item is quantified, it will get wrapped inside brackets so
5081
as to use the code for quantified brackets. We jump down and use the
5082
code that handles this for real brackets. */
5084
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5086
length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5087
duplength = 5 + 3 * LINK_SIZE;
5088
goto HANDLE_QUANTIFIED_BRACKETS;
5092
/* (?C) is an extension which provides "callout" - to provide a bit of
5093
the functionality of the Perl (?{...}) feature. An optional number may
5094
follow (default is zero). */
5098
while ((digitab[*(++ptr)] & ctype_digit) != 0);
5102
goto PCRE_ERROR_RETURN;
5104
length += 2 + 2*LINK_SIZE;
5107
/* Named subpatterns are an extension copied from Python */
5113
const uschar *p; /* Don't amalgamate; some compilers */
5114
p = ++ptr; /* grumble at autoincrement in declaration */
5115
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5119
goto PCRE_ERROR_RETURN;
5122
if (ptr - p > max_name_size) max_name_size = (ptr - p);
5126
if (*ptr == '=' || *ptr == '>')
5128
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5132
goto PCRE_ERROR_RETURN;
5137
/* Unknown character after (?P */
5140
goto PCRE_ERROR_RETURN;
5142
/* Lookbehinds are in Perl from version 5.005 */
5146
if (*ptr == '=' || *ptr == '!')
5148
branch_newextra = 1 + LINK_SIZE;
5149
length += 1 + LINK_SIZE; /* For the first branch */
5153
goto PCRE_ERROR_RETURN;
5155
/* Conditionals are in Perl from version 5.005. The bracket must either
5156
be followed by a number (for bracket reference) or by an assertion
5157
group, or (a PCRE extension) by 'R' for a recursion test. */
5160
if (ptr[3] == 'R' && ptr[4] == ')')
5165
else if ((digitab[ptr[3]] & ctype_digit) != 0)
5169
while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5173
goto PCRE_ERROR_RETURN;
5176
else /* An assertion must follow */
5178
ptr++; /* Can treat like ':' as far as spacing is concerned */
5179
if (ptr[2] != '?' ||
5180
(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5182
ptr += 2; /* To get right offset in message */
5184
goto PCRE_ERROR_RETURN;
5189
/* Else loop checking valid options until ) is met. Anything else is an
5190
error. If we are without any brackets, i.e. at top level, the settings
5191
act as if specified in the options, so massage the options immediately.
5192
This is for backward compatibility with Perl 5.004. */
5205
*optset |= PCRE_CASELESS;
5209
*optset |= PCRE_MULTILINE;
5213
*optset |= PCRE_DOTALL;
5217
*optset |= PCRE_EXTENDED;
5221
*optset |= PCRE_EXTRA;
5225
*optset |= PCRE_UNGREEDY;
5232
/* A termination by ')' indicates an options-setting-only item; if
5233
this is at the very start of the pattern (indicated by item_count
5234
being zero), we use it to set the global options. This is helpful
5235
when analyzing the pattern for first characters, etc. Otherwise
5236
nothing is done here and it is handled during the compiling
5239
[Historical note: Up to Perl 5.8, options settings at top level
5240
were always global settings, wherever they appeared in the pattern.
5241
That is, they were equivalent to an external setting. From 5.8
5242
onwards, they apply only to what follows (which is what you might
5246
if (item_count == 0)
5248
options = (options | set) & (~unset);
5249
set = unset = 0; /* To save length */
5250
item_count--; /* To allow for several */
5255
/* A termination by ':' indicates the start of a nested group with
5256
the given options set. This is again handled at compile time, but
5257
we must allow for compiled space if any of the ims options are
5258
set. We also have to allow for resetting space at the end of
5259
the group, which is why 4 is added to the length and not just 2.
5260
If there are several changes of options within the same group, this
5261
will lead to an over-estimate on the length, but this shouldn't
5262
matter very much. We also have to allow for resetting options at
5263
the start of any alternations, which we do by setting
5264
branch_newextra to 2. Finally, we record whether the case-dependent
5265
flag ever changes within the regex. This is used by the "required
5269
if (((set|unset) & PCRE_IMS) != 0)
5272
branch_newextra = 2;
5273
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5277
/* Unrecognized option character */
5281
goto PCRE_ERROR_RETURN;
5285
/* If we hit a closing bracket, that's it - this is a freestanding
5286
option-setting. We need to ensure that branch_extra is updated if
5287
necessary. The only values branch_newextra can have here are 0 or 2.
5288
If the value is 2, then branch_extra must either be 2 or 5, depending
5289
on whether this is a lookbehind group or not. */
5294
if (branch_newextra == 2 &&
5295
(branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5296
branch_extra += branch_newextra;
5300
/* If options were terminated by ':' control comes here. Fall through
5301
to handle the group below. */
5305
/* Extracting brackets must be counted so we can process escapes in a
5306
Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5307
need an additional 3 bytes of store per extracting bracket. However, if
5308
PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5309
must leave the count alone (it will aways be zero). */
5311
else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5314
if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5317
/* Save length for computing whole length at end if there's a repeat that
5318
requires duplication of the group. Also save the current value of
5319
branch_extra, and start the new group with the new value. If non-zero, this
5320
will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5322
if (brastackptr >= sizeof(brastack)/sizeof(int))
5325
goto PCRE_ERROR_RETURN;
5328
bralenstack[brastackptr] = branch_extra;
5329
branch_extra = branch_newextra;
5331
brastack[brastackptr++] = length;
5332
length += bracket_length;
5335
/* Handle ket. Look for subsequent max/min; for certain sets of values we
5336
have to replicate this bracket up to that many times. If brastackptr is
5337
0 this is an unmatched bracket which will generate an error, but take care
5338
not to try to access brastack[-1] when computing the length and restoring
5339
the branch_extra value. */
5342
length += 1 + LINK_SIZE;
5343
if (brastackptr > 0)
5345
duplength = length - brastack[--brastackptr];
5346
branch_extra = bralenstack[brastackptr];
5350
/* The following code is also used when a recursion such as (?3) is
5351
followed by a quantifier, because in that case, it has to be wrapped inside
5352
brackets so that the quantifier works. The value of duplength must be
5353
set before arrival. */
5355
HANDLE_QUANTIFIED_BRACKETS:
5357
/* Leave ptr at the final char; for read_repeat_counts this happens
5358
automatically; for the others we need an increment. */
5360
if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5362
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5363
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5365
else if (c == '*') { min = 0; max = -1; ptr++; }
5366
else if (c == '+') { min = 1; max = -1; ptr++; }
5367
else if (c == '?') { min = 0; max = 1; ptr++; }
5368
else { min = 1; max = 1; }
5370
/* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5371
group, and if the maximum is greater than zero, we have to replicate
5372
maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5378
if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5381
/* When the minimum is greater than zero, we have to replicate up to
5382
minval-1 times, with no additions required in the copies. Then, if there
5383
is a limited maximum we have to replicate up to maxval-1 times allowing
5384
for a BRAZERO item before each optional copy and nesting brackets for all
5385
but one of the optional copies. */
5389
length += (min - 1) * duplength;
5390
if (max > min) /* Need this test as max=-1 means no limit */
5391
length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5392
- (2 + 2*LINK_SIZE);
5395
/* Allow space for once brackets for "possessive quantifier" */
5400
length += 2 + 2*LINK_SIZE;
5404
/* Non-special character. It won't be space or # in extended mode, so it is
5405
always a genuine character. If we are in a \Q...\E sequence, check for the
5406
end; if not, we have a literal. */
5411
if (inescq && c == '\\' && ptr[1] == 'E')
5418
length += 2; /* For a one-byte character */
5419
lastitemlength = 1; /* Default length of last item for repeats */
5421
/* In UTF-8 mode, check for additional bytes. */
5424
if (utf8 && (c & 0xc0) == 0xc0)
5426
while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5427
{ /* because the end is marked */
5428
lastitemlength++; /* by a zero byte. */
5439
length += 2 + LINK_SIZE; /* For final KET and END */
5441
if ((options & PCRE_AUTO_CALLOUT) != 0)
5442
length += 2 + 2*LINK_SIZE; /* For final callout */
5444
if (length > MAX_PATTERN_SIZE)
5450
/* Compute the size of data block needed and get it, either from malloc or
5451
externally provided function. */
5453
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5454
re = (real_pcre *)(pcre_malloc)(size);
5462
/* Put in the magic number, and save the sizes, options, and character table
5463
pointer. NULL is used for the default character tables. The nullpad field is at
5464
the end; it's there to help in the case when a regex compiled on a system with
5465
4-byte pointers is run on another with 8-byte pointers. */
5467
re->magic_number = MAGIC_NUMBER;
5469
re->options = options;
5470
re->dummy1 = re->dummy2 = 0;
5471
re->name_table_offset = sizeof(real_pcre);
5472
re->name_entry_size = max_name_size + 3;
5473
re->name_count = name_count;
5474
re->tables = (tables == pcre_default_tables)? NULL : tables;
5477
/* The starting points of the name/number translation table and of the code are
5478
passed around in the compile data block. */
5480
compile_block.names_found = 0;
5481
compile_block.name_entry_size = max_name_size + 3;
5482
compile_block.name_table = (uschar *)re + re->name_table_offset;
5483
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5484
compile_block.start_code = codestart;
5485
compile_block.start_pattern = (const uschar *)pattern;
5486
compile_block.req_varyopt = 0;
5487
compile_block.nopartial = FALSE;
5489
/* Set up a starting, non-extracting bracket, then compile the expression. On
5490
error, *errorptr will be set non-NULL, so we don't need to look at the result
5491
of the function here. */
5493
ptr = (const uschar *)pattern;
5494
code = (uschar *)codestart;
5497
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5498
errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5499
re->top_bracket = bracount;
5500
re->top_backref = compile_block.top_backref;
5502
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5504
/* If not reached end of pattern on success, there's an excess bracket. */
5506
if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5508
/* Fill in the terminating state and check for disastrous overflow, but
5509
if debugging, leave the test till after things are printed out. */
5514
if (code - codestart > length) *errorptr = ERR23;
5517
/* Give an error if there's back reference to a non-existent capturing
5520
if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5522
/* Failed to compile, or error while post-processing */
5524
if (*errorptr != NULL)
5528
*erroroffset = ptr - (const uschar *)pattern;
5532
/* If the anchored option was not passed, set the flag if we can determine that
5533
the pattern is anchored by virtue of ^ characters or \A or anything else (such
5534
as starting with .* when DOTALL is set).
5536
Otherwise, if we know what the first character has to be, save it, because that
5537
speeds up unanchored matches no end. If not, see if we can set the
5538
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5539
start with ^. and also when all branches start with .* for non-DOTALL matches.
5542
if ((options & PCRE_ANCHORED) == 0)
5544
int temp_options = options;
5545
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5546
re->options |= PCRE_ANCHORED;
5550
firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5551
if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5553
int ch = firstbyte & 255;
5554
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5555
compile_block.fcc[ch] == ch)? ch : firstbyte;
5556
re->options |= PCRE_FIRSTSET;
5558
else if (is_startline(codestart, 0, compile_block.backref_map))
5559
re->options |= PCRE_STARTLINE;
5563
/* For an anchored pattern, we use the "required byte" only if it follows a
5564
variable length item in the regex. Remove the caseless flag for non-caseable
5568
((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5570
int ch = reqbyte & 255;
5571
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5572
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5573
re->options |= PCRE_REQCHSET;
5576
/* Print out the compiled data for debugging */
5580
printf("Length = %d top_bracket = %d top_backref = %d\n",
5581
length, re->top_bracket, re->top_backref);
5583
if (re->options != 0)
5585
printf("%s%s%s%s%s%s%s%s%s%s\n",
5586
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5587
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5588
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5589
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5590
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5591
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5592
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5593
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5594
((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5595
((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5598
if ((re->options & PCRE_FIRSTSET) != 0)
5600
int ch = re->first_byte & 255;
5601
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5602
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5603
else printf("First char = \\x%02x%s\n", ch, caseless);
5606
if ((re->options & PCRE_REQCHSET) != 0)
5608
int ch = re->req_byte & 255;
5609
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5610
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5611
else printf("Req char = \\x%02x%s\n", ch, caseless);
5614
print_internals(re, stdout);
5616
/* This check is done here in the debugging case so that the code that
5617
was compiled can be seen. */
5619
if (code - codestart > length)
5623
*erroroffset = ptr - (uschar *)pattern;
5633
/*************************************************
5634
* Match a back-reference *
5635
*************************************************/
5637
/* If a back reference hasn't been set, the length that is passed is greater
5638
than the number of characters left in the string, so the match fails.
5641
offset index into the offset vector
5642
eptr points into the subject
5643
length length to be matched
5644
md points to match data block
5647
Returns: TRUE if matched
5651
match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5652
unsigned long int ims)
5654
const uschar *p = md->start_subject + md->offset_vector[offset];
5657
if (eptr >= md->end_subject)
5658
printf("matching subject <null>");
5661
printf("matching subject ");
5662
pchars(eptr, length, TRUE, md);
5664
printf(" against backref ");
5665
pchars(p, length, FALSE, md);
5669
/* Always fail if not enough characters left */
5671
if (length > md->end_subject - eptr) return FALSE;
5673
/* Separate the caselesss case for speed */
5675
if ((ims & PCRE_CASELESS) != 0)
5677
while (length-- > 0)
5678
if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5681
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5688
/*************************************************
5689
* Match character against an XCLASS *
5690
*************************************************/
5692
/* This function is called from within the XCLASS code below, to match a
5693
character against an extended class which might match values > 255.
5697
data points to the flag byte of the XCLASS data
5699
Returns: TRUE if character matches, else FALSE
5703
match_xclass(int c, const uschar *data)
5706
BOOL negated = (*data & XCL_NOT) != 0;
5708
/* Character values < 256 are matched against a bitmap, if one is present. If
5709
not, we still carry on, because there may be ranges that start below 256 in the
5714
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5715
return !negated; /* char found */
5718
/* First skip the bit map if present. Then match against the list of Unicode
5719
properties or large chars or ranges that end with a large char. We won't ever
5720
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5722
if ((*data++ & XCL_MAP) != 0) data += 32;
5724
while ((t = *data++) != XCL_END)
5727
if (t == XCL_SINGLE)
5729
GETCHARINC(x, data);
5730
if (c == x) return !negated;
5732
else if (t == XCL_RANGE)
5734
GETCHARINC(x, data);
5735
GETCHARINC(y, data);
5736
if (c >= x && c <= y) return !negated;
5740
else /* XCL_PROP & XCL_NOTPROP */
5742
int chartype, othercase;
5743
int rqdtype = *data++;
5744
int category = ucp_findchar(c, &chartype, &othercase);
5747
if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5751
if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5754
#endif /* SUPPORT_UCP */
5757
return negated; /* char did not match */
5762
/***************************************************************************
5763
****************************************************************************
5764
RECURSION IN THE match() FUNCTION
5766
The match() function is highly recursive. Some regular expressions can cause
5767
it to recurse thousands of times. I was writing for Unix, so I just let it
5768
call itself recursively. This uses the stack for saving everything that has
5769
to be saved for a recursive call. On Unix, the stack can be large, and this
5772
It turns out that on non-Unix systems there are problems with programs that
5773
use a lot of stack. (This despite the fact that every last chip has oodles
5774
of memory these days, and techniques for extending the stack have been known
5775
for decades.) So....
5777
There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5778
calls by keeping local variables that need to be preserved in blocks of memory
5779
obtained from malloc instead instead of on the stack. Macros are used to
5780
achieve this so that the actual code doesn't look very different to what it
5782
****************************************************************************
5783
***************************************************************************/
5786
/* These versions of the macros use the stack, as normal */
5789
#define REGISTER register
5790
#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5791
#define RRETURN(ra) return ra
5795
/* These versions of the macros manage a private stack on the heap. Note
5796
that the rd argument of RMATCH isn't actually used. It's the md argument of
5797
match(), which never changes. */
5801
#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5803
heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5804
if (setjmp(frame->Xwhere) == 0)\
5806
newframe->Xeptr = ra;\
5807
newframe->Xecode = rb;\
5808
newframe->Xoffset_top = rc;\
5809
newframe->Xims = re;\
5810
newframe->Xeptrb = rf;\
5811
newframe->Xflags = rg;\
5812
newframe->Xprevframe = frame;\
5814
DPRINTF(("restarting from line %d\n", __LINE__));\
5819
DPRINTF(("longjumped back to line %d\n", __LINE__));\
5820
frame = md->thisframe;\
5821
rx = frame->Xresult;\
5825
#define RRETURN(ra)\
5827
heapframe *newframe = frame;\
5828
frame = newframe->Xprevframe;\
5829
(pcre_stack_free)(newframe);\
5832
frame->Xresult = ra;\
5833
md->thisframe = frame;\
5834
longjmp(frame->Xwhere, 1);\
5840
/* Structure for remembering the local variables in a private frame */
5842
typedef struct heapframe {
5843
struct heapframe *Xprevframe;
5845
/* Function arguments that may change */
5847
const uschar *Xeptr;
5848
const uschar *Xecode;
5854
/* Function local variables */
5856
const uschar *Xcallpat;
5857
const uschar *Xcharptr;
5858
const uschar *Xdata;
5859
const uschar *Xnext;
5861
const uschar *Xprev;
5862
const uschar *Xsaved_eptr;
5864
recursion_info Xnew_recursive;
5871
unsigned long int Xoriginal_ims;
5875
int Xprop_fail_result;
5878
int Xprop_othercase;
5879
int Xprop_test_against;
5880
int *Xprop_test_variable;
5892
int Xsave_capture_last;
5893
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5894
int Xstacksave[REC_STACK_SAVE_MAX];
5898
/* Place to pass back result, and where to jump back to */
5908
/***************************************************************************
5909
***************************************************************************/
5913
/*************************************************
5914
* Match from current position *
5915
*************************************************/
5917
/* On entry ecode points to the first opcode, and eptr to the first character
5918
in the subject string, while eptrb holds the value of eptr at the start of the
5919
last bracketed group - used for breaking infinite loops matching zero-length
5920
strings. This function is called recursively in many circumstances. Whenever it
5921
returns a negative (error) response, the outer incarnation must also return the
5924
Performance note: It might be tempting to extract commonly used fields from the
5925
md structure (e.g. utf8, end_subject) into individual variables to improve
5926
performance. Tests using gcc on a SPARC disproved this; in the first case, it
5927
made performance worse.
5930
eptr pointer in subject
5931
ecode position in code
5932
offset_top current top pointer
5933
md pointer to "static" info for the match
5934
ims current /i, /m, and /s options
5935
eptrb pointer to chain of blocks containing eptr at start of
5936
brackets - for testing for empty matches
5938
match_condassert - this is an assertion condition
5939
match_isgroup - this is the start of a bracketed group
5941
Returns: MATCH_MATCH if matched ) these values are >= 0
5942
MATCH_NOMATCH if failed to match )
5943
a negative PCRE_ERROR_xxx value if aborted by an error condition
5944
(e.g. stopped by recursion limit)
5948
match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5949
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5952
/* These variables do not need to be preserved over recursion in this function,
5953
so they can be ordinary variables in all cases. Mark them with "register"
5954
because they are used a lot in loops. */
5956
register int rrc; /* Returns from recursive calls */
5957
register int i; /* Used for loops not involving calls to RMATCH() */
5958
register int c; /* Character values not kept over RMATCH() calls */
5960
/* When recursion is not being used, all "local" variables that have to be
5961
preserved over calls to RMATCH() are part of a "frame" which is obtained from
5962
heap storage. Set up the top-level frame here; others are obtained from the
5963
heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5966
heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5967
frame->Xprevframe = NULL; /* Marks the top level */
5969
/* Copy in the original argument variables */
5971
frame->Xeptr = eptr;
5972
frame->Xecode = ecode;
5973
frame->Xoffset_top = offset_top;
5975
frame->Xeptrb = eptrb;
5976
frame->Xflags = flags;
5978
/* This is where control jumps back to to effect "recursion" */
5982
/* Macros make the argument variables come from the current frame */
5984
#define eptr frame->Xeptr
5985
#define ecode frame->Xecode
5986
#define offset_top frame->Xoffset_top
5987
#define ims frame->Xims
5988
#define eptrb frame->Xeptrb
5989
#define flags frame->Xflags
5991
/* Ditto for the local variables */
5994
#define charptr frame->Xcharptr
5996
#define callpat frame->Xcallpat
5997
#define data frame->Xdata
5998
#define next frame->Xnext
5999
#define pp frame->Xpp
6000
#define prev frame->Xprev
6001
#define saved_eptr frame->Xsaved_eptr
6003
#define new_recursive frame->Xnew_recursive
6005
#define cur_is_word frame->Xcur_is_word
6006
#define condition frame->Xcondition
6007
#define minimize frame->Xminimize
6008
#define prev_is_word frame->Xprev_is_word
6010
#define original_ims frame->Xoriginal_ims
6013
#define prop_type frame->Xprop_type
6014
#define prop_fail_result frame->Xprop_fail_result
6015
#define prop_category frame->Xprop_category
6016
#define prop_chartype frame->Xprop_chartype
6017
#define prop_othercase frame->Xprop_othercase
6018
#define prop_test_against frame->Xprop_test_against
6019
#define prop_test_variable frame->Xprop_test_variable
6022
#define ctype frame->Xctype
6023
#define fc frame->Xfc
6024
#define fi frame->Xfi
6025
#define length frame->Xlength
6026
#define max frame->Xmax
6027
#define min frame->Xmin
6028
#define number frame->Xnumber
6029
#define offset frame->Xoffset
6030
#define op frame->Xop
6031
#define save_capture_last frame->Xsave_capture_last
6032
#define save_offset1 frame->Xsave_offset1
6033
#define save_offset2 frame->Xsave_offset2
6034
#define save_offset3 frame->Xsave_offset3
6035
#define stacksave frame->Xstacksave
6037
#define newptrb frame->Xnewptrb
6039
/* When recursion is being used, local variables are allocated on the stack and
6040
get preserved during recursion in the normal way. In this environment, fi and
6041
i, and fc and c, can be the same variables. */
6048
#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6049
const uschar *charptr; /* small blocks of the code. My normal */
6050
#endif /* style of coding would have declared */
6051
const uschar *callpat; /* them within each of those blocks. */
6052
const uschar *data; /* However, in order to accommodate the */
6053
const uschar *next; /* version of this code that uses an */
6054
const uschar *pp; /* external "stack" implemented on the */
6055
const uschar *prev; /* heap, it is easier to declare them */
6056
const uschar *saved_eptr; /* all here, so the declarations can */
6057
/* be cut out in a block. The only */
6058
recursion_info new_recursive; /* declarations within blocks below are */
6059
/* for variables that do not have to */
6060
BOOL cur_is_word; /* be preserved over a recursive call */
6061
BOOL condition; /* to RMATCH(). */
6065
unsigned long int original_ims;
6069
int prop_fail_result;
6073
int prop_test_against;
6074
int *prop_test_variable;
6084
int save_capture_last;
6085
int save_offset1, save_offset2, save_offset3;
6086
int stacksave[REC_STACK_SAVE_MAX];
6091
/* These statements are here to stop the compiler complaining about unitialized
6095
prop_fail_result = 0;
6096
prop_test_against = 0;
6097
prop_test_variable = NULL;
6100
/* OK, now we can get on with the real code of the function. Recursion is
6101
specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6102
these just turn into a recursive call to match() and a "return", respectively.
6103
However, RMATCH isn't like a function call because it's quite a complicated
6104
macro. It has to be used in one particular way. This shouldn't, however, impact
6105
performance when true recursion is being used. */
6107
if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6109
original_ims = ims; /* Save for resetting on ')' */
6111
/* At the start of a bracketed group, add the current subject pointer to the
6112
stack of such pointers, to be re-instated at the end of the group when we hit
6113
the closing ket. When match() is called in other circumstances, we don't add to
6116
if ((flags & match_isgroup) != 0)
6118
newptrb.epb_prev = eptrb;
6119
newptrb.epb_saved_eptr = eptr;
6123
/* Now start processing the operations. */
6130
/* For partial matching, remember if we ever hit the end of the subject after
6131
matching at least one subject character. */
6134
eptr >= md->end_subject &&
6135
eptr > md->start_match)
6138
/* Opening capturing bracket. If there is space in the offset vector, save
6139
the current subject position in the working slot at the top of the vector. We
6140
mustn't change the current values of the data slot, because they may be set
6141
from a previous iteration of this group, and be referred to by a reference
6144
If the bracket fails to match, we need to restore this value and also the
6145
values of the final offsets, in case they were set by a previous iteration of
6148
If there isn't enough space in the offset vector, treat this as if it were a
6149
non-capturing bracket. Don't worry about setting the flag for the error case
6150
here; that is handled in the code for KET. */
6154
number = op - OP_BRA;
6156
/* For extended extraction brackets (large number), we have to fish out the
6157
number from a dummy opcode at the start. */
6159
if (number > EXTRACT_BASIC_MAX)
6160
number = GET2(ecode, 2+LINK_SIZE);
6161
offset = number << 1;
6164
printf("start bracket %d subject=", number);
6165
pchars(eptr, 16, TRUE, md);
6169
if (offset < md->offset_max)
6171
save_offset1 = md->offset_vector[offset];
6172
save_offset2 = md->offset_vector[offset+1];
6173
save_offset3 = md->offset_vector[md->offset_end - number];
6174
save_capture_last = md->capture_last;
6176
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6177
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6181
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6183
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6184
md->capture_last = save_capture_last;
6185
ecode += GET(ecode, 1);
6187
while (*ecode == OP_ALT);
6189
DPRINTF(("bracket %d failed\n", number));
6191
md->offset_vector[offset] = save_offset1;
6192
md->offset_vector[offset+1] = save_offset2;
6193
md->offset_vector[md->offset_end - number] = save_offset3;
6195
RRETURN(MATCH_NOMATCH);
6198
/* Insufficient room for saving captured contents */
6203
/* Other types of node can be handled by a switch */
6207
case OP_BRA: /* Non-capturing bracket: optimized */
6208
DPRINTF(("start bracket 0\n"));
6211
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6213
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6214
ecode += GET(ecode, 1);
6216
while (*ecode == OP_ALT);
6217
DPRINTF(("bracket 0 failed\n"));
6218
RRETURN(MATCH_NOMATCH);
6220
/* Conditional group: compilation checked that there are no more than
6221
two branches. If the condition is false, skipping the first branch takes us
6222
past the end if there is only one branch, but that's OK because that is
6223
exactly what going to the ket would do. */
6226
if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6228
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6229
condition = (offset == CREF_RECURSE * 2)?
6230
(md->recursive != NULL) :
6231
(offset < offset_top && md->offset_vector[offset] >= 0);
6232
RMATCH(rrc, eptr, ecode + (condition?
6233
(LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6234
offset_top, md, ims, eptrb, match_isgroup);
6238
/* The condition is an assertion. Call match() to evaluate it - setting
6239
the final argument TRUE causes it to stop at the end of an assertion. */
6243
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6244
match_condassert | match_isgroup);
6245
if (rrc == MATCH_MATCH)
6247
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6248
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6250
else if (rrc != MATCH_NOMATCH)
6252
RRETURN(rrc); /* Need braces because of following else */
6254
else ecode += GET(ecode, 1);
6255
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6259
/* Control never reaches here */
6261
/* Skip over conditional reference or large extraction number data if
6269
/* End of the pattern. If we are in a recursion, we should restore the
6270
offsets appropriately and continue from after the call. */
6273
if (md->recursive != NULL && md->recursive->group_num == 0)
6275
recursion_info *rec = md->recursive;
6276
DPRINTF(("Hit the end in a (?0) recursion\n"));
6277
md->recursive = rec->prevrec;
6278
memmove(md->offset_vector, rec->offset_save,
6279
rec->saved_max * sizeof(int));
6280
md->start_match = rec->save_start;
6282
ecode = rec->after_call;
6286
/* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6287
string - backtracking will then try other alternatives, if any. */
6289
if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6290
md->end_match_ptr = eptr; /* Record where we ended */
6291
md->end_offset_top = offset_top; /* and how many extracts were taken */
6292
RRETURN(MATCH_MATCH);
6294
/* Change option settings */
6299
DPRINTF(("ims set to %02lx\n", ims));
6302
/* Assertion brackets. Check the alternative branches in turn - the
6303
matching won't pass the KET for an assertion. If any one branch matches,
6304
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6305
start of each branch to move the current point backwards, so the code at
6306
this level is identical to the lookahead case. */
6312
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6314
if (rrc == MATCH_MATCH) break;
6315
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6316
ecode += GET(ecode, 1);
6318
while (*ecode == OP_ALT);
6319
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6321
/* If checking an assertion for a condition, return MATCH_MATCH. */
6323
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6325
/* Continue from after the assertion, updating the offsets high water
6326
mark, since extracts may have been taken during the assertion. */
6328
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6329
ecode += 1 + LINK_SIZE;
6330
offset_top = md->end_offset_top;
6333
/* Negative assertion: all branches must fail to match */
6336
case OP_ASSERTBACK_NOT:
6339
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6341
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6342
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6343
ecode += GET(ecode,1);
6345
while (*ecode == OP_ALT);
6347
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6349
ecode += 1 + LINK_SIZE;
6352
/* Move the subject pointer back. This occurs only at the start of
6353
each branch of a lookbehind assertion. If we are too close to the start to
6354
move back, this match function fails. When working with UTF-8 we move
6355
back a number of characters, not bytes. */
6362
for (i = 0; i < c; i++)
6365
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6372
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6375
eptr -= GET(ecode,1);
6376
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6379
/* Skip to next op code */
6381
ecode += 1 + LINK_SIZE;
6384
/* The callout item calls an external function, if one is provided, passing
6385
details of the match so far. This is mainly for debugging, though the
6386
function is able to force a failure. */
6389
if (pcre_callout != NULL)
6391
pcre_callout_block cb;
6392
cb.version = 1; /* Version 1 of the callout block */
6393
cb.callout_number = ecode[1];
6394
cb.offset_vector = md->offset_vector;
6395
cb.subject = (const char *)md->start_subject;
6396
cb.subject_length = md->end_subject - md->start_subject;
6397
cb.start_match = md->start_match - md->start_subject;
6398
cb.current_position = eptr - md->start_subject;
6399
cb.pattern_position = GET(ecode, 2);
6400
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6401
cb.capture_top = offset_top/2;
6402
cb.capture_last = md->capture_last;
6403
cb.callout_data = md->callout_data;
6404
if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6405
if (rrc < 0) RRETURN(rrc);
6407
ecode += 2 + 2*LINK_SIZE;
6410
/* Recursion either matches the current regex, or some subexpression. The
6411
offset data is the offset to the starting bracket from the start of the
6412
whole pattern. (This is so that it works from duplicated subpatterns.)
6414
If there are any capturing brackets started but not finished, we have to
6415
save their starting points and reinstate them after the recursion. However,
6416
we don't know how many such there are (offset_top records the completed
6417
total) so we just have to save all the potential data. There may be up to
6418
65535 such values, which is too large to put on the stack, but using malloc
6419
for small numbers seems expensive. As a compromise, the stack is used when
6420
there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6421
is used. A problem is what to do if the malloc fails ... there is no way of
6422
returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6423
values on the stack, and accept that the rest may be wrong.
6425
There are also other values that have to be saved. We use a chained
6426
sequence of blocks that actually live on the stack. Thanks to Robin Houston
6427
for the original version of this logic. */
6431
callpat = md->start_code + GET(ecode, 1);
6432
new_recursive.group_num = *callpat - OP_BRA;
6434
/* For extended extraction brackets (large number), we have to fish out
6435
the number from a dummy opcode at the start. */
6437
if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6438
new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6440
/* Add to "recursing stack" */
6442
new_recursive.prevrec = md->recursive;
6443
md->recursive = &new_recursive;
6445
/* Find where to continue from afterwards */
6447
ecode += 1 + LINK_SIZE;
6448
new_recursive.after_call = ecode;
6450
/* Now save the offset data. */
6452
new_recursive.saved_max = md->offset_end;
6453
if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6454
new_recursive.offset_save = stacksave;
6457
new_recursive.offset_save =
6458
(int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6459
if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6462
memcpy(new_recursive.offset_save, md->offset_vector,
6463
new_recursive.saved_max * sizeof(int));
6464
new_recursive.save_start = md->start_match;
6465
md->start_match = eptr;
6467
/* OK, now we can do the recursion. For each top-level alternative we
6468
restore the offset and recursion data. */
6470
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6473
RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6474
eptrb, match_isgroup);
6475
if (rrc == MATCH_MATCH)
6477
md->recursive = new_recursive.prevrec;
6478
if (new_recursive.offset_save != stacksave)
6479
(pcre_free)(new_recursive.offset_save);
6480
RRETURN(MATCH_MATCH);
6482
else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6484
md->recursive = &new_recursive;
6485
memcpy(md->offset_vector, new_recursive.offset_save,
6486
new_recursive.saved_max * sizeof(int));
6487
callpat += GET(callpat, 1);
6489
while (*callpat == OP_ALT);
6491
DPRINTF(("Recursion didn't match\n"));
6492
md->recursive = new_recursive.prevrec;
6493
if (new_recursive.offset_save != stacksave)
6494
(pcre_free)(new_recursive.offset_save);
6495
RRETURN(MATCH_NOMATCH);
6497
/* Control never reaches here */
6499
/* "Once" brackets are like assertion brackets except that after a match,
6500
the point in the subject string is not moved back. Thus there can never be
6501
a move back into the brackets. Friedl calls these "atomic" subpatterns.
6502
Check the alternative branches in turn - the matching won't pass the KET
6503
for this kind of subpattern. If any one branch matches, we carry on as at
6504
the end of a normal bracket, leaving the subject pointer. */
6513
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6514
eptrb, match_isgroup);
6515
if (rrc == MATCH_MATCH) break;
6516
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6517
ecode += GET(ecode,1);
6519
while (*ecode == OP_ALT);
6521
/* If hit the end of the group (which could be repeated), fail */
6523
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6525
/* Continue as from after the assertion, updating the offsets high water
6526
mark, since extracts may have been taken. */
6528
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6530
offset_top = md->end_offset_top;
6531
eptr = md->end_match_ptr;
6533
/* For a non-repeating ket, just continue at this level. This also
6534
happens for a repeating ket if no characters were matched in the group.
6535
This is the forcible breaking of infinite loops as implemented in Perl
6536
5.005. If there is an options reset, it will get obeyed in the normal
6537
course of events. */
6539
if (*ecode == OP_KET || eptr == saved_eptr)
6541
ecode += 1+LINK_SIZE;
6545
/* The repeating kets try the rest of the pattern or restart from the
6546
preceding bracket, in the appropriate order. We need to reset any options
6547
that changed within the bracket before re-running it, so check the next
6550
if (ecode[1+LINK_SIZE] == OP_OPT)
6552
ims = (ims & ~PCRE_IMS) | ecode[4];
6553
DPRINTF(("ims set to %02lx at group repeat\n", ims));
6556
if (*ecode == OP_KETRMIN)
6558
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6559
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6560
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6561
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6563
else /* OP_KETRMAX */
6565
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6566
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6567
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6568
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6571
RRETURN(MATCH_NOMATCH);
6573
/* An alternation is the end of a branch; scan along to find the end of the
6574
bracketed group and go to there. */
6577
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6580
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6581
that it may occur zero times. It may repeat infinitely, or not at all -
6582
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6583
repeat limits are compiled as a number of copies, with the optional ones
6584
preceded by BRAZERO or BRAMINZERO. */
6589
RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6590
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6591
do next += GET(next,1); while (*next == OP_ALT);
6592
ecode = next + 1+LINK_SIZE;
6599
do next += GET(next,1); while (*next == OP_ALT);
6600
RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6602
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6607
/* End of a group, repeated or non-repeating. If we are at the end of
6608
an assertion "group", stop matching and return MATCH_MATCH, but record the
6609
current high water mark for use by positive assertions. Do this also
6610
for the "once" (not-backup up) groups. */
6616
prev = ecode - GET(ecode, 1);
6617
saved_eptr = eptrb->epb_saved_eptr;
6619
/* Back up the stack of bracket start pointers. */
6621
eptrb = eptrb->epb_prev;
6623
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6624
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6627
md->end_match_ptr = eptr; /* For ONCE */
6628
md->end_offset_top = offset_top;
6629
RRETURN(MATCH_MATCH);
6632
/* In all other cases except a conditional group we have to check the
6633
group number back at the start and if necessary complete handling an
6634
extraction by setting the offsets and bumping the high water mark. */
6636
if (*prev != OP_COND)
6638
number = *prev - OP_BRA;
6640
/* For extended extraction brackets (large number), we have to fish out
6641
the number from a dummy opcode at the start. */
6643
if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6644
offset = number << 1;
6647
printf("end bracket %d", number);
6651
/* Test for a numbered group. This includes groups called as a result
6652
of recursion. Note that whole-pattern recursion is coded as a recurse
6653
into group 0, so it won't be picked up here. Instead, we catch it when
6654
the OP_END is reached. */
6658
md->capture_last = number;
6659
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6661
md->offset_vector[offset] =
6662
md->offset_vector[md->offset_end - number];
6663
md->offset_vector[offset+1] = eptr - md->start_subject;
6664
if (offset_top <= offset) offset_top = offset + 2;
6667
/* Handle a recursively called group. Restore the offsets
6668
appropriately and continue from after the call. */
6670
if (md->recursive != NULL && md->recursive->group_num == number)
6672
recursion_info *rec = md->recursive;
6673
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6674
md->recursive = rec->prevrec;
6675
md->start_match = rec->save_start;
6676
memcpy(md->offset_vector, rec->offset_save,
6677
rec->saved_max * sizeof(int));
6678
ecode = rec->after_call;
6685
/* Reset the value of the ims flags, in case they got changed during
6689
DPRINTF(("ims reset to %02lx\n", ims));
6691
/* For a non-repeating ket, just continue at this level. This also
6692
happens for a repeating ket if no characters were matched in the group.
6693
This is the forcible breaking of infinite loops as implemented in Perl
6694
5.005. If there is an options reset, it will get obeyed in the normal
6695
course of events. */
6697
if (*ecode == OP_KET || eptr == saved_eptr)
6699
ecode += 1 + LINK_SIZE;
6703
/* The repeating kets try the rest of the pattern or restart from the
6704
preceding bracket, in the appropriate order. */
6706
if (*ecode == OP_KETRMIN)
6708
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6709
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6710
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6711
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6713
else /* OP_KETRMAX */
6715
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6716
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6717
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6718
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6722
RRETURN(MATCH_NOMATCH);
6724
/* Start of subject unless notbol, or after internal newline if multiline */
6727
if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6728
if ((ims & PCRE_MULTILINE) != 0)
6730
if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6731
RRETURN(MATCH_NOMATCH);
6735
/* ... else fall through */
6737
/* Start of subject assertion */
6740
if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6744
/* Start of match assertion */
6747
if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6751
/* Assert before internal newline if multiline, or before a terminating
6752
newline unless endonly is set, else end of subject unless noteol is set. */
6755
if ((ims & PCRE_MULTILINE) != 0)
6757
if (eptr < md->end_subject)
6758
{ if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6760
{ if (md->noteol) RRETURN(MATCH_NOMATCH); }
6766
if (md->noteol) RRETURN(MATCH_NOMATCH);
6769
if (eptr < md->end_subject - 1 ||
6770
(eptr == md->end_subject - 1 && *eptr != NEWLINE))
6771
RRETURN(MATCH_NOMATCH);
6776
/* ... else fall through */
6778
/* End of subject assertion (\z) */
6781
if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6785
/* End of subject or ending \n assertion (\Z) */
6788
if (eptr < md->end_subject - 1 ||
6789
(eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6793
/* Word boundary assertions */
6795
case OP_NOT_WORD_BOUNDARY:
6796
case OP_WORD_BOUNDARY:
6799
/* Find out if the previous and current characters are "word" characters.
6800
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6801
be "non-word" characters. */
6806
if (eptr == md->start_subject) prev_is_word = FALSE; else
6808
const uschar *lastptr = eptr - 1;
6809
while((*lastptr & 0xc0) == 0x80) lastptr--;
6810
GETCHAR(c, lastptr);
6811
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6813
if (eptr >= md->end_subject) cur_is_word = FALSE; else
6816
cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6822
/* More streamlined when not in UTF-8 mode */
6825
prev_is_word = (eptr != md->start_subject) &&
6826
((md->ctypes[eptr[-1]] & ctype_word) != 0);
6827
cur_is_word = (eptr < md->end_subject) &&
6828
((md->ctypes[*eptr] & ctype_word) != 0);
6831
/* Now see if the situation is what we want */
6833
if ((*ecode++ == OP_WORD_BOUNDARY)?
6834
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6835
RRETURN(MATCH_NOMATCH);
6839
/* Match a single character type; inline for speed */
6842
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6843
RRETURN(MATCH_NOMATCH);
6844
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6847
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6852
/* Match a single byte, even in UTF-8 mode. This opcode really does match
6853
any byte, even newline, independent of the setting of PCRE_DOTALL. */
6856
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6861
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6862
GETCHARINCTEST(c, eptr);
6867
(md->ctypes[c] & ctype_digit) != 0
6869
RRETURN(MATCH_NOMATCH);
6874
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6875
GETCHARINCTEST(c, eptr);
6880
(md->ctypes[c] & ctype_digit) == 0
6882
RRETURN(MATCH_NOMATCH);
6886
case OP_NOT_WHITESPACE:
6887
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6888
GETCHARINCTEST(c, eptr);
6893
(md->ctypes[c] & ctype_space) != 0
6895
RRETURN(MATCH_NOMATCH);
6900
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6901
GETCHARINCTEST(c, eptr);
6906
(md->ctypes[c] & ctype_space) == 0
6908
RRETURN(MATCH_NOMATCH);
6912
case OP_NOT_WORDCHAR:
6913
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6914
GETCHARINCTEST(c, eptr);
6919
(md->ctypes[c] & ctype_word) != 0
6921
RRETURN(MATCH_NOMATCH);
6926
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6927
GETCHARINCTEST(c, eptr);
6932
(md->ctypes[c] & ctype_word) == 0
6934
RRETURN(MATCH_NOMATCH);
6939
/* Check the next character by Unicode property. We will get here only
6940
if the support is in the binary; otherwise a compile-time error occurs. */
6944
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6945
GETCHARINCTEST(c, eptr);
6947
int chartype, rqdtype;
6949
int category = ucp_findchar(c, &chartype, &othercase);
6951
rqdtype = *(++ecode);
6956
if ((rqdtype - 128 != category) == (op == OP_PROP))
6957
RRETURN(MATCH_NOMATCH);
6961
if ((rqdtype != chartype) == (op == OP_PROP))
6962
RRETURN(MATCH_NOMATCH);
6967
/* Match an extended Unicode sequence. We will get here only if the support
6968
is in the binary; otherwise a compile-time error occurs. */
6971
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6972
GETCHARINCTEST(c, eptr);
6976
int category = ucp_findchar(c, &chartype, &othercase);
6977
if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6978
while (eptr < md->end_subject)
6981
if (!md->utf8) c = *eptr; else
6983
GETCHARLEN(c, eptr, len);
6985
category = ucp_findchar(c, &chartype, &othercase);
6986
if (category != ucp_M) break;
6995
/* Match a back reference, possibly repeatedly. Look past the end of the
6996
item to see if there is repeat information following. The code is similar
6997
to that for character classes, but repeated for efficiency. Then obey
6998
similar code to character type repeats - written out again for speed.
6999
However, if the referenced string is the empty string, always treat
7000
it as matched, any number of times (otherwise there could be infinite
7005
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7006
ecode += 3; /* Advance past item */
7008
/* If the reference is unset, set the length to be longer than the amount
7009
of subject left; this ensures that every attempt at a match fails. We
7010
can't just fail here, because of the possibility of quantifiers with zero
7013
length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7014
md->end_subject - eptr + 1 :
7015
md->offset_vector[offset+1] - md->offset_vector[offset];
7017
/* Set up for repetition, or handle the non-repeated case */
7027
c = *ecode++ - OP_CRSTAR;
7028
minimize = (c & 1) != 0;
7029
min = rep_min[c]; /* Pick up values from tables; */
7030
max = rep_max[c]; /* zero for max => infinity */
7031
if (max == 0) max = INT_MAX;
7036
minimize = (*ecode == OP_CRMINRANGE);
7037
min = GET2(ecode, 1);
7038
max = GET2(ecode, 3);
7039
if (max == 0) max = INT_MAX;
7043
default: /* No repeat follows */
7044
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7046
continue; /* With the main loop */
7049
/* If the length of the reference is zero, just continue with the
7052
if (length == 0) continue;
7054
/* First, ensure the minimum number of matches are present. We get back
7055
the length of the reference string explicitly rather than passing the
7056
address of eptr, so that eptr can be a register variable. */
7058
for (i = 1; i <= min; i++)
7060
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7064
/* If min = max, continue at the same level without recursion.
7065
They are not both allowed to be zero. */
7067
if (min == max) continue;
7069
/* If minimizing, keep trying and advancing the pointer */
7073
for (fi = min;; fi++)
7075
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7076
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7077
if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7078
RRETURN(MATCH_NOMATCH);
7081
/* Control never gets here */
7084
/* If maximizing, find the longest string and work backwards */
7089
for (i = min; i < max; i++)
7091
if (!match_ref(offset, eptr, length, md, ims)) break;
7096
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7097
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7100
RRETURN(MATCH_NOMATCH);
7103
/* Control never gets here */
7107
/* Match a bit-mapped character class, possibly repeatedly. This op code is
7108
used when all the characters in the class have values in the range 0-255,
7109
and either the matching is caseful, or the characters are in the range
7110
0-127 when UTF-8 processing is enabled. The only difference between
7111
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7114
First, look past the end of the item to see if there is repeat information
7115
following. Then obey similar code to character type repeats - written out
7121
data = ecode + 1; /* Save for matching */
7122
ecode += 33; /* Advance past the item */
7132
c = *ecode++ - OP_CRSTAR;
7133
minimize = (c & 1) != 0;
7134
min = rep_min[c]; /* Pick up values from tables; */
7135
max = rep_max[c]; /* zero for max => infinity */
7136
if (max == 0) max = INT_MAX;
7141
minimize = (*ecode == OP_CRMINRANGE);
7142
min = GET2(ecode, 1);
7143
max = GET2(ecode, 3);
7144
if (max == 0) max = INT_MAX;
7148
default: /* No repeat follows */
7153
/* First, ensure the minimum number of matches are present. */
7159
for (i = 1; i <= min; i++)
7161
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7162
GETCHARINC(c, eptr);
7165
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7169
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7175
/* Not UTF-8 mode */
7177
for (i = 1; i <= min; i++)
7179
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7181
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7185
/* If max == min we can continue with the main loop without the
7188
if (min == max) continue;
7190
/* If minimizing, keep testing the rest of the expression and advancing
7191
the pointer while it matches the class. */
7199
for (fi = min;; fi++)
7201
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7202
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7203
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7204
GETCHARINC(c, eptr);
7207
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7211
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7217
/* Not UTF-8 mode */
7219
for (fi = min;; fi++)
7221
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7222
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7223
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7225
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7228
/* Control never gets here */
7231
/* If maximizing, find the longest possible run, then work backwards. */
7241
for (i = min; i < max; i++)
7244
if (eptr >= md->end_subject) break;
7245
GETCHARLEN(c, eptr, len);
7248
if (op == OP_CLASS) break;
7252
if ((data[c/8] & (1 << (c&7))) == 0) break;
7258
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7259
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7260
if (eptr-- == pp) break; /* Stop if tried at original pos */
7266
/* Not UTF-8 mode */
7268
for (i = min; i < max; i++)
7270
if (eptr >= md->end_subject) break;
7272
if ((data[c/8] & (1 << (c&7))) == 0) break;
7277
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7279
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7283
RRETURN(MATCH_NOMATCH);
7286
/* Control never gets here */
7289
/* Match an extended character class. This opcode is encountered only
7290
in UTF-8 mode, because that's the only time it is compiled. */
7295
data = ecode + 1 + LINK_SIZE; /* Save for matching */
7296
ecode += GET(ecode, 1); /* Advance past the item */
7306
c = *ecode++ - OP_CRSTAR;
7307
minimize = (c & 1) != 0;
7308
min = rep_min[c]; /* Pick up values from tables; */
7309
max = rep_max[c]; /* zero for max => infinity */
7310
if (max == 0) max = INT_MAX;
7315
minimize = (*ecode == OP_CRMINRANGE);
7316
min = GET2(ecode, 1);
7317
max = GET2(ecode, 3);
7318
if (max == 0) max = INT_MAX;
7322
default: /* No repeat follows */
7327
/* First, ensure the minimum number of matches are present. */
7329
for (i = 1; i <= min; i++)
7331
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7332
GETCHARINC(c, eptr);
7333
if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7336
/* If max == min we can continue with the main loop without the
7339
if (min == max) continue;
7341
/* If minimizing, keep testing the rest of the expression and advancing
7342
the pointer while it matches the class. */
7346
for (fi = min;; fi++)
7348
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7349
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7350
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7351
GETCHARINC(c, eptr);
7352
if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7354
/* Control never gets here */
7357
/* If maximizing, find the longest possible run, then work backwards. */
7362
for (i = min; i < max; i++)
7365
if (eptr >= md->end_subject) break;
7366
GETCHARLEN(c, eptr, len);
7367
if (!match_xclass(c, data)) break;
7372
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7373
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7374
if (eptr-- == pp) break; /* Stop if tried at original pos */
7377
RRETURN(MATCH_NOMATCH);
7380
/* Control never gets here */
7382
#endif /* End of XCLASS */
7384
/* Match a single character, casefully */
7392
GETCHARLEN(fc, ecode, length);
7393
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7394
while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7399
/* Non-UTF-8 mode */
7401
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7402
if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7407
/* Match a single character, caselessly */
7415
GETCHARLEN(fc, ecode, length);
7417
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7419
/* If the pattern character's value is < 128, we have only one byte, and
7420
can use the fast lookup table. */
7424
if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7427
/* Otherwise we must pick up the subject character */
7432
GETCHARINC(dc, eptr);
7435
/* If we have Unicode property support, we can use it to test the other
7436
case of the character, if there is one. The result of ucp_findchar() is
7437
< 0 if the char isn't found, and othercase is returned as zero if there
7445
if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7447
RRETURN(MATCH_NOMATCH);
7452
#endif /* SUPPORT_UTF8 */
7454
/* Non-UTF-8 mode */
7456
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7457
if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7462
/* Match a single character repeatedly; different opcodes share code. */
7465
min = max = GET2(ecode, 1);
7472
max = GET2(ecode, 1);
7473
minimize = *ecode == OP_MINUPTO;
7483
c = *ecode++ - OP_STAR;
7484
minimize = (c & 1) != 0;
7485
min = rep_min[c]; /* Pick up values from tables; */
7486
max = rep_max[c]; /* zero for max => infinity */
7487
if (max == 0) max = INT_MAX;
7489
/* Common code for all repeated single-character matches. We can give
7490
up quickly if there are fewer than the minimum number of characters left in
7499
GETCHARLEN(fc, ecode, length);
7500
if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7503
/* Handle multibyte character matching specially here. There is
7504
support for caseless matching if UCP support is present. */
7514
if ((ims & PCRE_CASELESS) != 0 &&
7515
ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7517
oclength = ord2utf8(othercase, occhars);
7518
#endif /* SUPPORT_UCP */
7520
for (i = 1; i <= min; i++)
7522
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7523
/* Need braces because of following else */
7524
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7527
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7532
if (min == max) continue;
7536
for (fi = min;; fi++)
7538
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7539
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7540
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7541
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7542
/* Need braces because of following else */
7543
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7546
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7550
/* Control never gets here */
7555
for (i = min; i < max; i++)
7557
if (eptr > md->end_subject - length) break;
7558
if (memcmp(eptr, charptr, length) == 0) eptr += length;
7559
else if (oclength == 0) break;
7562
if (memcmp(eptr, occhars, oclength) != 0) break;
7568
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7569
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7572
RRETURN(MATCH_NOMATCH);
7574
/* Control never gets here */
7577
/* If the length of a UTF-8 character is 1, we fall through here, and
7578
obey the code as for non-UTF-8 characters below, though in this case the
7579
value of fc will always be < 128. */
7582
#endif /* SUPPORT_UTF8 */
7584
/* When not in UTF-8 mode, load a single-byte character. */
7586
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7590
/* The value of fc at this point is always less than 256, though we may or
7591
may not be in UTF-8 mode. The code is duplicated for the caseless and
7592
caseful cases, for speed, since matching characters is likely to be quite
7593
common. First, ensure the minimum number of matches are present. If min =
7594
max, continue at the same level without recursing. Otherwise, if
7595
minimizing, keep trying the rest of the expression and advancing one
7596
matching character if failing, up to the maximum. Alternatively, if
7597
maximizing, find the maximum number of characters and work backwards. */
7599
DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7602
if ((ims & PCRE_CASELESS) != 0)
7605
for (i = 1; i <= min; i++)
7606
if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7607
if (min == max) continue;
7610
for (fi = min;; fi++)
7612
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7613
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7614
if (fi >= max || eptr >= md->end_subject ||
7615
fc != md->lcc[*eptr++])
7616
RRETURN(MATCH_NOMATCH);
7618
/* Control never gets here */
7623
for (i = min; i < max; i++)
7625
if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7630
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7632
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7634
RRETURN(MATCH_NOMATCH);
7636
/* Control never gets here */
7639
/* Caseful comparisons (includes all multi-byte characters) */
7643
for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7644
if (min == max) continue;
7647
for (fi = min;; fi++)
7649
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7650
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7651
if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7652
RRETURN(MATCH_NOMATCH);
7654
/* Control never gets here */
7659
for (i = min; i < max; i++)
7661
if (eptr >= md->end_subject || fc != *eptr) break;
7666
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7668
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7670
RRETURN(MATCH_NOMATCH);
7673
/* Control never gets here */
7675
/* Match a negated single one-byte character. The character we are
7676
checking can be multibyte. */
7679
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7681
GETCHARINCTEST(c, eptr);
7682
if ((ims & PCRE_CASELESS) != 0)
7688
if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7692
if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7696
/* Match a negated single one-byte character repeatedly. This is almost a
7697
repeat of the code for a repeated single character, but I haven't found a
7698
nice way of commoning these up that doesn't require a test of the
7699
positive/negative option for each character match. Maybe that wouldn't add
7700
very much to the time taken, but character matching *is* what this is all
7704
min = max = GET2(ecode, 1);
7711
max = GET2(ecode, 1);
7712
minimize = *ecode == OP_NOTMINUPTO;
7721
case OP_NOTMINQUERY:
7722
c = *ecode++ - OP_NOTSTAR;
7723
minimize = (c & 1) != 0;
7724
min = rep_min[c]; /* Pick up values from tables; */
7725
max = rep_max[c]; /* zero for max => infinity */
7726
if (max == 0) max = INT_MAX;
7728
/* Common code for all repeated single-byte matches. We can give up quickly
7729
if there are fewer than the minimum number of bytes left in the
7733
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7736
/* The code is duplicated for the caseless and caseful cases, for speed,
7737
since matching characters is likely to be quite common. First, ensure the
7738
minimum number of matches are present. If min = max, continue at the same
7739
level without recursing. Otherwise, if minimizing, keep trying the rest of
7740
the expression and advancing one matching character if failing, up to the
7741
maximum. Alternatively, if maximizing, find the maximum number of
7742
characters and work backwards. */
7744
DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7747
if ((ims & PCRE_CASELESS) != 0)
7756
for (i = 1; i <= min; i++)
7758
GETCHARINC(d, eptr);
7759
if (d < 256) d = md->lcc[d];
7760
if (fc == d) RRETURN(MATCH_NOMATCH);
7766
/* Not UTF-8 mode */
7768
for (i = 1; i <= min; i++)
7769
if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7772
if (min == max) continue;
7781
for (fi = min;; fi++)
7783
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7784
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7785
GETCHARINC(d, eptr);
7786
if (d < 256) d = md->lcc[d];
7787
if (fi >= max || eptr >= md->end_subject || fc == d)
7788
RRETURN(MATCH_NOMATCH);
7793
/* Not UTF-8 mode */
7795
for (fi = min;; fi++)
7797
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7798
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7799
if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7800
RRETURN(MATCH_NOMATCH);
7803
/* Control never gets here */
7817
for (i = min; i < max; i++)
7820
if (eptr >= md->end_subject) break;
7821
GETCHARLEN(d, eptr, len);
7822
if (d < 256) d = md->lcc[d];
7828
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7829
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7830
if (eptr-- == pp) break; /* Stop if tried at original pos */
7836
/* Not UTF-8 mode */
7838
for (i = min; i < max; i++)
7840
if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7845
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7846
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7851
RRETURN(MATCH_NOMATCH);
7853
/* Control never gets here */
7856
/* Caseful comparisons */
7865
for (i = 1; i <= min; i++)
7867
GETCHARINC(d, eptr);
7868
if (fc == d) RRETURN(MATCH_NOMATCH);
7873
/* Not UTF-8 mode */
7875
for (i = 1; i <= min; i++)
7876
if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7879
if (min == max) continue;
7888
for (fi = min;; fi++)
7890
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7891
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7892
GETCHARINC(d, eptr);
7893
if (fi >= max || eptr >= md->end_subject || fc == d)
7894
RRETURN(MATCH_NOMATCH);
7899
/* Not UTF-8 mode */
7901
for (fi = min;; fi++)
7903
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7904
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7905
if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7906
RRETURN(MATCH_NOMATCH);
7909
/* Control never gets here */
7923
for (i = min; i < max; i++)
7926
if (eptr >= md->end_subject) break;
7927
GETCHARLEN(d, eptr, len);
7933
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7934
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7935
if (eptr-- == pp) break; /* Stop if tried at original pos */
7941
/* Not UTF-8 mode */
7943
for (i = min; i < max; i++)
7945
if (eptr >= md->end_subject || fc == *eptr) break;
7950
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7951
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7956
RRETURN(MATCH_NOMATCH);
7959
/* Control never gets here */
7961
/* Match a single character type repeatedly; several different opcodes
7962
share code. This is very similar to the code for single characters, but we
7963
repeat it in the interests of efficiency. */
7966
min = max = GET2(ecode, 1);
7972
case OP_TYPEMINUPTO:
7974
max = GET2(ecode, 1);
7975
minimize = *ecode == OP_TYPEMINUPTO;
7980
case OP_TYPEMINSTAR:
7982
case OP_TYPEMINPLUS:
7984
case OP_TYPEMINQUERY:
7985
c = *ecode++ - OP_TYPESTAR;
7986
minimize = (c & 1) != 0;
7987
min = rep_min[c]; /* Pick up values from tables; */
7988
max = rep_max[c]; /* zero for max => infinity */
7989
if (max == 0) max = INT_MAX;
7991
/* Common code for all repeated single character type matches. Note that
7992
in UTF-8 mode, '.' matches a character of any length, but for the other
7993
character types, the valid characters are all one-byte long. */
7996
ctype = *ecode++; /* Code for the character type */
7999
if (ctype == OP_PROP || ctype == OP_NOTPROP)
8001
prop_fail_result = ctype == OP_NOTPROP;
8002
prop_type = *ecode++;
8003
if (prop_type >= 128)
8005
prop_test_against = prop_type - 128;
8006
prop_test_variable = &prop_category;
8010
prop_test_against = prop_type;
8011
prop_test_variable = &prop_chartype;
8014
else prop_type = -1;
8017
/* First, ensure the minimum number of matches are present. Use inline
8018
code for maximizing the speed, and do the type test once at the start
8019
(i.e. keep it out of the loop). Also we can test that there are at least
8020
the minimum number of bytes before we start. This isn't as effective in
8021
UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8022
is tidier. Also separate the UCP code, which can be the same for both UTF-8
8023
and single-bytes. */
8025
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8031
for (i = 1; i <= min; i++)
8033
GETCHARINC(c, eptr);
8034
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8035
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8036
RRETURN(MATCH_NOMATCH);
8040
/* Match extended Unicode sequences. We will get here only if the
8041
support is in the binary; otherwise a compile-time error occurs. */
8043
else if (ctype == OP_EXTUNI)
8045
for (i = 1; i <= min; i++)
8047
GETCHARINCTEST(c, eptr);
8048
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8049
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8050
while (eptr < md->end_subject)
8053
if (!md->utf8) c = *eptr; else
8055
GETCHARLEN(c, eptr, len);
8057
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8058
if (prop_category != ucp_M) break;
8065
#endif /* SUPPORT_UCP */
8067
/* Handle all other cases when the coding is UTF-8 */
8070
if (md->utf8) switch(ctype)
8073
for (i = 1; i <= min; i++)
8075
if (eptr >= md->end_subject ||
8076
(*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8077
RRETURN(MATCH_NOMATCH);
8078
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8087
for (i = 1; i <= min; i++)
8089
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8090
GETCHARINC(c, eptr);
8091
if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8092
RRETURN(MATCH_NOMATCH);
8097
for (i = 1; i <= min; i++)
8099
if (eptr >= md->end_subject ||
8100
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8101
RRETURN(MATCH_NOMATCH);
8102
/* No need to skip more bytes - we know it's a 1-byte character */
8106
case OP_NOT_WHITESPACE:
8107
for (i = 1; i <= min; i++)
8109
if (eptr >= md->end_subject ||
8110
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8111
RRETURN(MATCH_NOMATCH);
8112
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8117
for (i = 1; i <= min; i++)
8119
if (eptr >= md->end_subject ||
8120
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8121
RRETURN(MATCH_NOMATCH);
8122
/* No need to skip more bytes - we know it's a 1-byte character */
8126
case OP_NOT_WORDCHAR:
8127
for (i = 1; i <= min; i++)
8129
if (eptr >= md->end_subject ||
8130
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8131
RRETURN(MATCH_NOMATCH);
8132
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8137
for (i = 1; i <= min; i++)
8139
if (eptr >= md->end_subject ||
8140
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8141
RRETURN(MATCH_NOMATCH);
8142
/* No need to skip more bytes - we know it's a 1-byte character */
8147
RRETURN(PCRE_ERROR_INTERNAL);
8148
} /* End switch(ctype) */
8151
#endif /* SUPPORT_UTF8 */
8153
/* Code for the non-UTF-8 case for minimum matching of operators other
8154
than OP_PROP and OP_NOTPROP. */
8159
if ((ims & PCRE_DOTALL) == 0)
8161
for (i = 1; i <= min; i++)
8162
if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8172
for (i = 1; i <= min; i++)
8173
if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8177
for (i = 1; i <= min; i++)
8178
if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8181
case OP_NOT_WHITESPACE:
8182
for (i = 1; i <= min; i++)
8183
if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8187
for (i = 1; i <= min; i++)
8188
if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8191
case OP_NOT_WORDCHAR:
8192
for (i = 1; i <= min; i++)
8193
if ((md->ctypes[*eptr++] & ctype_word) != 0)
8194
RRETURN(MATCH_NOMATCH);
8198
for (i = 1; i <= min; i++)
8199
if ((md->ctypes[*eptr++] & ctype_word) == 0)
8200
RRETURN(MATCH_NOMATCH);
8204
RRETURN(PCRE_ERROR_INTERNAL);
8208
/* If min = max, continue at the same level without recursing */
8210
if (min == max) continue;
8212
/* If minimizing, we have to test the rest of the pattern before each
8213
subsequent match. Again, separate the UTF-8 case for speed, and also
8214
separate the UCP cases. */
8221
for (fi = min;; fi++)
8223
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8224
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8225
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8226
GETCHARINC(c, eptr);
8227
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8228
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8229
RRETURN(MATCH_NOMATCH);
8233
/* Match extended Unicode sequences. We will get here only if the
8234
support is in the binary; otherwise a compile-time error occurs. */
8236
else if (ctype == OP_EXTUNI)
8238
for (fi = min;; fi++)
8240
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8241
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8242
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8243
GETCHARINCTEST(c, eptr);
8244
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8245
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8246
while (eptr < md->end_subject)
8249
if (!md->utf8) c = *eptr; else
8251
GETCHARLEN(c, eptr, len);
8253
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8254
if (prop_category != ucp_M) break;
8261
#endif /* SUPPORT_UCP */
8267
for (fi = min;; fi++)
8269
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8270
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8271
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8273
GETCHARINC(c, eptr);
8277
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8284
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8285
RRETURN(MATCH_NOMATCH);
8289
if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8290
RRETURN(MATCH_NOMATCH);
8293
case OP_NOT_WHITESPACE:
8294
if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8295
RRETURN(MATCH_NOMATCH);
8299
if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8300
RRETURN(MATCH_NOMATCH);
8303
case OP_NOT_WORDCHAR:
8304
if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8305
RRETURN(MATCH_NOMATCH);
8309
if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8310
RRETURN(MATCH_NOMATCH);
8314
RRETURN(PCRE_ERROR_INTERNAL);
8320
/* Not UTF-8 mode */
8322
for (fi = min;; fi++)
8324
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8325
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8326
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8331
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8338
if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8342
if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8345
case OP_NOT_WHITESPACE:
8346
if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8350
if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8353
case OP_NOT_WORDCHAR:
8354
if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8358
if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8362
RRETURN(PCRE_ERROR_INTERNAL);
8366
/* Control never gets here */
8369
/* If maximizing it is worth using inline code for speed, doing the type
8370
test once at the start (i.e. keep it out of the loop). Again, keep the
8371
UTF-8 and UCP stuff separate. */
8375
pp = eptr; /* Remember where we started */
8380
for (i = min; i < max; i++)
8383
if (eptr >= md->end_subject) break;
8384
GETCHARLEN(c, eptr, len);
8385
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8386
if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8391
/* eptr is now past the end of the maximum run */
8395
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8396
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8397
if (eptr-- == pp) break; /* Stop if tried at original pos */
8402
/* Match extended Unicode sequences. We will get here only if the
8403
support is in the binary; otherwise a compile-time error occurs. */
8405
else if (ctype == OP_EXTUNI)
8407
for (i = min; i < max; i++)
8409
if (eptr >= md->end_subject) break;
8410
GETCHARINCTEST(c, eptr);
8411
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8412
if (prop_category == ucp_M) break;
8413
while (eptr < md->end_subject)
8416
if (!md->utf8) c = *eptr; else
8418
GETCHARLEN(c, eptr, len);
8420
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8421
if (prop_category != ucp_M) break;
8426
/* eptr is now past the end of the maximum run */
8430
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8431
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8432
if (eptr-- == pp) break; /* Stop if tried at original pos */
8433
for (;;) /* Move back over one extended */
8437
if (!md->utf8) c = *eptr; else
8439
GETCHARLEN(c, eptr, len);
8441
prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8442
if (prop_category != ucp_M) break;
8449
#endif /* SUPPORT_UCP */
8460
/* Special code is required for UTF8, but when the maximum is unlimited
8461
we don't need it, so we repeat the non-UTF8 code. This is probably
8462
worth it, because .* is quite a common idiom. */
8466
if ((ims & PCRE_DOTALL) == 0)
8468
for (i = min; i < max; i++)
8470
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8472
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8477
for (i = min; i < max; i++)
8480
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8485
/* Handle unlimited UTF-8 repeat */
8489
if ((ims & PCRE_DOTALL) == 0)
8491
for (i = min; i < max; i++)
8493
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8501
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8507
/* The byte case is the same as non-UTF8 */
8511
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8516
for (i = min; i < max; i++)
8519
if (eptr >= md->end_subject) break;
8520
GETCHARLEN(c, eptr, len);
8521
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8527
for (i = min; i < max; i++)
8530
if (eptr >= md->end_subject) break;
8531
GETCHARLEN(c, eptr, len);
8532
if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8537
case OP_NOT_WHITESPACE:
8538
for (i = min; i < max; i++)
8541
if (eptr >= md->end_subject) break;
8542
GETCHARLEN(c, eptr, len);
8543
if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8549
for (i = min; i < max; i++)
8552
if (eptr >= md->end_subject) break;
8553
GETCHARLEN(c, eptr, len);
8554
if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8559
case OP_NOT_WORDCHAR:
8560
for (i = min; i < max; i++)
8563
if (eptr >= md->end_subject) break;
8564
GETCHARLEN(c, eptr, len);
8565
if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8571
for (i = min; i < max; i++)
8574
if (eptr >= md->end_subject) break;
8575
GETCHARLEN(c, eptr, len);
8576
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8582
RRETURN(PCRE_ERROR_INTERNAL);
8585
/* eptr is now past the end of the maximum run */
8589
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8590
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8591
if (eptr-- == pp) break; /* Stop if tried at original pos */
8598
/* Not UTF-8 mode */
8603
if ((ims & PCRE_DOTALL) == 0)
8605
for (i = min; i < max; i++)
8607
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8612
/* For DOTALL case, fall through and treat as \C */
8616
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8621
for (i = min; i < max; i++)
8623
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8630
for (i = min; i < max; i++)
8632
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8638
case OP_NOT_WHITESPACE:
8639
for (i = min; i < max; i++)
8641
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8648
for (i = min; i < max; i++)
8650
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8656
case OP_NOT_WORDCHAR:
8657
for (i = min; i < max; i++)
8659
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8666
for (i = min; i < max; i++)
8668
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8675
RRETURN(PCRE_ERROR_INTERNAL);
8678
/* eptr is now past the end of the maximum run */
8682
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8684
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8688
/* Get here if we can't make it match with any permitted repetitions */
8690
RRETURN(MATCH_NOMATCH);
8692
/* Control never gets here */
8694
/* There's been some horrible disaster. Since all codes > OP_BRA are
8695
for capturing brackets, and there shouldn't be any gaps between 0 and
8696
OP_BRA, arrival here can only mean there is something seriously wrong
8697
in the code above or the OP_xxx definitions. */
8700
DPRINTF(("Unknown opcode %d\n", *ecode));
8701
RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8704
/* Do not stick any code in here without much thought; it is assumed
8705
that "continue" in the code above comes out to here to repeat the main
8708
} /* End of main loop */
8709
/* Control never reaches here */
8713
/***************************************************************************
8714
****************************************************************************
8715
RECURSION IN THE match() FUNCTION
8717
Undefine all the macros that were defined above to handle this. */
8735
#undef new_recursive
8751
#undef save_capture_last
8761
/* These two are defined as macros in both cases */
8766
/***************************************************************************
8767
***************************************************************************/
8771
/*************************************************
8772
* Execute a Regular Expression *
8773
*************************************************/
8775
/* This function applies a compiled re to a subject string and picks out
8776
portions of the string if it matches. Two elements in the vector are set for
8777
each substring: the offsets to the start and end of the substring.
8780
argument_re points to the compiled expression
8781
extra_data points to extra data or is NULL
8782
subject points to the subject string
8783
length length of subject string (may contain binary zeros)
8784
start_offset where to start in the subject string
8786
offsets points to a vector of ints to be filled in with offsets
8787
offsetcount the number of elements in the vector
8789
Returns: > 0 => success; value is the number of elements filled in
8790
= 0 => success, but offsets is not big enough
8791
-1 => failed to match
8792
< -1 => some kind of unexpected problem
8796
pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8797
const char *subject, int length, int start_offset, int options, int *offsets,
8800
int rc, resetcount, ocount;
8801
int first_byte = -1;
8804
unsigned long int ims = 0;
8805
BOOL using_temporary_offsets = FALSE;
8808
BOOL first_byte_caseless = FALSE;
8809
BOOL req_byte_caseless = FALSE;
8810
match_data match_block;
8811
const uschar *tables;
8812
const uschar *start_bits = NULL;
8813
const uschar *start_match = (const uschar *)subject + start_offset;
8814
const uschar *end_subject;
8815
const uschar *req_byte_ptr = start_match - 1;
8817
pcre_study_data internal_study;
8818
const pcre_study_data *study;
8820
real_pcre internal_re;
8821
const real_pcre *external_re = (const real_pcre *)argument_re;
8822
const real_pcre *re = external_re;
8824
/* Plausibility checks */
8826
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8827
if (re == NULL || subject == NULL ||
8828
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8829
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8831
/* Fish out the optional data from the extra_data structure, first setting
8832
the default values. */
8835
match_block.match_limit = MATCH_LIMIT;
8836
match_block.callout_data = NULL;
8838
/* The table pointer is always in native byte order. */
8840
tables = external_re->tables;
8842
if (extra_data != NULL)
8844
register unsigned int flags = extra_data->flags;
8845
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8846
study = (const pcre_study_data *)extra_data->study_data;
8847
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8848
match_block.match_limit = extra_data->match_limit;
8849
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8850
match_block.callout_data = extra_data->callout_data;
8851
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8854
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
8855
is a feature that makes it possible to save compiled regex and re-use them
8856
in other programs later. */
8858
if (tables == NULL) tables = pcre_default_tables;
8860
/* Check that the first field in the block is the magic number. If it is not,
8861
test for a regex that was compiled on a host of opposite endianness. If this is
8862
the case, flipped values are put in internal_re and internal_study if there was
8865
if (re->magic_number != MAGIC_NUMBER)
8867
re = try_flipped(re, &internal_re, study, &internal_study);
8868
if (re == NULL) return PCRE_ERROR_BADMAGIC;
8869
if (study != NULL) study = &internal_study;
8872
/* Set up other data */
8874
anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8875
startline = (re->options & PCRE_STARTLINE) != 0;
8877
/* The code starts after the real_pcre block and the capture name table. */
8879
match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8880
re->name_count * re->name_entry_size;
8882
match_block.start_subject = (const uschar *)subject;
8883
match_block.start_offset = start_offset;
8884
match_block.end_subject = match_block.start_subject + length;
8885
end_subject = match_block.end_subject;
8887
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8888
match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8890
match_block.notbol = (options & PCRE_NOTBOL) != 0;
8891
match_block.noteol = (options & PCRE_NOTEOL) != 0;
8892
match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8893
match_block.partial = (options & PCRE_PARTIAL) != 0;
8894
match_block.hitend = FALSE;
8896
match_block.recursive = NULL; /* No recursion at top level */
8898
match_block.lcc = tables + lcc_offset;
8899
match_block.ctypes = tables + ctypes_offset;
8901
/* Partial matching is supported only for a restricted set of regexes at the
8904
if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8905
return PCRE_ERROR_BADPARTIAL;
8907
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
8908
back the character offset. */
8911
if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8913
if (valid_utf8((uschar *)subject, length) >= 0)
8914
return PCRE_ERROR_BADUTF8;
8915
if (start_offset > 0 && start_offset < length)
8917
int tb = ((uschar *)subject)[start_offset];
8921
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8927
/* The ims options can vary during the matching as a result of the presence
8928
of (?ims) items in the pattern. They are kept in a local variable so that
8929
restoring at the exit of a group is easy. */
8931
ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8933
/* If the expression has got more back references than the offsets supplied can
8934
hold, we get a temporary chunk of working store to use during the matching.
8935
Otherwise, we can use the vector supplied, rounding down its size to a multiple
8938
ocount = offsetcount - (offsetcount % 3);
8940
if (re->top_backref > 0 && re->top_backref >= ocount/3)
8942
ocount = re->top_backref * 3 + 3;
8943
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8944
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8945
using_temporary_offsets = TRUE;
8946
DPRINTF(("Got memory to hold back references\n"));
8948
else match_block.offset_vector = offsets;
8950
match_block.offset_end = ocount;
8951
match_block.offset_max = (2*ocount)/3;
8952
match_block.offset_overflow = FALSE;
8953
match_block.capture_last = -1;
8955
/* Compute the minimum number of offsets that we need to reset each time. Doing
8956
this makes a huge difference to execution time when there aren't many brackets
8959
resetcount = 2 + re->top_bracket * 2;
8960
if (resetcount > offsetcount) resetcount = ocount;
8962
/* Reset the working variable associated with each extraction. These should
8963
never be used unless previously set, but they get saved and restored, and so we
8964
initialize them to avoid reading uninitialized locations. */
8966
if (match_block.offset_vector != NULL)
8968
register int *iptr = match_block.offset_vector + ocount;
8969
register int *iend = iptr - resetcount/2 + 1;
8970
while (--iptr >= iend) *iptr = -1;
8973
/* Set up the first character to match, if available. The first_byte value is
8974
never set for an anchored regular expression, but the anchoring may be forced
8975
at run time, so we have to test for anchoring. The first char may be unset for
8976
an unanchored pattern, of course. If there's no first char and the pattern was
8977
studied, there may be a bitmap of possible first characters. */
8981
if ((re->options & PCRE_FIRSTSET) != 0)
8983
first_byte = re->first_byte & 255;
8984
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8985
first_byte = match_block.lcc[first_byte];
8988
if (!startline && study != NULL &&
8989
(study->options & PCRE_STUDY_MAPPED) != 0)
8990
start_bits = study->start_bits;
8993
/* For anchored or unanchored matches, there may be a "last known required
8996
if ((re->options & PCRE_REQCHSET) != 0)
8998
req_byte = re->req_byte & 255;
8999
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9000
req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9003
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
9004
the loop runs just once. */
9008
/* Reset the maximum number of extractions we might see. */
9010
if (match_block.offset_vector != NULL)
9012
register int *iptr = match_block.offset_vector;
9013
register int *iend = iptr + resetcount;
9014
while (iptr < iend) *iptr++ = -1;
9017
/* Advance to a unique first char if possible */
9019
if (first_byte >= 0)
9021
if (first_byte_caseless)
9022
while (start_match < end_subject &&
9023
match_block.lcc[*start_match] != first_byte)
9026
while (start_match < end_subject && *start_match != first_byte)
9030
/* Or to just after \n for a multiline match if possible */
9034
if (start_match > match_block.start_subject + start_offset)
9036
while (start_match < end_subject && start_match[-1] != NEWLINE)
9041
/* Or to a non-unique first char after study */
9043
else if (start_bits != NULL)
9045
while (start_match < end_subject)
9047
register unsigned int c = *start_match;
9048
if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9052
#ifdef DEBUG /* Sigh. Some compilers never learn. */
9053
printf(">>>> Match against: ");
9054
pchars(start_match, end_subject - start_match, TRUE, &match_block);
9058
/* If req_byte is set, we know that that character must appear in the subject
9059
for the match to succeed. If the first character is set, req_byte must be
9060
later in the subject; otherwise the test starts at the match point. This
9061
optimization can save a huge amount of backtracking in patterns with nested
9062
unlimited repeats that aren't going to match. Writing separate code for
9063
cased/caseless versions makes it go faster, as does using an autoincrement
9064
and backing off on a match.
9066
HOWEVER: when the subject string is very, very long, searching to its end can
9067
take a long time, and give bad performance on quite ordinary patterns. This
9068
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9069
don't do this when the string is sufficiently long.
9071
ALSO: this processing is disabled when partial matching is requested.
9074
if (req_byte >= 0 &&
9075
end_subject - start_match < REQ_BYTE_MAX &&
9076
!match_block.partial)
9078
register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9080
/* We don't need to repeat the search if we haven't yet reached the
9081
place we found it at last time. */
9083
if (p > req_byte_ptr)
9085
if (req_byte_caseless)
9087
while (p < end_subject)
9089
register int pp = *p++;
9090
if (pp == req_byte || pp == req_byte2) { p--; break; }
9095
while (p < end_subject)
9097
if (*p++ == req_byte) { p--; break; }
9101
/* If we can't find the required character, break the matching loop */
9103
if (p >= end_subject) break;
9105
/* If we have found the required character, save the point where we
9106
found it, so that we don't search again next time round the loop if
9107
the start hasn't passed this character yet. */
9113
/* When a match occurs, substrings will be set for all internal extractions;
9114
we just need to set up the whole thing as substring 0 before returning. If
9115
there were too many extractions, set the return code to zero. In the case
9116
where we had to get some local store to hold offsets for backreferences, copy
9117
those back references that we can. In this case there need not be overflow
9118
if certain parts of the pattern were not used. */
9120
match_block.start_match = start_match;
9121
match_block.match_call_count = 0;
9123
rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9126
if (rc == MATCH_NOMATCH)
9130
if (match_block.utf8)
9131
while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9137
if (rc != MATCH_MATCH)
9139
DPRINTF((">>>> error: returning %d\n", rc));
9143
/* We have a match! Copy the offset information from temporary store if
9146
if (using_temporary_offsets)
9148
if (offsetcount >= 4)
9150
memcpy(offsets + 2, match_block.offset_vector + 2,
9151
(offsetcount - 2) * sizeof(int));
9152
DPRINTF(("Copied offsets from temporary memory\n"));
9154
if (match_block.end_offset_top > offsetcount)
9155
match_block.offset_overflow = TRUE;
9157
DPRINTF(("Freeing temporary memory\n"));
9158
(pcre_free)(match_block.offset_vector);
9161
rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9163
if (offsetcount < 2) rc = 0; else
9165
offsets[0] = start_match - match_block.start_subject;
9166
offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9169
DPRINTF((">>>> returning %d\n", rc));
9173
/* This "while" is the end of the "do" above */
9175
while (!anchored && start_match <= end_subject);
9177
if (using_temporary_offsets)
9179
DPRINTF(("Freeing temporary memory\n"));
9180
(pcre_free)(match_block.offset_vector);
9183
if (match_block.partial && match_block.hitend)
9185
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9186
return PCRE_ERROR_PARTIAL;
9190
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9191
return PCRE_ERROR_NOMATCH;