1
/*************************************************
2
* Perl-Compatible Regular Expressions *
3
*************************************************/
6
/* PCRE is a library of functions to support regular expressions whose syntax
7
and semantics are as close as possible to those of the Perl 5 language.
9
Written by Philip Hazel
10
Copyright (c) 1997-2005 University of Cambridge
11
Copyright (c) 2004, 2005 Apple Computer, Inc.
13
-----------------------------------------------------------------------------
14
Redistribution and use in source and binary forms, with or without
15
modification, are permitted provided that the following conditions are met:
17
* Redistributions of source code must retain the above copyright notice,
18
this list of conditions and the following disclaimer.
20
* Redistributions in binary form must reproduce the above copyright
21
notice, this list of conditions and the following disclaimer in the
22
documentation and/or other materials provided with the distribution.
24
* Neither the name of the University of Cambridge nor the names of its
25
contributors may be used to endorse or promote products derived from
26
this software without specific prior written permission.
28
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
-----------------------------------------------------------------------------
42
/* This header contains definitions that are shared between the different
43
modules, but which are not relevant to the exported API. This includes some
44
functions whose names all begin with "_pcre_". */
46
#ifndef PCRE_INTERNAL_H
47
#define PCRE_INTERNAL_H
49
#include "Assertions.h"
51
/* Added to prevent 64-to-32 shortening warnings when compiling for 64-bit
52
<rdar://problem/4712064> PCRE generates many warnings with -Wshorten-64-to-32 */
53
#if defined(__GNUC__) && defined(__LP64__)
54
#define INT_CAST(i) (int)(i); ASSERT((i) <= INT_MAX)
56
#define INT_CAST(i) (i)
60
#pragma warning(disable: 4232)
61
#pragma warning(disable: 4244)
64
#define _pcre_OP_lengths kjs_pcre_OP_lengths
65
#define _pcre_default_tables kjs_pcre_default_tables
66
#define _pcre_ord2utf8 kjs_pcre_ord2utf8
67
#define _pcre_printint kjs_pcre_printint
68
#define _pcre_try_flipped kjs_pcre_try_flipped
69
#define _pcre_ucp_findchar kjs_pcre_ucp_findchar
70
#define _pcre_utf8_table1 kjs_pcre_utf8_table1
71
#define _pcre_utf8_table1_size kjs_pcre_utf8_table1_size
72
#define _pcre_utf8_table2 kjs_pcre_utf8_table2
73
#define _pcre_utf8_table3 kjs_pcre_utf8_table3
74
#define _pcre_utf8_table4 kjs_pcre_utf8_table4
75
#define _pcre_utt kjs_pcre_utt
76
#define _pcre_utt_size kjs_pcre_utt_size
77
#define _pcre_valid_utf8 kjs_pcre_valid_utf8
78
#define _pcre_xclass kjs_pcre_xclass
80
/* Define DEBUG to get debugging output on stdout. */
86
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
87
inline, and there are *still* stupid compilers about that don't like indented
88
pre-processor statements, or at least there were when I first wrote this. After
89
all, it had only been about 10 years then... */
92
#define DPRINTF(p) printf p
94
#define DPRINTF(p) /*nothing*/
98
/* Get the definitions provided by running "configure" */
100
#include "pcre-config.h"
102
/* Standard C headers plus the external interface definition. The only time
103
setjmp and stdarg are used is when NO_RECURSE is set. */
115
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
118
/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
119
cannot determine these outside the compilation (e.g. by running a program as
120
part of "configure") because PCRE is often cross-compiled for use on other
121
systems. Instead we make use of the maximum sizes that are available at
122
preprocessor time in standard C environments. */
124
#if USHRT_MAX == 65535
125
typedef unsigned short pcre_uint16;
126
#elif UINT_MAX == 65535
127
typedef unsigned int pcre_uint16;
129
#error Cannot determine a type for 16-bit unsigned integers
132
#if UINT_MAX == 4294967295
133
typedef unsigned int pcre_uint32;
134
#elif ULONG_MAX == 4294967295
135
typedef unsigned long int pcre_uint32;
137
#error Cannot determine a type for 32-bit unsigned integers
140
/* Include the public PCRE header and the definitions of UCP character property
146
/* All character handling must be done as unsigned characters. Otherwise there
147
are problems with top-bit-set characters and functions such as isspace().
148
However, we leave the interface to the outside world as char *, because that
149
should make things easier for callers. We define a short type for unsigned char
150
to save lots of typing. I tried "uchar", but it causes problems on Digital
151
Unix, where it is defined in sys/types, so use "uschar" instead. */
153
typedef unsigned char uschar;
155
/* Use pcre_uchar for always-unsigned version of pcre_char. */
157
typedef pcre_char pcre_uchar;
159
typedef unsigned char pcre_uchar;
162
/* When compiling for use with the Virtual Pascal compiler, these functions
163
need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
164
option on the command line. */
167
#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
168
#define memcpy(d,s,n) _memcpy(d,s,n)
169
#define memmove(d,s,n) _memmove(d,s,n)
170
#define memset(s,c,n) _memset(s,c,n)
173
/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
174
define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
175
is set. Otherwise, include an emulating function for those systems that have
176
neither (there some non-Unix environments where this is the case). This assumes
177
that all calls to memmove are moving strings upwards in store, which is the
181
#undef memmove /* some systems may have a macro */
183
#define memmove(a, b, c) bcopy(b, a, c)
184
#else /* HAVE_BCOPY */
186
pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
191
for (i = 0; i < n; ++i) *(--dest) = *(--src);
193
#define memmove(a, b, c) pcre_memmove(a, b, c)
194
#endif /* not HAVE_BCOPY */
195
#endif /* not HAVE_MEMMOVE */
196
#endif /* not VPCOMPAT */
199
/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
200
in big-endian order) by default. These are used, for example, to link from the
201
start of a subpattern to its alternatives and its end. The use of 2 bytes per
202
offset limits the size of the compiled regex to around 64K, which is big enough
203
for almost everybody. However, I received a request for an even bigger limit.
204
For this reason, and also to make the code easier to maintain, the storing and
205
loading of offsets from the byte string is now handled by the macros that are
208
The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
209
the config.h file, but can be overridden by using -D on the command line. This
210
is automated on Unix systems via the "configure" command. */
216
(a[(n)+1] = (d) & 255)
219
(((a)[n] << 8) | (a)[(n)+1])
221
#define MAX_PATTERN_SIZE (1 << 16)
227
(a[n] = (d) >> 16), \
228
(a[(n)+1] = (d) >> 8), \
229
(a[(n)+2] = (d) & 255)
232
(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
234
#define MAX_PATTERN_SIZE (1 << 24)
240
(a[n] = (d) >> 24), \
241
(a[(n)+1] = (d) >> 16), \
242
(a[(n)+2] = (d) >> 8), \
243
(a[(n)+3] = (d) & 255)
246
(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
248
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
252
#error LINK_SIZE must be either 2, 3, or 4
256
/* Convenience macro defined in terms of the others */
258
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
261
/* PCRE uses some other 2-byte quantities that do not change when the size of
262
offsets changes. There are used for repeat counts and for other things such as
263
capturing parenthesis numbers in back references. */
265
#define PUT2(a,n,d) \
270
(((a)[n] << 8) | (a)[(n)+1])
272
#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
275
/* When UTF-8 encoding is being used, a character is no longer just a single
276
byte. The macros for character handling generate simple sequences when used in
277
byte-mode, and more complicated ones for UTF-8 characters. */
280
#define GETCHAR(c, eptr) c = *eptr;
281
#define GETCHARTEST(c, eptr) c = *eptr;
282
#define GETCHARINC(c, eptr) c = *eptr++;
283
#define GETCHARINCTEST(c, eptr) c = *eptr++;
284
#define GETCHARLEN(c, eptr, len) c = *eptr;
285
#define BACKCHAR(eptr)
287
#else /* SUPPORT_UTF8 */
289
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
290
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
292
#define GETUTF8CHARLEN(c, eptr, len) \
294
if ((c & 0xc0) == 0xc0) \
297
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
299
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
300
for (gcii = 1; gcii <= gcaa; gcii++) \
303
c |= (eptr[gcii] & 0x3f) << gcss; \
308
/* Get the next UTF-8 character, advancing the pointer. This is called when we
309
know we are in UTF-8 mode. */
311
#define GETUTF8CHARINC(c, eptr) \
313
if ((c & 0xc0) == 0xc0) \
315
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
317
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
321
c |= (*eptr++ & 0x3f) << gcss; \
327
#define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
328
#define SURROGATE_OFFSET (0x10000 - (0xd800 << 10) - 0xdc00)
330
#define IS_LEADING_SURROGATE(c) (((c) & ~0x3ff) == 0xd800)
331
#define IS_TRAILING_SURROGATE(c) (((c) & ~0x3ff) == 0xdc00)
333
#define DECODE_SURROGATE_PAIR(l, t) (((l) << 10) + (t) + SURROGATE_OFFSET)
334
#define LEADING_SURROGATE(c) (LEAD_OFFSET + ((c) >> 10))
335
#define TRAILING_SURROGATE(c) (0xdc00 + ((c) & 0x3FF))
337
#define GETCHAR(c, eptr) \
339
if (IS_LEADING_SURROGATE(c)) \
340
c = DECODE_SURROGATE_PAIR(c, eptr[1])
342
#define GETCHARTEST(c, eptr) GETCHAR(c, eptr)
344
#define GETCHARINC(c, eptr) \
346
if (IS_LEADING_SURROGATE(c)) \
347
c = DECODE_SURROGATE_PAIR(c, *eptr++)
349
#define GETCHARINCTEST(c, eptr) GETCHARINC(c, eptr)
351
#define GETCHARLEN(c, eptr, len) \
353
if (IS_LEADING_SURROGATE(c)) \
355
c = DECODE_SURROGATE_PAIR(c, eptr[1]); \
359
#define GETCHARLENEND(c, eptr, end, len) \
361
if (IS_LEADING_SURROGATE(c)) \
363
c = DECODE_SURROGATE_PAIR(c, eptr + 1 < end ? eptr[1] : 0); \
367
#define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c)
371
/* Get the next UTF-8 character, not advancing the pointer. This is called when
372
we know we are in UTF-8 mode. */
374
#define GETCHAR(c, eptr) \
376
if ((c & 0xc0) == 0xc0) \
379
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
381
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
382
for (gcii = 1; gcii <= gcaa; gcii++) \
385
c |= (eptr[gcii] & 0x3f) << gcss; \
389
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
392
#define GETCHARTEST(c, eptr) \
394
if (utf8 && (c & 0xc0) == 0xc0) \
397
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
399
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
400
for (gcii = 1; gcii <= gcaa; gcii++) \
403
c |= (eptr[gcii] & 0x3f) << gcss; \
407
/* Get the next UTF-8 character, advancing the pointer. This is called when we
408
know we are in UTF-8 mode. */
410
#define GETCHARINC GETUTF8CHARINC
412
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
414
#define GETCHARINCTEST(c, eptr) \
416
if (utf8 && (c & 0xc0) == 0xc0) \
418
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
420
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
424
c |= (*eptr++ & 0x3f) << gcss; \
428
#define GETCHARLEN(c, eptr) GETUTF8CHARLEN(c, eptr)
430
/* Return 1 if not the start of a character. */
432
#define ISMIDCHAR(c) (((c) & 0xc0) == 0x80)
436
/* If the pointer is not at the start of a character, move it back until
437
it is. Called only in UTF-8 mode. */
439
#define BACKCHAR(eptr) while(ISMIDCHAR(*eptr)) eptr--;
444
/* In case there is no definition of offsetof() provided - though any proper
445
Standard C system should have one. */
448
#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
452
/* These are the public options that can change during matching. */
454
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
456
/* Private options flags start at the most significant end of the four bytes,
457
but skip the top bit so we can use ints for convenience without getting tangled
458
with negative values. The public options defined in pcre.h start at the least
459
significant end. Make sure they don't overlap! */
461
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
462
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
463
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
464
#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
465
#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
467
/* Options for the "extra" block produced by pcre_study(). */
469
#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
471
/* Masks for identifying the public options that are permitted at compile
472
time, run time, or study time, respectively. */
474
#define PUBLIC_OPTIONS \
475
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
476
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
477
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE)
479
#define PUBLIC_EXEC_OPTIONS \
480
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
483
#define PUBLIC_DFA_EXEC_OPTIONS \
484
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
485
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART)
487
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
489
/* Magic number to provide a small check against being handed junk. Also used
490
to detect whether a pattern was compiled on a host of different endianness. */
492
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
494
/* Negative values for the firstchar and reqchar variables */
496
#define REQ_UNSET (-2)
497
#define REQ_NONE (-1)
499
/* The maximum remaining length of subject we are prepared to search for a
502
#define REQ_BYTE_MAX 1000
504
/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
505
variable-length repeat, or a anything other than literal characters. */
507
#define REQ_CASELESS 0x0100 /* indicates caselessness */
508
#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
510
/* Miscellaneous definitions */
517
/* Escape items that are just an encoding of a particular data value. Note that
518
ESC_n is defined as yet another macro, which is set in config.h to either \n
519
(the default) or \r (which some people want). */
530
#define ESC_n NEWLINE
537
/* We can't officially use ESC_t because it is a POSIX reserved identifier
538
(presumably because of all the others like size_t). */
548
/* These are escaped items that aren't just an encoding of a particular data
549
value such as \n. They must have non-zero values, as check_escape() returns
550
their negation. Also, they must appear in the same order as in the opcode
551
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
552
corresponds to "." rather than an escape sequence. The final one must be
553
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
554
tests in the code for an escape greater than ESC_b and less than ESC_Z to
555
detect the types that may be repeated. These are the types that consume
556
characters. If any new escapes are put in between that don't consume a
557
character, that code will have to change. */
559
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
560
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
563
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
564
contain UTF-8 characters with values greater than 255. */
566
#define XCL_NOT 0x01 /* Flag: this is a negative class */
567
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
569
#define XCL_END 0 /* Marks end of individual items */
570
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
571
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
572
#define XCL_PROP 3 /* Unicode property (one property code) follows */
573
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
576
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
577
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
578
OP_EOD must correspond in order to the list of escapes immediately above.
579
Note that whenever this list is updated, the two macro definitions that follow
580
must also be updated to match. */
583
OP_END, /* 0 End of pattern */
585
/* Values corresponding to backslashed metacharacters */
587
OP_SOD, /* 1 Start of data: \A */
588
OP_SOM, /* 2 Start of match (subject + offset): \G */
589
OP_NOT_WORD_BOUNDARY, /* 3 \B */
590
OP_WORD_BOUNDARY, /* 4 \b */
591
OP_NOT_DIGIT, /* 5 \D */
593
OP_NOT_WHITESPACE, /* 7 \S */
594
OP_WHITESPACE, /* 8 \s */
595
OP_NOT_WORDCHAR, /* 9 \W */
596
OP_WORDCHAR, /* 10 \w */
597
OP_ANY, /* 11 Match any character */
598
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
599
OP_NOTPROP, /* 13 \P (not Unicode property) */
600
OP_PROP, /* 14 \p (Unicode property) */
601
OP_EXTUNI, /* 15 \X (extended Unicode sequence */
602
OP_EODN, /* 16 End of data or \n at end of data: \Z. */
603
OP_EOD, /* 17 End of data: \z */
605
OP_OPT, /* 18 Set runtime options */
606
OP_CIRC, /* 19 Start of line - varies with multiline switch */
607
OP_DOLL, /* 20 End of line - varies with multiline switch */
608
OP_CHAR, /* 21 Match one character, casefully */
609
OP_CHARNC, /* 22 Match one character, caselessly */
610
OP_NOT, /* 23 Match anything but the following char */
612
OP_STAR, /* 24 The maximizing and minimizing versions of */
613
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
614
OP_PLUS, /* 26 the minimizing one second. */
615
OP_MINPLUS, /* 27 This first set applies to single characters */
617
OP_MINQUERY, /* 29 */
618
OP_UPTO, /* 30 From 0 to n matches */
620
OP_EXACT, /* 32 Exactly n matches */
622
OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
623
OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
624
OP_NOTPLUS, /* 35 the minimizing one second. */
625
OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
626
OP_NOTQUERY, /* 37 */
627
OP_NOTMINQUERY, /* 38 */
628
OP_NOTUPTO, /* 39 From 0 to n matches */
629
OP_NOTMINUPTO, /* 40 */
630
OP_NOTEXACT, /* 41 Exactly n matches */
632
OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
633
OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
634
OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
635
OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
636
OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
637
OP_TYPEMINQUERY, /* 47 */
638
OP_TYPEUPTO, /* 48 From 0 to n matches */
639
OP_TYPEMINUPTO, /* 49 */
640
OP_TYPEEXACT, /* 50 Exactly n matches */
642
OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
643
OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
644
OP_CRPLUS, /* 53 the minimizing one second. These codes must */
645
OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
646
OP_CRQUERY, /* 55 These are for character classes and back refs */
647
OP_CRMINQUERY, /* 56 */
648
OP_CRRANGE, /* 57 These are different to the three sets above. */
649
OP_CRMINRANGE, /* 58 */
651
OP_CLASS, /* 59 Match a character class, chars < 256 only */
652
OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
653
class - the difference is relevant only when a UTF-8
654
character > 255 is encountered. */
656
OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
657
class. This does both positive and negative. */
659
OP_REF, /* 62 Match a back reference */
660
OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
661
OP_CALLOUT, /* 64 Call out to external function if provided */
663
OP_ALT, /* 65 Start of alternation */
664
OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
665
OP_KETRMAX, /* 67 These two must remain together and in this */
666
OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
668
/* The assertions must come before ONCE and COND */
670
OP_ASSERT, /* 69 Positive lookahead */
671
OP_ASSERT_NOT, /* 70 Negative lookahead */
672
OP_ASSERTBACK, /* 71 Positive lookbehind */
673
OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
674
OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
676
/* ONCE and COND must come after the assertions, with ONCE first, as there's
677
a test for >= ONCE for a subpattern that isn't an assertion. */
679
OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
680
OP_COND, /* 75 Conditional group */
681
OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
683
OP_BRAZERO, /* 77 These two must remain together and in this */
684
OP_BRAMINZERO, /* 78 order. */
686
OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
687
than can fit into an opcode. */
689
OP_BRA /* 80 This and greater values are used for brackets that
690
extract substrings up to EXTRACT_BASIC_MAX. After
691
that, use is made of OP_BRANUMBER. */
694
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
695
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
696
character sequences easier. */
698
/* The highest extraction number before we have to start using additional
699
bytes. (Originally PCRE didn't have support for extraction counts highter than
700
this number.) The value is limited by the number of opcodes left after OP_BRA,
701
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
704
#define EXTRACT_BASIC_MAX 100
707
/* This macro defines textual names for all the opcodes. These are used only
708
for debugging. The macro is referenced only in pcre_printint.c. */
710
#define OP_NAME_LIST \
711
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
712
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
713
"notprop", "prop", "extuni", \
715
"Opt", "^", "$", "char", "charnc", "not", \
716
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
717
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
718
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
719
"*", "*?", "+", "+?", "?", "??", "{", "{", \
720
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
721
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
722
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
723
"Brazero", "Braminzero", "Branumber", "Bra"
726
/* This macro defines the length of fixed length operations in the compiled
727
regex. The lengths are used when searching for specific things, and also in the
728
debugging printing of a compiled regex. We use a macro so that it can be
729
defined close to the definitions of the opcodes themselves.
731
As things have been extended, some of these are no longer fixed lenths, but are
732
minima instead. For example, the length of a single-character repeat may vary
733
in UTF-8 mode. The code that uses this table must know about such things. */
737
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
738
1, 1, /* Any, Anybyte */ \
739
2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
740
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
741
2, /* Char - the minimum length */ \
742
2, /* Charnc - the minimum length */ \
744
/* Positive single-char repeats ** These are */ \
745
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
746
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
747
/* Negative single-char repeats - only for chars < 256 */ \
748
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
749
4, 4, 4, /* NOT upto, minupto, exact */ \
750
/* Positive type repeats */ \
751
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
752
4, 4, 4, /* Type upto, minupto, exact */ \
753
/* Character class & ref repeats */ \
754
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
755
5, 5, /* CRRANGE, CRMINRANGE */ \
758
0, /* XCLASS - variable length */ \
760
1+LINK_SIZE, /* RECURSE */ \
761
2+2*LINK_SIZE, /* CALLOUT */ \
762
1+LINK_SIZE, /* Alt */ \
763
1+LINK_SIZE, /* Ket */ \
764
1+LINK_SIZE, /* KetRmax */ \
765
1+LINK_SIZE, /* KetRmin */ \
766
1+LINK_SIZE, /* Assert */ \
767
1+LINK_SIZE, /* Assert not */ \
768
1+LINK_SIZE, /* Assert behind */ \
769
1+LINK_SIZE, /* Assert behind not */ \
770
1+LINK_SIZE, /* Reverse */ \
771
1+LINK_SIZE, /* Once */ \
772
1+LINK_SIZE, /* COND */ \
774
1, 1, /* BRAZERO, BRAMINZERO */ \
776
1+LINK_SIZE /* BRA */ \
779
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
781
#define CREF_RECURSE 0xffff
783
/* Error code numbers. They are given names so that they can more easily be
786
enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
787
ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
788
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
789
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
790
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
792
/* The real format of the start of the pcre block; the index of names and the
793
code vector run on as long as necessary after the end. We store an explicit
794
offset to the name table so that if a regex is compiled on one host, saved, and
795
then run on another where the size of pointers is different, all might still
796
be well. For the case of compiled-on-4 and run-on-8, we include an extra
797
pointer that is always NULL. For future-proofing, a few dummy fields were
798
originally included - even though you can never get this planning right - but
799
there is only one left now.
802
Because people can now save and re-use compiled patterns, any additions to this
803
structure should be made at the end, and something earlier (e.g. a new
804
flag in the options or one of the dummy fields) should indicate that the new
805
fields are present. Currently PCRE always sets the dummy fields to zero.
809
typedef struct real_pcre {
810
pcre_uint32 magic_number;
811
pcre_uint32 size; /* Total that was malloced */
813
pcre_uint32 dummy1; /* For future use, maybe */
815
pcre_uint16 top_bracket;
816
pcre_uint16 top_backref;
817
pcre_uint16 first_byte;
818
pcre_uint16 req_byte;
819
pcre_uint16 name_table_offset; /* Offset to name table that follows */
820
pcre_uint16 name_entry_size; /* Size of any name items */
821
pcre_uint16 name_count; /* Number of name items */
822
pcre_uint16 ref_count; /* Reference count */
824
const unsigned char *tables; /* Pointer to tables or NULL for std */
825
const unsigned char *nullpad; /* NULL padding */
828
/* The format of the block used to store data from pcre_study(). The same
829
remark (see NOTE above) about extending this structure applies. */
831
typedef struct pcre_study_data {
832
pcre_uint32 size; /* Total that was malloced */
834
uschar start_bits[32];
837
/* Structure for passing "static" information around between the functions
838
doing the compiling, so that they are thread-safe. */
840
typedef struct compile_data {
841
const uschar *lcc; /* Points to lower casing table */
842
const uschar *fcc; /* Points to case-flipping table */
843
const uschar *cbits; /* Points to character type table */
844
const uschar *ctypes; /* Points to table of type maps */
845
const uschar *start_code; /* The start of the compiled code */
846
const pcre_uchar *start_pattern; /* The start of the pattern */
847
uschar *name_table; /* The name/number table */
848
int names_found; /* Number of entries so far */
849
int name_entry_size; /* Size of each entry */
850
int top_backref; /* Maximum back reference */
851
unsigned int backref_map; /* Bitmap of low back refs */
852
int req_varyopt; /* "After variable item" flag for reqbyte */
853
BOOL nopartial; /* Set TRUE if partial won't work */
856
/* Structure for maintaining a chain of pointers to the currently incomplete
857
branches, for testing for left recursion. */
859
typedef struct branch_chain {
860
struct branch_chain *outer;
864
/* Structure for items in a linked list that represents an explicit recursive
865
call within the pattern. */
867
typedef struct recursion_info {
868
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
869
int group_num; /* Number of group that was called */
870
const uschar *after_call; /* "Return value": points after the call in the expr */
871
const pcre_uchar *save_start; /* Old value of md->start_match */
872
int *offset_save; /* Pointer to start of saved offsets */
873
int saved_max; /* Number of saved offsets */
876
/* When compiling in a mode that doesn't use recursive calls to match(),
877
a structure is used to remember local variables on the heap. It is defined in
878
pcre.c, close to the match() function, so that it is easy to keep it in step
879
with any changes of local variable. However, the pointer to the current frame
880
must be saved in some "static" place over a longjmp(). We declare the
881
structure here so that we can put a pointer in the match_data structure.
882
NOTE: This isn't used for a "normal" compilation of pcre. */
886
/* Structure for passing "static" information around between the functions
887
doing traditional NFA matching, so that they are thread-safe. */
889
typedef struct match_data {
890
unsigned long int match_call_count; /* As it says */
891
unsigned long int match_limit;/* As it says */
892
int *offset_vector; /* Offset vector */
893
int offset_end; /* One past the end */
894
int offset_max; /* The maximum usable for return data */
895
const uschar *lcc; /* Points to lower casing table */
896
const uschar *ctypes; /* Points to table of type maps */
897
BOOL offset_overflow; /* Set if too many extractions */
898
BOOL notbol; /* NOTBOL flag */
899
BOOL noteol; /* NOTEOL flag */
900
BOOL utf8; /* UTF8 flag */
901
BOOL endonly; /* Dollar not before final \n */
902
BOOL notempty; /* Empty string match not wanted */
903
BOOL partial; /* PARTIAL flag */
904
BOOL hitend; /* Hit the end of the subject at some point */
905
const uschar *start_code; /* For use when recursing */
906
const pcre_uchar *start_subject; /* Start of the subject string */
907
const pcre_uchar *end_subject; /* End of the subject string */
908
const pcre_uchar *start_match; /* Start of this match attempt */
909
const pcre_uchar *end_match_ptr; /* Subject position at end match */
910
int end_offset_top; /* Highwater mark at end of match */
911
int capture_last; /* Most recent capture number */
912
int start_offset; /* The start offset value */
913
recursion_info *recursive; /* Linked list of recursion data */
914
void *callout_data; /* To pass back to callouts */
915
struct heapframe *thisframe; /* Used only when compiling for no recursion */
918
/* A similar structure is used for the same purpose by the DFA matching
921
typedef struct dfa_match_data {
922
const uschar *start_code; /* Start of the compiled pattern */
923
const pcre_uchar *start_subject; /* Start of the subject string */
924
const pcre_uchar *end_subject; /* End of subject string */
925
const uschar *tables; /* Character tables */
926
int moptions; /* Match options */
927
int poptions; /* Pattern options */
928
void *callout_data; /* To pass back to callouts */
931
/* Bit definitions for entries in the pcre_ctypes table. */
933
#define ctype_space 0x01
934
#define ctype_letter 0x02
935
#define ctype_digit 0x04
936
#define ctype_xdigit 0x08
937
#define ctype_word 0x10 /* alphameric or '_' */
938
#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
940
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
941
of bits for a class map. Some classes are built by combining these tables. */
943
#define cbit_space 0 /* [:space:] or \s */
944
#define cbit_xdigit 32 /* [:xdigit:] */
945
#define cbit_digit 64 /* [:digit:] or \d */
946
#define cbit_upper 96 /* [:upper:] */
947
#define cbit_lower 128 /* [:lower:] */
948
#define cbit_word 160 /* [:word:] or \w */
949
#define cbit_graph 192 /* [:graph:] */
950
#define cbit_print 224 /* [:print:] */
951
#define cbit_punct 256 /* [:punct:] */
952
#define cbit_cntrl 288 /* [:cntrl:] */
953
#define cbit_length 320 /* Length of the cbits table */
955
/* Offsets of the various tables from the base tables pointer, and
959
#define fcc_offset 256
960
#define cbits_offset 512
961
#define ctypes_offset (cbits_offset + cbit_length)
962
#define tables_length (ctypes_offset + 256)
964
/* Layout of the UCP type table that translates property names into codes for
965
_pcre_ucp_findchar(). */
973
/* Internal shared data tables. These are tables that are used by more than one
974
of the exported public functions. They have to be "external" in the C sense,
975
but are not part of the PCRE public API. The data for these tables is in the
976
pcre_tables.c module. */
978
extern const int _pcre_utf8_table1[];
979
extern const int _pcre_utf8_table2[];
980
extern const int _pcre_utf8_table3[];
981
extern const uschar _pcre_utf8_table4[];
983
extern const int _pcre_utf8_table1_size;
985
extern const ucp_type_table _pcre_utt[];
986
extern const int _pcre_utt_size;
988
extern const uschar _pcre_default_tables[];
990
extern const uschar _pcre_OP_lengths[];
992
/* Internal shared functions. These are functions that are used by more than
993
one of the exported public functions. They have to be "external" in the C
994
sense, but are not part of the PCRE public API. */
996
extern int _pcre_ord2utf8(int, uschar *);
997
extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
998
const pcre_study_data *, pcre_study_data *);
999
extern int _pcre_ucp_findchar(const int, int *, int *);
1000
extern int _pcre_valid_utf8(const uschar *, int);
1001
extern BOOL _pcre_xclass(int, const uschar *);
1005
/* End of pcre_internal.h */