409
409
/* When UTF-8 encoding is being used, a character is no longer just a single
410
410
byte. The macros for character handling generate simple sequences when used in
411
byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
412
never be called in byte mode. To make sure it can never even appear when UTF-8
413
support is omitted, we don't even define it. */
411
byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
412
not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
413
never be called in byte mode. To make sure they can never even appear when
414
UTF-8 support is omitted, we don't even define them. */
415
416
#ifndef SUPPORT_UTF8
416
417
#define GETCHAR(c, eptr) c = *eptr;
418
419
#define GETCHARINC(c, eptr) c = *eptr++;
419
420
#define GETCHARINCTEST(c, eptr) c = *eptr++;
420
421
#define GETCHARLEN(c, eptr, len) c = *eptr;
422
/* #define GETCHARLENTEST(c, eptr, len) */
421
423
/* #define BACKCHAR(eptr) */
423
425
#else /* SUPPORT_UTF8 */
427
/* These macros were originally written in the form of loops that used data
428
from the tables whose names start with _pcre_utf8_table. They were rewritten by
429
a user so as not to use loops, because in some environments this gives a
430
significant performance advantage, and it seems never to do any harm. */
432
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
433
advancing the pointer. */
435
#define GETUTF8(c, eptr) \
437
if ((c & 0x20) == 0) \
438
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
439
else if ((c & 0x10) == 0) \
440
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
441
else if ((c & 0x08) == 0) \
442
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
443
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
444
else if ((c & 0x04) == 0) \
445
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
446
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
449
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
450
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
451
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
425
454
/* Get the next UTF-8 character, not advancing the pointer. This is called when
426
455
we know we are in UTF-8 mode. */
428
457
#define GETCHAR(c, eptr) \
433
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
435
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
436
for (gcii = 1; gcii <= gcaa; gcii++) \
439
c |= (eptr[gcii] & 0x3f) << gcss; \
459
if (c >= 0xc0) GETUTF8(c, eptr);
443
461
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
446
464
#define GETCHARTEST(c, eptr) \
448
if (utf8 && c >= 0xc0) \
466
if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
468
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
471
#define GETUTF8INC(c, eptr) \
451
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
453
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
454
for (gcii = 1; gcii <= gcaa; gcii++) \
457
c |= (eptr[gcii] & 0x3f) << gcss; \
473
if ((c & 0x20) == 0) \
474
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
475
else if ((c & 0x10) == 0) \
477
c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
480
else if ((c & 0x08) == 0) \
482
c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
483
((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
486
else if ((c & 0x04) == 0) \
488
c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
489
((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
495
c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
496
((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
497
((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
464
505
#define GETCHARINC(c, eptr) \
468
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
470
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
474
c |= (*eptr++ & 0x3f) << gcss; \
507
if (c >= 0xc0) GETUTF8INC(c, eptr);
478
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
509
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
510
This is called when we don't know if we are in UTF-8 mode. */
480
512
#define GETCHARINCTEST(c, eptr) \
482
if (utf8 && c >= 0xc0) \
514
if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
516
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
517
advancing the pointer, incrementing the length. */
519
#define GETUTF8LEN(c, eptr, len) \
484
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
486
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
490
c |= (*eptr++ & 0x3f) << gcss; \
521
if ((c & 0x20) == 0) \
523
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
526
else if ((c & 0x10) == 0) \
528
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
531
else if ((c & 0x08) == 0) \
533
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
534
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
537
else if ((c & 0x04) == 0) \
539
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
540
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
546
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
547
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
548
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
497
556
#define GETCHARLEN(c, eptr, len) \
502
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
504
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
505
for (gcii = 1; gcii <= gcaa; gcii++) \
508
c |= (eptr[gcii] & 0x3f) << gcss; \
558
if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
513
560
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
514
561
pointer, incrementing length if there are extra bytes. This is called when we
515
know we are in UTF-8 mode. */
562
do not know if we are in UTF-8 mode. */
517
564
#define GETCHARLENTEST(c, eptr, len) \
519
if (utf8 && c >= 0xc0) \
522
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
524
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
525
for (gcii = 1; gcii <= gcaa; gcii++) \
528
c |= (eptr[gcii] & 0x3f) << gcss; \
566
if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
533
568
/* If the pointer is not at the start of a character, move it back until
534
569
it is. This is called only in UTF-8 mode - we don't put a test within the macro
897
933
#define STRING_DEFINE "DEFINE"
899
#define STRING_CR_RIGHTPAR "CR)"
900
#define STRING_LF_RIGHTPAR "LF)"
901
#define STRING_CRLF_RIGHTPAR "CRLF)"
902
#define STRING_ANY_RIGHTPAR "ANY)"
903
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
904
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
905
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
906
#define STRING_UTF8_RIGHTPAR "UTF8)"
935
#define STRING_CR_RIGHTPAR "CR)"
936
#define STRING_LF_RIGHTPAR "LF)"
937
#define STRING_CRLF_RIGHTPAR "CRLF)"
938
#define STRING_ANY_RIGHTPAR "ANY)"
939
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
940
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
941
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
942
#define STRING_UTF8_RIGHTPAR "UTF8)"
943
#define STRING_UCP_RIGHTPAR "UCP)"
944
#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
908
946
#else /* SUPPORT_UTF8 */
1149
1188
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
1151
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
1152
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
1153
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1154
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
1155
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1156
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1157
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
1158
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
1190
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
1191
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
1192
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1193
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
1194
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1195
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1196
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
1197
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
1198
#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
1199
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
1160
1201
#endif /* SUPPORT_UTF8 */
1189
1230
#define PT_ANY 0 /* Any property - matches all chars */
1190
1231
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
1191
#define PT_GC 2 /* General characteristic (e.g. L) */
1192
#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
1232
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
1233
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
1193
1234
#define PT_SC 4 /* Script (e.g. Han) */
1235
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
1236
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
1237
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
1238
#define PT_WORD 8 /* Word - L plus N plus underscore */
1195
1240
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
1196
1241
contain UTF-8 characters with values greater than 255. */
1207
1252
/* These are escaped items that aren't just an encoding of a particular data
1208
1253
value such as \n. They must have non-zero values, as check_escape() returns
1209
1254
their negation. Also, they must appear in the same order as in the opcode
1210
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
1211
corresponds to "." rather than an escape sequence, and another for OP_ALLANY
1212
(which is used for [^] in JavaScript compatibility mode).
1255
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
1256
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
1257
used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
1260
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
1261
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
1262
They must be contiguous, and remain in order so that the replacements can be
1263
looked up from a table.
1214
1265
The final escape must be ESC_REF as subsequent values are used for
1215
1266
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
1221
1272
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
1222
ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
1223
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
1273
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
1274
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
1275
ESC_E, ESC_Q, ESC_g, ESC_k,
1276
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
1227
1279
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
1228
1280
OP_EOD must correspond in order to the list of escapes immediately above.
1247
1299
OP_WHITESPACE, /* 9 \s */
1248
1300
OP_NOT_WORDCHAR, /* 10 \W */
1249
1301
OP_WORDCHAR, /* 11 \w */
1250
OP_ANY, /* 12 Match any character (subject to DOTALL) */
1251
OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
1302
OP_ANY, /* 12 Match any character except newline */
1303
OP_ALLANY, /* 13 Match any character */
1252
1304
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
1253
1305
OP_NOTPROP, /* 15 \P (not Unicode property) */
1254
1306
OP_PROP, /* 16 \p (Unicode property) */
1379
1431
/* These are backtracking control verbs */
1384
OP_COMMIT, /* 110 */
1433
OP_MARK, /* 107 always has an argument */
1435
OP_PRUNE_ARG, /* 109 same, but with argument */
1437
OP_SKIP_ARG, /* 111 same, but with argument */
1439
OP_THEN_ARG, /* 113 same, but with argument */
1440
OP_COMMIT, /* 114 */
1386
1442
/* These are forced failure and success verbs */
1389
OP_ACCEPT, /* 112 */
1390
OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
1445
OP_ACCEPT, /* 116 */
1446
OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
1392
1448
/* This is used to skip a subpattern with a {0} quantifier */
1394
OP_SKIPZERO, /* 114 */
1450
OP_SKIPZERO, /* 118 */
1396
1452
/* This is not an opcode, but is used to check that tables indexed by opcode
1397
1453
are the correct length, in order to catch updating errors - there have been
1427
1483
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
1428
1484
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
1429
1485
"Brazero", "Braminzero", \
1430
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
1486
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
1487
"*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
1431
1488
"Close", "Skip zero"
1493
1550
3, 3, /* RREF, NRREF */ \
1495
1552
1, 1, /* BRAZERO, BRAMINZERO */ \
1496
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
1497
1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
1553
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1554
1, 3, /* SKIP, SKIP_ARG */ \
1555
1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
1556
1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
1500
1559
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
1512
1571
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
1513
1572
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
1514
1573
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
1515
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
1574
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
1517
1577
/* The real format of the start of the pcre block; the index of names and the
1518
1578
code vector run on as long as necessary after the end. We store an explicit