169
169
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
170
170
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
171
171
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
172
/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
172
/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
173
173
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
174
174
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
175
175
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
202
205
static const verbitem verbs[] = {
208
{ 6, OP_ACCEPT, -1 },
209
{ 6, OP_COMMIT, -1 },
212
{ 5, OP_PRUNE, OP_PRUNE_ARG },
213
{ 4, OP_SKIP, OP_SKIP_ARG },
214
{ 4, OP_THEN, OP_THEN_ARG }
212
217
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
254
259
cbit_xdigit,-1, 0 /* xdigit */
262
/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
263
substitutes must be in the order of the names, defined above, and there are
264
both positive and negative cases. NULL means no substitute. */
267
static const uschar *substitutes[] = {
268
(uschar *)"\\P{Nd}", /* \D */
269
(uschar *)"\\p{Nd}", /* \d */
270
(uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
271
(uschar *)"\\p{Xsp}", /* \s */
272
(uschar *)"\\P{Xwd}", /* \W */
273
(uschar *)"\\p{Xwd}" /* \w */
276
static const uschar *posix_substitutes[] = {
277
(uschar *)"\\p{L}", /* alpha */
278
(uschar *)"\\p{Ll}", /* lower */
279
(uschar *)"\\p{Lu}", /* upper */
280
(uschar *)"\\p{Xan}", /* alnum */
282
(uschar *)"\\h", /* blank */
284
(uschar *)"\\p{Nd}", /* digit */
288
(uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
289
(uschar *)"\\p{Xwd}", /* word */
292
(uschar *)"\\P{L}", /* ^alpha */
293
(uschar *)"\\P{Ll}", /* ^lower */
294
(uschar *)"\\P{Lu}", /* ^upper */
295
(uschar *)"\\P{Xan}", /* ^alnum */
297
(uschar *)"\\H", /* ^blank */
299
(uschar *)"\\P{Nd}", /* ^digit */
303
(uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
304
(uschar *)"\\P{Xwd}", /* ^word */
307
#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
258
310
#define STRING(a) # a
259
311
#define XSTRING(s) STRING(s)
318
370
"invalid condition (?(0)\0"
319
371
"\\C not allowed in lookbehind assertion\0"
320
"PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
372
"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
321
373
"number after (?C is > 255\0"
322
374
"closing ) for (?C expected\0"
343
395
"inconsistent NEWLINE options\0"
344
396
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
345
397
"a numbered reference must not be zero\0"
346
"(*VERB) with an argument is not supported\0"
398
"an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
348
400
"(*VERB) not recognized\0"
349
401
"number is too big\0"
351
403
"digit expected after (?+\0"
352
404
"] is an invalid data character in JavaScript compatibility mode\0"
354
"different names for subpatterns of the same number are not allowed\0";
406
"different names for subpatterns of the same number are not allowed\0"
407
"(*MARK) must have an argument\0"
408
"this version of PCRE is not compiled with PCRE_UCP support\0"
356
411
/* Table to identify digits and hex digits. This is used when compiling
357
412
patterns. Note that the tables in chartables are dependent on the locale, and
879
/* Perl supports \N{name} for character names, as well as plain \N for "not
880
newline". PCRE does not support \N{name}. */
882
if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
883
*errorcodeptr = ERR37;
885
/* If PCRE_UCP is set, we change the values for \d etc. */
887
if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
888
c -= (ESC_DU - ESC_D);
890
/* Set the pointer to the final character before returning. */
1061
1128
if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1063
if (ptr[1] == CHAR_QUESTION_MARK &&
1064
ptr[2] == CHAR_VERTICAL_LINE)
1130
/* Handle specials such as (*SKIP) or (*UTF8) etc. */
1132
if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1134
/* Handle a normal, unnamed capturing parenthesis. */
1136
else if (ptr[1] != CHAR_QUESTION_MARK)
1139
if (name == NULL && *count == lorn) return *count;
1143
/* All cases now have (? at the start. Remember when we are in a group
1144
where the parenthesis numbers are duplicated. */
1146
else if (ptr[2] == CHAR_VERTICAL_LINE)
1067
1149
dup_parens = TRUE;
1070
/* Handle a normal, unnamed capturing parenthesis */
1152
/* Handle comments; all characters are allowed until a ket is reached. */
1072
else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1154
else if (ptr[2] == CHAR_NUMBER_SIGN)
1075
if (name == NULL && *count == lorn) return *count;
1156
for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1079
1160
/* Handle a condition. If it is an assertion, just carry on so that it
1080
1161
is processed as normal. If not, skip to the closing parenthesis of the
1081
condition (there can't be any nested parens. */
1162
condition (there can't be any nested parens). */
1083
1164
else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1614
1694
/* Otherwise, we can get the item's length from the table, except that for
1615
1695
repeated character types, we have to test for \p and \P, which have an extra
1616
two bytes of parameters. */
1696
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1697
must add in its length. */
1709
1797
/* Otherwise, we can get the item's length from the table, except that for
1710
1798
repeated character types, we have to test for \p and \P, which have an extra
1711
two bytes of parameters. */
1799
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1800
must add in its length. */
2222
2328
*code++ = OP_CALLOUT;
2224
PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2225
PUT(code, LINK_SIZE, 0); /* Default length */
2330
PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2331
PUT(code, LINK_SIZE, 0); /* Default length */
2226
2332
return code + 2*LINK_SIZE;
2409
/*************************************************
2410
* Check a character and a property *
2411
*************************************************/
2413
/* This function is called by check_auto_possessive() when a property item
2414
is adjacent to a fixed character.
2418
ptype the property type
2419
pdata the data for the type
2420
negated TRUE if it's a negated property (\P or \p{^)
2422
Returns: TRUE if auto-possessifying is OK
2426
check_char_prop(int c, int ptype, int pdata, BOOL negated)
2428
const ucd_record *prop = GET_UCD(c);
2432
return (prop->chartype == ucp_Lu ||
2433
prop->chartype == ucp_Ll ||
2434
prop->chartype == ucp_Lt) == negated;
2437
return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2440
return (pdata == prop->chartype) == negated;
2443
return (pdata == prop->script) == negated;
2445
/* These are specials */
2448
return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2449
_pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2451
case PT_SPACE: /* Perl space */
2452
return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2453
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2456
case PT_PXSPACE: /* POSIX space */
2457
return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2458
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2459
c == CHAR_FF || c == CHAR_CR)
2463
return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2464
_pcre_ucp_gentype[prop->chartype] == ucp_N ||
2465
c == CHAR_UNDERSCORE) == negated;
2300
2469
#endif /* SUPPORT_UCP */
2310
2479
sense to automatically possessify the repeated item.
2313
op_code the repeated op code
2314
this data for this item, depends on the opcode
2482
previous pointer to the repeated opcode
2315
2483
utf8 TRUE in UTF-8 mode
2316
utf8_char used for utf8 character bytes, NULL if not relevant
2317
2484
ptr next character in pattern
2318
2485
options options bits
2319
2486
cd contains pointers to tables etc.
2325
check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2326
const uschar *ptr, int options, compile_data *cd)
2492
check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2493
int options, compile_data *cd)
2496
int op_code = *previous++;
2330
2498
/* Skip whitespace and comments in extended mode */
2386
2554
strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2389
/* Now compare the next item with the previous opcode. If the previous is a
2390
positive single character match, "item" either contains the character or, if
2391
"item" is greater than 127 in utf8 mode, the character's bytes are in
2395
/* Handle cases when the next item is a character. */
2557
/* Now compare the next item with the previous opcode. First, handle cases when
2558
the next item is a character. */
2397
2560
if (next >= 0) switch(op_code)
2400
2563
#ifdef SUPPORT_UTF8
2401
if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2564
GETCHARTEST(c, previous);
2403
(void)(utf8_char); /* Keep compiler happy by referencing function argument */
2405
return item != next;
2407
2570
/* For CHARNC (caseless character) we must check the other case. If we have
2408
2571
Unicode property support, we can use it to test the other case of
2424
2589
othercase = NOTACHAR;
2426
return (unsigned int)item != othercase;
2591
return (unsigned int)c != othercase;
2429
2594
#endif /* SUPPORT_UTF8 */
2430
return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2595
return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2432
/* For OP_NOT, "item" must be a single-byte character. */
2597
/* For OP_NOT, its data is always a single-byte character. */
2435
if (item == next) return TRUE;
2600
if ((c = *previous) == next) return TRUE;
2436
2601
if ((options & PCRE_CASELESS) == 0) return FALSE;
2437
2602
#ifdef SUPPORT_UTF8
2445
2610
othercase = NOTACHAR;
2447
return (unsigned int)item == othercase;
2612
return (unsigned int)c == othercase;
2450
2615
#endif /* SUPPORT_UTF8 */
2451
return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2616
return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2618
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2619
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2454
2622
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2510
return op_code != OP_VSPACE;
2679
return op_code == OP_NOT_VSPACE;
2512
return op_code == OP_VSPACE;
2681
return op_code != OP_NOT_VSPACE;
2686
return check_char_prop(next, previous[0], previous[1], FALSE);
2689
return check_char_prop(next, previous[0], previous[1], TRUE);
2520
/* Handle the case when the next item is \d, \s, etc. */
2697
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2698
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2699
generated only when PCRE_UCP is *not* set, that is, when only ASCII
2700
characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2701
replaced by OP_PROP codes when PCRE_UCP is set. */
2522
2703
switch(op_code)
2525
2706
case OP_CHARNC:
2526
2707
#ifdef SUPPORT_UTF8
2527
if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2708
GETCHARTEST(c, previous);
2532
return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2715
return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2535
return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2718
return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2538
return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2721
return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2541
return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2724
return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2544
return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2727
return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2547
return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2730
return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2590
2773
return -next == ESC_v;
2776
/* When PCRE_UCP is set, these values get generated for \d etc. Find
2777
their substitutions and process them. The result will always be either
2778
-ESC_p or -ESC_P. Then fall through to process those values. */
2788
int temperrorcode = 0;
2789
ptr = substitutes[-next - ESC_DU];
2790
next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2791
if (temperrorcode != 0) return FALSE;
2792
ptr++; /* For compatibility */
2799
int ptype, pdata, errorcodeptr;
2802
ptr--; /* Make ptr point at the p or P */
2803
ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2804
if (ptype < 0) return FALSE;
2805
ptr++; /* Point past the final curly ket */
2807
/* If the property item is optional, we have to give up. (When generated
2808
from \d etc by PCRE_UCP, this test will have been applied much earlier,
2809
to the original \d etc. At this point, ptr will point to a zero byte. */
2811
if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2812
strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2815
/* Do the property check. */
2817
return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2825
/* In principle, support for Unicode properties should be integrated here as
2826
well. It means re-organizing the above code so as to get hold of the property
2827
values before switching on the op-code. However, I wonder how many patterns
2828
combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2829
these op-codes are never generated.) */
2598
2832
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2599
next == -ESC_h || next == -ESC_v;
2833
next == -ESC_h || next == -ESC_v || next == -ESC_R;
2601
2835
case OP_NOT_DIGIT:
2602
2836
return next == -ESC_d;
2604
2838
case OP_WHITESPACE:
2605
return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2839
return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2607
2841
case OP_NOT_WHITESPACE:
2608
2842
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2610
2844
case OP_HSPACE:
2611
return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2845
return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2846
next == -ESC_w || next == -ESC_v || next == -ESC_R;
2613
2848
case OP_NOT_HSPACE:
2614
2849
return next == -ESC_h;
2616
2851
/* Can't have \S in here because VT matches \S (Perl anomaly) */
2617
2853
case OP_VSPACE:
2618
2854
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2620
2856
case OP_NOT_VSPACE:
2621
return next == -ESC_v;
2857
return next == -ESC_v || next == -ESC_R;
2623
2859
case OP_WORDCHAR:
2624
return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2860
return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2861
next == -ESC_v || next == -ESC_R;
2626
2863
case OP_NOT_WORDCHAR:
2627
2864
return next == -ESC_w || next == -ESC_d;
3122
3370
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3123
3371
posix_class = 0;
3125
/* We build the bit map for the POSIX class in a chunk of local store
3126
because we may be adding and subtracting from it, and we don't want to
3127
subtract bits that may be in the main map already. At the end we or the
3128
result into the bit map that is being built. */
3373
/* When PCRE_UCP is set, some of the POSIX classes are converted to
3374
different escape sequences that use Unicode properties. */
3377
if ((options & PCRE_UCP) != 0)
3379
int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3380
if (posix_substitutes[pc] != NULL)
3382
nestptr = tempptr + 1;
3383
ptr = posix_substitutes[pc] - 1;
3388
/* In the non-UCP case, we build the bit map for the POSIX class in a
3389
chunk of local store because we may be adding and subtracting from it,
3390
and we don't want to subtract bits that may be in the main map already.
3391
At the end we or the result into the bit map that is being built. */
3130
3393
posix_class *= 3;
3170
3433
/* Backslash may introduce a single character, or it may introduce one
3171
3434
of the specials, which just set a flag. The sequence \b is a special
3172
case. Inside a class (and only there) it is treated as backspace.
3173
Elsewhere it marks a word boundary. Other escapes have preset maps ready
3174
to 'or' into the one we are building. We assume they have more than one
3175
character in them, so set class_charcount bigger than one. */
3435
case. Inside a class (and only there) it is treated as backspace. We
3436
assume that other escapes have more than one character in them, so set
3437
class_charcount bigger than one. Unrecognized escapes fall through and
3438
are either treated as literal characters (by default), or are faulted if
3439
PCRE_EXTRA is set. */
3177
3441
if (c == CHAR_BACKSLASH)
3179
3443
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3180
3444
if (*errorcodeptr != 0) goto FAILED;
3182
if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3183
else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3184
else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3446
if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3185
3447
else if (-c == ESC_Q) /* Handle start of quoted string */
3187
3449
if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3198
3460
register const uschar *cbits = cd->cbits;
3199
3461
class_charcount += 2; /* Greater than 1 is what matters */
3201
/* Save time by not doing this in the pre-compile phase. */
3203
if (lengthptr == NULL) switch (-c)
3466
case ESC_du: /* These are the values given for \d etc */
3467
case ESC_DU: /* when PCRE_UCP is set. We replace the */
3468
case ESC_wu: /* escape sequence with an appropriate \p */
3469
case ESC_WU: /* or \P to test Unicode properties instead */
3470
case ESC_su: /* of the default ASCII testing. */
3473
ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3474
class_charcount -= 2; /* Undo! */
3206
3478
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3231
3503
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3234
default: /* Not recognized; fall through */
3235
break; /* Need "default" setting to stop compiler warning. */
3238
/* In the pre-compile phase, just do the recognition. */
3240
else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3241
c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3243
/* We need to deal with \H, \h, \V, and \v in both phases because
3244
they use extra memory. */
3248
3507
SETBIT(classbits, 0x09); /* VT */
3249
3508
SETBIT(classbits, 0x20); /* SPACE */
3250
3509
SETBIT(classbits, 0xa0); /* NSBP */
3370
/* We need to deal with \P and \p in both phases. */
3372
3622
#ifdef SUPPORT_UCP
3373
if (-c == ESC_p || -c == ESC_P)
3377
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3378
if (ptype < 0) goto FAILED;
3380
*class_utf8data++ = ((-c == ESC_p) != negated)?
3381
XCL_PROP : XCL_NOTPROP;
3382
*class_utf8data++ = ptype;
3383
*class_utf8data++ = pdata;
3384
class_charcount -= 2; /* Not a < 256 character */
3628
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3629
if (ptype < 0) goto FAILED;
3631
*class_utf8data++ = ((-c == ESC_p) != negated)?
3632
XCL_PROP : XCL_NOTPROP;
3633
*class_utf8data++ = ptype;
3634
*class_utf8data++ = pdata;
3635
class_charcount -= 2; /* Not a < 256 character */
3388
/* Unrecognized escapes are faulted if PCRE is running in its
3389
strict mode. By default, for compatibility with Perl, they are
3390
treated as literals. */
3639
/* Unrecognized escapes are faulted if PCRE is running in its
3640
strict mode. By default, for compatibility with Perl, they are
3641
treated as literals. */
3392
if ((options & PCRE_EXTRA) != 0)
3394
*errorcodeptr = ERR7;
3644
if ((options & PCRE_EXTRA) != 0)
3646
*errorcodeptr = ERR7;
3649
class_charcount -= 2; /* Undo the default count from above */
3650
c = *ptr; /* Get the final character and fall through */
3398
class_charcount -= 2; /* Undo the default count from above */
3399
c = *ptr; /* Get the final character and fall through */
3402
3655
/* Fall through if we have a single character (c >= 0). This may be
3466
3719
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3467
3720
if (*errorcodeptr != 0) goto FAILED;
3469
/* \b is backspace; \X is literal X; \R is literal R; any other
3470
special means the '-' was literal */
3722
/* \b is backspace; any other special means the '-' was literal */
3474
if (d == -ESC_b) d = CHAR_BS;
3475
else if (d == -ESC_X) d = CHAR_X;
3476
else if (d == -ESC_R) d = CHAR_R; else
3726
if (d == -ESC_b) d = CHAR_BS; else
3479
3729
goto LONE_SINGLE_CHARACTER; /* A few lines below */
3642
/* Loop until ']' reached. This "while" is the end of the "do" above. */
3644
while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3646
if (c == 0) /* Missing terminating ']' */
3892
/* Loop until ']' reached. This "while" is the end of the "do" far above.
3893
If we are at the end of an internal nested string, revert to the outer
3896
while (((c = *(++ptr)) != 0 ||
3898
(ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3899
(c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3901
/* Check for missing terminating ']' */
3648
3905
*errorcodeptr = ERR6;
3653
/* This code has been disabled because it would mean that \s counts as
3654
an explicit \r or \n reference, and that's not really what is wanted. Now
3655
we set the flag only if there is a literal "\r" or "\n" in the class. */
3658
/* Remember whether \r or \n are in this class */
3662
if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3666
if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3671
3909
/* If class_charcount is 1, we saw precisely one character whose value is
3672
3910
less than 256. As long as there were no characters >= 128 and there was no
3673
3911
use of \p or \P, in other words, no use of any XCLASS features, we can
3732
3970
/* If there are characters with values > 255, we have to compile an
3733
3971
extended class, with its own opcode, unless there was a negated special
3734
such as \S in the class, because in that case all characters > 255 are in
3735
the class, so any that were explicitly given as well can be ignored. If
3736
(when there are explicit characters > 255 that must be listed) there are no
3737
characters < 256, we can omit the bitmap in the actual compiled code. */
3972
such as \S in the class, and PCRE_UCP is not set, because in that case all
3973
characters > 255 are in the class, so any that were explicitly given as
3974
well can be ignored. If (when there are explicit characters > 255 that must
3975
be listed) there are no characters < 256, we can omit the bitmap in the
3976
actual compiled code. */
3739
3978
#ifdef SUPPORT_UTF8
3740
if (class_utf8 && !should_flip_negation)
3979
if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
3742
3981
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
3743
3982
*code++ = OP_XCLASS;
3766
/* If there are no characters > 255, set the opcode to OP_CLASS or
3767
OP_NCLASS, depending on whether the whole class was negated and whether
3768
there were negative specials such as \S in the class. Then copy the 32-byte
3769
map into the code vector, negating it if necessary. */
4005
/* If there are no characters > 255, or they are all to be included or
4006
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4007
whole class was negated and whether there were negative specials such as \S
4008
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
4009
negating it if necessary. */
3771
4011
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3772
4012
if (negate_class)
3891
4131
if (!possessive_quantifier &&
3892
4132
repeat_max < 0 &&
3893
check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
4133
check_auto_possessive(previous, utf8, ptr + 1, options, cd))
3896
4135
repeat_type = 0; /* Force greedy */
3897
4136
possessive_quantifier = TRUE;
3912
4151
c = previous[1];
3913
4152
if (!possessive_quantifier &&
3914
4153
repeat_max < 0 &&
3915
check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
4154
check_auto_possessive(previous, utf8, ptr + 1, options, cd))
3917
4156
repeat_type = 0; /* Force greedy */
3918
4157
possessive_quantifier = TRUE;
3937
4176
if (!possessive_quantifier &&
3938
4177
repeat_max < 0 &&
3939
check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
4178
check_auto_possessive(previous, utf8, ptr + 1, options, cd))
3941
4180
repeat_type = 0; /* Force greedy */
3942
4181
possessive_quantifier = TRUE;
4513
4752
/* First deal with various "verbs" that can be introduced by '*'. */
4515
if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4754
if (*(++ptr) == CHAR_ASTERISK &&
4755
((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4517
4757
int i, namelen;
4518
4759
const char *vn = verbnames;
4519
const uschar *name = ++ptr;
4760
const uschar *name = ptr + 1;
4761
const uschar *arg = NULL;
4520
4762
previous = NULL;
4521
4763
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4764
namelen = (int)(ptr - name);
4522
4766
if (*ptr == CHAR_COLON)
4524
*errorcodeptr = ERR59; /* Not supported */
4769
while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4770
|| *ptr == '_') ptr++;
4771
arglen = (int)(ptr - arg);
4527
4774
if (*ptr != CHAR_RIGHT_PARENTHESIS)
4529
4776
*errorcodeptr = ERR60;
4532
namelen = ptr - name;
4780
/* Scan the table of verb names */
4533
4782
for (i = 0; i < verbcount; i++)
4535
4784
if (namelen == verbs[i].len &&
4547
4796
PUT2INC(code, 0, oc->number);
4550
*code++ = verbs[i].op;
4800
/* Handle the cases with/without an argument */
4804
if (verbs[i].op < 0) /* Argument is mandatory */
4806
*errorcodeptr = ERR66;
4809
*code++ = verbs[i].op;
4814
if (verbs[i].op_arg < 0) /* Argument is forbidden */
4816
*errorcodeptr = ERR59;
4819
*code++ = verbs[i].op_arg;
4821
memcpy(code, arg, arglen);
4826
break; /* Found verb, exit loop */
4553
4829
vn += verbs[i].len + 1;
4555
if (i < verbcount) continue;
4556
*errorcodeptr = ERR60;
4832
if (i < verbcount) continue; /* Successfully handled a verb */
4833
*errorcodeptr = ERR60; /* Verb not recognized */
4871
PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4872
PUT(code, LINK_SIZE, 0); /* Default length */
5148
PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5149
PUT(code, LINK_SIZE, 0); /* Default length */
4873
5150
code += 2 * LINK_SIZE;
4875
5152
previous = NULL;
5336
5613
} /* End of switch for character following (? */
5337
5614
} /* End of (? handling */
5339
/* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5340
all unadorned brackets become non-capturing and behave like (?:...)
5616
/* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5617
is set, all unadorned brackets become non-capturing and behave like (?:...)
5343
5620
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5530
5807
/* ===================================================================*/
5531
5808
/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5532
are arranged to be the negation of the corresponding OP_values. For the
5533
back references, the values are ESC_REF plus the reference number. Only
5534
back references and those types that consume a character may be repeated.
5535
We can test for values between ESC_b and ESC_Z for the latter; this may
5536
have to change if any new ones are ever created. */
5809
are arranged to be the negation of the corresponding OP_values in the
5810
default case when PCRE_UCP is not set. For the back references, the values
5811
are ESC_REF plus the reference number. Only back references and those types
5812
that consume a character may be repeated. We can test for values between
5813
ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5538
5816
case CHAR_BACKSLASH:
5695
5973
/* For the rest (including \X when Unicode properties are supported), we
5696
can obtain the OP value by negating the escape value. */
5974
can obtain the OP value by negating the escape value in the default
5975
situation when PCRE_UCP is not set. When it *is* set, we substitute
5976
Unicode property tests. */
5700
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5981
if (-c >= ESC_DU && -c <= ESC_wu)
5983
nestptr = ptr + 1; /* Where to resume */
5984
ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
5989
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6059
6349
code - start_bracket);
6060
6350
*start_bracket = OP_ONCE;
6061
6351
code += 1 + LINK_SIZE;
6062
PUT(start_bracket, 1, code - start_bracket);
6352
PUT(start_bracket, 1, (int)(code - start_bracket));
6063
6353
*code = OP_KET;
6064
PUT(code, 1, code - start_bracket);
6354
PUT(code, 1, (int)(code - start_bracket));
6065
6355
code += 1 + LINK_SIZE;
6066
6356
length += 2 + 2*LINK_SIZE;
6506
6796
if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6507
6797
{ skipatstart += 7; options |= PCRE_UTF8; continue; }
6798
else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
6799
{ skipatstart += 6; options |= PCRE_UCP; continue; }
6509
6801
if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6510
6802
{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6745
7049
recno = GET(codestart, offset);
6746
7050
groupptr = _pcre_find_bracket(codestart, utf8, recno);
6747
7051
if (groupptr == NULL) errorcode = ERR53;
6748
else PUT(((uschar *)codestart), offset, groupptr - codestart);
7052
else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
6751
7055
/* Give an error if there's back reference to a non-existent capturing