489
489
"too many forward references\0"
490
490
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491
491
"invalid UTF-16 string\0"
493
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494
"character value in \\u.... sequence is too large\0"
494
497
/* Table to identify digits and hex digits. This is used when compiling
3069
3112
#endif /* SUPPORT_UTF */
3070
3113
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
3072
/* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3073
opcodes are not used for multi-byte characters, because they are coded using
3074
an XCLASS instead. */
3077
return (c = *previous) == next;
3117
GETCHARTEST(c, previous);
3080
if ((c = *previous) == next) return TRUE;
3125
GETCHARTEST(c, previous);
3129
if (c == next) return TRUE;
3081
3130
#ifdef SUPPORT_UTF
3084
3133
unsigned int othercase;
3085
3134
if (next < 128) othercase = cd->fcc[next]; else
3086
3135
#ifdef SUPPORT_UCP
3087
othercase = UCD_OTHERCASE(next);
3136
othercase = UCD_OTHERCASE((unsigned int)next);
3089
3138
othercase = NOTACHAR;
3094
3143
#endif /* SUPPORT_UTF */
3095
return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */
3144
return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
3097
3146
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3098
3147
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3101
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
3150
return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3103
3152
case OP_NOT_DIGIT:
3104
return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
3153
return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3106
3155
case OP_WHITESPACE:
3107
return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
3156
return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3109
3158
case OP_NOT_WHITESPACE:
3110
return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
3159
return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3112
3161
case OP_WORDCHAR:
3113
return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
3162
return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3115
3164
case OP_NOT_WORDCHAR:
3116
return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
3165
return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3118
3167
case OP_HSPACE:
3119
3168
case OP_NOT_HSPACE:
3194
return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3243
return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3197
return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3246
return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3200
return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3249
return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3203
return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3252
return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3206
return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3255
return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3209
return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3258
return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3315
3364
return next == -ESC_d;
3317
3366
case OP_WHITESPACE:
3318
return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3367
return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3320
3369
case OP_NOT_WHITESPACE:
3321
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3370
return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3323
3372
case OP_HSPACE:
3324
3373
return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
4482
4531
LONE_SINGLE_CHARACTER:
4484
4533
/* Only the value of 1 matters for class_single_char. */
4485
4535
if (class_single_char < 2) class_single_char++;
4487
4537
/* If class_charcount is 1, we saw precisely one character. As long as
4488
there were no negated characters >= 128 and there was no use of \p or \P,
4489
in other words, no use of any XCLASS features, we can optimize.
4491
In UTF-8 mode, we can optimize the negative case only if there were no
4492
characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4493
operate on single-bytes characters only. This is an historical hangover.
4494
Maybe one day we can tidy these opcodes to handle multi-byte characters.
4538
there was no use of \p or \P, in other words, no use of any XCLASS
4539
features, we can optimize.
4496
4541
The optimization throws away the bit map. We turn the item into a
4497
4542
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4498
Note that OP_NOT[I] does not support multibyte characters. In the positive
4499
case, it can cause firstchar to be set. Otherwise, there can be no first
4500
char if this item is first, whatever repeat count may follow. In the case
4501
of reqchar, save the previous value for reinstating. */
4543
In the positive case, it can cause firstchar to be set. Otherwise, there
4544
can be no first char if this item is first, whatever repeat count may
4545
follow. In the case of reqchar, save the previous value for reinstating. */
4504
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4505
&& (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4507
4547
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4511
4550
zeroreqchar = reqchar;
4513
/* The OP_NOT[I] opcodes work on single characters only. */
4515
4552
if (negate_class)
4517
4554
if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4518
4555
zerofirstchar = firstchar;
4519
4556
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4558
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4559
code += PRIV(ord2utf)(c, code);
4776
4818
/* Now handle repetition for the different types of item. */
4778
/* If previous was a character match, abolish the item and generate a
4779
repeat item instead. If a char item has a minumum of more than one, ensure
4780
that it is set in reqchar - it might not be if a sequence such as x{3} is
4781
the first thing in a branch because the x will have gone into firstchar
4820
/* If previous was a character or negated character match, abolish the item
4821
and generate a repeat item instead. If a char item has a minimum of more
4822
than one, ensure that it is set in reqchar - it might not be if a sequence
4823
such as x{3} is the first thing in a branch because the x will have gone
4824
into firstchar instead. */
4784
if (*previous == OP_CHAR || *previous == OP_CHARI)
4826
if (*previous == OP_CHAR || *previous == OP_CHARI
4827
|| *previous == OP_NOT || *previous == OP_NOTI)
4786
op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4831
default: /* Make compiler happy. */
4832
case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
4833
case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4834
case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
4835
case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
4788
4838
/* Deal with UTF characters that take up more than one character. It's
4789
4839
easier to write this out separately than try to macrify it. Use c to
4806
4856
with UTF disabled, or for a single character UTF character. */
4809
if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4859
if (*previous <= OP_CHARI && repeat_min > 1)
4860
reqchar = c | req_caseopt | cd->req_varyopt;
4812
4863
/* If the repetition is unlimited, it pays to see if the next thing on
4825
4876
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4828
/* If previous was a single negated character ([^a] or similar), we use
4829
one of the special opcodes, replacing it. The code is shared with single-
4830
character repeats by setting opt_type to add a suitable offset into
4831
repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4832
are currently used only for single-byte chars. */
4834
else if (*previous == OP_NOT || *previous == OP_NOTI)
4836
op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4838
if (!possessive_quantifier &&
4840
check_auto_possessive(previous, utf, ptr + 1, options, cd))
4842
repeat_type = 0; /* Force greedy */
4843
possessive_quantifier = TRUE;
4845
goto OUTPUT_SINGLE_REPEAT;
4848
4879
/* If previous was a character type match (\d or similar), abolish it and
4849
4880
create a suitable repeat item. The code is shared with single-character
4850
4881
repeats by setting op_type to add a suitable offset into repeat_type. Note
6836
6872
/* For the rest (including \X when Unicode properties are supported), we
6837
6873
can obtain the OP value by negating the escape value in the default
6838
6874
situation when PCRE_UCP is not set. When it *is* set, we substitute
6839
Unicode property tests. */
6875
Unicode property tests. Note that \b and \B do a one-character
6880
if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6881
cd->max_lookbehind = 1;
6843
6882
#ifdef SUPPORT_UCP
6844
6883
if (-c >= ESC_DU && -c <= ESC_wu)
7150
else { PUT(reverse_count, 0, fixed_length); }
7191
if (fixed_length > cd->max_lookbehind)
7192
cd->max_lookbehind = fixed_length;
7193
PUT(reverse_count, 0, fixed_length);
7908
7953
&firstchar, &reqchar, NULL, cd, NULL);
7909
7954
re->top_bracket = cd->bracount;
7910
7955
re->top_backref = cd->top_backref;
7956
re->max_lookbehind = cd->max_lookbehind;
7911
7957
re->flags = cd->external_flags | PCRE_MODE;
7913
7959
if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */