429
439
* Set a bit and maybe its alternate case *
430
440
*************************************************/
432
/* Given a character, set its bit in the table, and also the bit for the other
433
version of a letter if we are caseless.
442
/* Given a character, set its first byte's bit in the table, and also the
443
corresponding bit for the other version of a letter if we are caseless. In
444
UTF-8 mode, for characters greater than 127, we can only do the caseless thing
445
when Unicode property support is available.
436
448
start_bits points to the bit map
449
p points to the character
438
450
caseless the caseless flag
439
451
cd the block with char table pointers
445
set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
448
start_bits[c/8] |= (1 << (c&7));
449
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
450
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
452
utf8 TRUE for UTF-8 mode
454
Returns: pointer after the character
457
static const uschar *
458
set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
459
compile_data *cd, BOOL utf8)
473
c = UCD_OTHERCASE(c);
474
(void)_pcre_ord2utf8(c, buff);
482
/* Not UTF-8 mode, or character is less than 127. */
484
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
490
/*************************************************
491
* Set bits for a positive character type *
492
*************************************************/
494
/* This function sets starting bits for a character type. In UTF-8 mode, we can
495
only do a direct setting for bytes less than 128, as otherwise there can be
496
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
497
environment, the tables will only recognize ASCII characters anyway, but in at
498
least one Windows environment, some higher bytes bits were set in the tables.
499
So we deal with that case by considering the UTF-8 encoding.
502
start_bits the starting bitmap
503
cbit type the type of character wanted
504
table_limit 32 for non-UTF-8; 16 for UTF-8
505
cd the block with char table pointers
511
set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
515
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
516
if (table_limit == 32) return;
517
for (c = 128; c < 256; c++)
519
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522
(void)_pcre_ord2utf8(c, buff);
529
/*************************************************
530
* Set bits for a negative character type *
531
*************************************************/
533
/* This function sets starting bits for a negative character type such as \D.
534
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
535
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
536
Unlike in the positive case, where we can set appropriate starting bits for
537
specific high-valued UTF-8 characters, in this case we have to set the bits for
538
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
539
0xc0 (192) for simplicity.
542
start_bits the starting bitmap
543
cbit type the type of character wanted
544
table_limit 32 for non-UTF-8; 16 for UTF-8
545
cd the block with char table pointers
551
set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
555
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
556
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
639
set_table_bit(start_bits, tcode[1], caseless, cd);
643
/* Single character type sets the bits and stops */
736
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
740
/* Special spacing and line-terminating items. These recognize specific
741
lists of characters. The difference between VSPACE and ANYNL is that the
742
latter can match the two-character CRLF sequence, but that is not
743
relevant for finding the first character, so their code here is
751
SET_BIT(0xC2); /* For U+00A0 */
752
SET_BIT(0xE1); /* For U+1680, U+180E */
753
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
754
SET_BIT(0xE3); /* For U+3000 */
768
SET_BIT(0xC2); /* For U+0085 */
769
SET_BIT(0xE2); /* For U+2028, U+2029 */
775
/* Single character types set the bits and stop. Note that if PCRE_UCP
776
is set, we do not see these op codes because \d etc are converted to
777
properties. Therefore, these apply in the case when only characters less
778
than 256 are recognized to match the types. */
645
780
case OP_NOT_DIGIT:
646
for (c = 0; c < 32; c++)
647
start_bits[c] |= ~cd->cbits[c+cbit_digit];
781
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
648
782
try_next = FALSE;
652
for (c = 0; c < 32; c++)
653
start_bits[c] |= cd->cbits[c+cbit_digit];
786
set_type_bits(start_bits, cbit_digit, table_limit, cd);
654
787
try_next = FALSE;
657
790
/* The cbit_space table has vertical tab as whitespace; we have to
791
ensure it is set as not whitespace. */
660
793
case OP_NOT_WHITESPACE:
661
for (c = 0; c < 32; c++)
663
int d = cd->cbits[c+cbit_space];
664
if (c == 1) d &= ~0x08;
794
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
795
start_bits[1] |= 0x08;
667
796
try_next = FALSE;
670
799
/* The cbit_space table has vertical tab as whitespace; we have to
800
not set it from the table. */
673
802
case OP_WHITESPACE:
674
for (c = 0; c < 32; c++)
676
int d = cd->cbits[c+cbit_space];
677
if (c == 1) d &= ~0x08;
803
c = start_bits[1]; /* Save in case it was already set */
804
set_type_bits(start_bits, cbit_space, table_limit, cd);
805
start_bits[1] = (start_bits[1] & ~0x08) | c;
680
806
try_next = FALSE;
683
809
case OP_NOT_WORDCHAR:
684
for (c = 0; c < 32; c++)
685
start_bits[c] |= ~cd->cbits[c+cbit_word];
810
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
686
811
try_next = FALSE;
689
814
case OP_WORDCHAR:
690
for (c = 0; c < 32; c++)
691
start_bits[c] |= cd->cbits[c+cbit_word];
815
set_type_bits(start_bits, cbit_word, table_limit, cd);
692
816
try_next = FALSE;
720
845
case OP_TYPEPOSQUERY:
858
SET_BIT(0xC2); /* For U+00A0 */
859
SET_BIT(0xE1); /* For U+1680, U+180E */
860
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
861
SET_BIT(0xE3); /* For U+3000 */
874
SET_BIT(0xC2); /* For U+0085 */
875
SET_BIT(0xE2); /* For U+2028, U+2029 */
727
880
case OP_NOT_DIGIT:
728
for (c = 0; c < 32; c++)
729
start_bits[c] |= ~cd->cbits[c+cbit_digit];
881
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
733
for (c = 0; c < 32; c++)
734
start_bits[c] |= cd->cbits[c+cbit_digit];
885
set_type_bits(start_bits, cbit_digit, table_limit, cd);
737
888
/* The cbit_space table has vertical tab as whitespace; we have to
889
ensure it gets set as not whitespace. */
740
891
case OP_NOT_WHITESPACE:
741
for (c = 0; c < 32; c++)
743
int d = cd->cbits[c+cbit_space];
744
if (c == 1) d &= ~0x08;
892
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
893
start_bits[1] |= 0x08;
749
896
/* The cbit_space table has vertical tab as whitespace; we have to
752
899
case OP_WHITESPACE:
753
for (c = 0; c < 32; c++)
755
int d = cd->cbits[c+cbit_space];
756
if (c == 1) d &= ~0x08;
900
c = start_bits[1]; /* Save in case it was already set */
901
set_type_bits(start_bits, cbit_space, table_limit, cd);
902
start_bits[1] = (start_bits[1] & ~0x08) | c;
761
905
case OP_NOT_WORDCHAR:
762
for (c = 0; c < 32; c++)
763
start_bits[c] |= ~cd->cbits[c+cbit_word];
906
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
766
909
case OP_WORDCHAR:
767
for (c = 0; c < 32; c++)
768
start_bits[c] |= cd->cbits[c+cbit_word];
910
set_type_bits(start_bits, cbit_word, table_limit, cd);