1
/* $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.3 2008/02/11 23:26:42 kurt Exp $ */
2
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4
* Copyright 1998-2008 The OpenLDAP Foundation.
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted only as authorized by the OpenLDAP
11
* A copy of this license is available in file LICENSE in the
12
* top-level directory of the distribution or, alternatively, at
13
* <http://www.OpenLDAP.org/license.html>.
15
/* Copyright 1997, 1998, 1999 Computing Research Labs,
16
* New Mexico State University
18
* Permission is hereby granted, free of charge, to any person obtaining a
19
* copy of this software and associated documentation files (the "Software"),
20
* to deal in the Software without restriction, including without limitation
21
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
22
* and/or sell copies of the Software, and to permit persons to whom the
23
* Software is furnished to do so, subject to the following conditions:
25
* The above copyright notice and this permission notice shall be included in
26
* all copies or substantial portions of the Software.
28
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36
/* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */
40
#include <ac/stdlib.h>
41
#include <ac/string.h>
42
#include <ac/unistd.h>
47
* Flags used internally in the DFA.
49
#define _URE_DFA_CASEFOLD 0x01
50
#define _URE_DFA_BLANKLINE 0x02
52
static unsigned long cclass_flags[] = {
89
* Symbol types for the DFA.
91
#define _URE_ANY_CHAR 1
94
#define _URE_NCCLASS 4
95
#define _URE_BOL_ANCHOR 5
96
#define _URE_EOL_ANCHOR 6
99
* Op codes for converting the NFA to a DFA.
101
#define _URE_SYMBOL 10
102
#define _URE_PAREN 11
103
#define _URE_QUEST 12
110
#define _URE_NOOP 0xffff
112
#define _URE_REGSTART 0x8000
113
#define _URE_REGEND 0x4000
116
* Structure used to handle a compacted range of characters.
124
_ure_range_t *ranges;
135
* This is a general element structure used for expressions and stack
147
* This is a structure used to track a list or a stack of states.
156
* Structure to track the list of unique states for a symbol
165
_ure_stlist_t states;
169
* Structure to hold a single state.
182
* Structure used for keeping lists of states.
185
_ure_state_t *states;
191
* Structure to track pairs of DFA states when equivalent states are
200
* Structure used for constructing the NFA and reducing to a minimal DFA.
202
typedef struct _ure_buffer_t {
210
* Table of unique symbols encountered.
212
_ure_symtab_t *symtab;
217
* Tracks the unique expressions generated for the NFA and when the NFA is
225
* The reduced table of unique groups of NFA states.
227
_ure_statetable_t states;
230
* Tracks states when equivalent states are merged.
248
typedef struct _ure_dfa_t {
254
_ure_dstate_t *states;
261
/*************************************************************************
265
*************************************************************************/
268
_ure_memmove(char *dest, char *src, unsigned long bytes)
277
* Do a memmove using Ye Olde Duff's Device for efficiency.
286
case 7: *--dest = *--src;
287
case 6: *--dest = *--src;
288
case 5: *--dest = *--src;
289
case 4: *--dest = *--src;
290
case 3: *--dest = *--src;
291
case 2: *--dest = *--src;
292
case 1: *--dest = *--src;
295
} else if (src > dest) {
299
case 7: *dest++ = *src++;
300
case 6: *dest++ = *src++;
301
case 5: *dest++ = *src++;
302
case 4: *dest++ = *src++;
303
case 3: *dest++ = *src++;
304
case 2: *dest++ = *src++;
305
case 1: *dest++ = *src++;
312
_ure_push(ucs2_t v, _ure_buffer_t *b)
320
* If the `reducing' parameter is non-zero, check to see if the value
321
* passed is already on the stack.
323
if (b->reducing != 0 && b->expr[v].onstack != 0)
327
if (s->slist_used == s->slist_size) {
328
if (s->slist_size == 0)
329
s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
331
s->slist = (ucs2_t *) realloc((char *) s->slist,
332
sizeof(ucs2_t) * (s->slist_size + 8));
335
s->slist[s->slist_used++] = v;
338
* If the `reducing' parameter is non-zero, flag the element as being on
341
if (b->reducing != 0)
342
b->expr[v].onstack = 1;
346
_ure_peek(_ure_buffer_t *b)
348
if (b == 0 || b->stack.slist_used == 0)
351
return b->stack.slist[b->stack.slist_used - 1];
355
_ure_pop(_ure_buffer_t *b)
359
if (b == 0 || b->stack.slist_used == 0)
362
v = b->stack.slist[--b->stack.slist_used];
364
b->expr[v].onstack = 0;
369
/*************************************************************************
371
* Start symbol parse functions.
373
*************************************************************************/
376
* Parse a comma-separated list of integers that represent character
377
* properties. Combine them into a mask that is returned in the `mask'
378
* variable, and return the number of characters consumed.
381
_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
390
for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
393
* Encountered a comma, so select the next character property flag
394
* and reset the number.
396
m |= cclass_flags[n];
398
} else if (*sp >= '0' && *sp <= '9')
400
* Encountered a digit, so start or continue building the cardinal
401
* that represents the character property flag.
403
n = (n * 10) + (*sp - '0');
406
* Encountered something that is not part of the property list.
407
* Indicate that we are done.
412
* If a property number greater than 32 occurs, then there is a
413
* problem. Most likely a missing comma separator.
416
b->error = _URE_INVALID_PROPERTY;
420
m |= cclass_flags[n];
423
* Set the mask that represents the group of character properties.
428
* Return the number of characters consumed.
434
* Collect a hex number with 1 to 4 digits and return the number
435
* of characters used.
438
_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
447
for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
448
if (*sp >= '0' && *sp <= '9')
449
nn = (nn << 4) + (*sp - '0');
450
else if (*sp >= 'A' && *sp <= 'F')
451
nn = (nn << 4) + ((*sp - 'A') + 10);
452
else if (*sp >= 'a' && *sp <= 'f')
453
nn = (nn << 4) + ((*sp - 'a') + 10);
456
* Encountered something that is not a hex digit.
462
* Assign the character code collected and return the number of
471
* Insert a range into a character class, removing duplicates and ordering
472
* them in increasing range-start order.
475
_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
482
* If the `casefold' flag is set, then make sure both endpoints of the
483
* range are converted to lower case.
485
if (b->flags & _URE_DFA_CASEFOLD) {
486
r->min_code = _ure_tolower(r->min_code);
487
r->max_code = _ure_tolower(r->max_code);
491
* Swap the range endpoints if they are not in increasing order.
493
if (r->min_code > r->max_code) {
495
r->min_code = r->max_code;
499
for (i = 0, rp = ccl->ranges;
500
i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
503
* Check for a duplicate.
505
if (i < ccl->ranges_used &&
506
r->min_code == rp->min_code && r->max_code == rp->max_code)
509
if (ccl->ranges_used == ccl->ranges_size) {
510
if (ccl->ranges_size == 0)
511
ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
513
ccl->ranges = (_ure_range_t *)
514
realloc((char *) ccl->ranges,
515
sizeof(_ure_range_t) * (ccl->ranges_size + 8));
516
ccl->ranges_size += 8;
519
rp = ccl->ranges + ccl->ranges_used;
521
if (i < ccl->ranges_used)
522
_ure_memmove((char *) (rp + 1), (char *) rp,
523
sizeof(_ure_range_t) * (ccl->ranges_used - i));
526
rp->min_code = r->min_code;
527
rp->max_code = r->max_code;
530
#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
531
_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
532
#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
533
#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
535
#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
536
_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
537
#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
538
#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
540
typedef void (*_ure_cclsetup_t)(
550
_ure_cclsetup_t func;
555
_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
561
_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
568
* Add the additional characters needed for handling isspace().
570
range.min_code = range.max_code = '\t';
571
_ure_add_range(&sym->sym.ccl, &range, b);
572
range.min_code = range.max_code = '\r';
573
_ure_add_range(&sym->sym.ccl, &range, b);
574
range.min_code = range.max_code = '\n';
575
_ure_add_range(&sym->sym.ccl, &range, b);
576
range.min_code = range.max_code = '\f';
577
_ure_add_range(&sym->sym.ccl, &range, b);
578
range.min_code = range.max_code = 0xfeff;
579
_ure_add_range(&sym->sym.ccl, &range, b);
583
_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
588
* Add the additional characters needed for handling isxdigit().
590
range.min_code = '0';
591
range.max_code = '9';
592
_ure_add_range(&sym->sym.ccl, &range, b);
593
range.min_code = 'A';
594
range.max_code = 'F';
595
_ure_add_range(&sym->sym.ccl, &range, b);
596
range.min_code = 'a';
597
range.max_code = 'f';
598
_ure_add_range(&sym->sym.ccl, &range, b);
601
static _ure_trie_t cclass_trie[] = {
602
{0x003a, 1, 1, 0, 0},
603
{0x0061, 9, 10, 0, 0},
604
{0x0063, 8, 19, 0, 0},
605
{0x0064, 7, 24, 0, 0},
606
{0x0067, 6, 29, 0, 0},
607
{0x006c, 5, 34, 0, 0},
608
{0x0070, 4, 39, 0, 0},
609
{0x0073, 3, 49, 0, 0},
610
{0x0075, 2, 54, 0, 0},
611
{0x0078, 1, 59, 0, 0},
612
{0x006c, 1, 11, 0, 0},
613
{0x006e, 2, 13, 0, 0},
614
{0x0070, 1, 16, 0, 0},
615
{0x0075, 1, 14, 0, 0},
616
{0x006d, 1, 15, 0, 0},
617
{0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
618
{0x0068, 1, 17, 0, 0},
619
{0x0061, 1, 18, 0, 0},
620
{0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
621
{0x006e, 1, 20, 0, 0},
622
{0x0074, 1, 21, 0, 0},
623
{0x0072, 1, 22, 0, 0},
624
{0x006c, 1, 23, 0, 0},
625
{0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
626
{0x0069, 1, 25, 0, 0},
627
{0x0067, 1, 26, 0, 0},
628
{0x0069, 1, 27, 0, 0},
629
{0x0074, 1, 28, 0, 0},
630
{0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
631
{0x0072, 1, 30, 0, 0},
632
{0x0061, 1, 31, 0, 0},
633
{0x0070, 1, 32, 0, 0},
634
{0x0068, 1, 33, 0, 0},
635
{0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
636
{0x006f, 1, 35, 0, 0},
637
{0x0077, 1, 36, 0, 0},
638
{0x0065, 1, 37, 0, 0},
639
{0x0072, 1, 38, 0, 0},
640
{0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
641
{0x0072, 2, 41, 0, 0},
642
{0x0075, 1, 45, 0, 0},
643
{0x0069, 1, 42, 0, 0},
644
{0x006e, 1, 43, 0, 0},
645
{0x0074, 1, 44, 0, 0},
646
{0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
647
{0x006e, 1, 46, 0, 0},
648
{0x0063, 1, 47, 0, 0},
649
{0x0074, 1, 48, 0, 0},
650
{0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
651
{0x0070, 1, 50, 0, 0},
652
{0x0061, 1, 51, 0, 0},
653
{0x0063, 1, 52, 0, 0},
654
{0x0065, 1, 53, 0, 0},
655
{0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
656
{0x0070, 1, 55, 0, 0},
657
{0x0070, 1, 56, 0, 0},
658
{0x0065, 1, 57, 0, 0},
659
{0x0072, 1, 58, 0, 0},
660
{0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
661
{0x0064, 1, 60, 0, 0},
662
{0x0069, 1, 61, 0, 0},
663
{0x0067, 1, 62, 0, 0},
664
{0x0069, 1, 63, 0, 0},
665
{0x0074, 1, 64, 0, 0},
666
{0x003a, 1, 65, _ure_xdigit_setup, 0},
670
* Probe for one of the POSIX colon delimited character classes in the static
674
_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
683
* If the number of characters left is less than 7, then this cannot be
684
* interpreted as one of the colon delimited classes.
692
for (i = 0; sp < ep && i < 8; i++, sp++) {
695
for (; n > 0 && tp->key != *sp; tp++, n--) ;
700
if (*sp == ':' && (i == 6 || i == 7)) {
705
tp = cclass_trie + tp->next;
710
(*tp->func)(sym, tp->mask, b);
716
* Construct a list of ranges and return the number of characters consumed.
719
_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
733
symp->type = _URE_NCCLASS;
736
symp->type = _URE_CCLASS;
738
for (last = 0, range_end = 0;
739
b->error == _URE_OK && sp < ep && *sp != ']'; ) {
744
* The EOS was encountered when expecting the reverse solidus
745
* to be followed by the character it is escaping. Set an
746
* error code and return the number of characters consumed up
749
b->error = _URE_UNEXPECTED_EOS;
778
sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
780
* Invert the bit mask of the properties if this is a negated
781
* character class or if 'P' is used to specify a list of
782
* character properties that should *not* match in a
786
symp->props = ~symp->props;
794
((*sp >= '0' && *sp <= '9') ||
795
(*sp >= 'A' && *sp <= 'F') ||
796
(*sp >= 'a' && *sp <= 'f')))
797
sp += _ure_hex(sp, ep - sp, &c);
799
} else if (c == ':') {
801
* Probe for a POSIX colon delimited character class.
804
if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
812
cclp = &symp->sym.ccl;
815
* Check to see if the current character is a low surrogate that needs
816
* to be combined with a preceding high surrogate.
819
if (c >= 0xdc00 && c <= 0xdfff)
821
* Construct the UTF16 character code.
823
c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
826
* Add the isolated high surrogate to the range.
829
range.max_code = last & 0xffff;
831
range.min_code = range.max_code = last & 0xffff;
833
_ure_add_range(cclp, &range, b);
839
* Clear the last character code.
844
* This slightly awkward code handles the different cases needed to
847
if (c >= 0xd800 && c <= 0xdbff) {
849
* If the high surrogate is followed by a range indicator, simply
850
* add it as the range start. Otherwise, save it in case the next
851
* character is a low surrogate.
859
} else if (range_end == 1) {
861
_ure_add_range(cclp, &range, b);
864
range.min_code = range.max_code = c;
869
_ure_add_range(cclp, &range, b);
873
if (sp < ep && *sp == ']')
877
* The parse was not terminated by the character class close symbol
878
* (']'), so set an error code.
880
b->error = _URE_CCLASS_OPEN;
886
* Probe for a low surrogate hex code.
889
_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
894
for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
895
if (*sp >= '0' && *sp <= '9')
896
code = (code << 4) + (*sp - '0');
897
else if (*sp >= 'A' && *sp <= 'F')
898
code = (code << 4) + ((*sp - 'A') + 10);
899
else if (*sp >= 'a' && *sp <= 'f')
900
code = (code << 4) + ((*sp - 'a') + 10);
906
return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
910
_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
919
if ((c = *sp++) == '\\') {
923
* The EOS was encountered when expecting the reverse solidus to
924
* be followed by the character it is escaping. Set an error code
925
* and return the number of characters consumed up to this point.
927
b->error = _URE_UNEXPECTED_EOS;
935
symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
936
sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
939
symp->type = _URE_CHAR;
940
symp->sym.chr = 0x07;
943
symp->type = _URE_CHAR;
944
symp->sym.chr = 0x08;
947
symp->type = _URE_CHAR;
948
symp->sym.chr = 0x0c;
951
symp->type = _URE_CHAR;
952
symp->sym.chr = 0x0a;
955
symp->type = _URE_CHAR;
956
symp->sym.chr = 0x0d;
959
symp->type = _URE_CHAR;
960
symp->sym.chr = 0x09;
963
symp->type = _URE_CHAR;
964
symp->sym.chr = 0x0b;
971
* Collect between 1 and 4 digits representing a UCS2 code. Fall
972
* through to the next case.
975
((*sp >= '0' && *sp <= '9') ||
976
(*sp >= 'A' && *sp <= 'F') ||
977
(*sp >= 'a' && *sp <= 'f')))
978
sp += _ure_hex(sp, ep - sp, &c);
982
* Simply add an escaped character here.
984
symp->type = _URE_CHAR;
987
} else if (c == '^' || c == '$')
989
* Handle the BOL and EOL anchors. This actually consists simply of
990
* setting a flag that indicates that the user supplied anchor match
991
* function should be called. This needs to be done instead of simply
992
* matching line/paragraph separators because beginning-of-text and
993
* end-of-text tests are needed as well.
995
symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
998
* Construct a character class.
1000
sp += _ure_cclass(sp, ep - sp, symp, b);
1002
symp->type = _URE_ANY_CHAR;
1004
symp->type = _URE_CHAR;
1009
* If the symbol type happens to be a character and is a high surrogate,
1010
* then probe forward to see if it is followed by a low surrogate that
1011
* needs to be added.
1013
if (sp < ep && symp->type == _URE_CHAR &&
1014
0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1016
if (0xdc00 <= *sp && *sp <= 0xdfff) {
1017
symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1020
} else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1021
*(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1022
sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1023
if (0xdc00 <= c && c <= 0xdfff) {
1025
* Take into account the \[xu] in front of the hex code.
1028
symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1035
* Last, make sure any _URE_CHAR type symbols are changed to lower case if
1036
* the `casefold' flag is set.
1038
if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1039
symp->sym.chr = _ure_tolower(symp->sym.chr);
1042
* If the symbol constructed is anything other than one of the anchors,
1043
* make sure the _URE_DFA_BLANKLINE flag is removed.
1045
if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1046
b->flags &= ~_URE_DFA_BLANKLINE;
1049
* Return the number of characters consumed.
1055
_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1057
if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1060
if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1061
if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1063
if (a->sym.ccl.ranges_used > 0 &&
1064
memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1065
sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1067
} else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1073
* Construct a symbol, but only keep unique symbols.
1076
_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1080
_ure_symtab_t *sp, symbol;
1083
* Build the next symbol so we can test to see if it is already in the
1086
(void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1087
*consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1090
* Check to see if the symbol exists.
1092
for (i = 0, sp = b->symtab;
1093
i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1095
if (i < b->symtab_used) {
1097
* Free up any ranges used for the symbol.
1099
if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1100
symbol.sym.ccl.ranges_size > 0)
1101
free((char *) symbol.sym.ccl.ranges);
1103
return b->symtab[i].id;
1107
* Need to add the new symbol.
1109
if (b->symtab_used == b->symtab_size) {
1110
if (b->symtab_size == 0)
1111
b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1113
b->symtab = (_ure_symtab_t *)
1114
realloc((char *) b->symtab,
1115
sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1116
sp = b->symtab + b->symtab_size;
1117
(void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1118
b->symtab_size += 8;
1121
symbol.id = b->symtab_used++;
1122
(void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1123
sizeof(_ure_symtab_t));
1128
/*************************************************************************
1130
* End symbol parse functions.
1132
*************************************************************************/
1135
_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1143
* Determine if the expression already exists or not.
1145
for (i = 0; i < b->expr_used; i++) {
1146
if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1147
b->expr[i].rhs == rhs)
1150
if (i < b->expr_used)
1154
* Need to add a new expression.
1156
if (b->expr_used == b->expr_size) {
1157
if (b->expr_size == 0)
1158
b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1160
b->expr = (_ure_elt_t *)
1161
realloc((char *) b->expr,
1162
sizeof(_ure_elt_t) * (b->expr_size + 8));
1166
b->expr[b->expr_used].onstack = 0;
1167
b->expr[b->expr_used].type = type;
1168
b->expr[b->expr_used].lhs = lhs;
1169
b->expr[b->expr_used].rhs = rhs;
1171
return b->expr_used++;
1174
static unsigned char spmap[] = {
1175
0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1176
0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1177
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180
#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1181
(spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184
* Convert the regular expression into an NFA in a form that will be easy to
1185
* reduce to a DFA. The starting state for the reduction will be returned.
1188
_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1190
ucs2_t c, state, top, sym, *sp, *ep;
1197
while (b->error == _URE_OK && sp < ep) {
1201
_ure_push(_URE_PAREN, b);
1205
* Check for the case of too many close parentheses.
1207
if (_ure_peek(b) == _URE_NOOP) {
1208
b->error = _URE_UNBALANCED_GROUP;
1212
while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1214
* Make an expression with the AND or OR operator and its right
1217
state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1220
* Remove the _URE_PAREN off the stack.
1225
state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1228
state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1231
state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1234
while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1236
* Make an expression with the AND or OR operator and its right
1239
state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1241
_ure_push(state, b);
1242
_ure_push(_URE_OR, b);
1246
sym = _ure_make_symbol(sp, ep - sp, &used, b);
1248
state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1252
if (c != '(' && c != '|' && sp < ep &&
1253
(!_ure_isspecial(*sp) || *sp == '(')) {
1254
_ure_push(state, b);
1255
_ure_push(_URE_AND, b);
1258
while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1260
* Make an expression with the AND or OR operator and its right
1263
state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1265
if (b->stack.slist_used > 0)
1266
b->error = _URE_UNBALANCED_GROUP;
1268
return (b->error == _URE_OK) ? state : _URE_NOOP;
1272
_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1278
* Locate the symbol in the symbol table so the state can be added.
1279
* If the symbol doesn't exist, then a real problem exists.
1281
for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1285
* Now find out if the state exists in the symbol's state list.
1287
for (i = 0, stp = sp->states.slist;
1288
i < sp->states.slist_used && state > *stp; i++, stp++) ;
1290
if (i == sp->states.slist_used || state < *stp) {
1292
* Need to add the state in order.
1294
if (sp->states.slist_used == sp->states.slist_size) {
1295
if (sp->states.slist_size == 0)
1296
sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1298
sp->states.slist = (ucs2_t *)
1299
realloc((char *) sp->states.slist,
1300
sizeof(ucs2_t) * (sp->states.slist_size + 8));
1301
sp->states.slist_size += 8;
1303
if (i < sp->states.slist_used)
1304
(void) _ure_memmove((char *) (sp->states.slist + i + 1),
1305
(char *) (sp->states.slist + i),
1306
sizeof(ucs2_t) * (sp->states.slist_used - i));
1307
sp->states.slist[i] = state;
1308
sp->states.slist_used++;
1313
_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1318
for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1319
if (sp->st.slist_used == nstates &&
1320
memcmp((char *) states, (char *) sp->st.slist,
1321
sizeof(ucs2_t) * nstates) == 0)
1325
if (i == b->states.states_used) {
1327
* Need to add a new DFA state (set of NFA states).
1329
if (b->states.states_used == b->states.states_size) {
1330
if (b->states.states_size == 0)
1331
b->states.states = (_ure_state_t *)
1332
malloc(sizeof(_ure_state_t) << 3);
1334
b->states.states = (_ure_state_t *)
1335
realloc((char *) b->states.states,
1336
sizeof(_ure_state_t) * (b->states.states_size + 8));
1337
sp = b->states.states + b->states.states_size;
1338
(void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1339
b->states.states_size += 8;
1342
sp = b->states.states + b->states.states_used++;
1345
if (sp->st.slist_used + nstates > sp->st.slist_size) {
1346
if (sp->st.slist_size == 0)
1347
sp->st.slist = (ucs2_t *)
1348
malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1350
sp->st.slist = (ucs2_t *)
1351
realloc((char *) sp->st.slist,
1352
sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1353
sp->st.slist_size = sp->st.slist_used + nstates;
1355
sp->st.slist_used = nstates;
1356
(void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1357
sizeof(ucs2_t) * nstates);
1361
* Return the ID of the DFA state representing a group of NFA states.
1367
_ure_reduce(ucs2_t start, _ure_buffer_t *b)
1369
ucs2_t i, j, state, eval, syms, rhs;
1370
ucs2_t s1, s2, ns1, ns2;
1377
* Add the starting state for the reduction.
1379
_ure_add_state(1, &start, b);
1382
* Process each set of NFA states that get created.
1384
for (i = 0; i < b->states.states_used; i++) {
1385
sp = b->states.states + i;
1388
* Push the current states on the stack.
1390
for (j = 0; j < sp->st.slist_used; j++)
1391
_ure_push(sp->st.slist[j], b);
1394
* Reduce the NFA states.
1396
for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1397
state = b->stack.slist[j];
1401
* This inner loop is the iterative equivalent of recursively
1402
* reducing subexpressions generated as a result of a reduction.
1405
switch (b->expr[state].type) {
1407
ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1408
_ure_add_symstate(b->expr[state].lhs, ns1, b);
1417
s1 = b->expr[state].lhs;
1418
ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1419
state = _ure_make_expr(_URE_OR, ns1, s1, b);
1422
s1 = b->expr[state].lhs;
1423
ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1424
state = _ure_make_expr(_URE_AND, s1, ns1, b);
1427
s1 = b->expr[state].lhs;
1428
ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1429
ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1430
state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1433
s1 = b->expr[state].lhs;
1434
s2 = b->expr[state].rhs;
1440
s1 = b->expr[state].lhs;
1441
s2 = b->expr[state].rhs;
1442
switch (b->expr[s1].type) {
1444
_ure_add_symstate(b->expr[s1].lhs, s2, b);
1452
ns1 = b->expr[s1].lhs;
1453
ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1454
state = _ure_make_expr(_URE_OR, s2, ns2, b);
1457
ns1 = b->expr[s1].lhs;
1458
ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1459
state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1462
ns1 = b->expr[s1].lhs;
1463
ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1464
state = _ure_make_expr(_URE_OR, s2, ns2, b);
1467
ns1 = b->expr[s1].lhs;
1468
ns2 = b->expr[s1].rhs;
1469
ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1470
ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1471
state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1474
ns1 = b->expr[s1].lhs;
1475
ns2 = b->expr[s1].rhs;
1476
ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1477
state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1485
* Clear the state stack.
1487
while (_ure_pop(b) != _URE_NOOP) ;
1490
* Reset the state pointer because the reduction may have moved it
1491
* during a reallocation.
1493
sp = b->states.states + i;
1496
* Generate the DFA states for the symbols collected during the
1497
* current reduction.
1499
if (sp->trans_used + syms > sp->trans_size) {
1500
if (sp->trans_size == 0)
1501
sp->trans = (_ure_elt_t *)
1502
malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1504
sp->trans = (_ure_elt_t *)
1505
realloc((char *) sp->trans,
1506
sizeof(_ure_elt_t) * (sp->trans_used + syms));
1507
sp->trans_size = sp->trans_used + syms;
1511
* Go through the symbol table and generate the DFA state transitions
1512
* for each symbol that has collected NFA states.
1514
for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1515
sp = b->states.states + i;
1517
if (smp->states.slist_used > 0) {
1518
sp->trans[syms].lhs = smp->id;
1519
rhs = _ure_add_state(smp->states.slist_used,
1520
smp->states.slist, b);
1522
* Reset the state pointer in case the reallocation moves it
1525
sp = b->states.states + i;
1526
sp->trans[syms].rhs = rhs;
1528
smp->states.slist_used = 0;
1534
* Set the number of transitions actually used.
1536
sp->trans_used = syms;
1542
_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1546
l = b->states.states[l].id;
1547
r = b->states.states[r].id;
1559
* Check to see if the equivalence pair already exists.
1561
for (tmp = 0; tmp < b->equiv_used &&
1562
(b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1565
if (tmp < b->equiv_used)
1568
if (b->equiv_used == b->equiv_size) {
1569
if (b->equiv_size == 0)
1570
b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1572
b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1573
sizeof(_ure_equiv_t) *
1574
(b->equiv_size + 8));
1577
b->equiv[b->equiv_used].l = l;
1578
b->equiv[b->equiv_used].r = r;
1583
* Merge the DFA states that are equivalent.
1586
_ure_merge_equiv(_ure_buffer_t *b)
1588
ucs2_t i, j, k, eq, done;
1589
_ure_state_t *sp1, *sp2, *ls, *rs;
1591
for (i = 0; i < b->states.states_used; i++) {
1592
sp1 = b->states.states + i;
1595
for (j = 0; j < i; j++) {
1596
sp2 = b->states.states + j;
1600
_ure_add_equiv(i, j, b);
1601
for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1602
ls = b->states.states + b->equiv[eq].l;
1603
rs = b->states.states + b->equiv[eq].r;
1604
if (ls->accepting != rs->accepting ||
1605
ls->trans_used != rs->trans_used) {
1609
for (k = 0; k < ls->trans_used &&
1610
ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1611
if (k < ls->trans_used) {
1616
for (k = 0; k < ls->trans_used; k++)
1617
_ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1622
for (eq = 0; j < i && eq < b->equiv_used; eq++)
1623
b->states.states[b->equiv[eq].r].id =
1624
b->states.states[b->equiv[eq].l].id;
1628
* Renumber the states appropriately.
1630
for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1632
sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1635
/*************************************************************************
1639
*************************************************************************/
1642
ure_buffer_create(void)
1646
b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1652
ure_buffer_free(ure_buffer_t buf)
1659
if (buf->stack.slist_size > 0)
1660
free((char *) buf->stack.slist);
1662
if (buf->expr_size > 0)
1663
free((char *) buf->expr);
1665
for (i = 0; i < buf->symtab_size; i++) {
1666
if (buf->symtab[i].states.slist_size > 0)
1667
free((char *) buf->symtab[i].states.slist);
1670
if (buf->symtab_size > 0)
1671
free((char *) buf->symtab);
1673
for (i = 0; i < buf->states.states_size; i++) {
1674
if (buf->states.states[i].trans_size > 0)
1675
free((char *) buf->states.states[i].trans);
1676
if (buf->states.states[i].st.slist_size > 0)
1677
free((char *) buf->states.states[i].st.slist);
1680
if (buf->states.states_size > 0)
1681
free((char *) buf->states.states);
1683
if (buf->equiv_size > 0)
1684
free((char *) buf->equiv);
1690
ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1698
if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1702
* Reset the various fields of the compilation buffer. Default the flags
1703
* to indicate the presense of the "^$" pattern. If any other pattern
1704
* occurs, then this flag will be removed. This is done to catch this
1705
* special pattern and handle it specially when matching.
1707
buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1709
buf->stack.slist_used = 0;
1712
for (i = 0; i < buf->symtab_used; i++)
1713
buf->symtab[i].states.slist_used = 0;
1714
buf->symtab_used = 0;
1716
for (i = 0; i < buf->states.states_used; i++) {
1717
buf->states.states[i].st.slist_used = 0;
1718
buf->states.states[i].trans_used = 0;
1720
buf->states.states_used = 0;
1723
* Construct the NFA. If this stage returns a 0, then an error occured or
1724
* an empty expression was passed.
1726
if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1730
* Do the expression reduction to get the initial DFA.
1732
_ure_reduce(state, buf);
1735
* Merge all the equivalent DFA states.
1737
_ure_merge_equiv(buf);
1740
* Construct the minimal DFA.
1742
dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1743
(void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1745
dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1748
* Free up the NFA state groups and transfer the symbols from the buffer
1751
for (i = 0; i < buf->symtab_size; i++) {
1752
if (buf->symtab[i].states.slist_size > 0)
1753
free((char *) buf->symtab[i].states.slist);
1755
dfa->syms = buf->symtab;
1756
dfa->nsyms = buf->symtab_used;
1758
buf->symtab_used = buf->symtab_size = 0;
1761
* Collect the total number of states and transitions needed for the DFA.
1763
for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1765
if (sp->id == state) {
1767
dfa->ntrans += sp->trans_used;
1773
* Allocate enough space for the states and transitions.
1775
dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1777
dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1780
* Actually transfer the DFA states from the buffer.
1784
for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1786
if (sp->id == state) {
1788
dsp->ntrans = sp->trans_used;
1789
dsp->accepting = sp->accepting;
1792
* Add the transitions for the state.
1794
for (j = 0; j < dsp->ntrans; j++, tp++) {
1795
tp->symbol = sp->trans[j].lhs;
1796
tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1808
ure_dfa_free(ure_dfa_t dfa)
1815
for (i = 0; i < dfa->nsyms; i++) {
1816
if ((dfa->syms[i].type == _URE_CCLASS ||
1817
dfa->syms[i].type == _URE_NCCLASS) &&
1818
dfa->syms[i].sym.ccl.ranges_size > 0)
1819
free((char *) dfa->syms[i].sym.ccl.ranges);
1822
free((char *) dfa->syms);
1824
if (dfa->nstates > 0)
1825
free((char *) dfa->states);
1826
if (dfa->ntrans > 0)
1827
free((char *) dfa->trans);
1832
ure_write_dfa(ure_dfa_t dfa, FILE *out)
1834
ucs2_t i, j, k, h, l;
1839
if (dfa == 0 || out == 0)
1843
* Write all the different character classes.
1845
for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1846
if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1847
fprintf(out, "C%hd = ", sym->id);
1848
if (sym->sym.ccl.ranges_used > 0) {
1850
if (sym->type == _URE_NCCLASS)
1853
if (sym->props != 0) {
1854
if (sym->type == _URE_NCCLASS)
1855
fprintf(out, "\\P");
1857
fprintf(out, "\\p");
1858
for (k = h = 0; k < 32; k++) {
1859
if (sym->props & (1 << k)) {
1862
fprintf(out, "%hd", k + 1);
1870
for (k = 0, rp = sym->sym.ccl.ranges;
1871
k < sym->sym.ccl.ranges_used; k++, rp++) {
1873
* Check for UTF16 characters.
1875
if (0x10000 <= rp->min_code &&
1876
rp->min_code <= 0x10ffff) {
1877
h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1878
l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1879
fprintf(out, "\\x%04hX\\x%04hX", h, l);
1881
fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1882
if (rp->max_code != rp->min_code) {
1884
if (rp->max_code >= 0x10000 &&
1885
rp->max_code <= 0x10ffff) {
1886
h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1887
l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1888
fprintf(out, "\\x%04hX\\x%04hX", h, l);
1890
fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1893
if (sym->sym.ccl.ranges_used > 0)
1899
for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1900
fprintf(out, "S%hd = ", i);
1901
if (sp->accepting) {
1906
for (j = 0; j < sp->ntrans; j++) {
1910
sym = dfa->syms + sp->trans[j].symbol;
1911
switch (sym->type) {
1913
if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1915
* Take care of UTF16 characters.
1917
h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1918
l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1919
fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1921
fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1924
fprintf(out, "<any> ");
1926
case _URE_BOL_ANCHOR:
1927
fprintf(out, "<bol-anchor> ");
1929
case _URE_EOL_ANCHOR:
1930
fprintf(out, "<eol-anchor> ");
1934
fprintf(out, "[C%hd] ", sym->id);
1937
fprintf(out, "S%hd", sp->trans[j].next_state);
1938
if (j + 1 < sp->ntrans)
1945
#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1949
ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1950
unsigned long *match_start, unsigned long *match_end)
1952
int i, j, matched, found, skip;
1953
unsigned long ms, me;
1955
ucs2_t *sp, *ep, *lp;
1960
if (dfa == 0 || text == 0)
1964
* Handle the special case of an empty string matching the "^$" pattern.
1966
if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1967
*match_start = *match_end = 0;
1978
for (found = skip = 0; found == 0 && sp < ep; ) {
1983
* Check to see if this is a high surrogate that should be
1984
* combined with a following low surrogate.
1986
if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1987
0xdc00 <= *sp && *sp <= 0xdfff)
1988
c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1991
* Determine if the character is non-spacing and should be skipped.
1993
if (_ure_matches_properties(_URE_NONSPACING, c) &&
1994
(flags & URE_IGNORE_NONSPACING)) {
1999
if (dfa->flags & _URE_DFA_CASEFOLD)
2000
c = _ure_tolower(c);
2003
* See if one of the transitions matches.
2005
for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2006
sym = dfa->syms + stp->trans[i].symbol;
2007
switch (sym->type) {
2009
if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2014
if (c == sym->sym.chr)
2017
case _URE_BOL_ANCHOR:
2021
} else if (_ure_issep(c)) {
2022
if (c == '\r' && sp < ep && *sp == '\n')
2028
case _URE_EOL_ANCHOR:
2029
if (_ure_issep(c)) {
2031
* Put the pointer back before the separator so the match
2032
* end position will be correct. This case will also
2033
* cause the `sp' pointer to be advanced over the current
2034
* separator once the match end point has been recorded.
2042
if (sym->props != 0)
2043
matched = _ure_matches_properties(sym->props, c);
2044
for (j = 0, rp = sym->sym.ccl.ranges;
2045
j < sym->sym.ccl.ranges_used; j++, rp++) {
2046
if (rp->min_code <= c && c <= rp->max_code)
2049
if (sym->type == _URE_NCCLASS)
2059
stp = dfa->states + stp->trans[i].next_state;
2062
* If the match was an EOL anchor, adjust the pointer past the
2063
* separator that caused the match. The correct match
2064
* position has been recorded already.
2066
if (sym->type == _URE_EOL_ANCHOR) {
2068
* Skip the character that caused the match.
2073
* Handle the infamous CRLF situation.
2075
if (sp < ep && c == '\r' && *sp == '\n')
2082
if (stp->accepting == 0) {
2084
* If the last state was not accepting, then reset
2091
* The last state was accepting, so terminate the matching
2092
* loop to avoid more work.
2095
} else if (sp == ep) {
2096
if (!stp->accepting) {
2098
* This ugly hack is to make sure the end-of-line anchors
2099
* match when the source text hits the end. This is only done
2100
* if the last subexpression matches.
2102
for (i = 0; found == 0 && i < stp->ntrans; i++) {
2103
sym = dfa->syms + stp->trans[i].symbol;
2104
if (sym->type ==_URE_EOL_ANCHOR) {
2105
stp = dfa->states + stp->trans[i].next_state;
2106
if (stp->accepting) {
2115
* Make sure any conditions that match all the way to the end
2116
* of the string match.
2130
return (ms != ~0UL) ? 1 : 0;