111
/* Number of bits in an unsigned char. */
111
/* Number of bits in an unsigned char. */
113
113
# define CHARBITS 8
116
/* First integer value that is greater than any character code. */
116
/* First integer value that is greater than any character code. */
117
117
#define NOTCHAR (1 << CHARBITS)
119
/* INTBITS need not be exact, just a lower bound. */
119
/* INTBITS need not be exact, just a lower bound. */
121
121
# define INTBITS (CHARBITS * sizeof (int))
124
/* Number of ints required to hold a bit for every character. */
124
/* Number of ints required to hold a bit for every character. */
125
125
#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
127
/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
127
/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
128
128
typedef int charclass[CHARCLASS_INTS];
130
130
/* Convert a possibly-signed character to an unsigned character. This is
167
167
The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
168
168
succeeds in a particular context. Prev is a bitmask of possible
169
169
context values for the previous character, curr is the (single-bit)
170
context value for the lookahead character. */
170
context value for the lookahead character. */
171
171
#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
172
172
#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
173
173
#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
175
175
#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
176
((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
177
| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
178
| ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
176
((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \
177
| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \
178
| ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) & (prev))
180
/* The following macros give information about what a constraint depends on. */
180
/* The following macros describe what a constraint depends on. */
181
181
#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
182
182
#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
183
183
#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
212
212
end of input; any value of END or less in
213
213
the parse tree is such a symbol. Accepting
214
214
states of the DFA are those that would have
215
a transition on END. */
215
a transition on END. */
217
/* Ordinary character values are terminal symbols that match themselves. */
217
/* Ordinary character values are terminal symbols that match themselves. */
219
219
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
222
222
BACKREF, /* BACKREF is generated by \<digit>; it
223
223
is not completely handled. If the scanner
224
224
detects a transition on backref, it returns
225
225
a kind of "semi-success" indicating that
226
226
the match will have to be verified with
227
a backtracking matcher. */
227
a backtracking matcher. */
229
229
BEGLINE, /* BEGLINE is a terminal symbol that matches
230
230
the empty string if it is at the beginning
233
233
ENDLINE, /* ENDLINE is a terminal symbol that matches
234
234
the empty string if it is at the end of
237
237
BEGWORD, /* BEGWORD is a terminal symbol that matches
238
238
the empty string if it is at the beginning
241
241
ENDWORD, /* ENDWORD is a terminal symbol that matches
242
242
the empty string if it is at the end of
245
245
LIMWORD, /* LIMWORD is a terminal symbol that matches
246
246
the empty string if it is at the beginning
247
or the end of a word. */
247
or the end of a word. */
249
249
NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
250
250
matches the empty string if it is not at
251
the beginning or end of a word. */
251
the beginning or end of a word. */
253
253
QMARK, /* QMARK is an operator of one argument that
254
254
matches zero or one occurrences of its
257
257
STAR, /* STAR is an operator of one argument that
258
258
matches the Kleene closure (zero or more
259
occurrences) of its argument. */
259
occurrences) of its argument. */
261
261
PLUS, /* PLUS is an operator of one argument that
262
262
matches the positive closure (one or more
263
occurrences) of its argument. */
263
occurrences) of its argument. */
265
265
REPMN, /* REPMN is a lexical token corresponding
266
266
to the {m,n} construct. REPMN never
267
appears in the compiled token vector. */
267
appears in the compiled token vector. */
269
269
CAT, /* CAT is an operator of two arguments that
270
270
matches the concatenation of its
271
271
arguments. CAT is never returned by the
274
274
OR, /* OR is an operator of two arguments that
275
matches either of its arguments. */
275
matches either of its arguments. */
277
277
LPAREN, /* LPAREN never appears in the parse tree,
278
it is only a lexeme. */
278
it is only a lexeme. */
280
RPAREN, /* RPAREN never appears in the parse tree. */
280
RPAREN, /* RPAREN never appears in the parse tree. */
282
282
ANYCHAR, /* ANYCHAR is a terminal symbol that matches
283
283
any multibyte (or single byte) characters.
292
292
CSET /* CSET and (and any value greater) is a
293
293
terminal symbol that matches any of a
294
class of characters. */
294
class of characters. */
298
298
/* States of the recognizer correspond to sets of positions in the parse
299
299
tree, together with the constraints under which they may be matched.
300
300
So a position is encoded as an index into the parse tree together with
304
size_t index; /* Index into the parse array. */
305
unsigned int constraint; /* Constraint for matching this position. */
304
size_t index; /* Index into the parse array. */
305
unsigned int constraint; /* Constraint for matching this position. */
308
/* Sets of positions are stored as arrays. */
308
/* Sets of positions are stored as arrays. */
311
position *elems; /* Elements of this position set. */
312
size_t nelem; /* Number of elements in this set. */
311
position *elems; /* Elements of this position set. */
312
size_t nelem; /* Number of elements in this set. */
313
313
size_t alloc; /* Number of elements allocated in ELEMS. */
316
/* Sets of leaves are also stored as arrays. */
316
/* Sets of leaves are also stored as arrays. */
319
size_t *elems; /* Elements of this position set. */
320
size_t nelem; /* Number of elements in this set. */
319
size_t *elems; /* Elements of this position set. */
320
size_t nelem; /* Number of elements in this set. */
323
323
/* A state of the dfa consists of a set of positions, some flags,
324
324
and the token value of the lowest-numbered position of the state that
325
contains an END token. */
325
contains an END token. */
328
size_t hash; /* Hash of the positions of this state. */
329
position_set elems; /* Positions this state could match. */
330
unsigned char context; /* Context from previous state. */
328
size_t hash; /* Hash of the positions of this state. */
329
position_set elems; /* Positions this state could match. */
330
unsigned char context; /* Context from previous state. */
331
331
char backref; /* True if this state matches a \<digit>. */
332
unsigned short constraint; /* Constraint for this state to accept. */
333
token first_end; /* Token value of the first END in elems. */
332
unsigned short constraint; /* Constraint for this state to accept. */
333
token first_end; /* Token value of the first END in elems. */
334
334
position_set mbps; /* Positions which can match multibyte
335
characters. e.g. period.
336
These staff are used only if
335
characters, e.g., period.
336
Used only if MB_CUR_MAX > 1. */
340
339
/* States are indexed by state_num values. These are normally
360
359
size_t ncoll_elems; /* Collating elements. */
363
/* A compiled regular expression. */
362
/* A compiled regular expression. */
366
/* Fields filled by the scanner. */
367
charclass *charclasses; /* Array of character sets for CSET tokens. */
368
size_t cindex; /* Index for adding new charclasses. */
369
size_t calloc; /* Number of charclasses currently allocated. */
365
/* Fields filled by the scanner. */
366
charclass *charclasses; /* Array of character sets for CSET tokens. */
367
size_t cindex; /* Index for adding new charclasses. */
368
size_t calloc; /* Number of charclasses allocated. */
371
/* Fields filled by the parser. */
372
token *tokens; /* Postfix parse array. */
373
size_t tindex; /* Index for adding new tokens. */
374
size_t talloc; /* Number of tokens currently allocated. */
370
/* Fields filled by the parser. */
371
token *tokens; /* Postfix parse array. */
372
size_t tindex; /* Index for adding new tokens. */
373
size_t talloc; /* Number of tokens currently allocated. */
375
374
size_t depth; /* Depth required of an evaluation stack
376
375
used for depth-first traversal of the
378
size_t nleaves; /* Number of leaves on the parse tree. */
377
size_t nleaves; /* Number of leaves on the parse tree. */
379
378
size_t nregexps; /* Count of parallel regexps being built
381
380
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
382
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
381
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
384
383
/* The following are used only if MB_CUR_MAX > 1. */
409
408
size_t mbcsets_alloc;
411
/* Fields filled by the state builder. */
412
dfa_state *states; /* States of the dfa. */
413
state_num sindex; /* Index for adding new states. */
414
state_num salloc; /* Number of states currently allocated. */
410
/* Fields filled by the state builder. */
411
dfa_state *states; /* States of the dfa. */
412
state_num sindex; /* Index for adding new states. */
413
state_num salloc; /* Number of states currently allocated. */
416
/* Fields filled by the parse tree->NFA conversion. */
415
/* Fields filled by the parse tree->NFA conversion. */
417
416
position_set *follows; /* Array of follow sets, indexed by position
418
417
index. The follow of a position is the set
419
418
of positions containing characters that
420
419
could conceivably follow a character
421
420
matching the given position in a string
422
421
matching the regexp. Allocated to the
423
maximum possible position index. */
422
maximum possible position index. */
424
423
int searchflag; /* True if we are supposed to build a searching
425
424
as opposed to an exact matcher. A searching
426
425
matcher finds the first and shortest string
427
426
matching a regexp anywhere in the buffer,
428
427
whereas an exact matcher finds the longest
429
428
string matching, but anchored to the
430
beginning of the buffer. */
429
beginning of the buffer. */
432
/* Fields filled by dfaexec. */
431
/* Fields filled by dfaexec. */
433
432
state_num tralloc; /* Number of transition tables that have
435
434
int trcount; /* Number of transition tables that have
436
actually been built. */
435
actually been built. */
437
436
state_num **trans; /* Transition tables for states that can
438
437
never accept. If the transitions for a
439
438
state have not yet been computed, or the
440
439
state could possibly accept, its entry in
441
this table is NULL. */
440
this table is NULL. */
442
441
state_num **realtrans; /* Trans always points to realtrans + 1; this
443
is so trans[-1] can contain NULL. */
442
is so trans[-1] can contain NULL. */
444
443
state_num **fails; /* Transition tables after failing to accept
445
on a state that potentially could do so. */
444
on a state that potentially could do so. */
446
445
int *success; /* Table of acceptance conditions used in
447
dfaexec and computed in build_state. */
446
dfaexec and computed in build_state. */
448
447
state_num *newlines; /* Transitions on newlines. The entry for a
449
448
newline in any transition table is always
450
449
-1 so we can count lines without wasting
451
450
too many cycles. The transition for a
452
451
newline is stored separately and handled
453
452
as a special case. Newline is also used
454
as a sentinel at the end of the buffer. */
453
as a sentinel at the end of the buffer. */
455
454
struct dfamust *musts; /* List of strings, at least one of which
456
455
is known to appear in any r.e. matching
460
/* Some macros for user access to dfa internals. */
459
/* Some macros for user access to dfa internals. */
462
/* ACCEPTING returns true if s could possibly be an accepting state of r. */
461
/* ACCEPTING returns true if s could possibly be an accepting state of r. */
463
462
#define ACCEPTING(s, r) ((r).states[s].constraint)
465
464
/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
466
specified context. */
465
specified context. */
467
466
#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
468
467
SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
814
814
/* Lexical analyzer. All the dross that deals with the obnoxious
815
815
GNU Regex syntax bits is located here. The poor, suffering
816
816
reader is referred to the GNU Regex documentation for the
817
meaning of the @#%!@#%^!@ syntax bits. */
817
meaning of the @#%!@#%^!@ syntax bits. */
819
static char const *lexptr; /* Pointer to next input character. */
820
static size_t lexleft; /* Number of characters remaining. */
821
static token lasttok; /* Previous token returned; initially END. */
822
static int laststart; /* True if we're separated from beginning or (, |
823
only by zero-width characters. */
824
static size_t parens; /* Count of outstanding left parens. */
825
static int minrep, maxrep; /* Repeat counts for {m,n}. */
819
static char const *lexptr; /* Pointer to next input character. */
820
static size_t lexleft; /* Number of characters remaining. */
821
static token lasttok; /* Previous token returned; initially END. */
822
static int laststart; /* True if we're separated from beginning or (,
823
| only by zero-width characters. */
824
static size_t parens; /* Count of outstanding left parens. */
825
static int minrep, maxrep; /* Repeat counts for {m,n}. */
827
827
static int cur_mb_len = 1; /* Length of the multibyte representation of
829
829
/* These variables are used only if (MB_CUR_MAX > 1). */
830
static mbstate_t mbs; /* Mbstate for mbrlen(). */
830
static mbstate_t mbs; /* Mbstate for mbrlen. */
831
831
static wchar_t wctok; /* Wide character representation of the current
832
832
multibyte character. */
833
static unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec().
834
Each element store the amount of remain
835
byte of corresponding multibyte character
836
in the input string. A element's value
837
is 0 if corresponding character is a
838
single byte character.
839
e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
840
mblen_buf : 0, 3, 2, 1
842
static wchar_t *inputwcs; /* Wide character representation of input
844
The length of this array is same as
845
the length of input string(char array).
833
static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
834
Each element stores the number of remaining
835
bytes of the corresponding multibyte
836
character in the input string. A element's
837
value is 0 if the corresponding character is
839
e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
840
mblen_buf : 0, 3, 2, 1
842
static wchar_t *inputwcs; /* Wide character representation of the input
844
The length of this array is the same as
845
the length of input string (char array).
846
846
inputstring[i] is a single-byte char,
847
or 1st byte of a multibyte char.
848
And inputwcs[i] is the codepoint. */
849
static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
850
static unsigned char const *buf_end; /* reference to end in dfaexec(). */
847
or the first byte of a multibyte char;
848
inputwcs[i] is the codepoint. */
849
static unsigned char const *buf_begin; /* reference to begin in dfaexec. */
850
static unsigned char const *buf_end; /* reference to end in dfaexec. */
854
/* Note that characters become unsigned here. */
854
/* Note that characters become unsigned here. */
855
855
# define FETCH_WC(c, wc, eoferr) \
1539
1540
/* The above loop should consume at most a backslash
1540
and some other character. */
1541
and some other character. */
1542
return END; /* keeps pedantic compilers happy. */
1543
return END; /* keeps pedantic compilers happy. */
1545
/* Recursive descent parser for regular expressions. */
1546
/* Recursive descent parser for regular expressions. */
1547
static token tok; /* Lookahead token. */
1548
static token tok; /* Lookahead token. */
1548
1549
static size_t depth; /* Current depth of a hypothetical stack
1549
1550
holding deferred productions. This is
1550
1551
used to determine the depth that will be
1551
1552
required of the real stack later on in
1555
1556
addtok_mb (token t, int mbprop)
2114
2115
contains a symbol that matches the empty string in some context, replace
2115
2116
that position with the elements of its follow labeled with an appropriate
2116
2117
constraint. Repeat exhaustively until no funny positions are left.
2117
S->elems must be large enough to hold the result. */
2118
S->elems must be large enough to hold the result. */
2119
2120
epsclosure (position_set * s, struct dfa const *d)
2122
char *visited; /* array of booleans, enough to use char, not int */
2123
char *visited; /* Array of booleans, enough to use char, not int. */
2123
2124
position p, old;
2125
2126
CALLOC (visited, d->tindex);
2275
2276
analysis is conveniently done by a linear scan with the aid of a stack.
2276
2277
Sets are stored as arrays of the elements, obeying a stack-like allocation
2277
2278
scheme; the number of elements in each set deeper in the stack can be
2278
used to determine the address of a particular set's array. */
2279
used to determine the address of a particular set's array. */
2280
2281
dfaanalyze (struct dfa *d, int searchflag)
2282
int *nullable; /* Nullable stack. */
2283
size_t *nfirstpos; /* Element count stack for firstpos sets. */
2284
position *firstpos; /* Array where firstpos elements are stored. */
2285
size_t *nlastpos; /* Element count stack for lastpos sets. */
2286
position *lastpos; /* Array where lastpos elements are stored. */
2287
position_set tmp; /* Temporary set for merging sets. */
2288
position_set merged; /* Result of merging sets. */
2289
int separate_contexts; /* Context wanted by some position. */
2283
int *nullable; /* Nullable stack. */
2284
size_t *nfirstpos; /* Element count stack for firstpos sets. */
2285
position *firstpos; /* Array where firstpos elements are stored. */
2286
size_t *nlastpos; /* Element count stack for lastpos sets. */
2287
position *lastpos; /* Array where lastpos elements are stored. */
2288
position_set tmp; /* Temporary set for merging sets. */
2289
position_set merged; /* Result of merging sets. */
2290
int separate_contexts; /* Context wanted by some position. */
2290
2291
int *o_nullable;
2291
2292
size_t *o_nfirst, *o_nlast;
2292
2293
position *o_firstpos, *o_lastpos;
2387
/* A CAT node is nullable if both arguments are nullable. */
2388
/* A CAT node is nullable if both arguments are nullable. */
2388
2389
nullable[-2] = nullable[-1] && nullable[-2];
2393
/* The firstpos is the union of the firstpos of each argument. */
2394
/* The firstpos is the union of the firstpos of each argument. */
2394
2395
nfirstpos[-2] += nfirstpos[-1];
2397
/* The lastpos is the union of the lastpos of each argument. */
2398
/* The lastpos is the union of the lastpos of each argument. */
2398
2399
nlastpos[-2] += nlastpos[-1];
2401
/* An OR node is nullable if either argument is nullable. */
2402
/* An OR node is nullable if either argument is nullable. */
2402
2403
nullable[-2] = nullable[-1] || nullable[-2];
2408
2409
constructs like \< are treated as nonempty strings here;
2409
2410
an "epsilon closure" effectively makes them nullable later.
2410
2411
Backreferences have to get a real position so we can detect
2411
transitions on them later. But they are nullable. */
2412
transitions on them later. But they are nullable. */
2412
2413
*nullable++ = d->tokens[i] == BACKREF;
2414
/* This position is in its own firstpos and lastpos. */
2415
/* This position is in its own firstpos and lastpos. */
2415
2416
*nfirstpos++ = *nlastpos++ = 1;
2416
2417
--firstpos, --lastpos;
2417
2418
firstpos->index = lastpos->index = i;
2418
2419
firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2420
/* Allocate the follow set for this position. */
2421
/* Allocate the follow set for this position. */
2421
2422
alloc_position_set (&d->follows[i], 1);
2425
/* ... balance the above nonsyntactic #ifdef goo... */
2426
/* ... balance the above nonsyntactic #ifdef goo... */
2426
2427
fprintf (stderr, "node %zd:", i);
2427
2428
prtok (d->tokens[i]);
2428
2429
putc ('\n', stderr);
2524
2525
If after comparing with every group there are characters remaining in C,
2525
2526
create a new group labeled with the characters of C and insert this
2526
position in that group. */
2527
position in that group. */
2528
2529
dfastate (state_num s, struct dfa *d, state_num trans[])
2530
leaf_set *grps; /* As many as will ever be needed. */
2531
charclass *labels; /* Labels corresponding to the groups. */
2532
size_t ngrps = 0; /* Number of groups actually used. */
2533
position pos; /* Current position being considered. */
2534
charclass matches; /* Set of matching characters. */
2535
int matchesf; /* True if matches is nonempty. */
2536
charclass intersect; /* Intersection with some label set. */
2537
int intersectf; /* True if intersect is nonempty. */
2538
charclass leftovers; /* Stuff in the label that didn't match. */
2539
int leftoversf; /* True if leftovers is nonempty. */
2540
position_set follows; /* Union of the follows of some group. */
2541
position_set tmp; /* Temporary space for merging sets. */
2542
int possible_contexts; /* Contexts that this group can match. */
2543
int separate_contexts; /* Context that new state wants to know. */
2544
state_num state; /* New state. */
2545
state_num state_newline; /* New state on a newline transition. */
2546
state_num state_letter; /* New state on a letter transition. */
2531
leaf_set *grps; /* As many as will ever be needed. */
2532
charclass *labels; /* Labels corresponding to the groups. */
2533
size_t ngrps = 0; /* Number of groups actually used. */
2534
position pos; /* Current position being considered. */
2535
charclass matches; /* Set of matching characters. */
2536
int matchesf; /* True if matches is nonempty. */
2537
charclass intersect; /* Intersection with some label set. */
2538
int intersectf; /* True if intersect is nonempty. */
2539
charclass leftovers; /* Stuff in the label that didn't match. */
2540
int leftoversf; /* True if leftovers is nonempty. */
2541
position_set follows; /* Union of the follows of some group. */
2542
position_set tmp; /* Temporary space for merging sets. */
2543
int possible_contexts; /* Contexts that this group can match. */
2544
int separate_contexts; /* Context that new state wants to know. */
2545
state_num state; /* New state. */
2546
state_num state_newline; /* New state on a newline transition. */
2547
state_num state_letter; /* New state on a letter transition. */
2547
2548
int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2548
2549
size_t i, j, k;
2604
2605
/* If matches contains a single character only, and the current
2605
2606
group's label doesn't contain that character, go on to the
2607
2608
if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2608
2609
&& !tstbit (d->tokens[pos.index], labels[j]))
2611
2612
/* Check if this group's label has a nonempty intersection with
2613
2614
intersectf = 0;
2614
2615
for (k = 0; k < CHARCLASS_INTS; ++k)
2615
2616
(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2616
2617
if (!intersectf)
2619
/* It does; now find the set differences both ways. */
2620
/* It does; now find the set differences both ways. */
2620
2621
leftoversf = matchesf = 0;
2621
2622
for (k = 0; k < CHARCLASS_INTS; ++k)
2623
/* Even an optimizing compiler can't know this for sure. */
2624
/* Even an optimizing compiler can't know this for sure. */
2624
2625
int match = matches[k], label = labels[j][k];
2626
2627
(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2627
2628
(matches[k] = match & ~label) ? (matchesf = 1) : 0;
2630
/* If there were leftovers, create a new group labeled with them. */
2631
/* If there were leftovers, create a new group labeled with them. */
2631
2632
if (leftoversf)
2633
2634
copyset (leftovers, labels[ngrps]);
2644
2645
grps[j].elems[grps[j].nelem++] = pos.index;
2646
2647
/* If every character matching the current position has been
2647
accounted for, we're done. */
2648
accounted for, we're done. */
2652
2653
/* If we've passed the last group, and there are still characters
2653
unaccounted for, then we'll have to create a new group. */
2654
unaccounted for, then we'll have to create a new group. */
2654
2655
if (j == ngrps)
2656
2657
copyset (matches, labels[ngrps]);
2735
2736
/* If we are building a searching matcher, throw in the positions
2736
of state 0 as well. */
2737
of state 0 as well. */
2737
2738
if (d->searchflag
2738
2739
&& (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
2739
2740
for (j = 0; j < d->states[0].elems.nelem; ++j)
2740
2741
insert (d->states[0].elems.elems[j], &follows);
2742
/* Find out if the new state will want any context information. */
2743
/* Find out if the new state will want any context information. */
2743
2744
possible_contexts = charclass_context (labels[i]);
2744
2745
separate_contexts = state_separate_contexts (&follows);
2746
/* Find the state(s) corresponding to the union of the follows. */
2747
/* Find the state(s) corresponding to the union of the follows. */
2747
2748
if ((separate_contexts & possible_contexts) != possible_contexts)
2748
2749
state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2786
2787
is a non-accepting state, then d->trans[state] points to its table.
2787
2788
If it is an accepting state then d->fails[state] points to its table.
2788
2789
If it has no table at all, then d->trans[state] is NULL.
2789
TODO: Improve this comment, get rid of the unnecessary redundancy. */
2790
TODO: Improve this comment, get rid of the unnecessary redundancy. */
2792
2793
build_state (state_num s, struct dfa *d)
2794
state_num *trans; /* The new transition table. */
2795
state_num *trans; /* The new transition table. */
2797
2798
/* Set an upper limit on the number of transition tables that will ever
2798
2799
exist at once. 1024 is arbitrary. The idea is that the frequently
2799
2800
used transition tables will be quickly rebuilt, whereas the ones that
2800
were only needed once or twice will be cleared away. */
2801
were only needed once or twice will be cleared away. */
2801
2802
if (d->trcount >= 1024)
2803
2804
for (i = 0; i < d->tralloc; ++i)
2872
2873
/* Multibyte character handling sub-routines for dfaexec. */
2874
/* Initial state may encounter the byte which is not a single byte character
2875
nor 1st byte of a multibyte character. But it is incorrect for initial
2876
state to accept such a byte.
2877
For example, in sjis encoding the regular expression like "\\" accepts
2878
the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2879
0x815c. Then Initial state must skip the bytes which are not a single byte
2880
character nor 1st byte of a multibyte character. */
2875
/* The initial state may encounter a byte which is not a single byte character
2876
nor the first byte of a multibyte character. But it is incorrect for the
2877
initial state to accept such a byte. For example, in Shift JIS the regular
2878
expression "\\" accepts the codepoint 0x5c, but should not accept the second
2879
byte of the codepoint 0x815c. Then the initial state must skip the bytes
2880
that are not a single byte character nor the first byte of a multibyte
2881
2882
#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
3012
3013
match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
3015
int match; /* Flag which represent that matching succeed. */
3016
int match_len; /* Length of the character (or collating element)
3017
with which this operator match. */
3018
int op_len; /* Length of the operator. */
3016
int match; /* Matching succeeded. */
3017
int match_len; /* Length of the character (or collating element)
3018
with which this operator matches. */
3019
int op_len; /* Length of the operator. */
3019
3020
char buffer[128];
3021
3022
/* Pointer to the structure to which we are currently referring. */
3109
3110
return match ? match_len : 0;
3112
/* Check each of 'd->states[s].mbps.elem' can match or not. Then return the
3113
array which corresponds to 'd->states[s].mbps.elem' and each element of
3114
the array contains the amount of the bytes with which the element can
3116
'idx' is the index from the buf_begin, and it is the current position
3113
/* Check whether each of 'd->states[s].mbps.elem' can match. Then return the
3114
array which corresponds to 'd->states[s].mbps.elem'; each element of the
3115
array contains the number of bytes with which the element can match.
3117
'idx' is the index from buf_begin, and it is the current position
3118
Caller MUST free the array which this function return. */
3120
The caller MUST free the array which this function return. */
3120
3122
check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
3144
3146
/* Consume a single character and enumerate all of the positions which can
3145
be next position from the state 's'.
3146
'match_lens' is the input. It can be NULL, but it can also be the output
3147
of check_matching_with_multibyte_ops() for optimization.
3147
be the next position from the state 's'.
3149
'match_lens' is the input. It can be NULL, but it can also be the output
3150
of check_matching_with_multibyte_ops for optimization.
3148
3152
'mbclen' and 'pps' are the output. 'mbclen' is the length of the
3149
character consumed, and 'pps' is the set this function enumerate. */
3153
character consumed, and 'pps' is the set this function enumerates. */
3150
3154
static status_transit_state
3151
3155
transit_state_consume_1char (struct dfa *d, state_num s,
3152
3156
unsigned char const **pp,
3339
3343
If COUNT is non-NULL, increment *COUNT once for each newline processed.
3340
3344
Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3341
3345
encountered a back-reference (1) or not (0). The caller may use this
3342
to decide whether to fall back on a backtracking matcher. */
3346
to decide whether to fall back on a backtracking matcher. */
3344
3348
dfaexec (struct dfa *d, char const *begin, char *end,
3345
3349
int allow_nl, size_t *count, int *backref)
3347
state_num s, s1; /* Current state. */
3348
unsigned char const *p; /* Current input character. */
3351
state_num s, s1; /* Current state. */
3352
unsigned char const *p; /* Current input character. */
3349
3353
state_num **trans, *t; /* Copy of d->trans so it can be optimized
3351
3355
unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3352
3356
unsigned char saved_end;
3673
3677
and q->left and q->right p->is : NULL
3675
3679
If there's anything else we recognize in the tree, all four sequences get set
3676
to zero-length sequences. If there's something we don't recognize in the tree,
3677
we just return a zero-length sequence.
3680
to zero-length sequences. If there's something we don't recognize in the
3681
tree, we just return a zero-length sequence.
3679
3683
Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3682
And. . .is it here or someplace that we might ponder "optimizations" such as
3686
And ... is it here or someplace that we might ponder "optimizations" such as
3683
3687
egrep 'psi|epsilon' -> egrep 'psi'
3684
3688
egrep 'pepsi|epsilon' -> egrep 'epsi'
3685
3689
(Yes, we now find "epsi" as a "string