257
257
end of input; any value of END or less in
258
258
the parse tree is such a symbol. Accepting
259
259
states of the DFA are those that would have
260
a transition on END. */
260
a transition on END. This is -1, not some
261
more-negative value, to tweak the speed of
262
comparisons to END. */
262
264
/* Ordinary character values are terminal symbols that match themselves. */
265
/* CSET must come last in the following list of special tokens. Otherwise,
266
the list order matters only for performance. Related special tokens
267
should have nearby values so that code like (t == ANYCHAR || t == MBCSET
268
|| CSET <= t) can be done with a single machine-level comparison. */
264
270
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
265
271
the empty string. */
267
BACKREF, /* BACKREF is generated by \<digit>
268
or by any other construct that
269
is not completely handled. If the scanner
270
detects a transition on backref, it returns
271
a kind of "semi-success" indicating that
272
the match will have to be verified with
273
a backtracking matcher. */
275
BEGLINE, /* BEGLINE is a terminal symbol that matches
276
the empty string at the beginning of a
279
ENDLINE, /* ENDLINE is a terminal symbol that matches
280
the empty string at the end of a line. */
282
BEGWORD, /* BEGWORD is a terminal symbol that matches
283
the empty string at the beginning of a
286
ENDWORD, /* ENDWORD is a terminal symbol that matches
287
the empty string at the end of a word. */
289
LIMWORD, /* LIMWORD is a terminal symbol that matches
290
the empty string at the beginning or the
293
NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
294
matches the empty string not at
295
the beginning or end of a word. */
297
273
QMARK, /* QMARK is an operator of one argument that
298
274
matches zero or one occurrences of its
324
300
RPAREN, /* RPAREN never appears in the parse tree. */
302
WCHAR, /* Only returned by lex. wctok contains
303
the wide character representation. */
326
304
ANYCHAR, /* ANYCHAR is a terminal symbol that matches
327
305
a valid multibyte (or single byte) character.
328
306
It is used only if MB_CUR_MAX > 1. */
307
BEG, /* BEG is an initial symbol that matches the
308
beginning of input. */
310
BEGLINE, /* BEGLINE is a terminal symbol that matches
311
the empty string at the beginning of a
314
ENDLINE, /* ENDLINE is a terminal symbol that matches
315
the empty string at the end of a line. */
317
BEGWORD, /* BEGWORD is a terminal symbol that matches
318
the empty string at the beginning of a
321
ENDWORD, /* ENDWORD is a terminal symbol that matches
322
the empty string at the end of a word. */
324
LIMWORD, /* LIMWORD is a terminal symbol that matches
325
the empty string at the beginning or the
328
NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
329
matches the empty string not at
330
the beginning or end of a word. */
332
BACKREF, /* BACKREF is generated by \<digit>
333
or by any other construct that
334
is not completely handled. If the scanner
335
detects a transition on backref, it returns
336
a kind of "semi-success" indicating that
337
the match will have to be verified with
338
a backtracking matcher. */
330
340
MBCSET, /* MBCSET is similar to CSET, but for
331
341
multibyte characters. */
333
WCHAR, /* Only returned by lex. wctok contains
334
the wide character representation. */
336
344
CSET /* CSET and (and any value greater) is a
337
345
terminal symbol that matches any of a
1845
1856
atom (struct dfa *dfa)
1847
if (dfa->parse.tok == WCHAR)
1858
if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1859
|| dfa->parse.tok >= CSET
1860
|| dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1861
|| dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1862
|| dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1863
|| dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1864
|| dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1866
if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1868
/* For UTF-8 expand the period to a series of CSETs that define a
1869
valid UTF-8 character. This avoids using the slow multibyte
1870
path. I'm pretty sure it would be both profitable and correct to
1871
do it for any encoding; however, the optimization must be done
1872
manually as it is done above in add_utf8_anychar. So, let's
1873
start with UTF-8: it is the most used, and the structure of the
1874
encoding makes the correctness more obvious. */
1875
add_utf8_anychar (dfa);
1878
addtok (dfa, dfa->parse.tok);
1879
dfa->parse.tok = lex (dfa);
1881
else if (dfa->parse.tok == WCHAR)
1849
1883
if (dfa->lex.wctok == WEOF)
1850
1884
addtok (dfa, BACKREF);
1868
1902
dfa->parse.tok = lex (dfa);
1870
else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1872
/* For UTF-8 expand the period to a series of CSETs that define a valid
1873
UTF-8 character. This avoids using the slow multibyte path. I'm
1874
pretty sure it would be both profitable and correct to do it for
1875
any encoding; however, the optimization must be done manually as
1876
it is done above in add_utf8_anychar. So, let's start with
1877
UTF-8: it is the most used, and the structure of the encoding
1878
makes the correctness more obvious. */
1879
add_utf8_anychar (dfa);
1880
dfa->parse.tok = lex (dfa);
1882
else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1883
|| dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
1884
|| dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1885
|| dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
1886
|| dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
1887
|| dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
1889
addtok (dfa, dfa->parse.tok);
1890
dfa->parse.tok = lex (dfa);
1892
1904
else if (dfa->parse.tok == LPAREN)
1894
1906
dfa->parse.tok = lex (dfa);
2132
2147
merge_constrained (s1, s2, -1, m);
2151
merge2 (position_set *dst, position_set const *src, position_set *m)
2155
for (ptrdiff_t i = 0; i < src->nelem; ++i)
2156
insert (src->elems[i], dst);
2160
merge (src, dst, m);
2135
2165
/* Delete a position from a set. Return the nonzero constraint of the
2136
2166
deleted position, or zero if there was no such position. */
2137
2167
static unsigned int
2352
2382
return separate_contexts;
2387
/* Single token is repeated. It is distinguished from non-repeated. */
2388
OPT_REPEAT = (1 << 0),
2390
/* Multiple tokens are repeated. This flag is on at head of tokens. The
2391
node is not merged. */
2392
OPT_LPAREN = (1 << 1),
2394
/* Multiple branches are joined. The node is not merged. */
2395
OPT_RPAREN = (1 << 2),
2397
/* The node is walked. If the node is found in walking again, OPT_RPAREN
2398
flag is turned on. */
2399
OPT_WALKED = (1 << 3),
2401
/* The node is queued. The node is not queued again. */
2402
OPT_QUEUED = (1 << 4)
2406
merge_nfa_state (struct dfa *d, size_t *queue, ptrdiff_t nqueue, size_t tindex,
2407
char *flags, position_set *merged)
2409
position_set *follows = d->follows;
2410
ptrdiff_t nelem = 0;
2412
for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)
2414
size_t dindex = follows[tindex].elems[i].index;
2416
/* Skip the node as pruned in future. */
2417
unsigned int iconstraint = follows[tindex].elems[i].constraint;
2418
if (iconstraint == 0)
2421
follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2423
if (tindex < dindex && !(flags[dindex] & OPT_QUEUED))
2425
queue[nqueue++] = dindex;
2426
flags[dindex] |= OPT_QUEUED;
2429
if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2432
for (ptrdiff_t j = i + 1; j < follows[tindex].nelem; j++)
2434
size_t sindex = follows[tindex].elems[j].index;
2436
if (follows[tindex].elems[j].constraint != iconstraint)
2439
if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN))
2442
if (d->tokens[sindex] != d->tokens[dindex])
2445
if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2448
if (flags[sindex] & OPT_REPEAT)
2449
delete (sindex, &follows[sindex]);
2451
merge2 (&follows[dindex], &follows[sindex], merged);
2453
/* Mark it as pruned in future. */
2454
follows[tindex].elems[j].constraint = 0;
2458
follows[tindex].nelem = nelem;
2464
dfaoptimize (struct dfa *d)
2469
position_set merged0;
2470
position_set *merged;
2471
ptrdiff_t extra_space = d->tindex + sizeof *queue - 1;
2472
extra_space -= extra_space % sizeof *queue;
2474
queue = xmalloc (d->nleaves * sizeof *queue + extra_space);
2475
flags = (char *) (queue + d->nleaves);
2476
memset (flags, 0, d->tindex * sizeof *flags);
2478
for (size_t i = 0; i < d->tindex; i++)
2480
for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2482
if (d->follows[i].elems[j].index == i)
2483
flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2484
else if (d->follows[i].elems[j].index < i)
2485
flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2486
else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2487
flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2489
flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2494
alloc_position_set (merged, d->nleaves);
2497
queue[nqueue++] = 0;
2499
for (ptrdiff_t i = 0; i < nqueue; i++)
2500
nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged);
2502
free (merged->elems);
2356
2506
/* Perform bottom-up analysis on the parse tree, computing various functions.
2357
2507
Note that at this point, we're pretending constructs like \< are real
2590
/* Get the epsilon closure of the firstpos of the regexp. The result will
2591
be the set of positions of state 0. */
2593
for (size_t i = 0; i < stk[-1].nfirstpos; ++i)
2594
insert (firstpos[i], &merged);
2596
2745
/* For each follow set that is the follow set of a real position, replace
2597
2746
it with its epsilon closure. */
2598
epsclosure (&merged, d);
2747
epsclosure (&d->follows[0], d);
2600
2749
/* Context wanted by some position. */
2601
int separate_contexts = state_separate_contexts (&merged);
2750
int separate_contexts = state_separate_contexts (&d->follows[0]);
2603
2752
/* Build the initial state. */
2604
2753
if (separate_contexts & CTX_NEWLINE)
2605
state_index (d, &merged, CTX_NEWLINE);
2754
state_index (d, &d->follows[0], CTX_NEWLINE);
2606
2755
d->initstate_notbol = d->min_trcount
2607
= state_index (d, &merged, separate_contexts ^ CTX_ANY);
2756
= state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY);
2608
2757
if (separate_contexts & CTX_LETTER)
2609
d->min_trcount = state_index (d, &merged, CTX_LETTER);
2758
d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER);
2610
2759
d->min_trcount++;
2611
2760
d->trcount = 0;