~vcs-imports/gawk/master : revision 983

257

end of input; any value of END or less in

258

the parse tree is such a symbol. Accepting

259

states of the DFA are those that would have

260

a transition on END. */

260

a transition on END. This is -1, not some

261

more-negative value, to tweak the speed of

262

comparisons to END. */

261

263

262

264

/* Ordinary character values are terminal symbols that match themselves. */

265

/* CSET must come last in the following list of special tokens. Otherwise,

266

the list order matters only for performance. Related special tokens

267

should have nearby values so that code like (t == ANYCHAR || t == MBCSET

268

|| CSET <= t) can be done with a single machine-level comparison. */

263

269

264

270

EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches

265

271

the empty string. */

266

272

267

BACKREF, /* BACKREF is generated by \<digit>

268

or by any other construct that

269

is not completely handled. If the scanner

270

detects a transition on backref, it returns

271

a kind of "semi-success" indicating that

272

the match will have to be verified with

273

a backtracking matcher. */

274

275

BEGLINE, /* BEGLINE is a terminal symbol that matches

276

the empty string at the beginning of a

277

line. */

278

279

ENDLINE, /* ENDLINE is a terminal symbol that matches

280

the empty string at the end of a line. */

281

282

BEGWORD, /* BEGWORD is a terminal symbol that matches

283

the empty string at the beginning of a

284

word. */

285

286

ENDWORD, /* ENDWORD is a terminal symbol that matches

287

the empty string at the end of a word. */

288

289

LIMWORD, /* LIMWORD is a terminal symbol that matches

290

the empty string at the beginning or the

291

end of a word. */

292

293

NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that

294

matches the empty string not at

295

the beginning or end of a word. */

296

297

273

QMARK, /* QMARK is an operator of one argument that

298

274

matches zero or one occurrences of its

299

275

argument. */

323

299

324

300

RPAREN, /* RPAREN never appears in the parse tree. */

325

301

302

WCHAR, /* Only returned by lex. wctok contains

303

the wide character representation. */

326

304

ANYCHAR, /* ANYCHAR is a terminal symbol that matches

327

305

a valid multibyte (or single byte) character.

328

306

It is used only if MB_CUR_MAX > 1. */

307

BEG, /* BEG is an initial symbol that matches the

308

beginning of input. */

309

310

BEGLINE, /* BEGLINE is a terminal symbol that matches

311

the empty string at the beginning of a

312

line. */

313

314

ENDLINE, /* ENDLINE is a terminal symbol that matches

315

the empty string at the end of a line. */

316

317

BEGWORD, /* BEGWORD is a terminal symbol that matches

318

the empty string at the beginning of a

319

word. */

320

321

ENDWORD, /* ENDWORD is a terminal symbol that matches

322

the empty string at the end of a word. */

323

324

LIMWORD, /* LIMWORD is a terminal symbol that matches

325

the empty string at the beginning or the

326

end of a word. */

327

328

NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that

329

matches the empty string not at

330

the beginning or end of a word. */

331

332

BACKREF, /* BACKREF is generated by \<digit>

333

or by any other construct that

334

is not completely handled. If the scanner

335

detects a transition on backref, it returns

336

a kind of "semi-success" indicating that

337

the match will have to be verified with

338

a backtracking matcher. */

329

339

330

340

MBCSET, /* MBCSET is similar to CSET, but for

331

341

multibyte characters. */

332

342

333

WCHAR, /* Only returned by lex. wctok contains

334

the wide character representation. */

335

343

336

344

CSET /* CSET and (and any value greater) is a

337

345

terminal symbol that matches any of a

660

668

static void

661

669

prtok (token t)

662

670

{

663

if (t < 0)

671

if (t <= END)

664

672

fprintf (stderr, "END");

665

else if (t < NOTCHAR)

673

else if (0 <= t && t < NOTCHAR)

666

674

{

667

675

unsigned int ch = t;

668

676

fprintf (stderr, "0x%02x", ch);

672

680

char const *s;

673

681

switch (t)

674

682

{

683

case BEG:

684

s = "BEG";

685

break;

675

686

case EMPTY:

676

687

s = "EMPTY";

677

688

break;

1844

1855

static void

1845

1856

atom (struct dfa *dfa)

1846

1857

{

1847

if (dfa->parse.tok == WCHAR)

1858

if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)

1859

|| dfa->parse.tok >= CSET

1860

|| dfa->parse.tok == BEG || dfa->parse.tok == BACKREF

1861

|| dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE

1862

|| dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD

1863

|| dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD

1864

|| dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)

1865

{

1866

if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)

1867

{

1868

/* For UTF-8 expand the period to a series of CSETs that define a

1869

valid UTF-8 character. This avoids using the slow multibyte

1870

path. I'm pretty sure it would be both profitable and correct to

1871

do it for any encoding; however, the optimization must be done

1872

manually as it is done above in add_utf8_anychar. So, let's

1873

start with UTF-8: it is the most used, and the structure of the

1874

encoding makes the correctness more obvious. */

1875

add_utf8_anychar (dfa);

1876

}

1877

else

1878

addtok (dfa, dfa->parse.tok);

1879

dfa->parse.tok = lex (dfa);

1880

}

1881

else if (dfa->parse.tok == WCHAR)

1848

1882

{

1849

1883

if (dfa->lex.wctok == WEOF)

1850

1884

addtok (dfa, BACKREF);

1867

1901

1868

1902

dfa->parse.tok = lex (dfa);

1869

1903

}

1870

else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)

1871

{

1872

/* For UTF-8 expand the period to a series of CSETs that define a valid

1873

UTF-8 character. This avoids using the slow multibyte path. I'm

1874

pretty sure it would be both profitable and correct to do it for

1875

any encoding; however, the optimization must be done manually as

1876

it is done above in add_utf8_anychar. So, let's start with

1877

UTF-8: it is the most used, and the structure of the encoding

1878

makes the correctness more obvious. */

1879

add_utf8_anychar (dfa);

1880

dfa->parse.tok = lex (dfa);

1881

}

1882

else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)

1883

|| dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF

1884

|| dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE

1885

|| dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR

1886

|| dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD

1887

|| dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)

1888

{

1889

addtok (dfa, dfa->parse.tok);

1890

dfa->parse.tok = lex (dfa);

1891

}

1892

1904

else if (dfa->parse.tok == LPAREN)

1893

1905

{

1894

1906

dfa->parse.tok = lex (dfa);

2014

2026

if (!d->syntax.syntax_bits_set)

2015

2027

dfaerror (_("no syntax specified"));

2016

2028

2029

if (!d->nregexps)

2030

addtok (d, BEG);

2031

2017

2032

d->parse.tok = lex (d);

2018

2033

d->parse.depth = d->depth;

2019

2034

2132

2147

merge_constrained (s1, s2, -1, m);

2133

2148

}

2134

2149

2150

static void

2151

merge2 (position_set *dst, position_set const *src, position_set *m)

2152

{

2153

if (src->nelem < 4)

2154

{

2155

for (ptrdiff_t i = 0; i < src->nelem; ++i)

2156

insert (src->elems[i], dst);

2157

}

2158

else

2159

{

2160

merge (src, dst, m);

2161

copy (m, dst);

2162

}

2163

}

2164

2135

2165

/* Delete a position from a set. Return the nonzero constraint of the

2136

2166

deleted position, or zero if there was no such position. */

2137

2167

static unsigned int

2227

2257

for (state_num j = 0; j < s->nelem; j++)

2228

2258

{

2229

2259

int c = s->elems[j].constraint;

2230

if (d->tokens[s->elems[j].index] < 0)

2260

if (d->tokens[s->elems[j].index] <= END)

2231

2261

{

2232

2262

if (succeeds_in_context (c, context, CTX_ANY))

2233

2263

constraint |= c;

2352

2382

return separate_contexts;

2353

2383

}

2354

2384

2385

enum

2386

{

2387

/* Single token is repeated. It is distinguished from non-repeated. */

2388

OPT_REPEAT = (1 << 0),

2389

2390

/* Multiple tokens are repeated. This flag is on at head of tokens. The

2391

node is not merged. */

2392

OPT_LPAREN = (1 << 1),

2393

2394

/* Multiple branches are joined. The node is not merged. */

2395

OPT_RPAREN = (1 << 2),

2396

2397

/* The node is walked. If the node is found in walking again, OPT_RPAREN

2398

flag is turned on. */

2399

OPT_WALKED = (1 << 3),

2400

2401

/* The node is queued. The node is not queued again. */

2402

OPT_QUEUED = (1 << 4)

2403

};

2404

2405

static ptrdiff_t

2406

merge_nfa_state (struct dfa *d, size_t *queue, ptrdiff_t nqueue, size_t tindex,

2407

char *flags, position_set *merged)

2408

{

2409

position_set *follows = d->follows;

2410

ptrdiff_t nelem = 0;

2411

2412

for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)

2413

{

2414

size_t dindex = follows[tindex].elems[i].index;

2415

2416

/* Skip the node as pruned in future. */

2417

unsigned int iconstraint = follows[tindex].elems[i].constraint;

2418

if (iconstraint == 0)

2419

continue;

2420

2421

follows[tindex].elems[nelem++] = follows[tindex].elems[i];

2422

2423

if (tindex < dindex && !(flags[dindex] & OPT_QUEUED))

2424

{

2425

queue[nqueue++] = dindex;

2426

flags[dindex] |= OPT_QUEUED;

2427

}

2428

2429

if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))

2430

continue;

2431

2432

for (ptrdiff_t j = i + 1; j < follows[tindex].nelem; j++)

2433

{

2434

size_t sindex = follows[tindex].elems[j].index;

2435

2436

if (follows[tindex].elems[j].constraint != iconstraint)

2437

continue;

2438

2439

if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN))

2440

continue;

2441

2442

if (d->tokens[sindex] != d->tokens[dindex])

2443

continue;

2444

2445

if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)

2446

continue;

2447

2448

if (flags[sindex] & OPT_REPEAT)

2449

delete (sindex, &follows[sindex]);

2450

2451

merge2 (&follows[dindex], &follows[sindex], merged);

2452

2453

/* Mark it as pruned in future. */

2454

follows[tindex].elems[j].constraint = 0;

2455

}

2456

}

2457

2458

follows[tindex].nelem = nelem;

2459

2460

return nqueue;

2461

}

2462

2463

static void

2464

dfaoptimize (struct dfa *d)

2465

{

2466

char *flags;

2467

size_t *queue;

2468

ptrdiff_t nqueue;

2469

position_set merged0;

2470

position_set *merged;

2471

ptrdiff_t extra_space = d->tindex + sizeof *queue - 1;

2472

extra_space -= extra_space % sizeof *queue;

2473

2474

queue = xmalloc (d->nleaves * sizeof *queue + extra_space);

2475

flags = (char *) (queue + d->nleaves);

2476

memset (flags, 0, d->tindex * sizeof *flags);

2477

2478

for (size_t i = 0; i < d->tindex; i++)

2479

{

2480

for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)

2481

{

2482

if (d->follows[i].elems[j].index == i)

2483

flags[d->follows[i].elems[j].index] |= OPT_REPEAT;

2484

else if (d->follows[i].elems[j].index < i)

2485

flags[d->follows[i].elems[j].index] |= OPT_LPAREN;

2486

else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)

2487

flags[d->follows[i].elems[j].index] |= OPT_RPAREN;

2488

else

2489

flags[d->follows[i].elems[j].index] |= OPT_WALKED;

2490

}

2491

}

2492

2493

merged = &merged0;

2494

alloc_position_set (merged, d->nleaves);

2495

2496

nqueue = 0;

2497

queue[nqueue++] = 0;

2498

2499

for (ptrdiff_t i = 0; i < nqueue; i++)

2500

nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged);

2501

2502

free (merged->elems);

2503

free (queue);

2504

}

2355

2505

2356

2506

/* Perform bottom-up analysis on the parse tree, computing various functions.

2357

2507

Note that at this point, we're pretending constructs like \< are real

2427

2577

2428

2578

position_set merged; /* Result of merging sets. */

2429

2579

2580

addtok (d, CAT);

2581

2430

2582

#ifdef DEBUG

2431

2583

fprintf (stderr, "dfaanalyze:\n");

2432

2584

for (size_t i = 0; i < d->tindex; ++i)

2569

2721

#endif

2570

2722

}

2571

2723

2724

dfaoptimize (d);

2725

2572

2726

#ifdef DEBUG

2573

2727

for (size_t i = 0; i < d->tindex; ++i)

2574

2728

if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF

2587

2741

}

2588

2742

#endif

2589

2743

2590

/* Get the epsilon closure of the firstpos of the regexp. The result will

2591

be the set of positions of state 0. */

2592

merged.nelem = 0;

2593

for (size_t i = 0; i < stk[-1].nfirstpos; ++i)

2594

insert (firstpos[i], &merged);

2595

2744

2596

2745

/* For each follow set that is the follow set of a real position, replace

2597

2746

it with its epsilon closure. */

2598

epsclosure (&merged, d);

2747

epsclosure (&d->follows[0], d);

2599

2748

2600

2749

/* Context wanted by some position. */

2601

int separate_contexts = state_separate_contexts (&merged);

2750

int separate_contexts = state_separate_contexts (&d->follows[0]);

2602

2751

2603

2752

/* Build the initial state. */

2604

2753

if (separate_contexts & CTX_NEWLINE)

2605

state_index (d, &merged, CTX_NEWLINE);

2754

state_index (d, &d->follows[0], CTX_NEWLINE);

2606

2755

d->initstate_notbol = d->min_trcount

2607

= state_index (d, &merged, separate_contexts ^ CTX_ANY);

2756

= state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY);

2608

2757

if (separate_contexts & CTX_LETTER)

2609

d->min_trcount = state_index (d, &merged, CTX_LETTER);

2758

d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER);

2610

2759

d->min_trcount++;

2611

2760

d->trcount = 0;

2612

2761

3394

3543

return true;

3395

3544

}

3396

3545

3546

/* Disable use of the superset DFA if it is not likely to help

3547

performance. */

3397

3548

static void

3398

dfaoptimize (struct dfa *d)

3549

maybe_disable_superset_dfa (struct dfa *d)

3399

3550

{

3400

3551

if (!d->localeinfo.using_utf8)

3401

3552

return;

3523

3674

3524

3675

if (dfa_supported (d))

3525

3676

{

3526

dfaoptimize (d);

3677

maybe_disable_superset_dfa (d);

3527

3678

dfaanalyze (d, searchflag);

3528

3679

}

3529

3680

else

3830

3981

bool need_endline = false;

3831

3982

bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;

3832

3983

3833

for (size_t ri = 0; ri < d->tindex; ++ri)

3984

for (size_t ri = 1; ri + 1 < d->tindex; ri++)

3834

3985

{

3835

3986

token t = d->tokens[ri];

3836

3987

switch (t)