~vcs-imports/gawk/master : revision 408.4.31

62

#include "gettext.h"

63

#define _(str) gettext (str)

64

65

#include "mbsupport.h" /* defines MBS_SUPPORT to 1 or 0, as appropriate */

65

#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */

66

#if MBS_SUPPORT

67

/* We can handle multibyte strings. */

67

/* We can handle multibyte strings. */

68

# include <wchar.h>

69

# include <wctype.h>

70

#endif

100

#define mbrtowc(a, b, c, d) (-1)

101

#endif

102

103

/* HPUX, define those as macros in sys/param.h */

103

/* HPUX defines these as macros in sys/param.h. */

104

#ifdef setbit

105

# undef setbit

106

#endif

108

# undef clrbit

109

#endif

110

111

/* Number of bits in an unsigned char. */

111

/* Number of bits in an unsigned char. */

112

#ifndef CHARBITS

113

# define CHARBITS 8

114

#endif

115

116

/* First integer value that is greater than any character code. */

116

/* First integer value that is greater than any character code. */

117

#define NOTCHAR (1 << CHARBITS)

118

119

/* INTBITS need not be exact, just a lower bound. */

119

/* INTBITS need not be exact, just a lower bound. */

120

#ifndef INTBITS

121

# define INTBITS (CHARBITS * sizeof (int))

122

#endif

123

124

/* Number of ints required to hold a bit for every character. */

124

/* Number of ints required to hold a bit for every character. */

125

#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)

126

127

/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */

127

/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */

128

typedef int charclass[CHARCLASS_INTS];

129

130

/* Convert a possibly-signed character to an unsigned character. This is

137

}

138

139

/* Contexts tell us whether a character is a newline or a word constituent.

140

Word-constituent characters are those that satisfy iswalnum(), plus '_'.

140

Word-constituent characters are those that satisfy iswalnum, plus '_'.

141

Each character has a single CTX_* value; bitmasks of CTX_* values denote

142

a particular character class.

143

167

The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint

168

succeeds in a particular context. Prev is a bitmask of possible

169

context values for the previous character, curr is the (single-bit)

170

context value for the lookahead character. */

170

context value for the lookahead character. */

171

#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)

172

#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)

173

#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)

174

175

#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \

176

((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \

177

| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \

178

| ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))

176

((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \

177

| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \

178

| ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) & (prev))

179

180

/* The following macros give information about what a constraint depends on. */

180

/* The following macros describe what a constraint depends on. */

181

#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)

182

#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)

183

#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)

190

/* Tokens that match the empty string subject to some constraint actually

191

work by applying that constraint to determine what may follow them,

192

taking into account what has gone before. The following values are

193

the constraints corresponding to the special tokens previously defined. */

193

the constraints corresponding to the special tokens previously defined. */

194

#define NO_CONSTRAINT 0x777

195

#define BEGLINE_CONSTRAINT 0x444

196

#define ENDLINE_CONSTRAINT 0x700

201

202

/* The regexp is parsed into an array of tokens in postfix form. Some tokens

203

are operators and others are terminal symbols. Most (but not all) of these

204

codes are returned by the lexical analyzer. */

204

codes are returned by the lexical analyzer. */

205

206

typedef ptrdiff_t token;

207

212

end of input; any value of END or less in

213

the parse tree is such a symbol. Accepting

214

states of the DFA are those that would have

215

a transition on END. */

215

a transition on END. */

216

217

/* Ordinary character values are terminal symbols that match themselves. */

217

/* Ordinary character values are terminal symbols that match themselves. */

218

219

EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches

220

the empty string. */

220

the empty string. */

221

222

BACKREF, /* BACKREF is generated by \<digit>; it

223

is not completely handled. If the scanner

224

detects a transition on backref, it returns

225

a kind of "semi-success" indicating that

226

the match will have to be verified with

227

a backtracking matcher. */

227

a backtracking matcher. */

228

229

BEGLINE, /* BEGLINE is a terminal symbol that matches

230

the empty string if it is at the beginning

231

of a line. */

231

of a line. */

232

233

ENDLINE, /* ENDLINE is a terminal symbol that matches

234

the empty string if it is at the end of

235

a line. */

235

a line. */

236

237

BEGWORD, /* BEGWORD is a terminal symbol that matches

238

the empty string if it is at the beginning

239

of a word. */

239

of a word. */

240

241

ENDWORD, /* ENDWORD is a terminal symbol that matches

242

the empty string if it is at the end of

243

a word. */

243

a word. */

244

245

LIMWORD, /* LIMWORD is a terminal symbol that matches

246

the empty string if it is at the beginning

247

or the end of a word. */

247

or the end of a word. */

248

249

NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that

250

matches the empty string if it is not at

251

the beginning or end of a word. */

251

the beginning or end of a word. */

252

253

QMARK, /* QMARK is an operator of one argument that

254

matches zero or one occurrences of its

255

argument. */

255

argument. */

256

257

STAR, /* STAR is an operator of one argument that

258

matches the Kleene closure (zero or more

259

occurrences) of its argument. */

259

occurrences) of its argument. */

260

261

PLUS, /* PLUS is an operator of one argument that

262

matches the positive closure (one or more

263

occurrences) of its argument. */

263

occurrences) of its argument. */

264

265

REPMN, /* REPMN is a lexical token corresponding

266

to the {m,n} construct. REPMN never

267

appears in the compiled token vector. */

267

appears in the compiled token vector. */

268

269

CAT, /* CAT is an operator of two arguments that

270

matches the concatenation of its

271

arguments. CAT is never returned by the

272

lexical analyzer. */

272

lexical analyzer. */

273

274

OR, /* OR is an operator of two arguments that

275

matches either of its arguments. */

275

matches either of its arguments. */

276

277

LPAREN, /* LPAREN never appears in the parse tree,

278

it is only a lexeme. */

278

it is only a lexeme. */

279

280

RPAREN, /* RPAREN never appears in the parse tree. */

280

RPAREN, /* RPAREN never appears in the parse tree. */

281

282

ANYCHAR, /* ANYCHAR is a terminal symbol that matches

283

any multibyte (or single byte) characters.

291

292

CSET /* CSET and (and any value greater) is a

293

terminal symbol that matches any of a

294

class of characters. */

294

class of characters. */

295

};

296

297

298

/* States of the recognizer correspond to sets of positions in the parse

299

tree, together with the constraints under which they may be matched.

300

So a position is encoded as an index into the parse tree together with

301

a constraint. */

301

a constraint. */

302

typedef struct

303

{

304

size_t index; /* Index into the parse array. */

305

unsigned int constraint; /* Constraint for matching this position. */

304

size_t index; /* Index into the parse array. */

305

unsigned int constraint; /* Constraint for matching this position. */

306

} position;

307

308

/* Sets of positions are stored as arrays. */

308

/* Sets of positions are stored as arrays. */

309

typedef struct

310

{

311

position *elems; /* Elements of this position set. */

312

size_t nelem; /* Number of elements in this set. */

311

position *elems; /* Elements of this position set. */

312

size_t nelem; /* Number of elements in this set. */

313

size_t alloc; /* Number of elements allocated in ELEMS. */

314

} position_set;

315

316

/* Sets of leaves are also stored as arrays. */

316

/* Sets of leaves are also stored as arrays. */

317

typedef struct

318

{

319

size_t *elems; /* Elements of this position set. */

320

size_t nelem; /* Number of elements in this set. */

319

size_t *elems; /* Elements of this position set. */

320

size_t nelem; /* Number of elements in this set. */

321

} leaf_set;

322

323

/* A state of the dfa consists of a set of positions, some flags,

324

and the token value of the lowest-numbered position of the state that

325

contains an END token. */

325

contains an END token. */

326

typedef struct

327

{

328

size_t hash; /* Hash of the positions of this state. */

329

position_set elems; /* Positions this state could match. */

330

unsigned char context; /* Context from previous state. */

328

size_t hash; /* Hash of the positions of this state. */

329

position_set elems; /* Positions this state could match. */

330

unsigned char context; /* Context from previous state. */

331

char backref; /* True if this state matches a \<digit>. */

332

unsigned short constraint; /* Constraint for this state to accept. */

333

token first_end; /* Token value of the first END in elems. */

332

unsigned short constraint; /* Constraint for this state to accept. */

333

token first_end; /* Token value of the first END in elems. */

334

position_set mbps; /* Positions which can match multibyte

335

characters. e.g. period.

336

These staff are used only if

337

MB_CUR_MAX > 1. */

335

characters, e.g., period.

336

Used only if MB_CUR_MAX > 1. */

338

337

} dfa_state;

339

338

340

339

/* States are indexed by state_num values. These are normally

342

341

typedef ptrdiff_t state_num;

343

342

344

343

/* A bracket operator.

345

e.g. [a-c], [[:alpha:]], etc. */

344

e.g., [a-c], [[:alpha:]], etc. */

346

345

struct mb_char_classes

347

346

{

348

347

ptrdiff_t cset;

360

359

size_t ncoll_elems; /* Collating elements. */

361

360

};

362

361

363

/* A compiled regular expression. */

362

/* A compiled regular expression. */

364

363

struct dfa

365

364

{

366

/* Fields filled by the scanner. */

367

charclass *charclasses; /* Array of character sets for CSET tokens. */

368

size_t cindex; /* Index for adding new charclasses. */

369

size_t calloc; /* Number of charclasses currently allocated. */

365

/* Fields filled by the scanner. */

366

charclass *charclasses; /* Array of character sets for CSET tokens. */

367

size_t cindex; /* Index for adding new charclasses. */

368

size_t calloc; /* Number of charclasses allocated. */

370

369

371

/* Fields filled by the parser. */

372

token *tokens; /* Postfix parse array. */

373

size_t tindex; /* Index for adding new tokens. */

374

size_t talloc; /* Number of tokens currently allocated. */

370

/* Fields filled by the parser. */

371

token *tokens; /* Postfix parse array. */

372

size_t tindex; /* Index for adding new tokens. */

373

size_t talloc; /* Number of tokens currently allocated. */

375

374

size_t depth; /* Depth required of an evaluation stack

376

375

used for depth-first traversal of the

377

parse tree. */

378

size_t nleaves; /* Number of leaves on the parse tree. */

376

parse tree. */

377

size_t nleaves; /* Number of leaves on the parse tree. */

379

378

size_t nregexps; /* Count of parallel regexps being built

380

with dfaparse(). */

379

with dfaparse. */

381

380

unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */

382

token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */

381

token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */

383

382

384

383

/* The following are used only if MB_CUR_MAX > 1. */

385

384

408

407

size_t nmbcsets;

409

408

size_t mbcsets_alloc;

410

409

411

/* Fields filled by the state builder. */

412

dfa_state *states; /* States of the dfa. */

413

state_num sindex; /* Index for adding new states. */

414

state_num salloc; /* Number of states currently allocated. */

410

/* Fields filled by the state builder. */

411

dfa_state *states; /* States of the dfa. */

412

state_num sindex; /* Index for adding new states. */

413

state_num salloc; /* Number of states currently allocated. */

415

414

416

/* Fields filled by the parse tree->NFA conversion. */

415

/* Fields filled by the parse tree->NFA conversion. */

417

416

position_set *follows; /* Array of follow sets, indexed by position

418

417

index. The follow of a position is the set

419

418

of positions containing characters that

420

419

could conceivably follow a character

421

420

matching the given position in a string

422

421

matching the regexp. Allocated to the

423

maximum possible position index. */

422

maximum possible position index. */

424

423

int searchflag; /* True if we are supposed to build a searching

425

424

as opposed to an exact matcher. A searching

426

425

matcher finds the first and shortest string

427

426

matching a regexp anywhere in the buffer,

428

427

whereas an exact matcher finds the longest

429

428

string matching, but anchored to the

430

beginning of the buffer. */

429

beginning of the buffer. */

431

430

432

/* Fields filled by dfaexec. */

431

/* Fields filled by dfaexec. */

433

432

state_num tralloc; /* Number of transition tables that have

434

slots so far. */

433

slots so far. */

435

434

int trcount; /* Number of transition tables that have

436

actually been built. */

435

actually been built. */

437

436

state_num **trans; /* Transition tables for states that can

438

437

never accept. If the transitions for a

439

438

state have not yet been computed, or the

440

439

state could possibly accept, its entry in

441

this table is NULL. */

440

this table is NULL. */

442

441

state_num **realtrans; /* Trans always points to realtrans + 1; this

443

is so trans[-1] can contain NULL. */

442

is so trans[-1] can contain NULL. */

444

443

state_num **fails; /* Transition tables after failing to accept

445

on a state that potentially could do so. */

444

on a state that potentially could do so. */

446

445

int *success; /* Table of acceptance conditions used in

447

dfaexec and computed in build_state. */

446

dfaexec and computed in build_state. */

448

447

state_num *newlines; /* Transitions on newlines. The entry for a

449

448

newline in any transition table is always

450

449

-1 so we can count lines without wasting

451

450

too many cycles. The transition for a

452

451

newline is stored separately and handled

453

452

as a special case. Newline is also used

454

as a sentinel at the end of the buffer. */

453

as a sentinel at the end of the buffer. */

455

454

struct dfamust *musts; /* List of strings, at least one of which

456

455

is known to appear in any r.e. matching

457

the dfa. */

456

the dfa. */

458

457

};

459

458

460

/* Some macros for user access to dfa internals. */

459

/* Some macros for user access to dfa internals. */

461

460

462

/* ACCEPTING returns true if s could possibly be an accepting state of r. */

461

/* ACCEPTING returns true if s could possibly be an accepting state of r. */

463

462

#define ACCEPTING(s, r) ((r).states[s].constraint)

464

463

465

464

/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the

466

specified context. */

465

specified context. */

467

466

#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \

468

467

SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)

469

468

492

491

#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)

493

492

#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)

494

493

495

/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */

494

/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */

496

495

#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \

497

496

do \

498

497

{ \

584

583

}

585

584

#endif /* DEBUG */

586

585

587

/* Stuff pertaining to charclasses. */

586

/* Stuff pertaining to charclasses. */

588

587

589

588

static int

590

589

tstbit (unsigned int b, charclass const c)

631

630

return memcmp (s1, s2, sizeof (charclass)) == 0;

632

631

}

633

632

634

/* A pointer to the current dfa is kept here during parsing. */

633

/* A pointer to the current dfa is kept here during parsing. */

635

634

static struct dfa *dfa;

636

635

637

/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */

636

/* Find the index of charclass s in dfa->charclasses, or allocate a

637

new charclass. */

638

static size_t

639

charclass_index (charclass const s)

640

{

649

return i;

650

}

651

652

/* Syntax bits controlling the behavior of the lexical analyzer. */

652

/* Syntax bits controlling the behavior of the lexical analyzer. */

653

static reg_syntax_t syntax_bits, syntax_bits_set;

654

655

/* Flag for case-folding letters into sets. */

655

/* Flag for case-folding letters into sets. */

656

static int case_fold;

657

658

/* End-of-line byte in data. */

661

/* Cache of char-context values. */

662

static int sbit[NOTCHAR];

663

664

/* Set of characters considered letters. */

664

/* Set of characters considered letters. */

665

static charclass letters;

666

667

/* Set of characters that are newline. */

667

/* Set of characters that are newline. */

668

static charclass newline;

669

670

/* Add this to the test for whether a byte is word-constituent, since on

700

return CTX_NONE;

701

}

702

703

/* Entry point to set syntax options. */

703

/* Entry point to set syntax options. */

704

void

705

dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)

706

{

790

791

792

/* UTF-8 encoding allows some optimizations that we can't otherwise

793

assume in a multibyte encoding. */

793

assume in a multibyte encoding. */

794

static inline int

795

using_utf8 (void)

796

{

814

/* Lexical analyzer. All the dross that deals with the obnoxious

815

GNU Regex syntax bits is located here. The poor, suffering

816

reader is referred to the GNU Regex documentation for the

817

meaning of the @#%!@#%^!@ syntax bits. */

817

meaning of the @#%!@#%^!@ syntax bits. */

818

819

static char const *lexptr; /* Pointer to next input character. */

820

static size_t lexleft; /* Number of characters remaining. */

821

static token lasttok; /* Previous token returned; initially END. */

822

static int laststart; /* True if we're separated from beginning or (, |

823

only by zero-width characters. */

824

static size_t parens; /* Count of outstanding left parens. */

825

static int minrep, maxrep; /* Repeat counts for {m,n}. */

819

static char const *lexptr; /* Pointer to next input character. */

820

static size_t lexleft; /* Number of characters remaining. */

821

static token lasttok; /* Previous token returned; initially END. */

822

static int laststart; /* True if we're separated from beginning or (,

823

| only by zero-width characters. */

824

static size_t parens; /* Count of outstanding left parens. */

825

static int minrep, maxrep; /* Repeat counts for {m,n}. */

826

827

static int cur_mb_len = 1; /* Length of the multibyte representation of

828

wctok. */

829

/* These variables are used only if (MB_CUR_MAX > 1). */

830

static mbstate_t mbs; /* Mbstate for mbrlen(). */

830

static mbstate_t mbs; /* Mbstate for mbrlen. */

831

static wchar_t wctok; /* Wide character representation of the current

832

multibyte character. */

833

static unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec().

834

Each element store the amount of remain

835

byte of corresponding multibyte character

836

in the input string. A element's value

837

is 0 if corresponding character is a

838

single byte character.

839

e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>

840

mblen_buf : 0, 3, 2, 1

841

*/

842

static wchar_t *inputwcs; /* Wide character representation of input

843

string in dfaexec().

844

The length of this array is same as

845

the length of input string(char array).

833

static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.

834

Each element stores the number of remaining

835

bytes of the corresponding multibyte

836

character in the input string. A element's

837

value is 0 if the corresponding character is

838

single-byte.

839

e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>

840

mblen_buf : 0, 3, 2, 1

841

*/

842

static wchar_t *inputwcs; /* Wide character representation of the input

843

string in dfaexec.

844

The length of this array is the same as

845

the length of input string (char array).

846

inputstring[i] is a single-byte char,

847

or 1st byte of a multibyte char.

848

And inputwcs[i] is the codepoint. */

849

static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */

850

static unsigned char const *buf_end; /* reference to end in dfaexec(). */

847

or the first byte of a multibyte char;

848

inputwcs[i] is the codepoint. */

849

static unsigned char const *buf_begin; /* reference to begin in dfaexec. */

850

static unsigned char const *buf_end; /* reference to end in dfaexec. */

851

852

853

#if MBS_SUPPORT

854

/* Note that characters become unsigned here. */

854

/* Note that characters become unsigned here. */

855

# define FETCH_WC(c, wc, eoferr) \

856

do { \

857

if (! lexleft) \

879

(c) = wctob (wc); \

880

} \

881

} \

882

} while(0)

882

} while (0)

883

884

# define FETCH(c, eoferr) \

885

do { \

888

} while (0)

889

890

#else

891

/* Note that characters become unsigned here. */

891

/* Note that characters become unsigned here. */

892

# define FETCH(c, eoferr) \

893

do { \

894

if (! lexleft) \

900

} \

901

(c) = to_uchar (*lexptr++); \

902

--lexleft; \

903

} while(0)

903

} while (0)

904

905

# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)

906

914

915

/* The following list maps the names of the Posix named character classes

916

to predicate functions that determine whether a given character is in

917

the class. The leading [ has already been eaten by the lexical analyzer. */

917

the class. The leading [ has already been eaten by the lexical

918

analyzer. */

918

919

struct dfa_ctype

919

920

{

920

921

const char *name;

984

985

dfa->nmbcsets + 1);

985

986

987

/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.

987

We will update dfa->multibyte_prop[] in addtok(), because we can't

988

We will update dfa->multibyte_prop[] in addtok, because we can't

988

989

decide the index in dfa->tokens[]. */

989

990

991

/* Initialize work area. */

1013

1014

/* Note that if we're looking at some other [:...:] construct,

1014

1015

we just treat it as a bunch of ordinary characters. We can do

1015

1016

this because we assume regex has checked for syntax errors before

1016

dfa is ever called. */

1017

dfa is ever called. */

1017

1018

if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))

1018

1019

{

1019

1020

#define BRACKET_BUFFER_SIZE 128

1112

1113

if (c2 == ']')

1113

1114

{

1114

1115

/* In the case [x-], the - is an ordinary hyphen,

1115

which is left in c1, the lookahead character. */

1116

which is left in c1, the lookahead character. */

1116

1117

lexptr -= cur_mb_len;

1117

1118

lexleft += cur_mb_len;

1118

1119

}

1324

1325

1326

case '`':

1326

1327

if (backslash && !(syntax_bits & RE_NO_GNU_OPS))

1327

return lasttok = BEGLINE; /* FIXME: should be beginning of string */

1328

return lasttok = BEGLINE; /* FIXME: should be beginning of string */

1328

1329

goto normal_char;

1329

1330

1331

case '\'':

1537

1538

}

1538

1539

1540

/* The above loop should consume at most a backslash

1540

and some other character. */

1541

and some other character. */

1541

1542

abort ();

1542

return END; /* keeps pedantic compilers happy. */

1543

return END; /* keeps pedantic compilers happy. */

1543

1544

}

1544

1545

/* Recursive descent parser for regular expressions. */

1546

/* Recursive descent parser for regular expressions. */

1546

1547

static token tok; /* Lookahead token. */

1548

static token tok; /* Lookahead token. */

1548

1549

static size_t depth; /* Current depth of a hypothetical stack

1549

1550

holding deferred productions. This is

1550

1551

used to determine the depth that will be

1551

1552

required of the real stack later on in

1552

dfaanalyze(). */

1553

dfaanalyze. */

1553

1554

1555

static void

1555

1556

addtok_mb (token t, int mbprop)

1589

1590

static void addtok_wc (wint_t wc);

1590

1591

1592

/* Add the given token to the parse tree, maintaining the depth count and

1592

updating the maximum depth if necessary. */

1593

updating the maximum depth if necessary. */

1593

1594

static void

1594

1595

addtok (token t)

1595

1596

{

1648

1649

/* We treat a multibyte character as a single atom, so that DFA

1649

1650

can treat a multibyte character as a single expression.

1650

1651

e.g. We construct following tree from "<mb1><mb2>".

1652

e.g., we construct the following tree from "<mb1><mb2>".

1652

1653

<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>

1653

1654

<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */

1654

1655

static void

1764

1765

LPAREN regexp RPAREN

1765

1766

<empty>

1766

1767

The parser builds a parse tree in postfix form in an array of tokens. */

1768

The parser builds a parse tree in postfix form in an array of tokens. */

1768

1769

1770

static void

1770

1771

atom (void)

1820

1821

addtok (EMPTY);

1821

1822

}

1822

1823

/* Return the number of tokens in the given subexpression. */

1824

/* Return the number of tokens in the given subexpression. */

1824

1825

static size_t _GL_ATTRIBUTE_PURE

1825

1826

nsubtoks (size_t tindex)

1826

1827

{

1841

1842

}

1842

1843

}

1843

1844

/* Copy the given subexpression to the top of the tree. */

1845

/* Copy the given subexpression to the top of the tree. */

1845

1846

static void

1846

1847

copytoks (size_t tindex, size_t ntokens)

1847

1848

{

1849

1850

1851

if (MB_CUR_MAX > 1)

1851

1852

for (i = 0; i < ntokens; ++i)

1852

addtok_mb(dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);

1853

addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);

1853

1854

else

1854

1855

for (i = 0; i < ntokens; ++i)

1855

addtok_mb(dfa->tokens[tindex + i], 3);

1856

addtok_mb (dfa->tokens[tindex + i], 3);

1856

1857

}

1857

1858

1859

static void

1922

1923

1924

/* Main entry point for the parser. S is a string to be parsed, len is the

1924

1925

length of the string, so s can include NUL characters. D is a pointer to

1925

the struct dfa to parse into. */

1926

the struct dfa to parse into. */

1926

1927

void

1927

1928

dfaparse (char const *s, size_t len, struct dfa *d)

1928

1929

{

1958

1959

++d->nregexps;

1959

1960

}

1960

1961

/* Some primitives for operating on sets of positions. */

1962

/* Some primitives for operating on sets of positions. */

1962

1963

/* Copy one set to another; the destination must be large enough. */

1964

/* Copy one set to another; the destination must be large enough. */

1964

1965

static void

1965

1966

copy (position_set const *src, position_set * dst)

1966

1967

{

1980

1981

/* Insert position P in set S. S is maintained in sorted order on

1981

1982

decreasing index. If there is already an entry in S with P.index

1982

1983

then merge (logically-OR) P's constraints into the one in S.

1983

S->elems must point to an array large enough to hold the resulting set. */

1984

S->elems must point to an array large enough to hold the resulting set. */

1984

1985

static void

1985

1986

insert (position p, position_set * s)

1986

1987

{

2010

2011

}

2011

2012

2013

/* Merge two sets of positions into a third. The result is exactly as if

2013

the positions of both sets were inserted into an initially empty set. */

2014

the positions of both sets were inserted into an initially empty set. */

2014

2015

static void

2015

2016

merge (position_set const *s1, position_set const *s2, position_set * m)

2016

2017

{

2034

2035

m->elems[m->nelem++] = s2->elems[j++];

2035

2036

}

2036

2037

/* Delete a position from a set. */

2038

/* Delete a position from a set. */

2038

2039

static void

2039

2040

delete (position p, position_set * s)

2040

2041

{

2050

2051

2052

/* Find the index of the state corresponding to the given position set with

2052

2053

the given preceding context, or create a new state if there is no such

2053

state. Context tells whether we got here on a newline or letter. */

2054

state. Context tells whether we got here on a newline or letter. */

2054

2055

static state_num

2055

2056

state_index (struct dfa *d, position_set const *s, int context)

2056

2057

{

2061

2062

for (i = 0; i < s->nelem; ++i)

2062

2063

hash ^= s->elems[i].index + s->elems[i].constraint;

2063

2064

/* Try to find a state that exactly matches the proposed one. */

2065

/* Try to find a state that exactly matches the proposed one. */

2065

2066

for (i = 0; i < d->sindex; ++i)

2066

2067

{

2067

2068

if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem

2076

2077

return i;

2077

2078

}

2078

2079

/* We'll have to create a new state. */

2080

/* We'll have to create a new state. */

2080

2081

REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);

2081

2082

d->states[i].hash = hash;

2082

2083

alloc_position_set (&d->states[i].elems, s->nelem);

2114

2115

contains a symbol that matches the empty string in some context, replace

2115

2116

that position with the elements of its follow labeled with an appropriate

2116

2117

constraint. Repeat exhaustively until no funny positions are left.

2117

S->elems must be large enough to hold the result. */

2118

S->elems must be large enough to hold the result. */

2118

2119

static void

2119

2120

epsclosure (position_set * s, struct dfa const *d)

2120

2121

{

2121

2122

size_t i, j;

2122

char *visited; /* array of booleans, enough to use char, not int */

2123

char *visited; /* Array of booleans, enough to use char, not int. */

2123

2124

position p, old;

2124

2125

2126

CALLOC (visited, d->tindex);

2170

2171

p.index = d->follows[old.index].elems[j].index;

2171

2172

insert (p, s);

2172

2173

}

2173

/* Force rescan to start at the beginning. */

2174

/* Force rescan to start at the beginning. */

2174

2175

i = -1;

2175

2176

}

2176

2177

2275

2276

analysis is conveniently done by a linear scan with the aid of a stack.

2276

2277

Sets are stored as arrays of the elements, obeying a stack-like allocation

2277

2278

scheme; the number of elements in each set deeper in the stack can be

2278

used to determine the address of a particular set's array. */

2279

used to determine the address of a particular set's array. */

2279

2280

void

2280

2281

dfaanalyze (struct dfa *d, int searchflag)

2281

2282

{

2282

int *nullable; /* Nullable stack. */

2283

size_t *nfirstpos; /* Element count stack for firstpos sets. */

2284

position *firstpos; /* Array where firstpos elements are stored. */

2285

size_t *nlastpos; /* Element count stack for lastpos sets. */

2286

position *lastpos; /* Array where lastpos elements are stored. */

2287

position_set tmp; /* Temporary set for merging sets. */

2288

position_set merged; /* Result of merging sets. */

2289

int separate_contexts; /* Context wanted by some position. */

2283

int *nullable; /* Nullable stack. */

2284

size_t *nfirstpos; /* Element count stack for firstpos sets. */

2285

position *firstpos; /* Array where firstpos elements are stored. */

2286

size_t *nlastpos; /* Element count stack for lastpos sets. */

2287

position *lastpos; /* Array where lastpos elements are stored. */

2288

position_set tmp; /* Temporary set for merging sets. */

2289

position_set merged; /* Result of merging sets. */

2290

int separate_contexts; /* Context wanted by some position. */

2290

2291

int *o_nullable;

2291

2292

size_t *o_nfirst, *o_nlast;

2292

2293

position *o_firstpos, *o_lastpos;

2324

2325

switch (d->tokens[i])

2325

2326

{

2326

2327

case EMPTY:

2327

/* The empty set is nullable. */

2328

/* The empty set is nullable. */

2328

2329

*nullable++ = 1;

2329

2330

/* The firstpos and lastpos of the empty leaf are both empty. */

2331

/* The firstpos and lastpos of the empty leaf are both empty. */

2331

2332

*nfirstpos++ = *nlastpos++ = 0;

2332

2333

break;

2333

2334

2335

case STAR:

2335

2336

case PLUS:

2336

2337

/* Every element in the firstpos of the argument is in the follow

2337

of every element in the lastpos. */

2338

of every element in the lastpos. */

2338

2339

tmp.nelem = nfirstpos[-1];

2339

2340

tmp.elems = firstpos;

2340

2341

pos = lastpos;

2345

2346

}

2346

2347

2348

case QMARK:

2348

/* A QMARK or STAR node is automatically nullable. */

2349

/* A QMARK or STAR node is automatically nullable. */

2349

2350

if (d->tokens[i] != PLUS)

2350

2351

nullable[-1] = 1;

2351

2352

break;

2352

2353

2354

case CAT:

2354

2355

/* Every element in the firstpos of the second argument is in the

2355

follow of every element in the lastpos of the first argument. */

2356

follow of every element in the lastpos of the first argument. */

2356

2357

tmp.nelem = nfirstpos[-1];

2357

2358

tmp.elems = firstpos;

2358

2359

pos = lastpos + nlastpos[-1];

2363

2364

}

2364

2365

2366

/* The firstpos of a CAT node is the firstpos of the first argument,

2366

union that of the second argument if the first is nullable. */

2367

union that of the second argument if the first is nullable. */

2367

2368

if (nullable[-2])

2368

2369

nfirstpos[-2] += nfirstpos[-1];

2369

2370

else

2371

2372

--nfirstpos;

2372

2373

2374

/* The lastpos of a CAT node is the lastpos of the second argument,

2374

union that of the first argument if the second is nullable. */

2375

union that of the first argument if the second is nullable. */

2375

2376

if (nullable[-1])

2376

2377

nlastpos[-2] += nlastpos[-1];

2377

2378

else

2384

2385

}

2385

2386

--nlastpos;

2386

2387

/* A CAT node is nullable if both arguments are nullable. */

2388

/* A CAT node is nullable if both arguments are nullable. */

2388

2389

nullable[-2] = nullable[-1] && nullable[-2];

2389

2390

--nullable;

2390

2391

break;

2391

2392

2393

case OR:

2393

/* The firstpos is the union of the firstpos of each argument. */

2394

/* The firstpos is the union of the firstpos of each argument. */

2394

2395

nfirstpos[-2] += nfirstpos[-1];

2395

2396

--nfirstpos;

2396

2397

/* The lastpos is the union of the lastpos of each argument. */

2398

/* The lastpos is the union of the lastpos of each argument. */

2398

2399

nlastpos[-2] += nlastpos[-1];

2399

2400

--nlastpos;

2400

2401

/* An OR node is nullable if either argument is nullable. */

2402

/* An OR node is nullable if either argument is nullable. */

2402

2403

nullable[-2] = nullable[-1] || nullable[-2];

2403

2404

--nullable;

2404

2405

break;

2408

2409

constructs like \< are treated as nonempty strings here;

2409

2410

an "epsilon closure" effectively makes them nullable later.

2410

2411

Backreferences have to get a real position so we can detect

2411

transitions on them later. But they are nullable. */

2412

transitions on them later. But they are nullable. */

2412

2413

*nullable++ = d->tokens[i] == BACKREF;

2413

2414

/* This position is in its own firstpos and lastpos. */

2415

/* This position is in its own firstpos and lastpos. */

2415

2416

*nfirstpos++ = *nlastpos++ = 1;

2416

2417

--firstpos, --lastpos;

2417

2418

firstpos->index = lastpos->index = i;

2418

2419

firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;

2419

2420

/* Allocate the follow set for this position. */

2421

/* Allocate the follow set for this position. */

2421

2422

alloc_position_set (&d->follows[i], 1);

2422

2423

break;

2423

2424

}

2424

2425

#ifdef DEBUG

2425

/* ... balance the above nonsyntactic #ifdef goo... */

2426

/* ... balance the above nonsyntactic #ifdef goo... */

2426

2427

fprintf (stderr, "node %zd:", i);

2427

2428

prtok (d->tokens[i]);

2428

2429

putc ('\n', stderr);

2444

2445

}

2445

2446

2447

/* For each follow set that is the follow set of a real position, replace

2447

it with its epsilon closure. */

2448

it with its epsilon closure. */

2448

2449

for (i = 0; i < d->tindex; ++i)

2449

2450

if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF

2450

2451

#if MBS_SUPPORT

2469

2470

}

2470

2471

2472

/* Get the epsilon closure of the firstpos of the regexp. The result will

2472

be the set of positions of state 0. */

2473

be the set of positions of state 0. */

2473

2474

merged.nelem = 0;

2474

2475

for (i = 0; i < nfirstpos[-1]; ++i)

2475

2476

insert (firstpos[i], &merged);

2476

2477

epsclosure (&merged, d);

2477

2478

/* Build the initial state. */

2479

/* Build the initial state. */

2479

2480

d->salloc = 1;

2480

2481

d->sindex = 0;

2481

2482

MALLOC (d->states, d->salloc);

2523

2524

2525

If after comparing with every group there are characters remaining in C,

2525

2526

create a new group labeled with the characters of C and insert this

2526

position in that group. */

2527

position in that group. */

2527

2528

void

2528

2529

dfastate (state_num s, struct dfa *d, state_num trans[])

2529

2530

{

2530

leaf_set *grps; /* As many as will ever be needed. */

2531

charclass *labels; /* Labels corresponding to the groups. */

2532

size_t ngrps = 0; /* Number of groups actually used. */

2533

position pos; /* Current position being considered. */

2534

charclass matches; /* Set of matching characters. */

2535

int matchesf; /* True if matches is nonempty. */

2536

charclass intersect; /* Intersection with some label set. */

2537

int intersectf; /* True if intersect is nonempty. */

2538

charclass leftovers; /* Stuff in the label that didn't match. */

2539

int leftoversf; /* True if leftovers is nonempty. */

2540

position_set follows; /* Union of the follows of some group. */

2541

position_set tmp; /* Temporary space for merging sets. */

2542

int possible_contexts; /* Contexts that this group can match. */

2543

int separate_contexts; /* Context that new state wants to know. */

2544

state_num state; /* New state. */

2545

state_num state_newline; /* New state on a newline transition. */

2546

state_num state_letter; /* New state on a letter transition. */

2531

leaf_set *grps; /* As many as will ever be needed. */

2532

charclass *labels; /* Labels corresponding to the groups. */

2533

size_t ngrps = 0; /* Number of groups actually used. */

2534

position pos; /* Current position being considered. */

2535

charclass matches; /* Set of matching characters. */

2536

int matchesf; /* True if matches is nonempty. */

2537

charclass intersect; /* Intersection with some label set. */

2538

int intersectf; /* True if intersect is nonempty. */

2539

charclass leftovers; /* Stuff in the label that didn't match. */

2540

int leftoversf; /* True if leftovers is nonempty. */

2541

position_set follows; /* Union of the follows of some group. */

2542

position_set tmp; /* Temporary space for merging sets. */

2543

int possible_contexts; /* Contexts that this group can match. */

2544

int separate_contexts; /* Context that new state wants to know. */

2545

state_num state; /* New state. */

2546

state_num state_newline; /* New state on a newline transition. */

2547

state_num state_letter; /* New state on a letter transition. */

2547

2548

int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */

2548

2549

size_t i, j, k;

2549

2550

2576

2577

continue;

2577

2578

2579

/* Some characters may need to be eliminated from matches because

2579

they fail in the current context. */

2580

they fail in the current context. */

2580

2581

if (pos.constraint != NO_CONSTRAINT)

2581

2582

{

2582

2583

if (!SUCCEEDS_IN_CONTEXT (pos.constraint,

2592

2593

for (j = 0; j < CHARCLASS_INTS; ++j)

2593

2594

matches[j] &= letters[j] | newline[j];

2594

2595

/* If there are no characters left, there's no point in going on. */

2596

/* If there are no characters left, there's no point in going on. */

2596

2597

for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)

2597

2598

continue;

2598

2599

if (j == CHARCLASS_INTS)

2603

2604

{

2604

2605

/* If matches contains a single character only, and the current

2605

2606

group's label doesn't contain that character, go on to the

2606

next group. */

2607

next group. */

2607

2608

if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR

2608

2609

&& !tstbit (d->tokens[pos.index], labels[j]))

2609

2610

continue;

2610

2611

2612

/* Check if this group's label has a nonempty intersection with

2612

matches. */

2613

matches. */

2613

2614

intersectf = 0;

2614

2615

for (k = 0; k < CHARCLASS_INTS; ++k)

2615

2616

(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;

2616

2617

if (!intersectf)

2617

2618

continue;

2618

2619

/* It does; now find the set differences both ways. */

2620

/* It does; now find the set differences both ways. */

2620

2621

leftoversf = matchesf = 0;

2621

2622

for (k = 0; k < CHARCLASS_INTS; ++k)

2622

2623

{

2623

/* Even an optimizing compiler can't know this for sure. */

2624

/* Even an optimizing compiler can't know this for sure. */

2624

2625

int match = matches[k], label = labels[j][k];

2625

2626

2627

(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;

2627

2628

(matches[k] = match & ~label) ? (matchesf = 1) : 0;

2628

2629

}

2629

2630

/* If there were leftovers, create a new group labeled with them. */

2631

/* If there were leftovers, create a new group labeled with them. */

2631

2632

if (leftoversf)

2632

2633

{

2633

2634

copyset (leftovers, labels[ngrps]);

2644

2645

grps[j].elems[grps[j].nelem++] = pos.index;

2645

2646

2647

/* If every character matching the current position has been

2647

accounted for, we're done. */

2648

accounted for, we're done. */

2648

2649

if (!matchesf)

2649

2650

break;

2650

2651

}

2651

2652

2653

/* If we've passed the last group, and there are still characters

2653

unaccounted for, then we'll have to create a new group. */

2654

unaccounted for, then we'll have to create a new group. */

2654

2655

if (j == ngrps)

2655

2656

{

2656

2657

copyset (matches, labels[ngrps]);

2667

2668

2669

/* If we are a searching matcher, the default transition is to a state

2669

2670

containing the positions of state 0, otherwise the default transition

2670

is to fail miserably. */

2671

is to fail miserably. */

2671

2672

if (d->searchflag)

2672

2673

{

2673

/* Find the state(s) corresponding to the positions of state 0. */

2674

/* Find the state(s) corresponding to the positions of state 0. */

2674

2675

copy (&d->states[0].elems, &follows);

2675

2676

separate_contexts = state_separate_contexts (&follows);

2676

2677

state = state_index (d, &follows, separate_contexts ^ CTX_ANY);

2696

2697

follows.nelem = 0;

2697

2698

2699

/* Find the union of the follows of the positions of the group.

2699

This is a hideously inefficient loop. Fix it someday. */

2700

This is a hideously inefficient loop. Fix it someday. */

2700

2701

for (j = 0; j < grps[i].nelem; ++j)

2701

2702

for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)

2702

2703

insert (d->follows[grps[i].elems[j]].elems[k], &follows);

2733

2734

}

2734

2735

2736

/* If we are building a searching matcher, throw in the positions

2736

of state 0 as well. */

2737

of state 0 as well. */

2737

2738

if (d->searchflag

2738

2739

&& (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))

2739

2740

for (j = 0; j < d->states[0].elems.nelem; ++j)

2740

2741

insert (d->states[0].elems.elems[j], &follows);

2741

2742

/* Find out if the new state will want any context information. */

2743

/* Find out if the new state will want any context information. */

2743

2744

possible_contexts = charclass_context (labels[i]);

2744

2745

separate_contexts = state_separate_contexts (&follows);

2745

2746

/* Find the state(s) corresponding to the union of the follows. */

2747

/* Find the state(s) corresponding to the union of the follows. */

2747

2748

if ((separate_contexts & possible_contexts) != possible_contexts)

2748

2749

state = state_index (d, &follows, separate_contexts ^ CTX_ANY);

2749

2750

else

2757

2758

else

2758

2759

state_letter = state;

2759

2760

/* Set the transitions for each character in the current label. */

2761

/* Set the transitions for each character in the current label. */

2761

2762

for (j = 0; j < CHARCLASS_INTS; ++j)

2762

2763

for (k = 0; k < INTBITS; ++k)

2763

2764

if (labels[i][j] & 1 << k)

2786

2787

is a non-accepting state, then d->trans[state] points to its table.

2787

2788

If it is an accepting state then d->fails[state] points to its table.

2788

2789

If it has no table at all, then d->trans[state] is NULL.

2789

TODO: Improve this comment, get rid of the unnecessary redundancy. */

2790

TODO: Improve this comment, get rid of the unnecessary redundancy. */

2790

2791

2792

static void

2792

2793

build_state (state_num s, struct dfa *d)

2793

2794

{

2794

state_num *trans; /* The new transition table. */

2795

state_num *trans; /* The new transition table. */

2795

2796

state_num i;

2796

2797

2798

/* Set an upper limit on the number of transition tables that will ever

2798

2799

exist at once. 1024 is arbitrary. The idea is that the frequently

2799

2800

used transition tables will be quickly rebuilt, whereas the ones that

2800

were only needed once or twice will be cleared away. */

2801

were only needed once or twice will be cleared away. */

2801

2802

if (d->trcount >= 1024)

2802

2803

{

2803

2804

for (i = 0; i < d->tralloc; ++i)

2811

2812

2813

++d->trcount;

2813

2814

/* Set up the success bits for this state. */

2815

/* Set up the success bits for this state. */

2815

2816

d->success[s] = 0;

2816

2817

if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))

2817

2818

d->success[s] |= CTX_NEWLINE;

2825

2826

2827

/* Now go through the new transition table, and make sure that the trans

2827

2828

and fail arrays are allocated large enough to hold a pointer for the

2828

largest state mentioned in the table. */

2829

largest state mentioned in the table. */

2829

2830

for (i = 0; i < NOTCHAR; ++i)

2830

2831

if (trans[i] >= d->tralloc)

2831

2832

{

2846

2847

}

2847

2848

2849

/* Keep the newline transition in a special place so we can use it as

2849

a sentinel. */

2850

a sentinel. */

2850

2851

d->newlines[s] = trans[eolbyte];

2851

2852

trans[eolbyte] = -1;

2852

2853

2871

2872

2873

/* Multibyte character handling sub-routines for dfaexec. */

2873

2874

/* Initial state may encounter the byte which is not a single byte character

2875

nor 1st byte of a multibyte character. But it is incorrect for initial

2876

state to accept such a byte.

2877

For example, in sjis encoding the regular expression like "\\" accepts

2878

the codepoint 0x5c, but should not accept the 2nd byte of the codepoint

2879

0x815c. Then Initial state must skip the bytes which are not a single byte

2880

character nor 1st byte of a multibyte character. */

2875

/* The initial state may encounter a byte which is not a single byte character

2876

nor the first byte of a multibyte character. But it is incorrect for the

2877

initial state to accept such a byte. For example, in Shift JIS the regular

2878

expression "\\" accepts the codepoint 0x5c, but should not accept the second

2879

byte of the codepoint 0x815c. Then the initial state must skip the bytes

2880

that are not a single byte character nor the first byte of a multibyte

2881

character. */

2881

2882

#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \

2882

2883

if (s == 0) \

2883

2884

{ \

2898

2899

realloc_trans_if_necessary (struct dfa *d, state_num new_state)

2899

2900

{

2900

2901

/* Make sure that the trans and fail arrays are allocated large enough

2901

to hold a pointer for the new state. */

2902

to hold a pointer for the new state. */

2902

2903

if (new_state >= d->tralloc)

2903

2904

{

2904

2905

state_num oldalloc = d->tralloc;

2918

2919

}

2919

2920

}

2920

2921

/* Return values of transit_state_singlebyte(), and

2922

/* Return values of transit_state_singlebyte, and

2922

2923

transit_state_consume_1char. */

2923

2924

typedef enum

2924

2925

{

2928

2929

} status_transit_state;

2929

2930

2931

/* Consume a single byte and transit state from 's' to '*next_state'.

2931

This function is almost same as the state transition routin in dfaexec().

2932

This function is almost same as the state transition routin in dfaexec.

2932

2933

But state transition is done just once, otherwise matching succeed or

2933

2934

reach the end of the buffer. */

2934

2935

static status_transit_state

3012

3013

match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)

3013

3014

{

3014

3015

size_t i;

3015

int match; /* Flag which represent that matching succeed. */

3016

int match_len; /* Length of the character (or collating element)

3017

with which this operator match. */

3018

int op_len; /* Length of the operator. */

3016

int match; /* Matching succeeded. */

3017

int match_len; /* Length of the character (or collating element)

3018

with which this operator matches. */

3019

int op_len; /* Length of the operator. */

3019

3020

char buffer[128];

3020

3021

3022

/* Pointer to the structure to which we are currently referring. */

3109

3110

return match ? match_len : 0;

3110

3111

}

3111

3112

/* Check each of 'd->states[s].mbps.elem' can match or not. Then return the

3113

array which corresponds to 'd->states[s].mbps.elem' and each element of

3114

the array contains the amount of the bytes with which the element can

3115

match.

3116

'idx' is the index from the buf_begin, and it is the current position

3113

/* Check whether each of 'd->states[s].mbps.elem' can match. Then return the

3114

array which corresponds to 'd->states[s].mbps.elem'; each element of the

3115

array contains the number of bytes with which the element can match.

3116

3117

'idx' is the index from buf_begin, and it is the current position

3117

3118

in the buffer.

3118

Caller MUST free the array which this function return. */

3119

3120

The caller MUST free the array which this function return. */

3119

3121

static int *

3120

3122

check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)

3121

3123

{

3142

3144

}

3143

3145

3144

3146

/* Consume a single character and enumerate all of the positions which can

3145

be next position from the state 's'.

3146

'match_lens' is the input. It can be NULL, but it can also be the output

3147

of check_matching_with_multibyte_ops() for optimization.

3147

be the next position from the state 's'.

3148

3149

'match_lens' is the input. It can be NULL, but it can also be the output

3150

of check_matching_with_multibyte_ops for optimization.

3151

3148

3152

'mbclen' and 'pps' are the output. 'mbclen' is the length of the

3149

character consumed, and 'pps' is the set this function enumerate. */

3153

character consumed, and 'pps' is the set this function enumerates. */

3150

3154

static status_transit_state

3151

3155

transit_state_consume_1char (struct dfa *d, state_num s,

3152

3156

unsigned char const **pp,

3203

3207

transit_state (struct dfa *d, state_num s, unsigned char const **pp)

3204

3208

{

3205

3209

state_num s1;

3206

int mbclen; /* The length of current input multibyte character. */

3210

int mbclen; /* The length of current input multibyte character. */

3207

3211

int maxlen = 0;

3208

3212

size_t i, j;

3209

3213

int *match_lens = NULL;

3339

3343

If COUNT is non-NULL, increment *COUNT once for each newline processed.

3340

3344

Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we

3341

3345

encountered a back-reference (1) or not (0). The caller may use this

3342

to decide whether to fall back on a backtracking matcher. */

3346

to decide whether to fall back on a backtracking matcher. */

3343

3347

char *

3344

3348

dfaexec (struct dfa *d, char const *begin, char *end,

3345

3349

int allow_nl, size_t *count, int *backref)

3346

3350

{

3347

state_num s, s1; /* Current state. */

3348

unsigned char const *p; /* Current input character. */

3351

state_num s, s1; /* Current state. */

3352

unsigned char const *p; /* Current input character. */

3349

3353

state_num **trans, *t; /* Copy of d->trans so it can be optimized

3350

into a register. */

3354

into a register. */

3351

3355

unsigned char eol = eolbyte; /* Likewise for eolbyte. */

3352

3356

unsigned char saved_end;

3353

3357

3446

3450

continue;

3447

3451

}

3448

3452

3449

/* If the previous character was a newline, count it. */

3453

/* If the previous character was a newline, count it. */

3450

3454

if ((char *) p <= end && p[-1] == eol)

3451

3455

{

3452

3456

if (count)

3456

3460

prepare_wc_buf ((const char *) p, end);

3457

3461

}

3458

3462

3459

/* Check if we've run off the end of the buffer. */

3463

/* Check if we've run off the end of the buffer. */

3460

3464

if ((char *) p > end)

3461

3465

{

3462

3466

if (d->mb_cur_max > 1)

3517

3521

}

3518

3522

3519

3523

/* Initialize the components of a dfa that the other routines don't

3520

initialize for themselves. */

3524

initialize for themselves. */

3521

3525

void

3522

3526

dfainit (struct dfa *d)

3523

3527

{

3567

3571

d->mb_cur_max = 1;

3568

3572

}

3569

3573

3570

/* Parse and analyze a single string of the given length. */

3574

/* Parse and analyze a single string of the given length. */

3571

3575

void

3572

3576

dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)

3573

3577

{

3578

3582

dfaanalyze (d, searchflag);

3579

3583

}

3580

3584

3581

/* Free the storage held by the components of a dfa. */

3585

/* Free the storage held by the components of a dfa. */

3582

3586

void

3583

3587

dfafree (struct dfa *d)

3584

3588

{

3673

3677

and q->left and q->right p->is : NULL

3674

3678

3675

3679

If there's anything else we recognize in the tree, all four sequences get set

3676

to zero-length sequences. If there's something we don't recognize in the tree,

3677

we just return a zero-length sequence.

3680

to zero-length sequences. If there's something we don't recognize in the

3681

tree, we just return a zero-length sequence.

3678

3682

3679

3683

Break ties in favor of infrequent letters (choosing 'zzz' in preference to

3680

3684

'aaa')?

3681

3685

3682

And. . .is it here or someplace that we might ponder "optimizations" such as

3686

And ... is it here or someplace that we might ponder "optimizations" such as

3683

3687

egrep 'psi|epsilon' -> egrep 'psi'

3684

3688

egrep 'pepsi|epsilon' -> egrep 'epsi'

3685

3689

(Yes, we now find "epsi" as a "string

3700

3704

3701

3705

Are optimizable r.e.'s likely to be used in real-life situations

3702

3706

(something like 'ab*' is probably unlikely; something like is

3703

'psi|epsilon' is likelier)? */

3707

'psi|epsilon' is likelier)? */

3704

3708

3705

3709

static char *

3706

3710

icatalloc (char *old, char const *new)

3761

3765

return NULL;

3762

3766

}

3763

3767

new[len] = '\0';

3764

/* Is there already something in the list that's new (or longer)? */

3768

/* Is there already something in the list that's new (or longer)? */

3765

3769

for (i = 0; cpp[i] != NULL; ++i)

3766

3770

if (istrstr (cpp[i], new) != NULL)

3767

3771

{

3768

3772

free (new);

3769

3773

return cpp;

3770

3774

}

3771

/* Eliminate any obsoleted strings. */

3775

/* Eliminate any obsoleted strings. */

3772

3776

j = 0;

3773

3777

while (cpp[j] != NULL)

3774

3778

if (istrstr (new, cpp[j]) == NULL)

3781

3785

cpp[j] = cpp[i];

3782

3786

cpp[i] = NULL;

3783

3787

}

3784

/* Add the new string. */

3788

/* Add the new string. */

3785

3789

REALLOC (cpp, i + 2);

3786

3790

cpp[i] = new;

3787

3791

cpp[i + 1] = NULL;

3789

3793

}

3790

3794

3791

3795

/* Given pointers to two strings, return a pointer to an allocated

3792

list of their distinct common substrings. Return NULL if something

3793

seems wild. */

3796

list of their distinct common substrings. Return NULL if something

3797

seems wild. */

3794

3798

static char **

3795

3799

comsubs (char *left, char const *right)

3796

3800

{

3850

3854

}

3851

3855

3852

3856

/* Given two lists of substrings, return a new list giving substrings

3853

common to both. */

3857

common to both. */

3854

3858

static char **

3855

3859

inboth (char **left, char **right)

3856

3860

{

3970

3974

3971

3975

rmp = --mp;

3972

3976

lmp = --mp;

3973

/* Guaranteed to be. Unlikely, but. . . */

3977

/* Guaranteed to be. Unlikely, but ... */

3974

3978

if (!STREQ (lmp->is, rmp->is))

3975

3979

lmp->is[0] = '\0';

3976

3980

/* Left side--easy */

4021

4025

lmp = --mp;

4022

4026

/* In. Everything in left, plus everything in

4023

4027

right, plus concatenation of

4024

left's right and right's left. */

4028

left's right and right's left. */

4025

4029

lmp->in = addlists (lmp->in, rmp->in);

4026

4030

if (lmp->in == NULL)

4027

4031

goto done;