2
* Definitions etc. for regexp(3) routines.
4
* Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
5
* not the System V one.
11
#define rep_NSUBEXP 10
13
typedef enum rep_regtype {
18
typedef union rep_regsubs {
20
char *startp[rep_NSUBEXP];
21
char *endp[rep_NSUBEXP];
24
repv startp[rep_NSUBEXP];
25
repv endp[rep_NSUBEXP];
29
typedef struct rep_regexp {
33
u_char regstart; /* Internal use only. */
34
char reganch; /* Internal use only. */
35
char *regmust; /* Internal use only. */
36
int regmlen; /* Internal use only. */
37
int regsize; /* actual size of regexp structure */
38
char program[1]; /* Unwarranted chumminess with compiler. */
41
/* Data structure used to save and restore regexp data internally */
42
struct rep_saved_regexp_data {
43
struct rep_saved_regexp_data *next;
49
/* eflags for regexec2() */
50
#define rep_REG_NOTBOL 1 /* start of input isn't start of line */
51
#define rep_REG_NOCASE 2 /* fold upper and lower case */
52
#define rep_REG_1LINE 4 /* for regexec_tx: only search to the
53
end of the line for the start of the
56
#define rep_regexec(p,s) rep_regexec2(p,s,0)
58
extern rep_regexp *rep_regcomp(char *);
59
extern int rep_regexec2(rep_regexp *, char *, int);
60
extern int rep_regmatch_string(rep_regexp *, char *, int);
62
extern int rep_regexp_max_depth;
65
/* Only include the internal stuff if it's explicitly requested, since
66
it comtaminates the namespace.. */
68
#ifdef rep_NEED_REGEXP_INTERNALS
71
* Structure for regexp "program". This is essentially a linear encoding of
72
* a nondeterministic finite-state machine (aka syntax charts or "railroad
73
* normal form" in parsing technology). Each node is an opcode plus a "next"
74
* pointer, possibly plus an operand. "Next" pointers of all nodes except
75
* BRANCH implement concatenation; a "next" pointer with a BRANCH on both
76
* ends of it is connecting two alternatives. (Here we have one of the
77
* subtle syntax dependencies: an individual BRANCH (as opposed to a
78
* collection of them) is never concatenated with anything because of
79
* operator precedence.) The operand of some types of node is a literal
80
* string; for others, it is a node leading into a sub-FSM. In particular,
81
* the operand of a BRANCH node is the first node of the branch. (NB this is
82
* *not* a tree structure: the tail of the branch connects to the thing
83
* following the set of BRANCHes.) The opcodes are:
86
/* definition number opnd? meaning */
87
#define END 0 /* no End of program. */
88
#define BOL 1 /* no Match "" at beginning of line. */
89
#define EOL 2 /* no Match "" at end of line. */
90
#define ANY 3 /* no Match any one character. */
91
#define ANYOF 4 /* str Match any character in this string. */
92
#define ANYBUT 5 /* str Match any character not in this
94
#define BRANCH 6 /* node Match this alternative, or the
96
#define BACK 7 /* no Match "", "next" ptr points backward. */
97
#define EXACTLY 8 /* str Match this string. */
98
#define NOTHING 9 /* no Match empty string. */
99
#define STAR 10 /* node Match this (simple) thing 0 or more
101
#define PLUS 11 /* node Match this (simple) thing 1 or more
103
#define WORD 12 /* no Match alphanumeric or _ char */
104
#define NWORD 13 /* no Match non-(alphanumeric or _) char */
105
#define WSPC 14 /* no Match whitespace char */
106
#define NWSPC 15 /* no Match non-whitespace char */
107
#define DIGI 16 /* no Match digit char */
108
#define NDIGI 17 /* no Match non-digit char */
109
#define WEDGE 18 /* no Match "" at word boundary */
110
#define NWEDGE 19 /* no Match "" not at word boundary */
111
#define OPEN 20 /* no Mark this point in input as start of
113
/* OPEN+1 is number 1, etc. */
114
#define CLOSE 30 /* no Analogous to OPEN. */
115
#define NGSTAR 40 /* node Match this (simple) thing 0 or more
116
times (non-greedily) */
117
#define NGPLUS 41 /* node Match this (simple) thing 1 or more
118
times (non-greedily) */
123
* BRANCH The set of branches constituting a single choice are hooked together
124
* with their "next" pointers, since precedence prevents anything being
125
* concatenated to any individual branch. The "next" pointer of the last
126
* BRANCH in a choice points to the thing following the whole choice. This
127
* is also where the final "next" pointer of each individual branch points;
128
* each branch starts with the operand node of a BRANCH node.
130
* BACK Normal "next" pointers all implicitly point forward; BACK exists to
131
* make loop structures possible.
133
* STAR,PLUS '?', and complex '*' and '+', are implemented as circular
134
* BRANCH structures using BACK. Simple cases (one character per match) are
135
* implemented with STAR and PLUS for speed and to minimize recursive
138
* OPEN,CLOSE ...are numbered at compile time.
142
* A node is one char of opcode followed by two chars of "next" pointer.
143
* "Next" pointers are stored as two 8-bit pieces, high order first. The
144
* value is a positive offset from the opcode of the node containing it. An
145
* operand, if any, simply follows the node. (Note that much of the code
146
* generation knows about this implicit relationship.)
148
* Using two bytes for the "next" pointer is vast overkill for most things, but
149
* allows patterns to get big without disasters.
152
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
153
#define OPERAND(p) ((p) + 3)
157
* The first byte of the regexp internal "program" is actually this magic
158
* number; the start node begins in the second byte.
162
#endif /* rep_NEED_REGEXP_INTERNALS */
164
#endif /* REP_REGEXP_H */