2
* Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
5
* "THE BEER-WARE LICENSE" (Revision 42):
6
* Sergey Lyubka wrote this file. As long as you retain this notice you
7
* can do whatever you want with this stuff. If we meet some day, and you think
8
* this stuff is worth it, you can buy me a beer in return.
12
* Downloaded Sat Nov 5 17:43:06 CET 2011 at
13
* http://slre.sourceforge.net/1.0/slre.c
24
#include <linux/ctype.h>
25
#endif /* SLRE_TEST */
31
enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32
STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
40
{"END", 0, ""}, /* End of code block or program */
41
{"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
42
{"ANY", 0, ""}, /* Match any character, "." */
43
{"EXACT", 2, "d"}, /* Match exact string */
44
{"ANYOF", 2, "D"}, /* Match any from set, "[]" */
45
{"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
46
{"OPEN ", 1, "i"}, /* Capture start, "(" */
47
{"CLOSE", 1, "i"}, /* Capture end, ")" */
48
{"BOL", 0, ""}, /* Beginning of string, "^" */
49
{"EOL", 0, ""}, /* End of string, "$" */
50
{"STAR", 1, "o"}, /* Match zero or more times "*" */
51
{"PLUS", 1, "o"}, /* Match one or more times, "+" */
52
{"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
53
{"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
54
{"QUEST", 1, "o"}, /* Match zero or one time, "?" */
55
{"SPACE", 0, ""}, /* Match whitespace, "\s" */
56
{"NONSPACE", 0, ""}, /* Match non-space, "\S" */
57
{"DIGIT", 0, ""} /* Match digit, "\d" */
59
#endif /* SLRE_TEST */
62
* Commands and operands are all unsigned char (1 byte long). All code offsets
63
* are relative to current address, and positive (always point forward). Data
64
* offsets are absolute. Commands with operands:
66
* BRANCH offset1 offset2
67
* Try to match the code block that follows the BRANCH instruction
68
* (code block ends with END). If no match, try to match code block that
69
* starts at offset1. If either of these match, jump to offset2.
71
* EXACT data_offset data_length
72
* Try to match exact string. String is recorded in data section from
73
* data_offset, and has length data_length.
76
* CLOSE capture_number
77
* If the user have passed 'struct cap' array for captures, OPEN
78
* records the beginning of the matched substring (cap->ptr), CLOSE
79
* sets the length (cap->len) for respective capture_number.
84
* *, +, ?, respectively. Try to gobble as much as possible from the
85
* matched buffer, until code block that follows these instructions
86
* matches. When the longest possible string is matched,
89
* STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
92
static const char *meta_chars = "|.^$*+?()[\\";
97
print_character_set(FILE *fp, const unsigned char *p, int len)
101
for (i = 0; i < len; i++) {
103
(void) fputc(',', fp);
107
(void) fprintf(fp, "\\x%02x", p[i]);
109
(void) fprintf(fp, "%s", opcodes[p[i]].name);
110
} else if (isprint(p[i])) {
111
(void) fputc(p[i], fp);
113
(void) fprintf(fp, "\\x%02x", p[i]);
119
slre_dump(const struct slre *r, FILE *fp)
121
int i, j, ch, op, pc;
123
for (pc = 0; pc < r->code_size; pc++) {
126
(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
128
for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129
switch (opcodes[op].flags[i]) {
131
(void) fprintf(fp, "%d ", r->code[pc + 1]);
135
(void) fprintf(fp, "%d ",
136
pc + r->code[pc + 1] - i);
140
print_character_set(fp, r->data +
141
r->code[pc + 1], r->code[pc + 2]);
145
(void) fputc('"', fp);
146
for (j = 0; j < r->code[pc + 2]; j++) {
147
ch = r->data[r->code[pc + 1] + j];
149
(void) fputc(ch, fp);
155
(void) fputc('"', fp);
160
(void) fputc('\n', fp);
163
#endif /* SLRE_TEST */
166
set_jump_offset(struct slre *r, int pc, int offset)
168
assert(offset < r->code_size);
170
if (r->code_size - offset > 0xff)
171
r->err_str = "Jump offset is too big";
173
r->code[pc] = (unsigned char) (r->code_size - offset);
177
emit(struct slre *r, int code)
179
if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180
r->err_str = "RE is too long (code overflow)";
182
r->code[r->code_size++] = (unsigned char) code;
186
store_char_in_data(struct slre *r, int ch)
188
if (r->data_size >= (int) sizeof(r->data))
189
r->err_str = "RE is too long (data overflow)";
191
r->data[r->data_size++] = ch;
195
exact(struct slre *r, const char **re)
197
int old_data_size = r->data_size;
199
while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200
store_char_in_data(r, *(*re)++);
203
emit(r, old_data_size);
204
emit(r, r->data_size - old_data_size);
208
get_escape_char(const char **re)
243
anyof(struct slre *r, const char **re)
245
int esc, old_data_size = r->data_size, op = ANYOF;
257
emit(r, old_data_size);
258
emit(r, r->data_size - old_data_size);
263
esc = get_escape_char(re);
264
if ((esc & 0xff) == 0) {
265
store_char_in_data(r, 0);
266
store_char_in_data(r, esc >> 8);
268
store_char_in_data(r, esc);
272
store_char_in_data(r, (*re)[-1]);
276
r->err_str = "No closing ']' bracket";
280
relocate(struct slre *r, int begin, int shift)
283
memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284
r->code_size += shift;
288
quantifier(struct slre *r, int prev, int op)
290
if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
293
emit(r, r->code[prev + 1] + r->code[prev + 2]);
295
prev = r->code_size - 3;
297
relocate(r, prev, 2);
299
set_jump_offset(r, prev + 1, prev);
303
exact_one_char(struct slre *r, int ch)
306
emit(r, r->data_size);
308
store_char_in_data(r, ch);
312
fixup_branch(struct slre *r, int fixup)
316
set_jump_offset(r, fixup, fixup - 2);
321
compile(struct slre *r, const char **re)
323
int op, esc, branch_start, last_op, fixup, cap_no, level;
327
branch_start = last_op = r->code_size;
343
last_op = r->code_size;
347
last_op = r->code_size;
351
last_op = r->code_size;
352
esc = get_escape_char(re);
356
exact_one_char(r, esc);
359
last_op = r->code_size;
360
cap_no = ++r->num_caps;
365
if (*(*re)++ != ')') {
366
r->err_str = "No closing bracket";
375
fixup_branch(r, fixup);
377
r->err_str = "Unbalanced brackets";
385
op = (*re)[-1] == '*' ? STAR : PLUS;
388
op = op == STAR ? STARQ : PLUSQ;
390
quantifier(r, last_op, op);
393
quantifier(r, last_op, QUEST);
396
fixup_branch(r, fixup);
397
relocate(r, branch_start, 3);
398
r->code[branch_start] = BRANCH;
399
set_jump_offset(r, branch_start + 1, branch_start);
400
fixup = branch_start + 2;
401
r->code[fixup] = 0xff;
405
last_op = r->code_size;
412
slre_compile(struct slre *r, const char *re)
415
r->code_size = r->data_size = r->num_caps = r->anchored = 0;
420
emit(r, OPEN); /* This will capture what matches full RE */
426
if (r->code[2] == BRANCH)
433
return (r->err_str == NULL ? 1 : 0);
436
static int match(const struct slre *, int,
437
const char *, int, int *, struct cap *);
440
loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
442
int saved_offset, matched_offset;
444
saved_offset = matched_offset = *ofs;
446
while (match(r, pc + 2, s, len, ofs, NULL)) {
448
if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449
matched_offset = saved_offset;
453
*ofs = matched_offset;
457
loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
459
int saved_offset = *ofs;
461
while (match(r, pc + 2, s, len, ofs, NULL)) {
463
if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
471
is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
477
for (i = 0; i < len; i++)
487
is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
493
for (i = 0; i < len; i++) {
503
match(const struct slre *r, int pc, const char *s, int len,
504
int *ofs, struct cap *caps)
506
int n, saved_offset, res = 1;
508
while (res && r->code[pc] != END) {
510
assert(pc < r->code_size);
511
assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
513
switch (r->code[pc]) {
516
res = match(r, pc + 3, s, len, ofs, caps);
519
res = match(r, pc + r->code[pc + 1],
522
pc += r->code[pc + 2];
526
n = r->code[pc + 2]; /* String length */
527
if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528
r->code[pc + 1], n)) {
537
if (!match(r, pc + 2, s, len, ofs, caps))
539
pc += r->code[pc + 1];
543
loop_greedy(r, pc, s, len, ofs);
544
pc += r->code[pc + 1];
548
loop_non_greedy(r, pc, s, len, ofs);
549
pc += r->code[pc + 1];
552
res = match(r, pc + 2, s, len, ofs, caps);
556
loop_greedy(r, pc, s, len, ofs);
557
pc += r->code[pc + 1];
560
res = match(r, pc + 2, s, len, ofs, caps);
564
loop_non_greedy(r, pc, s, len, ofs);
565
pc += r->code[pc + 1];
569
if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
578
!isspace(((unsigned char *)s)[*ofs])) {
586
if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
603
res = is_any_of(r->data + r->code[pc + 1],
604
r->code[pc + 2], s, ofs);
610
res = is_any_but(r->data + r->code[pc + 1],
611
r->code[pc + 2], s, ofs);
615
res = *ofs == 0 ? 1 : 0;
619
res = *ofs == len ? 1 : 0;
624
caps[r->code[pc + 1]].ptr = s + *ofs;
629
caps[r->code[pc + 1]].len = (s + *ofs) -
630
caps[r->code[pc + 1]].ptr;
637
printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
647
slre_match(const struct slre *r, const char *buf, int len,
650
int i, ofs = 0, res = 0;
653
res = match(r, 0, buf, len, &ofs, caps);
655
for (i = 0; i < len && res == 0; i++) {
657
res = match(r, 0, buf, len, &ofs, caps);
667
int main(int argc, char *argv[])
670
struct cap caps[N_CAPS];
671
unsigned char data[1 * 1024 * 1024];
676
fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
680
fp = fopen(argv[2], "rb");
682
fprintf(stderr, "Error: cannot open %s:%s\n",
683
argv[2], strerror(errno));
687
if (!slre_compile(&slre, argv[1])) {
688
fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
692
slre_dump(&slre, stderr);
694
while (fgets(data, sizeof(data), fp) != NULL) {
697
if ((len > 0) && (data[len-1] == '\n')) {
702
printf("Data = \"%s\"\n", data);
704
(void) memset(caps, 0, sizeof(caps));
708
res = slre_match(&slre, data, len, caps);
709
printf("Result [%d]: %d\n", i, res);
711
for (i = 0; i < N_CAPS; i++) {
712
if (caps[i].len > 0) {
713
printf("Substring %d: len=%d [%.*s]\n", i,
715
caps[i].len, caps[i].ptr);
718
printf("----------------------------------------------------\n");
724
#endif /* SLRE_TEST */