2
* Parse a regular expression, and extract a static suffix.
4
* Copyright (C) 2007-2008 Sourcefire, Inc.
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License version 2 as
10
* published by the Free Software Foundation.
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU General Public License for more details.
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23
#include "clamav-config.h"
36
#include "jsparse/textbuf.h"
37
#include "regex_suffix.h"
38
#define MODULE "regex_suffix: "
46
leaf, /* a character */
47
leaf_class /* character class */
48
/* (x)+ is transformed into (x)*(x) */
59
uint8_t* leaf_class_bitmap;
64
/* --- Prototypes --*/
65
static int build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex);
66
/* -----------------*/
68
static uint8_t dot_bitmap[32] = {
69
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
70
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
71
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
72
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
75
static struct node* make_node(enum node_type type, struct node *left, struct node *right)
84
n = cli_malloc(sizeof(*n));
89
n->u.children.left = left;
90
n->u.children.right = right;
98
static struct node *dup_node(struct node *p)
100
struct node *node_left, *node_right;
105
d = cli_malloc(sizeof(*d));
112
d->u.leaf_char = p->u.leaf_char;
115
d->u.leaf_class_bitmap = cli_malloc(32);
116
if(!d->u.leaf_class_bitmap)
118
memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
121
node_left = dup_node(p->u.children.left);
122
node_right = dup_node(p->u.children.right);
123
d->u.children.left = node_left;
124
d->u.children.right = node_right;
126
node_left->parent = d;
128
node_right->parent = d;
134
static struct node *make_charclass(uint8_t *bitmap)
136
struct node *v = cli_malloc(sizeof(*v));
139
v->type = leaf_class;
141
v->u.leaf_class_bitmap = bitmap;
145
static struct node *make_leaf(char c)
147
struct node *v = cli_malloc(sizeof(*v));
156
static void destroy_tree(struct node *n)
164
destroy_tree(n->u.children.left);
165
destroy_tree(n->u.children.right);
168
if(n->u.leaf_class_bitmap != dot_bitmap)
169
free(n->u.leaf_class_bitmap);
178
static uint8_t* parse_char_class(const char *pat, size_t *pos)
180
unsigned char range_start=0;
182
uint8_t* bitmap = cli_malloc(32);
185
if (pat[*pos]=='^') {
186
memset(bitmap,0xFF,32);/*match chars not in brackets*/
190
memset(bitmap,0x00,32);
192
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
193
if (pat[*pos]=='-' && hasprev) {
195
unsigned char range_end;
200
if (pat[*pos+1]=='.') {
201
/* collating sequence not handled */
203
/* we are parsing the regex for a
204
* filter, be conservative and
205
* tell the filter that anything could
207
while(pat[*pos] != ']') ++*pos;
209
while(pat[*pos] != ']') ++*pos;
213
range_end = pat[*pos];
215
range_end = pat[*pos];
216
for(c=range_start+1;c<=range_end;c++)
217
bitmap[c>>3] ^= 1<<(c&0x7);
220
else if (pat[*pos]=='[' && pat[*pos]==':') {
223
while(pat[*pos] != ']') ++*pos;
225
while(pat[*pos] != ']') ++*pos;
228
bitmap[pat[*pos]>>3] ^= 1<<(pat[*pos]&0x7);
229
range_start = pat[*pos];
233
} while(pat[*pos]!=']');
237
static struct node* parse_regex(const char *p, size_t *last)
239
struct node *v = NULL;
243
while(p[*last] != '$' && p[*last] != '\0') {
247
right = parse_regex(p, last);
248
v = make_node(alternate, v, right);
254
v = make_node(optional, v, NULL);
261
tmp = make_node(optional, v, NULL);
268
/* (x)*(x) => (x)+ */
269
v = make_node(concat, tmp, right);
276
right = parse_regex(p, last);
280
v = make_node(concat, v, right);
285
right = make_charclass(dot_bitmap);
288
v = make_node(concat, v, right);
295
right = make_charclass( parse_char_class(p, last) );
298
v = make_node(concat, v, right);
304
/* next char is escaped, advance pointer
305
* and let fall-through handle it */
308
right = make_leaf(p[*last]);
309
v = make_node(concat, v, right);
319
#define BITMAP_HASSET(b, i) (b[i>>3] & (1<<(i&7)))
321
static int build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex)
328
textbuffer_putc(buf, '\0');
329
if(cb(cbdata, buf->data, buf->pos-1, regex) < 0)
333
textbuffer_putc(buf, n->u.leaf_char);
339
if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
342
textbuffer_putc(buf, '\0');
343
if(cb(cbdata, buf->data, buf->pos-1, regex) < 0)
347
/* handle small classes by expanding */
349
if(BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
352
textbuffer_putc(buf, i);
353
if(build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) < 0)
360
if(prev != n->u.children.left) {
361
if(build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) < 0)
363
/* we're done here, descend will call
364
* ascend if needed */
374
textbuffer_putc(buf, '\0');
375
if(cb(cbdata, buf->data, buf->pos-1, regex) < 0)
384
static int build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex)
387
while(n && n->type == concat) {
388
n = n->u.children.right;
392
/* find out end of the regular expression,
393
* if it ends with a static pattern */
396
/* save pos as restart point */
398
if(build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) < 0)
401
if(build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) < 0)
406
textbuffer_putc(buf, '\0');
407
if(cb(cbdata, buf->data, buf->pos-1, regex) < 0)
412
if(build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) < 0)
421
int cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata)
423
struct regex_list regex;
424
struct text_buffer buf;
425
struct node root_node;
433
rc = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
435
size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
436
char *errbuf = cli_malloc(buflen);
438
cli_regerror(rc, regex.preg, errbuf, buflen);
439
cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
442
cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
447
regex.pattern = cli_strdup(pattern);
449
n = parse_regex(pattern, &last);
452
memset(&buf, 0, sizeof(buf));
453
memset(&root_node, 0, sizeof(buf));
454
n->parent = &root_node;
456
rc = build_suffixtree_descend(n, &buf, cb, cbdata, ®ex);