~ubuntu-branches/ubuntu/hardy/clamav/hardy-security

« back to all changes in this revision

Viewing changes to libclamav/regex_list.c

  • Committer: Bazaar Package Importer
  • Author(s): Jamie Strandboge
  • Date: 2009-04-30 14:44:26 UTC
  • mfrom: (0.28.3 sid)
  • Revision ID: james.westby@ubuntu.com-20090430144426-933t29chbo6phaa7
Tags: 0.94.dfsg.2-1ubuntu0.3~hardy4
No change rebuild from backports for use with ClamAV 0.94

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*
2
2
 *  Match a string against a list of patterns/regexes.
3
3
 *
4
 
 *  Copyright (C) 2006-2007 T�r�k Edvin <edwin@clamav.net>
 
4
 *  Copyright (C) 2007-2008 Sourcefire, Inc.
 
5
 *
 
6
 *  Authors: Török Edvin
5
7
 *
6
8
 *  This program is free software; you can redistribute it and/or modify
7
 
 *  it under the terms of the GNU General Public License version 2 as 
 
9
 *  it under the terms of the GNU General Public License version 2 as
8
10
 *  published by the Free Software Foundation.
9
11
 *
10
12
 *  This program is distributed in the hope that it will be useful,
16
18
 *  along with this program; if not, write to the Free Software
17
19
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
20
 *  MA 02110-1301, USA.
19
 
 *
20
21
 */
21
22
 
22
23
#if HAVE_CONFIG_H
33
34
#endif
34
35
#endif
35
36
 
36
 
 
37
 
/* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */
38
 
#ifdef CL_EXPERIMENTAL
39
 
/*#define USE_NEW_VERSION*/
40
 
#endif
41
 
 
42
 
#ifndef USE_NEW_VERSION
43
 
/*this is the old version of regex_list.c
44
 
 *reason for redesign: there is only one node type that has to handle all the cases: binary search among children, alternatives list, match.
45
 
 * This design is very error-prone.*/
46
 
 
47
37
#include <stdio.h>
48
38
#include <stdlib.h>
49
39
#include <string.h>
50
40
#include <ctype.h>
 
41
#include <zlib.h>
51
42
 
52
43
#include <limits.h>
53
44
#include <sys/types.h>
 
45
#include <assert.h>
 
46
 
54
47
 
55
48
#include "regex/regex.h"
56
49
 
59
52
#include "others.h"
60
53
#include "regex_list.h"
61
54
#include "matcher-ac.h"
 
55
#include "matcher.h"
62
56
#include "str.h"
63
 
 
64
 
/*Tree*/
65
 
enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
66
 
typedef unsigned char* char_bitmap_p;
67
 
/*
68
 
 *
69
 
 * OP_CHAR: 1 character, c = character
70
 
 * complex stuff:
71
 
 * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
72
 
 * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
73
 
 * OP_DOT: single . matching any character except \n
74
 
 * OP_LEAF: this is a leaf node, reinterpret structure
75
 
 */
76
 
struct tree_node {
77
 
        struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
78
 
        unsigned char c;
79
 
        enum token_op_t op;
80
 
        char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
81
 
        char listend;/* no more siblings, next pointer is pointer to parent*/
82
 
        union {
83
 
                struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
84
 
                char_bitmap_p* bitmap;
85
 
                struct leaf_info*  leaf;
86
 
        } u;
87
 
};
88
 
 
89
 
struct leaf_info {
90
 
        char* info;/* what does it mean that we reached the leaf...*/
91
 
        regex_t* preg;/* this is NULL if leaf node, and non-regex*/
92
 
};
93
 
 
94
 
/* Character classes */
95
 
static const char* std_class[] = {
96
 
        "[:alnum:]",
97
 
        "[:digit:]",
98
 
        "[:punct:]",
99
 
        "[:alpha:]",
100
 
        "[:graph:]",
101
 
        "[:space:]",
102
 
        "[:blank:]",
103
 
        "[:lower:]", 
104
 
        "[:upper:]",
105
 
        "[:cntrl:]",
106
 
        "[:print:]",
107
 
        "[:xdigit:]"
108
 
        /* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
109
 
};
110
 
 
111
 
 
112
 
#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
113
 
 
114
 
/* generated by contrib/phishing/generate_tables.c */
115
 
static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
116
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
117
 
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
118
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
119
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
120
 
 
121
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
122
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
123
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
124
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
125
 
 
126
 
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 
127
 
         0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, 
128
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
129
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
130
 
 
131
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
132
 
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
133
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
134
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
135
 
 
136
 
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 
137
 
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
138
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
139
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
140
 
 
141
 
        {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
142
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
143
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
144
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
145
 
 
146
 
        {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
147
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
148
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
149
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
150
 
 
151
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
152
 
         0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 
153
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
154
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
155
 
 
156
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
157
 
         0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 
158
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
159
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
160
 
 
161
 
        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
162
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 
163
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
164
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
165
 
 
166
 
        {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 
167
 
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
168
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
169
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
170
 
 
171
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
172
 
         0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 
173
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
174
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
175
 
};
176
 
 
177
 
static const unsigned short int char_class[256] = {
178
 
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, 
179
 
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 
180
 
        0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
181
 
        0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
182
 
        0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 
183
 
        0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, 
184
 
        0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 
185
 
        0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, 
186
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
187
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
188
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
189
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
190
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
191
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
192
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
193
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
194
 
};
195
 
 
196
 
static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
197
 
 
 
57
#include "readdb.h"
 
58
#include "jsparse/textbuf.h"
 
59
#include "regex_suffix.h"
198
60
/* Prototypes */
199
 
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
200
 
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
201
 
static void destroy_tree(struct regex_matcher* matcher);
202
 
static struct tree_node* tree_root_alloc(void);
203
 
static int build_regex_list(struct regex_matcher* matcher);
204
 
static void stack_destroy(struct node_stack* stack);
205
 
 
206
 
#ifndef NDEBUG
207
 
void dump_tree(struct tree_node* root);
208
 
#endif
209
 
 
210
 
#define MATCH_SUCCESS 0 
 
61
static regex_t *new_preg(struct regex_matcher *matcher);
 
62
static size_t reverse_string(char *pattern);
 
63
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *regex);
 
64
static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
 
65
/* ---------- */
 
66
 
 
67
/* ----- shift-or filtering -------------- */
 
68
 
 
69
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
 
70
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
 
71
 
 
72
static void SO_init(struct filter *m)
 
73
{
 
74
        memset(m->B, ~0, sizeof(m->B));
 
75
        memset(m->end, ~0, sizeof(m->end));
 
76
        memset(m->end_fast, ~0, sizeof(m->end_fast));
 
77
}
 
78
 
 
79
/* because we use uint32_t */
 
80
#define MAXSOPATLEN 32
 
81
 
 
82
/* merge another pattern into the filter
 
83
 * add('abc'); add('bcd'); will match [ab][bc][cd] */
 
84
static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
 
85
{
 
86
        uint16_t q;
 
87
        uint8_t j;
 
88
 
 
89
        /* cut length, and make it modulo 2 */
 
90
        if(len > MAXSOPATLEN) {
 
91
                len = MAXSOPATLEN;
 
92
        } else {
 
93
                /* we use 2-grams, must be multiple of 2 */
 
94
                len = len & ~1;
 
95
        }
 
96
        if(!len)
 
97
                return 0;
 
98
 
 
99
        /* Shift-Or like preprocessing */
 
100
        for(j=0;j < len-1;j++) {
 
101
                /* use overlapping 2-grams. We need them overlapping because matching can start at any position */
 
102
                q = cli_readint16( &pattern[j] );
 
103
                m->B[q] &= ~(1 << j);
 
104
        }
 
105
        /* we use variable length patterns, use last character to mark pattern end,
 
106
         * can lead to false positives.*/
 
107
        /* mark that at state j, the q-gram q can end the pattern */
 
108
        if(j) {
 
109
                j--;
 
110
                m->end[q] &= ~(1 << j);
 
111
                m->end_fast[pattern[j+1]] &= ~(1<<j);
 
112
        }
 
113
        return 0;
 
114
}
 
115
 
 
116
/* this is like a FSM, with multiple active states at the same time.
 
117
 * each bit in "state" means an active state, when a char is encountered
 
118
 * we determine what states can remain active.
 
119
 * The FSM transition rules are expressed as bit-masks */
 
120
long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
 
121
{
 
122
        size_t j;
 
123
        uint32_t state = ~0;
 
124
        const uint32_t *B = m->B;
 
125
        const uint32_t *End = m->end;
 
126
        const uint32_t *EndFast = m->end_fast;
 
127
 
 
128
        /* cut length, and make it modulo 2 */
 
129
        if(len > MAXSOPATLEN) {
 
130
                len = MAXSOPATLEN;
 
131
        } else {
 
132
                /* we use 2-grams, must be multiple of 2 */
 
133
                len = len & ~1;
 
134
        }
 
135
        if(!len) return -1;
 
136
        /* Shift-Or like search algorithm */
 
137
        for(j=0;j < len-1; j++) {
 
138
                const uint16_t q0 = cli_readint16( &data[j] );
 
139
                uint32_t match_end;
 
140
                state = (state << 1) | B[q0];
 
141
                /* state marks with a 0 bit all active states
 
142
                 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
 
143
                 * if we got two 0's at matching positions, it means we encountered a pattern's end */
 
144
                match_end = state | EndFast[data[j+1]];
 
145
                if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
 
146
                        /* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
 
147
                         * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
 
148
                         * set inside L2 */
 
149
 
 
150
                        /* if state is reachable, and this character can finish a pattern, assume match */
 
151
                        /* to reduce false positives check if qgram can finish the pattern */
 
152
                        /* return position of probable match */
 
153
                        /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
 
154
                         * longest pattern that could match */
 
155
                        return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
 
156
                }
 
157
        }
 
158
        /* no match */
 
159
        return -1;
 
160
}
 
161
 
 
162
/* ----------------------------------------------------------- */
 
163
 
 
164
 
 
165
#define MATCH_SUCCESS 0
211
166
#define MATCH_FAILED  -1
212
167
 
213
168
/*
236
191
                realpos++;
237
192
        }
238
193
        while(str[realpos]==' ') realpos++;
239
 
        cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);      
 
194
        cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);
240
195
        return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
241
196
}
242
197
 
 
198
static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
 
199
{
 
200
        char c;
 
201
        size_t match_len;
 
202
 
 
203
        if(!regex || !regex->pattern)
 
204
                return 0;
 
205
        match_len = strlen(regex->pattern);
 
206
        if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
 
207
                        (match_len == buffer_len || /* full match */
 
208
                         (match_len < buffer_len &&
 
209
                          ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
 
210
                         /* subdomain matched*/)) {
 
211
                /* we have an extra / at the end */
 
212
                if(match_len > 0) match_len--;
 
213
                cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
 
214
                cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
 
215
                if(real_len >= match_len + 1) {
 
216
                        const size_t pos = real_len - match_len - 1;
 
217
                        if(real_url[pos] != '.') {
 
218
                                /* we need to shift left, and insert a '.'
 
219
                                 * we have an extra '.' at the beginning inserted by get_host to have room,
 
220
                                 * orig_real_url has to be used here, 
 
221
                                 * because we want to overwrite that extra '.' */
 
222
                                size_t orig_real_len = strlen(orig_real_url);
 
223
                                cli_dbgmsg("No dot here:%s\n",real_url+pos);
 
224
                                real_url = orig_real_url;
 
225
                                memmove(real_url, real_url+1, orig_real_len-match_len-1);
 
226
                                real_url[orig_real_len-match_len-1]='.';
 
227
                                cli_dbgmsg("After inserting .: %s\n", real_url);
 
228
                        }
 
229
                }
 
230
                return 1;
 
231
        }
 
232
        cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
 
233
        return 0;
 
234
}
 
235
 
243
236
/*
244
237
 * @matcher - matcher structure to use
245
238
 * @real_url - href target
253
246
 * Do not send NULL pointers to this function!!
254
247
 *
255
248
 */
256
 
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
 
249
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
257
250
{
258
 
        massert(matcher);
259
 
        massert(real_url);
260
 
        massert(display_url);
261
 
        massert(info);
 
251
        char* orig_real_url = real_url;
 
252
        struct regex_list *regex;
 
253
        size_t real_len, display_len, buffer_len;
 
254
 
 
255
        assert(matcher);
 
256
        assert(real_url);
 
257
        assert(display_url);
 
258
        *info = NULL;
262
259
        if(!matcher->list_inited)
263
260
                return 0;
264
 
        massert(matcher->list_built);
 
261
        assert(matcher->list_built);
 
262
        /* skip initial '.' inserted by get_host */
 
263
        if(real_url[0] == '.') real_url++;
 
264
        if(display_url[0] == '.') display_url++;
 
265
        real_len    = strlen(real_url);
 
266
        display_len = strlen(display_url);
 
267
        buffer_len  = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
 
268
        if(buffer_len < 3) {
 
269
                /* too short, no match possible */
 
270
                return 0;
 
271
        }
265
272
        {
266
 
                size_t real_len    = strlen(real_url);
267
 
                size_t display_len = strlen(display_url);
268
 
                size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
269
 
                char*  buffer = cli_malloc(buffer_len+1);
270
 
                size_t i;
 
273
                char *buffer = cli_malloc(buffer_len+1);
 
274
                char *bufrev;
271
275
                int rc = 0;
272
276
                struct cli_ac_data mdata;
 
277
                struct cli_ac_result *res = NULL;
273
278
 
274
279
                if(!buffer)
275
280
                        return CL_EMEM;
276
281
 
277
282
                strncpy(buffer,real_url,real_len);
278
 
                buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
 
283
                buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
279
284
                if(!hostOnly || is_whitelist) {
280
285
                        strncpy(buffer+real_len+1,display_url,display_len);
281
 
                        if(is_whitelist) 
282
 
                                buffer[buffer_len - 1] = '/';
283
 
                        buffer[buffer_len]=0;
284
286
                }
 
287
                buffer[buffer_len - 1] = '/';
 
288
                buffer[buffer_len]=0;
285
289
                cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
286
290
 
287
 
                if(hostOnly) {
288
 
                        if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
289
 
                                return rc;
290
 
                        rc = 0;
291
 
 
292
 
                        for(i = 0; i < matcher->root_hosts_cnt; i++) {
293
 
                                /* doesn't need to match terminating \0*/
294
 
                                rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL);
295
 
                                cli_ac_freedata(&mdata);
296
 
                                if(rc) {
297
 
                                        char c;
298
 
                                        const char* matched = strchr(*info,':');        
299
 
                                        const size_t match_len = matched ? strlen(matched+1) : 0;
300
 
                                        if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
301
 
                                                (match_len == buffer_len || /* full match */
302
 
                                                (match_len < buffer_len &&
303
 
                                                ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) 
304
 
                                                /* subdomain matched*/)) {
305
 
 
306
 
                                                cli_dbgmsg("Got a match: %s with %s\n",buffer,*info);
307
 
                                                cli_dbgmsg("Before inserting .: %s\n",real_url);
308
 
                                                if(real_len >= match_len + 1) {
309
 
                                                        real_url[real_len-match_len-1]='.';
310
 
                                                        cli_dbgmsg("After inserting .: %s\n",real_url);
311
 
                                                }
312
 
                                                break;
313
 
                                        }
314
 
                                        cli_dbgmsg("Ignoring false match: %s with %s,%c\n",buffer,*info,c);
315
 
                                        rc=0;
 
291
                if((rc = cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN)))
 
292
                        return rc;
 
293
 
 
294
                bufrev = cli_strdup(buffer);
 
295
                if(!bufrev)
 
296
                        return CL_EMEM;
 
297
                reverse_string(bufrev);
 
298
                rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
 
299
                if(rc == -1) {
 
300
                        free(buffer);
 
301
                        free(bufrev);
 
302
                        /* filter says this suffix doesn't match.
 
303
                         * The filter has false positives, but no false
 
304
                         * negatives */
 
305
                        return 0;
 
306
                }
 
307
                rc = cli_ac_scanbuff((const unsigned char*)bufrev,buffer_len, NULL, (void*)&regex, &res, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
 
308
                free(bufrev);
 
309
                cli_ac_freedata(&mdata);
 
310
 
 
311
                rc = 0;
 
312
                while(res) {
 
313
                        struct cli_ac_result *q;
 
314
                        regex = res->customdata;
 
315
                        while(!rc && regex) {
 
316
                                /* loop over multiple regexes corresponding to
 
317
                                 * this suffix */
 
318
                                if (!regex->preg) {
 
319
                                        /* we matched a static pattern */
 
320
                                        rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
 
321
                                } else {
 
322
                                        rc = !cli_regexec(regex->preg, buffer, 0, NULL, 0);
316
323
                                }
 
324
                                if(rc) *info = regex->pattern;
 
325
                                regex = regex->nxt;
317
326
                        }
318
 
                } else
319
 
                        rc = 0;
320
 
    
321
 
                if(!rc) 
322
 
                        rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
 
327
                        q = res;
 
328
                        res = res->next;
 
329
                        free(q);
 
330
                }
323
331
                free(buffer);
324
332
                if(!rc)
325
333
                        cli_dbgmsg("Lookup result: not in regex list\n");
329
337
        }
330
338
}
331
339
 
332
 
/* node stack */
333
 
#define NODE_STACK_INITIAL 1024
334
 
#define NODE_STACK_GROW    4096
335
 
/* Initialize @stack */
336
 
static int stack_init(struct node_stack* stack)
337
 
{
338
 
        massert(stack);
339
 
 
340
 
        stack->cnt = 0;
341
 
        stack->capacity = NODE_STACK_INITIAL;
342
 
        stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
343
 
        if(!stack->data)
344
 
                return CL_EMEM;
345
 
        else
346
 
                return CL_SUCCESS;
347
 
}
348
 
 
349
 
/* Reset @stack pointer, but don't realloc */
350
 
static void stack_reset(struct node_stack* stack)
351
 
{
352
 
        massert(stack);
353
 
 
354
 
        stack->cnt = 0;
355
 
}
356
 
 
357
 
/* Push @node on @stack, growing it if necessarry */
358
 
static int stack_push(struct node_stack* stack,struct tree_node* node)
359
 
{
360
 
        massert(stack);
361
 
        massert(stack->data);
362
 
 
363
 
        if(stack->cnt == stack->capacity) {
364
 
                stack->capacity += NODE_STACK_GROW;
365
 
                stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data));
366
 
                if(!stack->data)
367
 
                        return CL_EMEM;
368
 
        }
369
 
        stack->data[stack->cnt++] = node;
370
 
        return CL_SUCCESS;
371
 
}
372
 
 
373
 
/* Pops node from @stack, doesn't realloc */
374
 
static struct tree_node* stack_pop(struct node_stack* stack)
375
 
{
376
 
        massert(stack);
377
 
        massert(stack->data);
378
 
        massert(stack->cnt);/*don't pop from empty stack */
379
 
 
380
 
        return stack->cnt ? stack->data[--stack->cnt] : NULL;
381
 
}
382
340
 
383
341
/* Initialization & loading */
384
342
/* Initializes @matcher, allocating necesarry substructures */
386
344
{
387
345
        int rc;
388
346
 
389
 
        massert(matcher);
390
 
        matcher->list_inited = 0;
391
 
        matcher->root_hosts_cnt = 0;
392
 
        matcher->root_hosts = NULL;
393
 
        matcher->root_hosts_cnt = 0;
394
 
 
395
 
        matcher->root_regex = tree_root_alloc();
396
 
        if(!matcher->root_regex) {
397
 
                return CL_EMEM;
398
 
        }
399
 
 
400
 
        matcher->root_regex_hostonly = tree_root_alloc();
401
 
        if(!matcher->root_regex_hostonly) {
402
 
                free(matcher->root_regex);
403
 
                return CL_EMEM;
404
 
        }
405
 
 
406
 
        if(( rc = stack_init(&matcher->node_stack) )) {
407
 
                free(matcher->root_regex_hostonly);
408
 
                free(matcher->root_regex);
409
 
                return rc;
410
 
        }
411
 
        if(( rc = stack_init(&matcher->node_stack_alt) )) {
412
 
                free(matcher->root_regex_hostonly);
413
 
                free(matcher->root_regex);
414
 
                stack_destroy(&matcher->node_stack);
415
 
                return rc;
416
 
        }
 
347
        assert(matcher);
 
348
        memset(matcher, 0, sizeof(*matcher));
417
349
 
418
350
        matcher->list_inited=1;
419
 
        matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
 
351
        matcher->list_built=0;
420
352
        matcher->list_loaded=0;
421
353
 
 
354
        hashtab_init(&matcher->suffix_hash, 10);
 
355
        if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) {
 
356
                return rc;
 
357
        }
 
358
        if((rc = cli_bm_init(&matcher->md5_hashes))) {
 
359
                return rc;
 
360
        }
 
361
        SO_init(&matcher->filter);
 
362
        SO_init(&matcher->md5_filter);
422
363
        return CL_SUCCESS;
423
364
}
424
365
 
425
 
/* inserts @pattern into @root, using ac-matcher 
426
 
 * although the name might be confusing, @pattern is not a regex!*/
427
 
static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
428
 
{
429
 
       int ret;
430
 
       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
431
 
       size_t len,i;
432
 
 
433
 
       if(!new)
434
 
               return CL_EMEM;
435
 
       massert(root);
436
 
       massert(pattern);
437
 
 
438
 
       len = strlen(pattern);
439
 
       /* need not to match \0 too */
440
 
       new->type = 0;
441
 
       new->sigid = 0;
442
 
       new->parts = 0;
443
 
       new->partno = 0;
444
 
       new->mindist = 0;
445
 
       new->maxdist = 0;
446
 
       new->offset = 0;
447
 
       new->target = 0;
448
 
       new->length = len;
449
 
       if(new->length > root->maxpatlen)
450
 
               root->maxpatlen = new->length;
451
 
 
452
 
       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
453
 
       if(!new->pattern) {
454
 
               free(new);
455
 
               return CL_EMEM;
456
 
       }
457
 
       for(i=0;i<len;i++)
458
 
               new->pattern[i]=pattern[i];/*new->pattern is short int* */
459
 
 
460
 
        
461
 
       new->virname = cli_strdup(info);
462
 
       if((ret = cli_ac_addpatt(root,new))) {
463
 
               free(new->virname);
464
 
               free(new->pattern);
465
 
               free(new);
466
 
               return ret;
467
 
       }
468
 
       return CL_SUCCESS;
469
 
}
470
 
 
471
366
static int functionality_level_check(char* line)
472
367
{
473
368
        char* ptmin;
508
403
                        return CL_EMALFDB;
509
404
                ptmin[-1]='\0';
510
405
                return CL_SUCCESS;
511
 
        }               
 
406
        }
 
407
}
 
408
 
 
409
static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl)
 
410
{
 
411
        int rc;
 
412
        struct cli_bm_patt *pat = cli_calloc(1, sizeof(*pat));
 
413
        if(!pat)
 
414
                return CL_EMEM;
 
415
        pat->pattern = (unsigned char*)cli_hex2str(pattern);
 
416
        if(!pat->pattern)
 
417
                return CL_EMALFDB;
 
418
        pat->length = 16;
 
419
        pat->virname = cli_malloc(1);
 
420
        if(!pat->virname) {
 
421
                free(pat);
 
422
                return CL_EMEM;
 
423
        }
 
424
        *pat->virname = fl;
 
425
        SO_preprocess_add(&matcher->md5_filter, pat->pattern, pat->length);
 
426
        if((rc = cli_bm_addpatt(&matcher->md5_hashes, pat))) {
 
427
                cli_errmsg("add_hash: failed to add BM pattern\n");
 
428
                free(pat->pattern);
 
429
                free(pat->virname);
 
430
                free(pat);
 
431
                return CL_EMALFDB;
 
432
        }
 
433
        return CL_SUCCESS;
512
434
}
513
435
 
514
436
 
515
437
/* Load patterns/regexes from file */
516
 
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist)
 
438
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio)
517
439
{
518
440
        int rc,line=0;
519
441
        char buffer[FILEBUFF];
520
442
 
521
 
        massert(matcher);
522
 
        massert(fd);
 
443
        assert(matcher);
523
444
 
524
445
        if(matcher->list_inited==-1)
525
446
                return CL_EMALFDB; /* already failed to load */
526
 
/*      if(matcher->list_loaded) {
527
 
                cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
528
 
                return CL_SUCCESS;
529
 
        }*/
530
 
        if(!fd) {
 
447
        if(!fd && !dbio) {
531
448
                cli_errmsg("Unable to load regex list (null file)\n");
532
449
                return CL_EIO;
533
450
        }
540
457
                        fatal_error(matcher);
541
458
                        return rc;
542
459
                }
543
 
                /*atexit(regex_list_done); TODO: destroy this in manager.c */
544
460
        }
545
461
        /*
546
462
         * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
562
478
         * If a line in the file doesn't conform to this format, loading fails
563
479
         * 
564
480
         */
565
 
        while(fgets(buffer,FILEBUFF,fd)) {
 
481
        while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
566
482
                char* pattern;
567
483
                char* flags;
 
484
                size_t pattern_len;
 
485
 
568
486
                cli_chomp(buffer);
569
487
                if(!*buffer)
570
488
                        continue;/* skip empty lines */
571
489
 
572
 
                if(functionality_level_check(buffer)) 
 
490
                if(functionality_level_check(buffer))
573
491
                        continue;
574
492
 
575
493
                line++;
583
501
                flags = buffer+1;
584
502
                pattern++;
585
503
 
586
 
                if(is_whitelist) {
587
 
                        const size_t pattern_len = strlen(pattern);
588
 
                        if(pattern_len < FILEBUFF) {
589
 
                                pattern[pattern_len] = '/';
590
 
                                pattern[pattern_len+1] = '\0';
591
 
                        }
592
 
                        else {
593
 
                                cli_errmsg("Overlong regex line %d\n",line);
594
 
                                fatal_error(matcher);
595
 
                                return CL_EMALFDB;
596
 
                        }
597
 
                }
598
 
 
599
 
                if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/
600
 
                        if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') ))
601
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
602
 
                }
603
 
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
604
 
                        struct cli_matcher* root;
605
 
                        if(matcher->list_built) {
606
 
                                struct cli_matcher* old_hosts = matcher->root_hosts;
607
 
                                matcher->root_hosts_cnt++;
608
 
 
609
 
                                matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
610
 
                                if(!matcher->root_hosts) {
611
 
                                        matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
612
 
                                        return CL_EMEM;
613
 
                                } 
614
 
 
615
 
                                root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
616
 
                                memset(root, 0, sizeof(struct cli_matcher));
617
 
 
618
 
                                cli_dbgmsg("regex_list: Initialising AC pattern matcher\n");
619
 
                                if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) {
620
 
                                        /* no need to free previously allocated memory here */
621
 
                                        cli_errmsg("regex_list: Can't initialise AC pattern matcher\n");
622
 
                                        return rc;
623
 
                                }
624
 
                                matcher->list_built = 0;
625
 
                        }
626
 
                        else {
627
 
                                root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
628
 
                        }
629
 
                        if(( rc = add_regex_list_element(root,pattern,flags) ))
630
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
504
                pattern_len = strlen(pattern);
 
505
                if(pattern_len < FILEBUFF) {
 
506
                        pattern[pattern_len] = '/';
 
507
                        pattern[pattern_len+1] = '\0';
631
508
                }
632
509
                else {
633
 
                        return CL_EMALFDB;
634
 
                        /* this is useless, we have host, and regex matches
635
 
                        if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
636
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
 
510
                        cli_errmsg("Overlong regex line %d\n",line);
 
511
                        fatal_error(matcher);
 
512
                        return CL_EMALFDB;
 
513
                }
 
514
 
 
515
                if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {
 
516
                        /* regex for hostname*/
 
517
                        if (( rc = regex_list_add_pattern(matcher, pattern) ))
 
518
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
519
                }
 
520
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {
 
521
                        /*matches displayed host*/
 
522
                        if (( rc = add_static_pattern(matcher, pattern) ))
 
523
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
524
                } else if (buffer[0] == 'U' && !is_whitelist) {
 
525
                        pattern[pattern_len] = '\0';
 
526
                        if (( rc = add_hash(matcher, pattern, flags[0]) )) {
 
527
                                cli_errmsg("Error loading at line: %d\n", line);
 
528
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
529
                        }
 
530
                } else {
 
531
                        return CL_EMALFDB;
637
532
                }
638
533
        }
639
534
        matcher->list_loaded = 1;
640
 
        if(( rc = build_regex_list(matcher) ))
641
 
                return rc;
642
535
 
643
 
#ifndef NDEBUG
644
 
/*                      dump_tree(matcher->root_regex);*/
645
 
#endif
646
 
        if(!matcher->list_built) {
647
 
                cli_errmsg("Regex list not loaded: build failed!\n");
648
 
                fatal_error(matcher);
649
 
                return CL_EMALFDB;
650
 
        }
651
 
        regex_list_cleanup(matcher);
652
536
        return CL_SUCCESS;
653
537
}
654
538
 
655
539
 
656
 
static struct tree_node ** tree_node_get_children(const struct tree_node* node)
657
 
{
658
 
        return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
659
 
}
660
 
 
661
540
/* Build the matcher list */
662
 
static int build_regex_list(struct regex_matcher* matcher)
 
541
int cli_build_regex_list(struct regex_matcher* matcher)
663
542
{
664
543
        int rc;
 
544
        if(!matcher)
 
545
                return CL_SUCCESS;
665
546
        if(!matcher->list_inited || !matcher->list_loaded) {
666
547
                cli_errmsg("Regex list not loaded!\n");
667
548
                return -1;/*TODO: better error code */
668
549
        }
669
550
        cli_dbgmsg("Building regex list\n");
670
 
        if(matcher->root_hosts)
671
 
                if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
672
 
                        return rc;
 
551
        hashtab_free(&matcher->suffix_hash);
 
552
        if(( rc = cli_ac_buildtrie(&matcher->suffixes) ))
 
553
                return rc;
673
554
        matcher->list_built=1;
674
555
 
675
556
        return CL_SUCCESS;
678
559
/* Done with this matcher, free resources */
679
560
void regex_list_done(struct regex_matcher* matcher)
680
561
{
681
 
        massert(matcher);
682
 
 
683
 
        regex_list_cleanup(matcher);
684
 
        if(matcher->list_loaded) {
685
 
                if(matcher->root_hosts) {
686
 
                        size_t i;
687
 
                        for(i=0;i<matcher->root_hosts_cnt;i++) 
688
 
                                cli_ac_free(&matcher->root_hosts[i]);
689
 
                        free(matcher->root_hosts);
690
 
                        matcher->root_hosts=NULL;
691
 
                }
692
 
 
693
 
                matcher->root_hosts_cnt=0;
694
 
                matcher->list_built=0;
695
 
                destroy_tree(matcher);
696
 
                matcher->list_loaded=0;
697
 
        }
 
562
        assert(matcher);
 
563
 
698
564
        if(matcher->list_inited) {
699
 
                matcher->list_inited=0;
700
 
        }
701
 
        stack_destroy(&matcher->node_stack);
702
 
        stack_destroy(&matcher->node_stack_alt);
703
 
}
704
 
 
705
 
/* Tree matcher algorithm */
706
 
struct token_t
707
 
{
708
 
        union {
709
 
                const unsigned char* start;
710
 
                char_bitmap_p  bitmap;
711
 
                unsigned char  c;
712
 
        } u;
713
 
        size_t len;
714
 
        char   type;
715
 
};
716
 
 
717
 
enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
718
 
 
719
 
static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
720
 
{
721
 
        massert(pat);
722
 
        massert(token);
723
 
 
724
 
        switch(*pat) {
725
 
                case '\\':
726
 
                        token->type=TOKEN_CHAR;
727
 
                        token->u.c = *(++pat);
728
 
                        if(islower(token->u.c)) {
729
 
                                /* handle \n, \t, etc. */
730
 
                                char fmt[3] = {'\\', '\0', '\0'};
731
 
                                char c;
732
 
 
733
 
                                fmt[1] = token->u.c;
734
 
                                if(snprintf(&c,1,fmt)!=1) {
735
 
                                        token->type=TOKEN_REGEX;
736
 
                                        token->u.start = pat;
737
 
                                }
738
 
                                else
739
 
                                        token->u.c=c;
740
 
                        }
741
 
                        token->len = 1;
742
 
                        break;
743
 
                case '|':
744
 
                        token->type=TOKEN_ALT;
745
 
                        break;
746
 
                case '*':
747
 
                case '+':
748
 
                case '?':
749
 
                case '{':
750
 
                case '}':
751
 
                        token->type=TOKEN_REGEX;
752
 
                        break;
753
 
                case '[':
754
 
                        {
755
 
                        /*TODO: implement*/
756
 
                        /*see if it is something simple like a list of characters, a range, or negated ...*/
757
 
                        const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
758
 
                        unsigned char range_start=0;
759
 
                        int hasprev = 0;
760
 
                        char_bitmap_p bitmap = cli_malloc(32);
761
 
                        if(!bitmap)
762
 
                                return NULL;
763
 
                        if (*pat=='^') {
764
 
                                memset(bitmap,0xFF,32);/*match chars not in brackets*/
765
 
                                pat++;
766
 
                        }
767
 
                        else
768
 
                                memset(bitmap,0x00,32);
769
 
                        do {
770
 
                                /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
771
 
                                if (*pat=='-' && hasprev) {
772
 
                                        /* it is a range*/
773
 
                                        unsigned char range_end;
774
 
                                        unsigned int c;
775
 
                                        massert(range_start);
776
 
                                        pat++;
777
 
                                        if (pat[0]=='[')
778
 
                                                if (pat[1]=='.') {
779
 
                                                        if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
780
 
                                                                range_end = '-';
781
 
                                                        else {
782
 
                                                                /* this is getting complicated, bail out */
783
 
                                                                cli_warnmsg("confused about collating sequences in regex,bailing out");
784
 
                                                                pat=old;
785
 
                                                                token->type=TOKEN_REGEX;
786
 
                                                                break;
787
 
                                                        }
788
 
                                                }
789
 
                                                else 
790
 
                                                        range_end = *pat;
791
 
                                        else
792
 
                                                range_end = *pat;
793
 
                                        for(c=range_start+1;c<=range_end;c++)
794
 
                                                bitmap[c>>3] ^= 1<<(c&0x7);
795
 
                                        hasprev = 0;
796
 
                                }
797
 
                                else if (pat[0]=='[' && pat[1]==':') {
798
 
                                                        const unsigned char* end;
799
 
                                                        int len,found=-1;
800
 
                                                        size_t i;
801
 
 
802
 
                                                        pat+=2;
803
 
                                                        end=(unsigned char*)strstr((const char*)pat,":]");
804
 
                                                        if(!end) {
805
 
                                                                cli_warnmsg("confused about std char class syntax regex,bailing out");
806
 
                                                                pat=old;
807
 
                                                                token->type=TOKEN_REGEX;
808
 
                                                                break;
809
 
                                                        }
810
 
 
811
 
                                                        len = end-pat;
812
 
                                                        for(i=0;i<std_class_cnt;i++)
813
 
                                                                if(!strncmp((const char*)pat,std_class[i],len)) {
814
 
                                                                        found=i;
815
 
                                                                        break;
816
 
                                                                }
817
 
                                                        if(found!=-1) {
818
 
                                                                for(i=0;i<256;i++)
819
 
                                                                        if(char_class[i]&(1<<found))
820
 
                                                                                bitmap[i>>3] ^= 1<<(i&0x7);
821
 
                                                        }
822
 
                                                        else {
823
 
                                                                /*unknown class*/
824
 
                                                                cli_warnmsg("confused about regex bracket expression, bailing out");
825
 
                                                                pat=old;
826
 
                                                                token->type=TOKEN_REGEX;
827
 
                                                                break;
828
 
                                                        }
829
 
                                                }
830
 
                                else {
831
 
                                        bitmap[*pat>>3] ^= 1<<(*pat&0x7);
832
 
                                        pat++;
833
 
                                        range_start = *pat;
834
 
                                        hasprev = 1;
835
 
                                }
836
 
                        } while(*pat!=']');
837
 
                        /*TODO: see if this bitmap already exists, then reuse*/                 
838
 
                        token->type = TOKEN_BRACKET;
839
 
                        token->u.bitmap = bitmap;
840
 
                        break;
841
 
                        }
842
 
                case ']':
843
 
                        massert(0 && "Encountered ] without matching [");
844
 
                        /* bad state */
845
 
                        break;
846
 
                case '.':
847
 
                        token->type=TOKEN_DOT;
848
 
                        break;
849
 
                case '(':
850
 
                        token->type=TOKEN_PAR_OPEN;
851
 
                        break;
852
 
                case ')':
853
 
                        token->type=TOKEN_PAR_CLOSE;
854
 
                        break;
855
 
                default:
856
 
                        token->type=TOKEN_CHAR;
857
 
                        token->u.c = *pat;
858
 
                        token->len=1;
859
 
                        break;
860
 
        }
861
 
        return ++pat;
862
 
}
863
 
 
864
 
#define INITIAL_ALT_STACK 10
865
 
#define ALT_STACK_GROW 20
866
 
 
867
 
static const unsigned char* find_regex_start(const unsigned char* pat)
868
 
{
869
 
        struct token_t token;
870
 
        /*TODO: find where the regex part begins, for ex:
871
 
         * abcd+, regex begins at 'd'
872
 
         * */
873
 
        const unsigned char* last=NULL;
874
 
        const unsigned char* tmp=NULL;
875
 
        const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
876
 
        size_t altpositions_capacity = INITIAL_ALT_STACK;
877
 
        size_t altpositions_cnt = 0;
878
 
        char lasttype = -1;
879
 
        if(!altpositions)
880
 
                return NULL;
881
 
        massert(pat);
882
 
 
883
 
        /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
884
 
         * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
885
 
         * back up to the last known good position
886
 
         * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
887
 
         * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
888
 
         * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
889
 
         * */
890
 
        do {    
891
 
                tmp = pat;
892
 
                pat = getNextToken(pat,&token);
893
 
                if(token.type!=TOKEN_REGEX) {
894
 
                        last = tmp;
895
 
                        lasttype = token.type;
896
 
                        if(token.type==TOKEN_BRACKET && token.u.bitmap)
897
 
                                free(token.u.bitmap);
898
 
                        if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
899
 
                                /* save this position on stack, succesfully parsed till here*/
900
 
                                if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
901
 
                                        /* encountered another alternate (|) operator, override previous | position stored */
902
 
                                        altpositions[altpositions_cnt-1]=last;
903
 
                                else {
904
 
                                        altpositions[altpositions_cnt++] = last;
905
 
                                        if(altpositions_cnt == altpositions_capacity) {
906
 
                                                altpositions_capacity += ALT_STACK_GROW;
907
 
                                                altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
908
 
                                                if(!altpositions)
909
 
                                                        return NULL;
910
 
                                        }
911
 
                                }
912
 
                        } else if (lasttype==TOKEN_PAR_CLOSE) {
913
 
                                /* remove last stored position from stack, succesfully this last group */
914
 
                                altpositions_cnt--;
915
 
                                massert(altpositions_cnt>0);
916
 
                        }
917
 
                }
918
 
                else {
919
 
                        if(altpositions_cnt)
920
 
                                last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
921
 
                        /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
922
 
                }
923
 
        } while(*pat && token.type!=TOKEN_REGEX);
924
 
        free(altpositions);
925
 
        return *pat ? last : last+1;
926
 
}
927
 
 
928
 
static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
929
 
{
930
 
        struct tree_node* node = cli_malloc(sizeof(*node));
931
 
        if(node) {
932
 
                node->alternatives=0;
933
 
                node->next=next;
934
 
                node->listend=listend;
935
 
                node->u.children=NULL;
936
 
        }
937
 
        return node;
938
 
}
939
 
 
940
 
static struct tree_node* tree_root_alloc(void)
941
 
{
942
 
        struct tree_node* root=tree_node_alloc(NULL,1);
943
 
        if(root) {
944
 
                root->op=OP_ROOT;
945
 
                root->c=0;
946
 
                root->next=NULL;
947
 
                root->listend=1;
948
 
        }
949
 
        return root;
950
 
}
951
 
 
952
 
static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
953
 
{
954
 
        int right;
955
 
        struct tree_node **children;
956
 
        massert(node);
957
 
        massert(left);
958
 
 
959
 
        children = tree_node_get_children(node);
960
 
        right = node->alternatives-1;
961
 
        *left = 0;
962
 
        if(!node->alternatives)
963
 
                return NULL;
964
 
        massert(children);
965
 
        while(*left<=right) {
966
 
                int mid  = *left+(right-*left)/2;
967
 
                if(children[mid]->c == csearch)
968
 
                        return children[mid]; 
969
 
                else if(children[mid]->c < csearch)
970
 
                        *left=mid+1;
971
 
                else
972
 
                        right=mid-1;
973
 
        }
974
 
        return NULL;
975
 
}
976
 
 
977
 
static struct tree_node* tree_get_next(struct tree_node* node)
978
 
{
979
 
        struct tree_node** children;
980
 
        massert(node);
981
 
        children = tree_node_get_children(node);
982
 
 
983
 
        if(!node->alternatives && children && children[0])
984
 
                return children[0];
985
 
        else if(node->alternatives<=1)
986
 
                return node;
987
 
        else
988
 
                return children[0]->next;
989
 
}
990
 
 
991
 
static size_t tree_node_get_array_size(const struct tree_node* node)
992
 
{
993
 
        massert(node);
994
 
        /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
995
 
        return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
996
 
}
997
 
 
998
 
static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
999
 
{
1000
 
        struct tree_node* new, *alt = tree_get_next(node);
1001
 
        struct tree_node **children;
1002
 
        node->alternatives++;
1003
 
        node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
1004
 
        if(!node->u.children)
1005
 
                return NULL;
1006
 
 
1007
 
        children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
1008
 
 
1009
 
        new = tree_node_alloc(alt , node == alt );
1010
 
        if(new) {
1011
 
                new->op=OP_CHAR;
1012
 
                new->c=c;
1013
 
        }
1014
 
 
1015
 
        if(node->alternatives-left-1>0)
1016
 
                        memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
1017
 
        children[left] = new;   
1018
 
 
1019
 
        return new;
1020
 
}
1021
 
 
1022
 
static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
1023
 
{
1024
 
        struct tree_node **children;
1025
 
        massert(node);
1026
 
        massert(new);
1027
 
 
1028
 
        children = tree_node_get_children(node);
1029
 
        if(node->alternatives) {
1030
 
                massert(children);
1031
 
                if(children[0]->next == node) {
1032
 
                        int i;
1033
 
                        new->listend = 1;
1034
 
                        for(i=0;i<node->alternatives;i++) {
1035
 
                                children[i]->next = new;
1036
 
                                children[i]->listend = 0;
1037
 
                        }
1038
 
                }
1039
 
                else {
1040
 
                        struct tree_node* p;
1041
 
                        for(p = children[0]->next ; p->next != node ; p = p->next)
1042
 
                                massert(!p->listend);
1043
 
                        new->listend = 1;
1044
 
                        p->listend = 0;
1045
 
                        p->next = new;
1046
 
                }
1047
 
        }
1048
 
        else {
1049
 
                int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
1050
 
                if(node->u.children)
1051
 
                        if(node->u.children[idx]) {
1052
 
                                node = node->u.children[idx];
1053
 
                                while(node->next && !node->listend)
1054
 
                                        node = node->next;
1055
 
                                node->listend = 0;
1056
 
                                new->next = node->next;
1057
 
                                node->next = new;
1058
 
                                new->listend=1;
1059
 
                                return;
1060
 
                        }
1061
 
                node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
1062
 
                if(node->u.children) {
1063
 
                        node->u.children[idx] = new;
1064
 
                }
1065
 
        }
1066
 
}
1067
 
 
1068
 
static unsigned char char_getclass(const unsigned char* bitmap)
1069
 
{
1070
 
        size_t i;
1071
 
        massert(bitmap);
1072
 
 
1073
 
        for(i=0;i<std_class_cnt;i++)
1074
 
                if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
1075
 
                        return i;
1076
 
        return std_class_cnt;
1077
 
}
1078
 
 
1079
 
static void stack_destroy(struct node_stack* stack)
1080
 
{
1081
 
        massert(stack);
1082
 
        if(stack->data)
1083
 
                free(stack->data);
1084
 
        stack->data = NULL;
1085
 
        stack->capacity = 0;
1086
 
}
1087
 
 
1088
 
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
1089
 
void regex_list_cleanup(struct regex_matcher* matcher)
1090
 
{
1091
 
        massert(matcher);
1092
 
 
1093
 
        stack_destroy(&matcher->node_stack);
1094
 
        stack_destroy(&matcher->node_stack_alt);
1095
 
        stack_init(&matcher->node_stack);
1096
 
        stack_init(&matcher->node_stack_alt);
 
565
                size_t i;
 
566
                cli_ac_free(&matcher->suffixes);
 
567
                if(matcher->suffix_regexes) {
 
568
                        for(i=0;i<matcher->suffix_cnt;i++) {
 
569
                                struct regex_list *r = matcher->suffix_regexes[i].head;
 
570
                                while(r) {
 
571
                                        struct regex_list *q = r;
 
572
                                        r = r->nxt;
 
573
                                        free(q->pattern);
 
574
                                        free(q);
 
575
                                }
 
576
                        }
 
577
                        free(matcher->suffix_regexes);
 
578
                        matcher->suffix_regexes = NULL;
 
579
                }
 
580
                if(matcher->all_pregs) {
 
581
                        for(i=0;i<matcher->regex_cnt;i++) {
 
582
                                regex_t *r = matcher->all_pregs[i];
 
583
                                cli_regfree(r);
 
584
                                free(r);
 
585
                        }
 
586
                        free(matcher->all_pregs);
 
587
                }
 
588
                hashtab_free(&matcher->suffix_hash);
 
589
                cli_bm_free(&matcher->md5_hashes);
 
590
        }
1097
591
}
1098
592
 
1099
593
int is_regex_ok(struct regex_matcher* matcher)
1100
594
{
1101
 
        massert(matcher);
 
595
        assert(matcher);
1102
596
        return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
1103
597
}
1104
598
 
1105
 
/* returns 0 on success, regexec error code otherwise */                                                
1106
 
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
1107
 
{
1108
 
        int bol=1;
1109
 
        const unsigned char* pat_end = find_regex_start(pat);
1110
 
        struct token_t token;
1111
 
        struct tree_node* node;
1112
 
        
1113
 
        massert(matcher);
1114
 
 
1115
 
        node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
1116
 
 
1117
 
        stack_reset(&matcher->node_stack);
1118
 
        stack_reset(&matcher->node_stack_alt);
1119
 
        stack_push(&matcher->node_stack,node);
1120
 
 
1121
 
        for(;node->op!=OP_LEAF;){
1122
 
                if(pat<pat_end)
1123
 
                        pat  = getNextToken(pat,&token);
1124
 
                else if(*pat) {
1125
 
                        token.type = TOKEN_REGEX;
1126
 
                        token.u.start=pat;
1127
 
                }
1128
 
                else
1129
 
                        token.type = TOKEN_DONE;
1130
 
 
1131
 
                switch(token.type) {
1132
 
                        case TOKEN_CHAR: 
1133
 
                                {
1134
 
                                        /* search for char in tree */
1135
 
                                        int left;
1136
 
                                        struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
1137
 
                                        if(newnode)
1138
 
                                                node = newnode;
1139
 
                                        else {
1140
 
                                                /* not found, insert it */
1141
 
                                                node = tree_node_char_insert(node,token.u.c,left);
1142
 
                                        }
1143
 
                                        break;
1144
 
                                }
1145
 
 
1146
 
                        case TOKEN_PAR_OPEN:
1147
 
                                stack_push(&matcher->node_stack_alt,NULL);/* marker */
1148
 
                                stack_push(&matcher->node_stack,node);
1149
 
                                break;
1150
 
 
1151
 
                        case TOKEN_PAR_CLOSE: {
1152
 
                                                      /*TODO: test this!!!*/
1153
 
                                                      struct tree_node* node_alt = node;
1154
 
                                                      node = tree_node_alloc(NULL,1);
1155
 
                                                      node->op=OP_PARCLOSE;
1156
 
                                                      node->c=0;
1157
 
                                                      node->listend=1;
1158
 
                                                      tree_node_insert_nonbin(node_alt,node);
1159
 
                                                      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
1160
 
                                                              tree_node_insert_nonbin(node_alt,node);
1161
 
                                                      }
1162
 
                                                      stack_pop(&matcher->node_stack);                                        
1163
 
                                                      break;
1164
 
                                              }
1165
 
 
1166
 
                        case TOKEN_ALT:
1167
 
                                stack_push(&matcher->node_stack_alt,node);
1168
 
                                node = stack_pop(&matcher->node_stack);
1169
 
                                stack_push(&matcher->node_stack,node);
1170
 
                                break;
1171
 
 
1172
 
                        case TOKEN_BRACKET:
1173
 
                                {
1174
 
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1175
 
                                        unsigned char charclass = char_getclass(token.u.bitmap);
1176
 
                                        if(charclass == std_class_cnt) {/*not a std char class*/
1177
 
                                                new->op = OP_CUSTOMCLASS;
1178
 
                                                new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
1179
 
                                                if(!new->u.children)
1180
 
                                                        return CL_EMEM;
1181
 
                                                new->u.bitmap[0] = token.u.bitmap;
1182
 
                                                new->u.bitmap[1] = NULL;
1183
 
                                                tree_node_insert_nonbin(node,new);
1184
 
                                                node = new;
1185
 
                                        }
1186
 
                                        else {
1187
 
                                                new->op = OP_STDCLASS;
1188
 
                                                new->c = charclass;
1189
 
                                                tree_node_insert_nonbin(node,new);
1190
 
                                                node=new;
1191
 
                                        }
1192
 
                                        break;
1193
 
                                }
1194
 
 
1195
 
                        case TOKEN_DOT:
1196
 
                                {
1197
 
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1198
 
                                        new->op = OP_DOT;
1199
 
                                        tree_node_insert_nonbin(node,new);
1200
 
                                        node=new;
1201
 
                                        break;
1202
 
                                }
1203
 
 
1204
 
                        case TOKEN_REGEX:
1205
 
                        case TOKEN_DONE: {
1206
 
                                                 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
1207
 
                                                 if(!leaf)
1208
 
                                                         return CL_EMEM;
1209
 
                                                 leaf->info = cli_strdup(info);
1210
 
                                                 if(token.type==TOKEN_REGEX) {
1211
 
                                                         int rc;
1212
 
                                                         struct tree_node* new;
1213
 
                                                         regex_t* preg;
1214
 
                                                         preg=cli_malloc(sizeof(*preg));
1215
 
                                                         if(!preg)
1216
 
                                                                 return CL_EMEM;
1217
 
                                                         rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
1218
 
                                                         leaf->preg=preg;
1219
 
                                                         if(rc)
1220
 
                                                                 return rc;
1221
 
                                                         new=cli_malloc(sizeof(*new));
1222
 
                                                         if(!new)
1223
 
                                                                 return CL_EMEM;
1224
 
                                                         new->op=OP_LEAF;
1225
 
                                                         new->next=node;
1226
 
                                                         new->alternatives=0;
1227
 
                                                         new->u.leaf=leaf;
1228
 
                                                         new->listend=1;
1229
 
                                                         tree_node_insert_nonbin(node,new);
1230
 
                                                 }
1231
 
                                                 else {
1232
 
                                                         leaf->preg=NULL;
1233
 
                                                         node->alternatives=0;
1234
 
                                                         node->u.leaf=leaf;
1235
 
                                                         node->op=OP_LEAF;
1236
 
                                                 }
1237
 
                                                 return 0;
1238
 
                                         }
1239
 
                }
1240
 
 
1241
 
                bol=0;
1242
 
        }
1243
 
        return 0;
1244
 
}
1245
 
 
1246
 
/* c has to be unsigned char here!! */
1247
 
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
1248
 
{
1249
 
        struct tree_node** children;
1250
 
        int rc;
1251
 
 
1252
 
        massert(node);
1253
 
        massert(c);
1254
 
        massert(info);
1255
 
 
1256
 
        if(!node->u.children)
1257
 
                return MATCH_FAILED;/* tree empty */
1258
 
        *info = NULL;
1259
 
        len++;
1260
 
        c--;
1261
 
        for(;;) {
1262
 
                massert(node);
1263
 
                children = node->u.children;
1264
 
                switch(node->op) {
1265
 
                        case OP_ROOT:
1266
 
                                rc=1;
1267
 
                                break;
1268
 
                        case OP_PARCLOSE:
1269
 
                                /*this isn't a real character, so don't move*/
1270
 
                                c--;
1271
 
                                len++;
1272
 
                                rc=1;
1273
 
                                break;
1274
 
                        case OP_CHAR:
1275
 
                                massert(*c==node->c && "We know this has to match");
1276
 
                                rc = 1;/* *c==node->c;- we know it has matched */
1277
 
                                break;
1278
 
                        case OP_DOT:    
1279
 
                                rc = *c!='\n';
1280
 
                                break;
1281
 
                        case OP_STDCLASS:
1282
 
                                rc = char_class[*c]&(node->c);
1283
 
                                break;
1284
 
                        case OP_CUSTOMCLASS:
1285
 
                        {
1286
 
                                char_bitmap_p bitmap;
1287
 
                                massert(children);
1288
 
                                bitmap = (char_bitmap_p)node->u.bitmap[0];
1289
 
                                children++;
1290
 
                                rc = bitmap[*c>>3]&(1<<(*c&0x7));
1291
 
                                break;
1292
 
                        }
1293
 
                        case OP_LEAF:
1294
 
                        {
1295
 
                                const struct leaf_info* leaf = node->u.leaf;
1296
 
                                /*isleaf = 1;*/
1297
 
                                if(leaf->preg) {
1298
 
                                        rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0);
1299
 
                                }
1300
 
                                else  {
1301
 
                                        massert(*c==node->c && "We know this has to match[2]");
1302
 
                                        rc = 1;
1303
 
                                }
1304
 
                                if(rc) {
1305
 
                                        *info = leaf->info;
1306
 
                                        return MATCH_SUCCESS;
1307
 
                                }
1308
 
                                break;
1309
 
                        }
1310
 
                        default:
1311
 
                                /* impossible */
1312
 
                                cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
1313
 
                                exit(1);
1314
 
                }
1315
 
                len--;
1316
 
                if(!len) rc=0;
1317
 
                c++;
1318
 
                if(rc) {
1319
 
                        const char csearch = *c;
1320
 
                        int left = 0,right = node->alternatives-1;
1321
 
                        int mid;
1322
 
                        /*matched so far, go deeper*/
1323
 
                        /*do a binary search between children */
1324
 
                        massert(children);
1325
 
                        while(left<=right) {
1326
 
                                mid  = left+(right-left)/2;
1327
 
                                if (children[mid]->c == csearch)
1328
 
                                        break;
1329
 
                                else if(children[mid]->c < csearch)
1330
 
                                        left=mid+1;
1331
 
                                else
1332
 
                                        right=mid-1;
1333
 
                        }
1334
 
                        if(left<=right) {
1335
 
                                node = children[mid];
1336
 
                                massert(node);
1337
 
                        }
1338
 
                        else {
1339
 
                                if(node->alternatives) {
1340
 
                                        if(!children[0]->listend) {
1341
 
                                                node = children[0];
1342
 
                                                c++;
1343
 
                                                len--;
1344
 
                                        }
1345
 
                                        while(node && node->listend) {
1346
 
                                                node = node->next;/* climb up */
1347
 
                                                c--;
1348
 
                                                len++;
1349
 
                                        }
1350
 
                                        if(!node || !node->next) 
1351
 
                                                return MATCH_FAILED;/* reached root node */
1352
 
                                        node=node->next;
1353
 
                                        c--;
1354
 
                                        len++;
1355
 
                                }
1356
 
                                else if(node->u.children) {
1357
 
                                        struct tree_node* rewrite_next = NULL;
1358
 
                                        if(node->op==OP_PARCLOSE) 
1359
 
                                                rewrite_next = node;
1360
 
                                        node = children[0];
1361
 
                                        massert(node);
1362
 
                                        massert(node->op!=OP_CHAR);
1363
 
                                        if(rewrite_next)
1364
 
                                                node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
1365
 
                                                                             we need to know 
1366
 
                                                                             from which one we came, so we can find out way back
1367
 
                                                                             should we fail to match somewhere deeper*/
1368
 
                                }
1369
 
                        }
1370
 
                }
1371
 
                else {
1372
 
                        /* this node didn't match, try sibling, or parent (if no more siblings) */
1373
 
                        while(node && node->listend) {
1374
 
                                node = node->next;/* sibling of parent */
1375
 
                                c--;
1376
 
                                len++;
1377
 
                        }
1378
 
                        if(!node || !node->next) /* reached root node, it has no next */
1379
 
                                return MATCH_FAILED;
1380
 
                        else {
1381
 
                                c--;
1382
 
                                len++;
1383
 
                                node=node->next;
1384
 
                        }
1385
 
                }
1386
 
        }
1387
 
        return MATCH_FAILED;
1388
 
}
1389
 
 
1390
 
/* push node on stack, only if it isn't there already */
1391
 
static void stack_push_once(struct node_stack* stack,struct tree_node* node)
1392
 
{
1393
 
        size_t i;
1394
 
        massert(stack);
1395
 
        massert(node);
1396
 
 
1397
 
        for(i=0;i < stack->cnt;i++)
1398
 
                if(stack->data[i]==node)
1399
 
                        return;
1400
 
        stack_push(stack,node);
1401
 
}
1402
 
 
1403
 
static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
1404
 
{
1405
 
        struct tree_node **children;
1406
 
        massert(matcher);
1407
 
        massert(node);
1408
 
 
1409
 
        children = tree_node_get_children(node);
1410
 
        if(node->op==OP_LEAF) {
1411
 
                struct leaf_info* leaf = node->u.leaf;
1412
 
                if(node->next && !node->listend)
1413
 
                        destroy_tree_internal(matcher,node->next);
1414
 
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
1415
 
                stack_push_once(&matcher->node_stack,node);
1416
 
                if(leaf->preg) {
1417
 
                        cli_regfree(leaf->preg);
1418
 
                        free(leaf->preg);
1419
 
                        leaf->preg=NULL;
1420
 
                }
1421
 
                if(leaf->info) {
1422
 
                        free(leaf->info);
1423
 
                        leaf->info=NULL;
1424
 
                }
1425
 
        /*      return;*/
1426
 
        }
1427
 
        if(node->alternatives) {
1428
 
                int i;
1429
 
                struct tree_node* p;
1430
 
                massert(children);
1431
 
                p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
1432
 
                for(i=0;i<node->alternatives;i++)
1433
 
                        destroy_tree_internal(matcher,children[i]);
1434
 
                if(p && p!=node)
1435
 
                        destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
1436
 
        }
1437
 
        else {
1438
 
                if(children) {
1439
 
                        if(children[0])
1440
 
                                destroy_tree_internal(matcher,children[0]);             
1441
 
                }
1442
 
        }
1443
 
        if(node->op!=OP_LEAF && node->next && !node->listend)
1444
 
                destroy_tree_internal(matcher,node->next);
1445
 
        if(node->u.children)
1446
 
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
1447
 
        if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
1448
 
                free(node->u.children[0]);
1449
 
                node->u.children[0]=NULL;
1450
 
        }
1451
 
        stack_push_once(&matcher->node_stack,node);
1452
 
}
1453
 
 
1454
 
static void destroy_tree(struct regex_matcher* matcher)
1455
 
{
1456
 
        /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
1457
 
         * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
1458
 
         * (and push to stack only if it doesn't contain it already*/
1459
 
        massert(matcher);
1460
 
 
1461
 
        stack_reset(&matcher->node_stack);
1462
 
        destroy_tree_internal(matcher,matcher->root_regex);
1463
 
        destroy_tree_internal(matcher,matcher->root_regex_hostonly);
1464
 
        while (matcher->node_stack.cnt) {
1465
 
                struct tree_node* node = stack_pop(&matcher->node_stack);
1466
 
                if(node)
1467
 
                        free(node);
1468
 
        }
1469
 
}
1470
 
#ifndef NDEBUG
1471
 
static void dump_node(struct tree_node* node)
1472
 
{
1473
 
        int i;
1474
 
        struct tree_node* p,**children;
1475
 
        massert(node);
1476
 
        if(node->op==OP_LEAF) {
1477
 
                if(node->u.leaf->preg)
1478
 
                        printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
1479
 
                else
1480
 
                        printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
1481
 
                if(node->next && !node->listend) {
1482
 
                        printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1483
 
                        dump_node(node->next);
1484
 
                }
1485
 
                return;
1486
 
        }
1487
 
        printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
1488
 
        if(node->next)
1489
 
                printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1490
 
        printf("n%p -> {",(void*)node);/*using address of node as id*/
1491
 
        children = tree_node_get_children(node);
1492
 
        if(node->alternatives)
1493
 
                massert(children);
1494
 
        for(i=0;i<node->alternatives;i++)
1495
 
                printf("n%p ",(void*)children[i]);
1496
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1497
 
                for(p=children[0]->next;p!=node;p=p->next)
1498
 
                {
1499
 
                        massert(p);
1500
 
                        printf("n%p ",(void*)p);
1501
 
                        if(p->op==OP_LEAF || p->listend)
1502
 
                                break;
1503
 
                }
1504
 
        if(!node->alternatives && children && children[0])
1505
 
                printf("n%p ",(void*)children[0]);
1506
 
        printf("};\n");
1507
 
        printf("{rank=same;");
1508
 
        for(i=0;i<node->alternatives;i++)
1509
 
                printf("n%p ",(void*)node->u.children[i]);
1510
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1511
 
                for(p=children[0]->next;p!=node;p=p->next) 
1512
 
                {
1513
 
                        printf("n%p ",(void*)p);        
1514
 
                        if(p->op==OP_LEAF || p->listend)
1515
 
                                break;
1516
 
                }
1517
 
        if(!node->alternatives && children && children[0])
1518
 
                printf("n%p ",(void*)children[0]);
1519
 
        printf("};\n");
1520
 
        for(i=0;i<node->alternatives;i++)
1521
 
                dump_node(children[i]);
1522
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1523
 
                for(p=children[0]->next;p!=node;p=p->next)
1524
 
                {
1525
 
                        dump_node(p);
1526
 
                        if(p->op==OP_LEAF || p->listend)
1527
 
                                break;
1528
 
                }
1529
 
        if(!node->alternatives && children && children[0])
1530
 
                dump_node(children[0]);
1531
 
}
1532
 
 
1533
 
void dump_tree(struct tree_node* root)
1534
 
{
1535
 
        /*use dot/dotty from graphviz to view it*/
1536
 
        massert(root);
1537
 
        printf("digraph tree {\n");
1538
 
        dump_node(root);
1539
 
        printf("}\n");
1540
 
}
1541
 
#endif
1542
 
 
1543
 
 
1544
 
#else
1545
 
/*------------------------New version of regex_list.c------------------------*/
1546
 
 
1547
 
/* Regex_list.c: 
1548
 
 * A scalable, trie-based implementation for matching against 
1549
 
 * a list of regular expressions.
1550
 
 *
1551
 
 * A trivial way to implement matching against a list of regular expressions 
1552
 
 * would have been to construct a single regular expression, by concatenating 
1553
 
 * the list with the alternate (|) operator.
1554
 
 * BUT a usual DFA implementation of regular expression matching (eg.: GNU libc)
1555
 
 * leads to "state explosion" when there are many (5000+) alternate (|) operators.
1556
 
 * This is the reason for using a trie-based implementation.
1557
 
 *
1558
 
 *
1559
 
 * Design considerations:
1560
 
 *
1561
 
 * Recursive call points: there are situations when match has to be retried on a different sub-trie, or with a different repeat count.
1562
 
 * Alternate operators, and repeat/range operators (+,*,{}) are recursiv call points. When a failure is encountered during a match,
1563
 
 * the function simply returns from the recursive call, and ends up at a failure point (recursive call point).
1564
 
 *
1565
 
 * "go to parent" below actually means, return from recursive call.
1566
 
 *
1567
 
 * fail_action: we need to return to closest failure point (recursive call point),
1568
 
 *  and switch current node to node pointed by fail_action
1569
 
 *
1570
 
 * Node types:
1571
 
 *      OP_ROOT: contains information that applies to the entire trie.
1572
 
 *              it can only appear as root node, and not as child node.
1573
 
 *              On child fail: match has failed
1574
 
 *              This is NOT a recursive call point
1575
 
 *      OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character; 
1576
 
 *                      using binary-search
1577
 
 *                      On fail: go to node indicated by fail_action, or if 
1578
 
 *                              fail_action is NULL, to parent
1579
 
 *                      On child fail: execute fail of current node
1580
 
 *      OP_ALTERNATIVES: try matching each sub-trie, if all fails execute fail
1581
 
 *              action of current node. This is a recursive call point
1582
 
 *      OP_CHAR_REPEAT: repeat specified character a number of times in range:
1583
 
 *              [min_range, max_range]; 
1584
 
 *                      min_range: 0 for * operator
1585
 
 *                                 1 for + operator
1586
 
 *                      max_range: remaining length of current string for *,+ operator
1587
 
 *                      OR: min_range, max_range as specified by the {min,max} operator
1588
 
 *              On fail: fail_action, or parent if NULL
1589
 
 *              On child fail: reduce match repeat count, try again on child, if
1590
 
 *                      repeat count<min_range, execute fail of current node
1591
 
 *              Also has a bitmap on what characters are accepted beyond it,
1592
 
 *              as an optimizations for the case, when a maximum match isn't possible
1593
 
 *              Not recomended to use this when min_range=max_range=1
1594
 
 *              This is a recursive call point
1595
 
 *      OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
1596
 
 *              Not recomended to use this when min_range=max_range=1
1597
 
 *              This is a recursive call point
1598
 
 *      OP_GROUP_START: start of a group "(", also specifies group flags:
1599
 
 *              repeat: is_repeat, min_range, max_range
1600
 
 *              This is a recursive call point if is_repeat
1601
 
 *      OP_GROUP_END: end of group ")"
1602
 
 *      OP_STRCMP: compare with specified string,
1603
 
 *                 it has an array of fail actions, one for each character
1604
 
 *                 default fail action: go to parent
1605
 
 *                 This was introduced from memory- and speed-efficiency
1606
 
 *                 considerations. 
1607
 
 *      OP_CHAR_CLASS_REPEAT: match character with character class
1608
 
 *              min_range, max_range
1609
 
 *              For a single character class min_range=max_range=1
1610
 
 *      OP_MATCH_OK: match has succeeded
1611
 
 *
1612
 
 * TODO: maybe we'll need a more efficient way to choose between character classes.
1613
 
 *       OP_DOT_REPEAT/OP_CHAR_REPEAT needs a more efficient specification of its failure function, instead of using
1614
 
 *       backtracking approach.
1615
 
 *
1616
 
 * The failure function/action is just for optimization, the match algorithms works even without it.
1617
 
 * TODO:In this first draft fail action will always be NULL, in a later version I'll implement fail actions too.
1618
 
 *
1619
 
 *
1620
 
 */ 
1621
 
 
1622
 
#include <string.h>
1623
 
#include "cltypes.h"
1624
 
#include "others.h"
1625
 
 
1626
 
/* offsetof is not ANSI C */
1627
 
#ifndef offsetof
1628
 
#   define offsetof(type,memb) ((size_t)&((type*)0)->memb)
1629
 
#endif
1630
 
 
1631
 
#define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) )
1632
 
#define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) )
1633
 
 
1634
 
enum trie_node_type {
1635
 
        OP_ROOT,
1636
 
        OP_CHAR_BINARY_SEARCH,
1637
 
        OP_ALTERNATIVES,
1638
 
        OP_CHAR_REPEAT,
1639
 
        OP_DOT_REPEAT,
1640
 
        OP_CHAR_CLASS_REPEAT,
1641
 
        OP_STRCMP,
1642
 
        OP_GROUP_START,
1643
 
        OP_GROUP_END,
1644
 
        OP_MATCH_OK
1645
 
};
1646
 
 
1647
 
 
1648
 
/* the comon definition of a trie node */
1649
 
struct trie_node
1650
 
{
1651
 
        enum trie_node_type type;
1652
 
};
1653
 
 
1654
 
struct trie_node_match {
1655
 
        struct trie_node node;
1656
 
        /* additional match info */
1657
 
};
1658
 
 
1659
 
struct trie_node_root
1660
 
{
1661
 
        struct trie_node node;
1662
 
        struct trie_node* child;
1663
 
};
1664
 
 
1665
 
struct trie_node_binary_search
1666
 
{
1667
 
        struct trie_node node;
1668
 
        uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/   
1669
 
        struct trie_node* fail_action;
1670
 
        unsigned char* char_choices;/* children_count elements */
1671
 
        struct trie_node** children;/*children_count elements */
1672
 
};
1673
 
 
1674
 
struct trie_node_alternatives
1675
 
{
1676
 
        struct trie_node node;
1677
 
        uint32_t alternatives_count;
1678
 
        /* need to support node with lots of alternatives, 
1679
 
         * for a worst-case scenario where each line ends up as a sub-trie of OP_ALTERNATIVES*/
1680
 
        struct trie_node* fail_action;
1681
 
        struct trie_node** children;
1682
 
};
1683
 
 
1684
 
struct trie_node_char_repeat
1685
 
{
1686
 
        struct trie_node node;
1687
 
        unsigned char character;
1688
 
        uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1689
 
        struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
1690
 
                                                   to optimize repeat < max_range case; if its NULL
1691
 
                                                   there is no optimization*/
1692
 
        struct trie_node* child;
1693
 
        struct trie_node* fail_action;
1694
 
};
1695
 
 
1696
 
struct trie_node_dot_repeat
1697
 
{
1698
 
        struct trie_node node;
1699
 
        uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
1700
 
        struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
1701
 
                                                   to optimize repeat < max_range case; if its NULL
1702
 
                                                   there is no optimization*/
1703
 
        struct trie_node* child;
1704
 
        struct trie_node* fail_action;
1705
 
};
1706
 
 
1707
 
struct trie_node_group_start
1708
 
{
1709
 
        struct trie_node node;
1710
 
        uint8_t range_min, range_max;/* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/
1711
 
        struct trie_node* child;
1712
 
        struct trie_node* fail_action;  
1713
 
};
1714
 
 
1715
 
struct trie_node_group_end
1716
 
{
1717
 
        struct trie_node node;
1718
 
        struct trie_node* child;
1719
 
};
1720
 
 
1721
 
struct trie_node_strcmp
1722
 
{
1723
 
        struct trie_node node;
1724
 
        uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
1725
 
        unsigned char* string;
1726
 
        struct trie_node* child;
1727
 
        struct trie_node** fail_actions;/* this has string_length elements, or NULL if no fail_actions are computed */
1728
 
};
1729
 
 
1730
 
struct trie_node_char_class_repeat
1731
 
{
1732
 
        struct trie_node node;
1733
 
        struct char_bitmap* bitmap;
1734
 
        struct char_bitmap* bitmap_accept_after;
1735
 
        uint8_t range_min, range_max;
1736
 
        struct trie_node* child;
1737
 
        struct trie_node* fail_action;
1738
 
};
1739
 
 
1740
 
static inline int bitmap_accepts(const struct char_bitmap* bitmap, const char c)
1741
 
{
1742
 
        /* TODO: check if c is accepted by bitmap */
1743
 
        return 0;
1744
 
}
1745
 
 
1746
 
#define MATCH_FAILED 0
1747
 
#define MATCH_OK     1
1748
 
 
1749
 
#define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
1750
 
 
1751
 
 
1752
 
#ifndef MIN
1753
 
#define MIN(a,b) ((a)<(b) ? (a) : (b))
1754
 
#endif
1755
 
 
1756
 
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action);
1757
 
 
1758
 
static int match_repeat(const unsigned char* text, const unsigned char* text_end, const size_t range_min, const size_t repeat_start, 
1759
 
                const struct char_bitmap* bitmap_accept_after, const struct trie_node* child, const struct trie_node** fail_action,
1760
 
                const struct trie_node* this_fail_action)
1761
 
{
1762
 
        size_t i;
1763
 
        for(i = repeat_start;i > range_min;i--) {
1764
 
                if(!bitmap_accept_after || bitmap_accepts( bitmap_accept_after, text[i-1])) {
1765
 
                        int rc = match_node(child, &text[i], text_end, fail_action);
1766
 
                        /* ignore fail_action for now, we have the bitmap_accepts_after optimization */
1767
 
                        if(rc) {
1768
 
                                return MATCH_OK;
1769
 
                        }
1770
 
                }                                               
1771
 
        }
1772
 
        if(!range_min) {
1773
 
                /* this match is optional, try child only */
1774
 
                int rc = match_node(child, text, text_end, fail_action);
1775
 
                if(rc) {
1776
 
                        return MATCH_OK;
1777
 
                }
1778
 
        }
1779
 
        return FAIL_ACTION(this_fail_action);
1780
 
}
1781
 
 
1782
 
/* text_end points to \0 in text */
1783
 
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action)
1784
 
{
1785
 
        while(node && text < text_end) {        
1786
 
                switch(node->type) {
1787
 
                        case OP_ROOT:
1788
 
                                {       
1789
 
                                        const struct trie_node_root* root_node = container_of_const(node, const struct trie_node_root, node);
1790
 
                                        node = root_node->child;
1791
 
                                        break;
1792
 
                                }
1793
 
                        case OP_CHAR_BINARY_SEARCH:
1794
 
                                {                                       
1795
 
                                        const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
1796
 
                                        const unsigned char csearch = *text;
1797
 
                                        size_t mid, left = 0, right = bin_node->children_count-1;                                       
1798
 
                                        while(left<=right) {
1799
 
                                                mid = left+(right-left)/2;
1800
 
                                                if(bin_node->char_choices[mid] == csearch)
1801
 
                                                        break;
1802
 
                                                else if(bin_node->char_choices[mid] < csearch)
1803
 
                                                        left = mid+1;
1804
 
                                                else
1805
 
                                                        right = mid-1;
1806
 
                                        }
1807
 
                                        if(left <= right) {
1808
 
                                                /* match successful */
1809
 
                                                node = bin_node->children[mid];
1810
 
                                                ++text;
1811
 
                                        }
1812
 
                                        else {
1813
 
                                                return FAIL_ACTION( bin_node->fail_action );
1814
 
                                        }
1815
 
                                        break;
1816
 
                                }
1817
 
                        case OP_ALTERNATIVES:
1818
 
                                {
1819
 
                                        const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
1820
 
                                        size_t i;
1821
 
                                        *fail_action = NULL;
1822
 
                                        for(i=0;i < alt_node->alternatives_count;i++) {
1823
 
                                                int rc = match_node(alt_node->children[i], text, text_end, fail_action);
1824
 
                                                if(rc) {                                                        
1825
 
                                                        return MATCH_OK;
1826
 
                                                }
1827
 
                                                /* supporting fail_actions is tricky,
1828
 
                                                 *  if we just go to the node specified, what happens if the match fails, and no
1829
 
                                                 *  further fail_action is specified? We should know where to continue the search.
1830
 
                                                 * For now fail_action isn't supported for OP_ALTERNATIVES*/                                            
1831
 
                                        }
1832
 
                                        break;
1833
 
                                }
1834
 
                        case OP_CHAR_REPEAT:
1835
 
                                {
1836
 
                                        const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
1837
 
                                        const size_t max_len = MIN( text_end - text, char_rep_node->range_max-1);
1838
 
                                        /* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1839
 
                                        const char caccept = char_rep_node->character;
1840
 
                                        size_t rep;
1841
 
 
1842
 
                                        if(max_len < char_rep_node->range_min)
1843
 
                                                return FAIL_ACTION(char_rep_node->fail_action);
1844
 
 
1845
 
                                        for(rep=0;rep < max_len;rep++) {
1846
 
                                                if(text[rep] != caccept) {
1847
 
                                                        break;
1848
 
                                                }
1849
 
                                        }
1850
 
 
1851
 
                                        return match_repeat(text, text_end, char_rep_node->range_min, rep,
1852
 
                                                        char_rep_node->bitmap_accept_after, char_rep_node->child, fail_action,
1853
 
                                                        char_rep_node->fail_action);
1854
 
                                }
1855
 
                        case OP_DOT_REPEAT:
1856
 
                                {
1857
 
                                        const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
1858
 
                                        const size_t max_len = MIN( text_end - text, dot_rep_node->range_max-1);
1859
 
                                        /* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1860
 
 
1861
 
                                        if(max_len < dot_rep_node->range_min)
1862
 
                                                return FAIL_ACTION(dot_rep_node->fail_action);
1863
 
 
1864
 
                                        return match_repeat(text, text_end, dot_rep_node->range_min, max_len,
1865
 
                                                        dot_rep_node->bitmap_accept_after, dot_rep_node->child, fail_action,
1866
 
                                                        dot_rep_node->fail_action);
1867
 
                                }
1868
 
                        case OP_CHAR_CLASS_REPEAT:
1869
 
                                {
1870
 
                                        const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
1871
 
                                        const size_t max_len = MIN( text_end - text, class_rep_node->range_max-1);
1872
 
                                        /* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
1873
 
                                        size_t rep;
1874
 
 
1875
 
                                        if(max_len < class_rep_node->range_min)
1876
 
                                                return FAIL_ACTION(class_rep_node->fail_action);
1877
 
 
1878
 
                                        for(rep=0;rep < max_len;rep++) {
1879
 
                                                if(!bitmap_accepts( class_rep_node->bitmap, text[rep])) {
1880
 
                                                        break;
1881
 
                                                }
1882
 
                                        }
1883
 
 
1884
 
                                        return match_repeat(text, text_end, class_rep_node->range_min, rep,
1885
 
                                                        class_rep_node->bitmap_accept_after, class_rep_node->child, fail_action,
1886
 
                                                        class_rep_node->fail_action);
1887
 
                                        break;
1888
 
                                }
1889
 
                        case OP_STRCMP:
1890
 
                                {
1891
 
                                        const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
1892
 
                                        size_t i;
1893
 
                                        if(strcmp_node->fail_actions) {
1894
 
                                                const size_t max_len = MIN(strcmp_node->string_length, text_end-text);
1895
 
                                                /* we don't use strncmp, because we need the exact match-fail point */
1896
 
                                                for(i=0;i < max_len;i++) {
1897
 
                                                        if(text[i] != strcmp_node->string[i]) {
1898
 
                                                                return FAIL_ACTION( strcmp_node->fail_actions[i] );
1899
 
                                                        }
1900
 
                                                }
1901
 
                                                if(max_len < strcmp_node->string_length) {
1902
 
                                                        /* failed, because text was shorter */
1903
 
                                                        return FAIL_ACTION( strcmp_node->fail_actions[max_len] );
1904
 
                                                }
1905
 
                                        }
1906
 
                                        else {
1907
 
                                                /* no fail_actions computed, some shortcuts possible on compare */
1908
 
                                                if((text_end - text < strcmp_node->string_length) ||
1909
 
                                                                strncmp((const char*)text, (const char*)strcmp_node->string, strcmp_node->string_length)) {
1910
 
 
1911
 
                                                        return FAIL_ACTION( NULL );
1912
 
                                                }
1913
 
                                        }
1914
 
                                        /* match successful */
1915
 
                                        node = strcmp_node->child;
1916
 
                                        text += strcmp_node->string_length;
1917
 
                                        break;
1918
 
                                }
1919
 
                        case OP_GROUP_START:
1920
 
                                {
1921
 
                                        const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
1922
 
                                        /* TODO: implement */
1923
 
                                        break;
1924
 
                                }
1925
 
                        case OP_GROUP_END:
1926
 
                                {                                       
1927
 
                                        const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
1928
 
                                        /* TODO: implement */
1929
 
                                        break;
1930
 
                                }
1931
 
                        case OP_MATCH_OK:
1932
 
                                {
1933
 
                                        return MATCH_OK;
1934
 
                                }
1935
 
                }
1936
 
        }
1937
 
        /* if fail_action was NULL, or text ended*/
1938
 
        return MATCH_FAILED;
1939
 
}
1940
 
 
1941
 
#endif
1942
 
 
 
599
static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, const char *suffix, size_t len)
 
600
{
 
601
        struct cli_matcher *root = &matcher->suffixes;
 
602
        struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
 
603
        size_t i;
 
604
        int ret;
 
605
 
 
606
        if(!new)
 
607
                return CL_EMEM;
 
608
        assert(root && suffix);
 
609
 
 
610
        new->rtype = 0;
 
611
        new->type = 0;
 
612
        new->sigid = 0;
 
613
        new->parts = 0;
 
614
        new->partno = 0;
 
615
        new->mindist = 0;
 
616
        new->maxdist = 0;
 
617
        new->offset = 0;
 
618
        new->length = len;
 
619
 
 
620
        new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
 
621
        if(new->length > root->maxpatlen)
 
622
                root->maxpatlen = new->length;
 
623
 
 
624
        new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
 
625
        if(!new->pattern) {
 
626
                free(new);
 
627
                return CL_EMEM;
 
628
        }
 
629
        for(i=0;i<len;i++)
 
630
                new->pattern[i] = suffix[i];/*new->pattern is short int* */
 
631
 
 
632
        new->customdata = info;
 
633
        new->virname = NULL;
 
634
        if((ret = cli_ac_addpatt(root,new))) {
 
635
                free(new->pattern);
 
636
                free(new);
 
637
                return ret;
 
638
        }
 
639
        SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len);
 
640
        return CL_SUCCESS;
 
641
}
 
642
 
 
643
#define MODULE "regex_list: "
 
644
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
 
645
 
 
646
static void list_add_tail(struct regex_list_ht *ht, struct regex_list *regex)
 
647
{
 
648
        if(!ht->head)
 
649
                ht->head = regex;
 
650
        if(ht->tail) {
 
651
                ht->tail->nxt = regex;
 
652
        }
 
653
        ht->tail = regex;
 
654
}
 
655
 
 
656
/* returns 0 on success, clamav error code otherwise */
 
657
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *iregex)
 
658
{
 
659
        struct regex_matcher *matcher = cbdata;
 
660
        struct regex_list *regex = cli_malloc(sizeof(*regex));
 
661
        const struct element *el;
 
662
 
 
663
        assert(matcher);
 
664
        if(!regex)
 
665
                return CL_EMEM;
 
666
        regex->pattern = iregex->pattern ? cli_strdup(iregex->pattern) : NULL;
 
667
        regex->preg = iregex->preg;
 
668
        regex->nxt = NULL;
 
669
        el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
 
670
        /* TODO: what if suffixes are prefixes of eachother and only one will
 
671
         * match? */
 
672
        if(el) {
 
673
                /* existing suffix */
 
674
                assert((size_t)el->data < matcher->suffix_cnt);
 
675
                list_add_tail(&matcher->suffix_regexes[el->data], regex);
 
676
                cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern);
 
677
        } else {
 
678
                /* new suffix */
 
679
                size_t n = matcher->suffix_cnt++;
 
680
                el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
 
681
                matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
 
682
                if(!matcher->suffix_regexes)
 
683
                        return CL_EMEM;
 
684
                matcher->suffix_regexes[n].tail = regex;
 
685
                matcher->suffix_regexes[n].head = regex;
 
686
                add_newsuffix(matcher, regex, suffix, suffix_len);
 
687
                cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern);
 
688
        }
 
689
        return 0;
 
690
}
 
691
 
 
692
static size_t reverse_string(char *pattern)
 
693
{
 
694
        size_t len = strlen(pattern);
 
695
        size_t i;
 
696
        for(i=0; i < (len/2); i++) {
 
697
                char aux = pattern[i];
 
698
                pattern[i] = pattern[len-i-1];
 
699
                pattern[len-i-1] = aux;
 
700
        }
 
701
        return len;
 
702
}
 
703
 
 
704
static regex_t *new_preg(struct regex_matcher *matcher)
 
705
{
 
706
        regex_t *r;
 
707
        matcher->all_pregs = cli_realloc(matcher->all_pregs, ++matcher->regex_cnt * sizeof(*matcher->all_pregs));
 
708
        if(!matcher->all_pregs)
 
709
                return NULL;
 
710
        r = cli_malloc(sizeof(*r));
 
711
        if(!r)
 
712
                return NULL;
 
713
        matcher->all_pregs[matcher->regex_cnt-1] = r;
 
714
        return r;
 
715
}
 
716
 
 
717
static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
 
718
{
 
719
        size_t len;
 
720
        struct regex_list regex;
 
721
        int rc;
 
722
 
 
723
        len = reverse_string(pattern);
 
724
        regex.nxt = NULL;
 
725
        regex.pattern = cli_strdup(pattern);
 
726
        regex.preg = NULL;
 
727
        rc = add_pattern_suffix(matcher, pattern, len, &regex);
 
728
        free(regex.pattern);
 
729
        return rc;
 
730
}
 
731
 
 
732
int regex_list_add_pattern(struct regex_matcher *matcher, char *pattern)
 
733
{
 
734
        int rc;
 
735
        regex_t *preg;
 
736
        size_t len;
 
737
        /* we only match the host, so remove useless stuff */
 
738
        const char remove_end[] = "([/?].*)?/";
 
739
        const char remove_end2[] = "([/?].*)/";
 
740
 
 
741
        len = strlen(pattern);
 
742
        if(len > sizeof(remove_end)) {
 
743
                if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
 
744
                        len -= sizeof(remove_end) - 1;
 
745
                        pattern[len++]='/';
 
746
                }
 
747
                if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
 
748
                        len -= sizeof(remove_end2) - 1;
 
749
                        pattern[len++]='/';
 
750
                }
 
751
        }
 
752
        pattern[len] = '\0';
 
753
 
 
754
        preg = new_preg(matcher);
 
755
        if(!preg)
 
756
                return CL_EMEM;
 
757
 
 
758
        rc = cli_regex2suffix(pattern, preg, add_pattern_suffix, (void*)matcher);
 
759
        if(rc) {
 
760
                cli_regfree(preg);
 
761
        }
 
762
 
 
763
        return rc;
 
764
}