~ubuntu-branches/ubuntu/raring/clamav/raring

« back to all changes in this revision

Viewing changes to libclamav/regex_list.c

  • Committer: Bazaar Package Importer
  • Author(s): Stephen Gran
  • Date: 2008-09-05 17:25:34 UTC
  • mfrom: (0.35.1 lenny)
  • Revision ID: james.westby@ubuntu.com-20080905172534-yi3f8fkye1o7u1r3
* New upstream version (closes: #497662, #497773)
  - lots of new options for clamd.conf
  - fixes CVEs CVE-2008-3912, CVE-2008-3913, CVE-2008-3914, and
    CVE-2008-1389
* No longer supports --unzip option, so typo is gone (closes: #496276)
* Translations:
  - sv (thanks Martin Bagge <brother@bsnet.se>) (closes: #491760)

Show diffs side-by-side

added added

removed removed

Lines of Context:
42
42
 
43
43
#include <limits.h>
44
44
#include <sys/types.h>
 
45
#include <assert.h>
 
46
 
45
47
 
46
48
#include "regex/regex.h"
47
49
 
53
55
#include "matcher.h"
54
56
#include "str.h"
55
57
#include "readdb.h"
56
 
 
57
 
/*Tree*/
58
 
enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
59
 
typedef unsigned char* char_bitmap_p;
60
 
/*
61
 
 *
62
 
 * OP_CHAR: 1 character, c = character
63
 
 * complex stuff:
64
 
 * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
65
 
 * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
66
 
 * OP_DOT: single . matching any character except \n
67
 
 * OP_LEAF: this is a leaf node, reinterpret structure
68
 
 */
69
 
struct tree_node {
70
 
        struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
71
 
        union {
72
 
                struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
73
 
                char_bitmap_p* bitmap;
74
 
                struct leaf_info*  leaf;
75
 
        } u;
76
 
        enum token_op_t op;
77
 
        unsigned char c;
78
 
        char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
79
 
        char listend;/* no more siblings, next pointer is pointer to parent*/
80
 
};
81
 
 
82
 
struct leaf_info {
83
 
        char* info;/* what does it mean that we reached the leaf...*/
84
 
        regex_t* preg;/* this is NULL if leaf node, and non-regex*/
85
 
};
86
 
 
87
 
/* Character classes */
88
 
static const char* std_class[] = {
89
 
        "[:alnum:]",
90
 
        "[:digit:]",
91
 
        "[:punct:]",
92
 
        "[:alpha:]",
93
 
        "[:graph:]",
94
 
        "[:space:]",
95
 
        "[:blank:]",
96
 
        "[:lower:]", 
97
 
        "[:upper:]",
98
 
        "[:cntrl:]",
99
 
        "[:print:]",
100
 
        "[:xdigit:]"
101
 
        /* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
102
 
};
103
 
 
104
 
 
105
 
#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
106
 
 
107
 
/* generated by contrib/phishing/generate_tables.c */
108
 
static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
109
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
110
 
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
111
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
112
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
113
 
 
114
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
115
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
116
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
117
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
118
 
 
119
 
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 
120
 
         0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, 
121
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
122
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
123
 
 
124
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
125
 
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
126
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
127
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
128
 
 
129
 
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 
130
 
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
131
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
132
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
133
 
 
134
 
        {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
135
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
136
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
137
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
138
 
 
139
 
        {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
140
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
141
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
142
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
143
 
 
144
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
145
 
         0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 
146
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
147
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
148
 
 
149
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
150
 
         0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 
151
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
152
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
153
 
 
154
 
        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
155
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 
156
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
157
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
158
 
 
159
 
        {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 
160
 
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
161
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
162
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
163
 
 
164
 
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
165
 
         0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 
166
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
167
 
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
168
 
};
169
 
 
170
 
static const unsigned short int char_class[256] = {
171
 
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, 
172
 
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 
173
 
        0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
174
 
        0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
175
 
        0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 
176
 
        0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, 
177
 
        0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 
178
 
        0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, 
179
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
180
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
181
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
182
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
183
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
184
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
185
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
186
 
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
187
 
};
188
 
 
189
 
static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
190
 
 
 
58
#include "jsparse/textbuf.h"
 
59
#include "regex_suffix.h"
191
60
/* Prototypes */
192
 
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
193
 
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
194
 
static void destroy_tree(struct regex_matcher* matcher);
195
 
static struct tree_node* tree_root_alloc(void);
196
 
static int build_regex_list(struct regex_matcher* matcher);
197
 
static void stack_destroy(struct node_stack* stack);
198
 
 
199
 
#ifndef NDEBUG
200
 
void dump_tree(struct tree_node* root);
201
 
#endif
 
61
static regex_t *new_preg(struct regex_matcher *matcher);
 
62
static size_t reverse_string(char *pattern);
 
63
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *regex);
 
64
static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
 
65
/* ---------- */
 
66
 
 
67
/* ----- shift-or filtering -------------- */
 
68
 
 
69
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
 
70
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
 
71
 
 
72
static void SO_init(struct filter *m)
 
73
{
 
74
        memset(m->B, ~0, sizeof(m->B));
 
75
        memset(m->end, ~0, sizeof(m->end));
 
76
        memset(m->end_fast, ~0, sizeof(m->end_fast));
 
77
}
 
78
 
 
79
/* because we use uint32_t */
 
80
#define MAXSOPATLEN 32
 
81
 
 
82
/* merge another pattern into the filter
 
83
 * add('abc'); add('bcd'); will match [ab][bc][cd] */
 
84
static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
 
85
{
 
86
        uint16_t q;
 
87
        uint8_t j;
 
88
 
 
89
        /* cut length, and make it modulo 2 */
 
90
        if(len > MAXSOPATLEN) {
 
91
                len = MAXSOPATLEN;
 
92
        } else {
 
93
                /* we use 2-grams, must be multiple of 2 */
 
94
                len = len & ~1;
 
95
        }
 
96
        if(!len)
 
97
                return 0;
 
98
 
 
99
        /* Shift-Or like preprocessing */
 
100
        for(j=0;j < len-1;j++) {
 
101
                /* use overlapping 2-grams. We need them overlapping because matching can start at any position */
 
102
                q = cli_readint16( &pattern[j] );
 
103
                m->B[q] &= ~(1 << j);
 
104
        }
 
105
        /* we use variable length patterns, use last character to mark pattern end,
 
106
         * can lead to false positives.*/
 
107
        /* mark that at state j, the q-gram q can end the pattern */
 
108
        if(j) {
 
109
                j--;
 
110
                m->end[q] &= ~(1 << j);
 
111
                m->end_fast[pattern[j+1]] &= ~(1<<j);
 
112
        }
 
113
        return 0;
 
114
}
 
115
 
 
116
/* this is like a FSM, with multiple active states at the same time.
 
117
 * each bit in "state" means an active state, when a char is encountered
 
118
 * we determine what states can remain active.
 
119
 * The FSM transition rules are expressed as bit-masks */
 
120
long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
 
121
{
 
122
        size_t j;
 
123
        uint32_t state = ~0;
 
124
        const uint32_t *B = m->B;
 
125
        const uint32_t *End = m->end;
 
126
        const uint32_t *EndFast = m->end_fast;
 
127
 
 
128
        /* cut length, and make it modulo 2 */
 
129
        if(len > MAXSOPATLEN) {
 
130
                len = MAXSOPATLEN;
 
131
        } else {
 
132
                /* we use 2-grams, must be multiple of 2 */
 
133
                len = len & ~1;
 
134
        }
 
135
        if(!len) return -1;
 
136
        /* Shift-Or like search algorithm */
 
137
        for(j=0;j < len-1; j++) {
 
138
                const uint16_t q0 = cli_readint16( &data[j] );
 
139
                uint32_t match_end;
 
140
                state = (state << 1) | B[q0];
 
141
                /* state marks with a 0 bit all active states
 
142
                 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
 
143
                 * if we got two 0's at matching positions, it means we encountered a pattern's end */
 
144
                match_end = state | EndFast[data[j+1]];
 
145
                if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
 
146
                        /* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
 
147
                         * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
 
148
                         * set inside L2 */
 
149
 
 
150
                        /* if state is reachable, and this character can finish a pattern, assume match */
 
151
                        /* to reduce false positives check if qgram can finish the pattern */
 
152
                        /* return position of probable match */
 
153
                        /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
 
154
                         * longest pattern that could match */
 
155
                        return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
 
156
                }
 
157
        }
 
158
        /* no match */
 
159
        return -1;
 
160
}
 
161
 
 
162
/* ----------------------------------------------------------- */
 
163
 
202
164
 
203
165
#define MATCH_SUCCESS 0 
204
166
#define MATCH_FAILED  -1
233
195
        return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
234
196
}
235
197
 
 
198
static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
 
199
{
 
200
        char c;
 
201
        size_t match_len;
 
202
 
 
203
        if(!regex || !regex->pattern)
 
204
                return 0;
 
205
        match_len = strlen(regex->pattern);
 
206
        if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
 
207
                        (match_len == buffer_len || /* full match */
 
208
                         (match_len < buffer_len &&
 
209
                          ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
 
210
                         /* subdomain matched*/)) {
 
211
                /* we have an extra / at the end */
 
212
                if(match_len > 0) match_len--;
 
213
                cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
 
214
                cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
 
215
                if(real_len >= match_len + 1) {
 
216
                        const size_t pos = real_len - match_len - 1;
 
217
                        if(real_url[pos] != '.') {
 
218
                                /* we need to shift left, and insert a '.'
 
219
                                 * we have an extra '.' at the beginning inserted by get_host to have room,
 
220
                                 * orig_real_url has to be used here, 
 
221
                                 * because we want to overwrite that extra '.' */
 
222
                                size_t orig_real_len = strlen(orig_real_url);
 
223
                                cli_dbgmsg("No dot here:%s\n",real_url+pos);
 
224
                                real_url = orig_real_url;
 
225
                                memmove(real_url, real_url+1, orig_real_len-match_len-1);
 
226
                                real_url[orig_real_len-match_len-1]='.';
 
227
                                cli_dbgmsg("After inserting .: %s\n", real_url);
 
228
                        }
 
229
                }
 
230
                return 1;
 
231
        }
 
232
        cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
 
233
        return 0;
 
234
}
 
235
 
236
236
/*
237
237
 * @matcher - matcher structure to use
238
238
 * @real_url - href target
246
246
 * Do not send NULL pointers to this function!!
247
247
 *
248
248
 */
249
 
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
 
249
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
250
250
{
251
251
        char* orig_real_url = real_url;
252
 
        massert(matcher);
253
 
        massert(real_url);
254
 
        massert(display_url);
255
 
        massert(info);
 
252
        struct regex_list *regex;
 
253
        size_t real_len, display_len, buffer_len;
 
254
 
 
255
        assert(matcher);
 
256
        assert(real_url);
 
257
        assert(display_url);
 
258
        *info = NULL;
256
259
        if(!matcher->list_inited)
257
260
                return 0;
258
 
        massert(matcher->list_built);
 
261
        assert(matcher->list_built);
259
262
        /* skip initial '.' inserted by get_host */
260
263
        if(real_url[0] == '.') real_url++;
261
264
        if(display_url[0] == '.') display_url++;
 
265
        real_len    = strlen(real_url);
 
266
        display_len = strlen(display_url);
 
267
        buffer_len  = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
 
268
        if(buffer_len < 3) {
 
269
                /* too short, no match possible */
 
270
                return 0;
 
271
        }
262
272
        {
263
 
                size_t real_len    = strlen(real_url);
264
 
                size_t display_len = strlen(display_url);
265
 
                size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
266
 
                char*  buffer = cli_malloc(buffer_len+1);
267
 
                size_t i;
 
273
                char *buffer = cli_malloc(buffer_len+1);
 
274
                char *bufrev;
268
275
                int rc = 0;
269
276
                struct cli_ac_data mdata;
 
277
                struct cli_ac_result *res = NULL;
270
278
 
271
279
                if(!buffer)
272
280
                        return CL_EMEM;
273
281
 
274
282
                strncpy(buffer,real_url,real_len);
275
 
                buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
 
283
                buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
276
284
                if(!hostOnly || is_whitelist) {
277
285
                        strncpy(buffer+real_len+1,display_url,display_len);
278
 
                        if(is_whitelist)
279
 
                                buffer[buffer_len - 1] = '/';
280
 
                        buffer[buffer_len]=0;
281
286
                }
 
287
                buffer[buffer_len - 1] = '/';
 
288
                buffer[buffer_len]=0;
282
289
                cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
283
290
 
284
 
                if(hostOnly) {
285
 
                        if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
286
 
                                return rc;
287
 
                        rc = 0;
288
 
 
289
 
                        for(i = 0; i < matcher->root_hosts_cnt; i++) {
290
 
                                /* doesn't need to match terminating \0*/
291
 
                                rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
292
 
                                cli_ac_freedata(&mdata);
293
 
                                if(rc) {
294
 
                                        char c;
295
 
                                        const char* matched = strchr(*info,':');
296
 
                                        const size_t match_len = matched ? strlen(matched+1) : 0;
297
 
                                        if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
298
 
                                                (match_len == buffer_len || /* full match */
299
 
                                                (match_len < buffer_len &&
300
 
                                                ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
301
 
                                                /* subdomain matched*/)) {
302
 
 
303
 
                                                cli_dbgmsg("Got a match: %s with %s\n", buffer, *info);
304
 
                                                cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
305
 
                                                if(real_len >= match_len + 1) {
306
 
                                                        const size_t pos = real_len - match_len - 1;
307
 
                                                        if(real_url[pos] != '.') {
308
 
                                                                /* we need to shift left, and insert a '.'
309
 
                                                                 * we have an extra '.' at the beginning inserted by get_host to have room,
310
 
                                                                 * orig_real_url has to be used here, 
311
 
                                                                 * because we want to overwrite that extra '.' */
312
 
                                                                size_t orig_real_len = strlen(orig_real_url);
313
 
                                                                cli_dbgmsg("No dot here:%s\n",real_url+pos);
314
 
                                                                real_url = orig_real_url;
315
 
                                                                memmove(real_url, real_url+1, orig_real_len-match_len-1);
316
 
                                                                real_url[orig_real_len-match_len-1]='.';
317
 
                                                                cli_dbgmsg("After inserting .: %s\n", real_url);
318
 
                                                        }
319
 
                                                }
320
 
                                                break;
321
 
                                        }
322
 
                                        cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c);
323
 
                                        rc=0;
 
291
                if((rc = cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN)))
 
292
                        return rc;
 
293
 
 
294
                bufrev = cli_strdup(buffer);
 
295
                if(!bufrev)
 
296
                        return CL_EMEM;
 
297
                reverse_string(bufrev);
 
298
                rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
 
299
                if(rc == -1) {
 
300
                        free(buffer);
 
301
                        free(bufrev);
 
302
                        /* filter says this suffix doesn't match.
 
303
                         * The filter has false positives, but no false
 
304
                         * negatives */
 
305
                        return 0;
 
306
                }
 
307
                rc = cli_ac_scanbuff((const unsigned char*)bufrev,buffer_len, NULL, (void*)&regex, &res, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
 
308
                free(bufrev);
 
309
                cli_ac_freedata(&mdata);
 
310
 
 
311
                rc = 0;
 
312
                while(res) {
 
313
                        struct cli_ac_result *q;
 
314
                        regex = res->customdata;
 
315
                        while(!rc && regex) {
 
316
                                /* loop over multiple regexes corresponding to
 
317
                                 * this suffix */
 
318
                                if (!regex->preg) {
 
319
                                        /* we matched a static pattern */
 
320
                                        rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
 
321
                                } else {
 
322
                                        rc = !cli_regexec(regex->preg, buffer, 0, NULL, 0);
324
323
                                }
 
324
                                if(rc) *info = regex->pattern;
 
325
                                regex = regex->nxt;
325
326
                        }
326
 
                } else
327
 
                        rc = 0;
328
 
                if(!rc)
329
 
                        rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
 
327
                        q = res;
 
328
                        res = res->next;
 
329
                        free(q);
 
330
                }
330
331
                free(buffer);
331
332
                if(!rc)
332
333
                        cli_dbgmsg("Lookup result: not in regex list\n");
336
337
        }
337
338
}
338
339
 
339
 
/* node stack */
340
 
#define NODE_STACK_INITIAL 1024
341
 
#define NODE_STACK_GROW    4096
342
 
/* Initialize @stack */
343
 
static int stack_init(struct node_stack* stack)
344
 
{
345
 
        massert(stack);
346
 
 
347
 
        stack->cnt = 0;
348
 
        stack->capacity = NODE_STACK_INITIAL;
349
 
        stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
350
 
        if(!stack->data)
351
 
                return CL_EMEM;
352
 
        else
353
 
                return CL_SUCCESS;
354
 
}
355
 
 
356
 
/* Reset @stack pointer, but don't realloc */
357
 
static void stack_reset(struct node_stack* stack)
358
 
{
359
 
        massert(stack);
360
 
 
361
 
        stack->cnt = 0;
362
 
}
363
 
 
364
 
/* Push @node on @stack, growing it if necessarry */
365
 
static int stack_push(struct node_stack* stack,struct tree_node* node)
366
 
{
367
 
        massert(stack);
368
 
        massert(stack->data);
369
 
 
370
 
        if(stack->cnt == stack->capacity) {
371
 
                stack->capacity += NODE_STACK_GROW;
372
 
                stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data));
373
 
                if(!stack->data)
374
 
                        return CL_EMEM;
375
 
        }
376
 
        stack->data[stack->cnt++] = node;
377
 
        return CL_SUCCESS;
378
 
}
379
 
 
380
 
/* Pops node from @stack, doesn't realloc */
381
 
static struct tree_node* stack_pop(struct node_stack* stack)
382
 
{
383
 
        massert(stack);
384
 
        massert(stack->data);
385
 
        massert(stack->cnt);/*don't pop from empty stack */
386
 
 
387
 
        return stack->cnt ? stack->data[--stack->cnt] : NULL;
388
 
}
389
340
 
390
341
/* Initialization & loading */
391
342
/* Initializes @matcher, allocating necesarry substructures */
393
344
{
394
345
        int rc;
395
346
 
396
 
        massert(matcher);
397
 
        matcher->list_inited = 0;
398
 
        matcher->root_hosts_cnt = 0;
399
 
        matcher->root_hosts = NULL;
400
 
        matcher->root_hosts_cnt = 0;
401
 
 
402
 
        matcher->root_regex = tree_root_alloc();
403
 
        if(!matcher->root_regex) {
404
 
                return CL_EMEM;
405
 
        }
406
 
 
407
 
        matcher->root_regex_hostonly = tree_root_alloc();
408
 
        if(!matcher->root_regex_hostonly) {
409
 
                free(matcher->root_regex);
410
 
                return CL_EMEM;
411
 
        }
412
 
 
413
 
        if(( rc = stack_init(&matcher->node_stack) )) {
414
 
                free(matcher->root_regex_hostonly);
415
 
                free(matcher->root_regex);
416
 
                return rc;
417
 
        }
418
 
        if(( rc = stack_init(&matcher->node_stack_alt) )) {
419
 
                free(matcher->root_regex_hostonly);
420
 
                free(matcher->root_regex);
421
 
                stack_destroy(&matcher->node_stack);
422
 
                return rc;
423
 
        }
 
347
        assert(matcher);
 
348
        memset(matcher, 0, sizeof(*matcher));
424
349
 
425
350
        matcher->list_inited=1;
426
 
        matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
 
351
        matcher->list_built=0;
427
352
        matcher->list_loaded=0;
428
353
 
 
354
        hashtab_init(&matcher->suffix_hash, 10);
 
355
        if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) {
 
356
                return rc;
 
357
        }
 
358
        if((rc = cli_bm_init(&matcher->md5_hashes))) {
 
359
                return rc;
 
360
        }
 
361
        SO_init(&matcher->filter);
 
362
        SO_init(&matcher->md5_filter);
429
363
        return CL_SUCCESS;
430
364
}
431
365
 
432
 
/* inserts @pattern into @root, using ac-matcher 
433
 
 * although the name might be confusing, @pattern is not a regex!*/
434
 
static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
435
 
{
436
 
       int ret;
437
 
       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
438
 
       size_t len,i;
439
 
 
440
 
       if(!new)
441
 
               return CL_EMEM;
442
 
       massert(root);
443
 
       massert(pattern);
444
 
 
445
 
       len = strlen(pattern);
446
 
       /* need not to match \0 too */
447
 
       new->rtype = 0;
448
 
       new->type = 0;
449
 
       new->sigid = 0;
450
 
       new->parts = 0;
451
 
       new->partno = 0;
452
 
       new->mindist = 0;
453
 
       new->maxdist = 0;
454
 
       new->offset = 0;
455
 
       new->target = 0;
456
 
       new->length = len;
457
 
       new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
458
 
       if(new->length > root->maxpatlen)
459
 
               root->maxpatlen = new->length;
460
 
 
461
 
       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
462
 
       if(!new->pattern) {
463
 
               free(new);
464
 
               return CL_EMEM;
465
 
       }
466
 
       for(i=0;i<len;i++)
467
 
               new->pattern[i]=pattern[i];/*new->pattern is short int* */
468
 
 
469
 
        
470
 
       new->virname = cli_strdup(info);
471
 
       if((ret = cli_ac_addpatt(root,new))) {
472
 
               free(new->virname);
473
 
               free(new->pattern);
474
 
               free(new);
475
 
               return ret;
476
 
       }
477
 
       return CL_SUCCESS;
478
 
}
479
 
 
480
366
static int functionality_level_check(char* line)
481
367
{
482
368
        char* ptmin;
517
403
                        return CL_EMALFDB;
518
404
                ptmin[-1]='\0';
519
405
                return CL_SUCCESS;
520
 
        }               
 
406
        }
 
407
}
 
408
 
 
409
static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl)
 
410
{
 
411
        int rc;
 
412
        struct cli_bm_patt *pat = cli_calloc(1, sizeof(*pat));
 
413
        if(!pat)
 
414
                return CL_EMEM;
 
415
        pat->pattern = (unsigned char*)cli_hex2str(pattern);
 
416
        if(!pat->pattern)
 
417
                return CL_EMALFDB;
 
418
        pat->length = 16;
 
419
        pat->virname = cli_malloc(1);
 
420
        if(!pat->virname) {
 
421
                free(pat);
 
422
                return CL_EMEM;
 
423
        }
 
424
        *pat->virname = fl;
 
425
        SO_preprocess_add(&matcher->md5_filter, pat->pattern, pat->length);
 
426
        if((rc = cli_bm_addpatt(&matcher->md5_hashes, pat))) {
 
427
                cli_errmsg("add_hash: failed to add BM pattern\n");
 
428
                free(pat->pattern);
 
429
                free(pat->virname);
 
430
                free(pat);
 
431
                return CL_EMALFDB;
 
432
        }
 
433
        return CL_SUCCESS;
521
434
}
522
435
 
523
436
 
527
440
        int rc,line=0;
528
441
        char buffer[FILEBUFF];
529
442
 
530
 
        massert(matcher);
 
443
        assert(matcher);
531
444
 
532
445
        if(matcher->list_inited==-1)
533
446
                return CL_EMALFDB; /* already failed to load */
534
 
/*      if(matcher->list_loaded) {
535
 
                cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
536
 
                return CL_SUCCESS;
537
 
        }*/
538
447
        if(!fd && !dbio) {
539
448
                cli_errmsg("Unable to load regex list (null file)\n");
540
449
                return CL_EIO;
548
457
                        fatal_error(matcher);
549
458
                        return rc;
550
459
                }
551
 
                /*atexit(regex_list_done); TODO: destroy this in manager.c */
552
460
        }
553
461
        /*
554
462
         * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
573
481
        while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
574
482
                char* pattern;
575
483
                char* flags;
 
484
                size_t pattern_len;
 
485
 
576
486
                cli_chomp(buffer);
577
487
                if(!*buffer)
578
488
                        continue;/* skip empty lines */
579
489
 
580
 
                if(functionality_level_check(buffer)) 
 
490
                if(functionality_level_check(buffer))
581
491
                        continue;
582
492
 
583
493
                line++;
591
501
                flags = buffer+1;
592
502
                pattern++;
593
503
 
594
 
                if(is_whitelist) {
595
 
                        const size_t pattern_len = strlen(pattern);
596
 
                        if(pattern_len < FILEBUFF) {
597
 
                                pattern[pattern_len] = '/';
598
 
                                pattern[pattern_len+1] = '\0';
599
 
                        }
600
 
                        else {
601
 
                                cli_errmsg("Overlong regex line %d\n",line);
602
 
                                fatal_error(matcher);
603
 
                                return CL_EMALFDB;
604
 
                        }
605
 
                }
606
 
 
607
 
                if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/
608
 
                        if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') ))
609
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
610
 
                }
611
 
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
612
 
                        struct cli_matcher* root;
613
 
                        if(matcher->list_built) {
614
 
                                struct cli_matcher* old_hosts = matcher->root_hosts;
615
 
                                matcher->root_hosts_cnt++;
616
 
 
617
 
                                matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
618
 
                                if(!matcher->root_hosts) {
619
 
                                        matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
620
 
                                        return CL_EMEM;
621
 
                                } 
622
 
 
623
 
                                root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
624
 
                                memset(root, 0, sizeof(struct cli_matcher));
625
 
 
626
 
                                cli_dbgmsg("regex_list: Initialising AC pattern matcher\n");
627
 
                                if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) {
628
 
                                        /* no need to free previously allocated memory here */
629
 
                                        cli_errmsg("regex_list: Can't initialise AC pattern matcher\n");
630
 
                                        return rc;
631
 
                                }
632
 
                                matcher->list_built = 0;
633
 
                        }
634
 
                        else {
635
 
                                root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
636
 
                        }
637
 
                        if(( rc = add_regex_list_element(root,pattern,flags) ))
638
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
504
                pattern_len = strlen(pattern);
 
505
                if(pattern_len < FILEBUFF) {
 
506
                        pattern[pattern_len] = '/';
 
507
                        pattern[pattern_len+1] = '\0';
639
508
                }
640
509
                else {
641
 
                        return CL_EMALFDB;
642
 
                        /* this is useless, we have host, and regex matches
643
 
                        if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
644
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
 
510
                        cli_errmsg("Overlong regex line %d\n",line);
 
511
                        fatal_error(matcher);
 
512
                        return CL_EMALFDB;
 
513
                }
 
514
 
 
515
                if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {
 
516
                        /* regex for hostname*/
 
517
                        if (( rc = regex_list_add_pattern(matcher, pattern) ))
 
518
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
519
                }
 
520
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {
 
521
                        /*matches displayed host*/
 
522
                        if (( rc = add_static_pattern(matcher, pattern) ))
 
523
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
524
                } else if (buffer[0] == 'U' && !is_whitelist) {
 
525
                        pattern[pattern_len] = '\0';
 
526
                        if (( rc = add_hash(matcher, pattern, flags[0]) )) {
 
527
                                cli_errmsg("Error loading at line: %d\n", line);
 
528
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
529
                        }
 
530
                } else {
 
531
                        return CL_EMALFDB;
645
532
                }
646
533
        }
647
534
        matcher->list_loaded = 1;
648
 
        if(( rc = build_regex_list(matcher) ))
649
 
                return rc;
650
535
 
651
 
#ifndef NDEBUG
652
 
/*                      dump_tree(matcher->root_regex);*/
653
 
#endif
654
 
        if(!matcher->list_built) {
655
 
                cli_errmsg("Regex list not loaded: build failed!\n");
656
 
                fatal_error(matcher);
657
 
                return CL_EMALFDB;
658
 
        }
659
 
        regex_list_cleanup(matcher);
660
536
        return CL_SUCCESS;
661
537
}
662
538
 
663
539
 
664
 
static struct tree_node ** tree_node_get_children(const struct tree_node* node)
665
 
{
666
 
        return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
667
 
}
668
 
 
669
540
/* Build the matcher list */
670
 
static int build_regex_list(struct regex_matcher* matcher)
 
541
int cli_build_regex_list(struct regex_matcher* matcher)
671
542
{
672
543
        int rc;
 
544
        if(!matcher)
 
545
                return CL_SUCCESS;
673
546
        if(!matcher->list_inited || !matcher->list_loaded) {
674
547
                cli_errmsg("Regex list not loaded!\n");
675
548
                return -1;/*TODO: better error code */
676
549
        }
677
550
        cli_dbgmsg("Building regex list\n");
678
 
        if(matcher->root_hosts)
679
 
                if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
680
 
                        return rc;
 
551
        hashtab_free(&matcher->suffix_hash);
 
552
        if(( rc = cli_ac_buildtrie(&matcher->suffixes) ))
 
553
                return rc;
681
554
        matcher->list_built=1;
682
555
 
683
556
        return CL_SUCCESS;
686
559
/* Done with this matcher, free resources */
687
560
void regex_list_done(struct regex_matcher* matcher)
688
561
{
689
 
        massert(matcher);
690
 
 
691
 
        regex_list_cleanup(matcher);
692
 
        if(matcher->list_loaded) {
693
 
                if(matcher->root_hosts) {
694
 
                        size_t i;
695
 
                        for(i=0;i<matcher->root_hosts_cnt;i++) 
696
 
                                cli_ac_free(&matcher->root_hosts[i]);
697
 
                        free(matcher->root_hosts);
698
 
                        matcher->root_hosts=NULL;
699
 
                }
700
 
 
701
 
                matcher->root_hosts_cnt=0;
 
562
        assert(matcher);
 
563
 
 
564
        if(matcher->list_inited) {
 
565
                size_t i;
 
566
                cli_ac_free(&matcher->suffixes);
 
567
                if(matcher->suffix_regexes) {
 
568
                        for(i=0;i<matcher->suffix_cnt;i++) {
 
569
                                struct regex_list *r = matcher->suffix_regexes[i].head;
 
570
                                while(r) {
 
571
                                        struct regex_list *q = r;
 
572
                                        r = r->nxt;
 
573
                                        free(q->pattern);
 
574
                                        free(q);
 
575
                                }
 
576
                        }
 
577
                        free(matcher->suffix_regexes);
 
578
                        matcher->suffix_regexes = NULL;
 
579
                }
 
580
                if(matcher->all_pregs) {
 
581
                        for(i=0;i<matcher->regex_cnt;i++) {
 
582
                                regex_t *r = matcher->all_pregs[i];
 
583
                                cli_regfree(r);
 
584
                                free(r);
 
585
                        }
 
586
                        free(matcher->all_pregs);
 
587
                }
 
588
                hashtab_free(&matcher->suffix_hash);
 
589
                cli_bm_free(&matcher->md5_hashes);
702
590
                matcher->list_built=0;
703
 
                destroy_tree(matcher);
704
591
                matcher->list_loaded=0;
705
592
        }
706
593
        if(matcher->list_inited) {
707
594
                matcher->list_inited=0;
708
595
        }
709
 
        stack_destroy(&matcher->node_stack);
710
 
        stack_destroy(&matcher->node_stack_alt);
711
 
}
712
 
 
713
 
/* Tree matcher algorithm */
714
 
struct token_t
715
 
{
716
 
        union {
717
 
                const unsigned char* start;
718
 
                char_bitmap_p  bitmap;
719
 
                unsigned char  c;
720
 
        } u;
721
 
        size_t len;
722
 
        char   type;
723
 
};
724
 
 
725
 
enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
726
 
 
727
 
static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
728
 
{
729
 
        massert(pat);
730
 
        massert(token);
731
 
 
732
 
        switch(*pat) {
733
 
                case '\\':
734
 
                        token->type=TOKEN_CHAR;
735
 
                        token->u.c = *(++pat);
736
 
                        if(islower(token->u.c)) {
737
 
                                /* handle \n, \t, etc. */
738
 
                                char fmt[3] = {'\\', '\0', '\0'};
739
 
                                char c;
740
 
 
741
 
                                fmt[1] = token->u.c;
742
 
                                if(snprintf(&c,1,fmt)!=1) {
743
 
                                        token->type=TOKEN_REGEX;
744
 
                                        token->u.start = pat;
745
 
                                }
746
 
                                else
747
 
                                        token->u.c=c;
748
 
                        }
749
 
                        token->len = 1;
750
 
                        break;
751
 
                case '|':
752
 
                        token->type=TOKEN_ALT;
753
 
                        break;
754
 
                case '*':
755
 
                case '+':
756
 
                case '?':
757
 
                case '{':
758
 
                case '}':
759
 
                        token->type=TOKEN_REGEX;
760
 
                        break;
761
 
                case '[':
762
 
                        {
763
 
                        /*TODO: implement*/
764
 
                        /*see if it is something simple like a list of characters, a range, or negated ...*/
765
 
                        const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
766
 
                        unsigned char range_start=0;
767
 
                        int hasprev = 0;
768
 
                        char_bitmap_p bitmap = cli_malloc(32);
769
 
                        if(!bitmap)
770
 
                                return NULL;
771
 
                        if (*pat=='^') {
772
 
                                memset(bitmap,0xFF,32);/*match chars not in brackets*/
773
 
                                pat++;
774
 
                        }
775
 
                        else
776
 
                                memset(bitmap,0x00,32);
777
 
                        do {
778
 
                                /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
779
 
                                if (*pat=='-' && hasprev) {
780
 
                                        /* it is a range*/
781
 
                                        unsigned char range_end;
782
 
                                        unsigned int c;
783
 
                                        massert(range_start);
784
 
                                        pat++;
785
 
                                        if (pat[0]=='[')
786
 
                                                if (pat[1]=='.') {
787
 
                                                        if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
788
 
                                                                range_end = '-';
789
 
                                                        else {
790
 
                                                                /* this is getting complicated, bail out */
791
 
                                                                cli_warnmsg("confused about collating sequences in regex,bailing out");
792
 
                                                                pat=old;
793
 
                                                                token->type=TOKEN_REGEX;
794
 
                                                                break;
795
 
                                                        }
796
 
                                                }
797
 
                                                else 
798
 
                                                        range_end = *pat;
799
 
                                        else
800
 
                                                range_end = *pat;
801
 
                                        for(c=range_start+1;c<=range_end;c++)
802
 
                                                bitmap[c>>3] ^= 1<<(c&0x7);
803
 
                                        hasprev = 0;
804
 
                                }
805
 
                                else if (pat[0]=='[' && pat[1]==':') {
806
 
                                                        const unsigned char* end;
807
 
                                                        int len,found=-1;
808
 
                                                        size_t i;
809
 
 
810
 
                                                        pat+=2;
811
 
                                                        end=(unsigned char*)strstr((const char*)pat,":]");
812
 
                                                        if(!end) {
813
 
                                                                cli_warnmsg("confused about std char class syntax regex,bailing out");
814
 
                                                                pat=old;
815
 
                                                                token->type=TOKEN_REGEX;
816
 
                                                                break;
817
 
                                                        }
818
 
 
819
 
                                                        len = end-pat;
820
 
                                                        for(i=0;i<std_class_cnt;i++)
821
 
                                                                if(!strncmp((const char*)pat,std_class[i],len)) {
822
 
                                                                        found=i;
823
 
                                                                        break;
824
 
                                                                }
825
 
                                                        if(found!=-1) {
826
 
                                                                for(i=0;i<256;i++)
827
 
                                                                        if(char_class[i]&(1<<found))
828
 
                                                                                bitmap[i>>3] ^= 1<<(i&0x7);
829
 
                                                        }
830
 
                                                        else {
831
 
                                                                /*unknown class*/
832
 
                                                                cli_warnmsg("confused about regex bracket expression, bailing out");
833
 
                                                                pat=old;
834
 
                                                                token->type=TOKEN_REGEX;
835
 
                                                                break;
836
 
                                                        }
837
 
                                                }
838
 
                                else {
839
 
                                        bitmap[*pat>>3] ^= 1<<(*pat&0x7);
840
 
                                        pat++;
841
 
                                        range_start = *pat;
842
 
                                        hasprev = 1;
843
 
                                }
844
 
                        } while(*pat!=']');
845
 
                        /*TODO: see if this bitmap already exists, then reuse*/                 
846
 
                        token->type = TOKEN_BRACKET;
847
 
                        token->u.bitmap = bitmap;
848
 
                        break;
849
 
                        }
850
 
                case ']':
851
 
                        massert(0 && "Encountered ] without matching [");
852
 
                        /* bad state */
853
 
                        break;
854
 
                case '.':
855
 
                        token->type=TOKEN_DOT;
856
 
                        break;
857
 
                case '(':
858
 
                        token->type=TOKEN_PAR_OPEN;
859
 
                        break;
860
 
                case ')':
861
 
                        token->type=TOKEN_PAR_CLOSE;
862
 
                        break;
863
 
                default:
864
 
                        token->type=TOKEN_CHAR;
865
 
                        token->u.c = *pat;
866
 
                        token->len=1;
867
 
                        break;
868
 
        }
869
 
        return ++pat;
870
 
}
871
 
 
872
 
#define INITIAL_ALT_STACK 10
873
 
#define ALT_STACK_GROW 20
874
 
 
875
 
static const unsigned char* find_regex_start(const unsigned char* pat)
876
 
{
877
 
        struct token_t token;
878
 
        /*TODO: find where the regex part begins, for ex:
879
 
         * abcd+, regex begins at 'd'
880
 
         * */
881
 
        const unsigned char* last=NULL;
882
 
        const unsigned char* tmp=NULL;
883
 
        const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
884
 
        size_t altpositions_capacity = INITIAL_ALT_STACK;
885
 
        size_t altpositions_cnt = 0;
886
 
        char lasttype = -1;
887
 
        if(!altpositions)
888
 
                return NULL;
889
 
        massert(pat);
890
 
 
891
 
        /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
892
 
         * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
893
 
         * back up to the last known good position
894
 
         * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
895
 
         * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
896
 
         * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
897
 
         * */
898
 
        do {    
899
 
                tmp = pat;
900
 
                pat = getNextToken(pat,&token);
901
 
                if(token.type!=TOKEN_REGEX) {
902
 
                        last = tmp;
903
 
                        lasttype = token.type;
904
 
                        if(token.type==TOKEN_BRACKET && token.u.bitmap)
905
 
                                free(token.u.bitmap);
906
 
                        if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
907
 
                                /* save this position on stack, succesfully parsed till here*/
908
 
                                if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
909
 
                                        /* encountered another alternate (|) operator, override previous | position stored */
910
 
                                        altpositions[altpositions_cnt-1]=last;
911
 
                                else {
912
 
                                        altpositions[altpositions_cnt++] = last;
913
 
                                        if(altpositions_cnt == altpositions_capacity) {
914
 
                                                altpositions_capacity += ALT_STACK_GROW;
915
 
                                                altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
916
 
                                                if(!altpositions)
917
 
                                                        return NULL;
918
 
                                        }
919
 
                                }
920
 
                        } else if (lasttype==TOKEN_PAR_CLOSE) {
921
 
                                /* remove last stored position from stack, succesfully this last group */
922
 
                                altpositions_cnt--;
923
 
                                massert(altpositions_cnt>0);
924
 
                        }
925
 
                }
926
 
                else {
927
 
                        if(altpositions_cnt)
928
 
                                last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
929
 
                        /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
930
 
                }
931
 
        } while(*pat && token.type!=TOKEN_REGEX);
932
 
        free(altpositions);
933
 
        return *pat ? last : last+1;
934
 
}
935
 
 
936
 
static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
937
 
{
938
 
        struct tree_node* node = cli_malloc(sizeof(*node));
939
 
        if(node) {
940
 
                node->alternatives=0;
941
 
                node->next=next;
942
 
                node->listend=listend;
943
 
                node->u.children=NULL;
944
 
        }
945
 
        return node;
946
 
}
947
 
 
948
 
static struct tree_node* tree_root_alloc(void)
949
 
{
950
 
        struct tree_node* root=tree_node_alloc(NULL,1);
951
 
        if(root) {
952
 
                root->op=OP_ROOT;
953
 
                root->c=0;
954
 
                root->next=NULL;
955
 
                root->listend=1;
956
 
        }
957
 
        return root;
958
 
}
959
 
 
960
 
static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
961
 
{
962
 
        int right;
963
 
        struct tree_node **children;
964
 
        massert(node);
965
 
        massert(left);
966
 
 
967
 
        children = tree_node_get_children(node);
968
 
        right = node->alternatives-1;
969
 
        *left = 0;
970
 
        if(!node->alternatives)
971
 
                return NULL;
972
 
        massert(children);
973
 
        while(*left<=right) {
974
 
                int mid  = *left+(right-*left)/2;
975
 
                if(children[mid]->c == csearch)
976
 
                        return children[mid]; 
977
 
                else if(children[mid]->c < csearch)
978
 
                        *left=mid+1;
979
 
                else
980
 
                        right=mid-1;
981
 
        }
982
 
        return NULL;
983
 
}
984
 
 
985
 
static struct tree_node* tree_get_next(struct tree_node* node)
986
 
{
987
 
        struct tree_node** children;
988
 
        massert(node);
989
 
        children = tree_node_get_children(node);
990
 
 
991
 
        if(!node->alternatives && children && children[0])
992
 
                return children[0];
993
 
        else if(node->alternatives<=1)
994
 
                return node;
995
 
        else
996
 
                return children[0]->next;
997
 
}
998
 
 
999
 
static size_t tree_node_get_array_size(const struct tree_node* node)
1000
 
{
1001
 
        massert(node);
1002
 
        /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
1003
 
        return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
1004
 
}
1005
 
 
1006
 
static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
1007
 
{
1008
 
        struct tree_node* new, *alt = tree_get_next(node);
1009
 
        struct tree_node **children;
1010
 
        node->alternatives++;
1011
 
        node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
1012
 
        if(!node->u.children)
1013
 
                return NULL;
1014
 
 
1015
 
        children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
1016
 
 
1017
 
        new = tree_node_alloc(alt , node == alt );
1018
 
        if(new) {
1019
 
                new->op=OP_CHAR;
1020
 
                new->c=c;
1021
 
        }
1022
 
 
1023
 
        if(node->alternatives-left-1>0)
1024
 
                        memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
1025
 
        children[left] = new;   
1026
 
 
1027
 
        return new;
1028
 
}
1029
 
 
1030
 
static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
1031
 
{
1032
 
        struct tree_node **children;
1033
 
        massert(node);
1034
 
        massert(new);
1035
 
 
1036
 
        children = tree_node_get_children(node);
1037
 
        if(node->alternatives) {
1038
 
                massert(children);
1039
 
                if(children[0]->next == node) {
1040
 
                        int i;
1041
 
                        new->listend = 1;
1042
 
                        for(i=0;i<node->alternatives;i++) {
1043
 
                                children[i]->next = new;
1044
 
                                children[i]->listend = 0;
1045
 
                        }
1046
 
                }
1047
 
                else {
1048
 
                        struct tree_node* p;
1049
 
                        for(p = children[0]->next ; p->next != node ; p = p->next)
1050
 
                                massert(!p->listend);
1051
 
                        new->listend = 1;
1052
 
                        p->listend = 0;
1053
 
                        p->next = new;
1054
 
                }
1055
 
        }
1056
 
        else {
1057
 
                int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
1058
 
                if(node->u.children)
1059
 
                        if(node->u.children[idx]) {
1060
 
                                node = node->u.children[idx];
1061
 
                                while(node->next && !node->listend)
1062
 
                                        node = node->next;
1063
 
                                node->listend = 0;
1064
 
                                new->next = node->next;
1065
 
                                node->next = new;
1066
 
                                new->listend=1;
1067
 
                                return;
1068
 
                        }
1069
 
                node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
1070
 
                if(node->u.children) {
1071
 
                        node->u.children[idx] = new;
1072
 
                }
1073
 
        }
1074
 
}
1075
 
 
1076
 
static unsigned char char_getclass(const unsigned char* bitmap)
1077
 
{
1078
 
        size_t i;
1079
 
        massert(bitmap);
1080
 
 
1081
 
        for(i=0;i<std_class_cnt;i++)
1082
 
                if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
1083
 
                        return i;
1084
 
        return std_class_cnt;
1085
 
}
1086
 
 
1087
 
static void stack_destroy(struct node_stack* stack)
1088
 
{
1089
 
        massert(stack);
1090
 
        if(stack->data)
1091
 
                free(stack->data);
1092
 
        stack->data = NULL;
1093
 
        stack->capacity = 0;
1094
 
}
1095
 
 
1096
 
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
1097
 
void regex_list_cleanup(struct regex_matcher* matcher)
1098
 
{
1099
 
        massert(matcher);
1100
 
 
1101
 
        stack_destroy(&matcher->node_stack);
1102
 
        stack_destroy(&matcher->node_stack_alt);
1103
 
        stack_init(&matcher->node_stack);
1104
 
        stack_init(&matcher->node_stack_alt);
1105
596
}
1106
597
 
1107
598
int is_regex_ok(struct regex_matcher* matcher)
1108
599
{
1109
 
        massert(matcher);
 
600
        assert(matcher);
1110
601
        return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
1111
602
}
1112
603
 
1113
 
/* returns 0 on success, regexec error code otherwise */                                                
1114
 
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
1115
 
{
1116
 
        int bol=1;
1117
 
        const unsigned char* pat_end = find_regex_start(pat);
1118
 
        struct token_t token;
1119
 
        struct tree_node* node;
1120
 
        
1121
 
        massert(matcher);
1122
 
 
1123
 
        node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
1124
 
 
1125
 
        stack_reset(&matcher->node_stack);
1126
 
        stack_reset(&matcher->node_stack_alt);
1127
 
        stack_push(&matcher->node_stack,node);
1128
 
 
1129
 
        for(;node->op!=OP_LEAF;){
1130
 
                if(pat<pat_end)
1131
 
                        pat  = getNextToken(pat,&token);
1132
 
                else if(*pat) {
1133
 
                        token.type = TOKEN_REGEX;
1134
 
                        token.u.start=pat;
1135
 
                }
1136
 
                else
1137
 
                        token.type = TOKEN_DONE;
1138
 
 
1139
 
                switch(token.type) {
1140
 
                        case TOKEN_CHAR: 
1141
 
                                {
1142
 
                                        /* search for char in tree */
1143
 
                                        int left;
1144
 
                                        struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
1145
 
                                        if(newnode)
1146
 
                                                node = newnode;
1147
 
                                        else {
1148
 
                                                /* not found, insert it */
1149
 
                                                node = tree_node_char_insert(node,token.u.c,left);
1150
 
                                        }
1151
 
                                        break;
1152
 
                                }
1153
 
 
1154
 
                        case TOKEN_PAR_OPEN:
1155
 
                                stack_push(&matcher->node_stack_alt,NULL);/* marker */
1156
 
                                stack_push(&matcher->node_stack,node);
1157
 
                                break;
1158
 
 
1159
 
                        case TOKEN_PAR_CLOSE: {
1160
 
                                                      /*TODO: test this!!!*/
1161
 
                                                      struct tree_node* node_alt = node;
1162
 
                                                      node = tree_node_alloc(NULL,1);
1163
 
                                                      node->op=OP_PARCLOSE;
1164
 
                                                      node->c=0;
1165
 
                                                      node->listend=1;
1166
 
                                                      tree_node_insert_nonbin(node_alt,node);
1167
 
                                                      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
1168
 
                                                              tree_node_insert_nonbin(node_alt,node);
1169
 
                                                      }
1170
 
                                                      stack_pop(&matcher->node_stack);                                        
1171
 
                                                      break;
1172
 
                                              }
1173
 
 
1174
 
                        case TOKEN_ALT:
1175
 
                                stack_push(&matcher->node_stack_alt,node);
1176
 
                                node = stack_pop(&matcher->node_stack);
1177
 
                                stack_push(&matcher->node_stack,node);
1178
 
                                break;
1179
 
 
1180
 
                        case TOKEN_BRACKET:
1181
 
                                {
1182
 
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1183
 
                                        unsigned char charclass = char_getclass(token.u.bitmap);
1184
 
                                        if(charclass == std_class_cnt) {/*not a std char class*/
1185
 
                                                new->op = OP_CUSTOMCLASS;
1186
 
                                                new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
1187
 
                                                if(!new->u.children)
1188
 
                                                        return CL_EMEM;
1189
 
                                                new->u.bitmap[0] = token.u.bitmap;
1190
 
                                                new->u.bitmap[1] = NULL;
1191
 
                                                tree_node_insert_nonbin(node,new);
1192
 
                                                node = new;
1193
 
                                        }
1194
 
                                        else {
1195
 
                                                new->op = OP_STDCLASS;
1196
 
                                                new->c = charclass;
1197
 
                                                tree_node_insert_nonbin(node,new);
1198
 
                                                node=new;
1199
 
                                        }
1200
 
                                        break;
1201
 
                                }
1202
 
 
1203
 
                        case TOKEN_DOT:
1204
 
                                {
1205
 
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1206
 
                                        new->op = OP_DOT;
1207
 
                                        tree_node_insert_nonbin(node,new);
1208
 
                                        node=new;
1209
 
                                        break;
1210
 
                                }
1211
 
 
1212
 
                        case TOKEN_REGEX:
1213
 
                        case TOKEN_DONE: {
1214
 
                                                 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
1215
 
                                                 if(!leaf)
1216
 
                                                         return CL_EMEM;
1217
 
                                                 leaf->info = cli_strdup(info);
1218
 
                                                 if(token.type==TOKEN_REGEX) {
1219
 
                                                         int rc;
1220
 
                                                         struct tree_node* new;
1221
 
                                                         regex_t* preg;
1222
 
                                                         preg=cli_malloc(sizeof(*preg));
1223
 
                                                         if(!preg)
1224
 
                                                                 return CL_EMEM;
1225
 
                                                         rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
1226
 
                                                         leaf->preg=preg;
1227
 
                                                         if(rc)
1228
 
                                                                 return rc;
1229
 
                                                         new=cli_malloc(sizeof(*new));
1230
 
                                                         if(!new)
1231
 
                                                                 return CL_EMEM;
1232
 
                                                         new->op=OP_LEAF;
1233
 
                                                         new->next=node;
1234
 
                                                         new->alternatives=0;
1235
 
                                                         new->u.leaf=leaf;
1236
 
                                                         new->listend=1;
1237
 
                                                         tree_node_insert_nonbin(node,new);
1238
 
                                                 }
1239
 
                                                 else {
1240
 
                                                         leaf->preg=NULL;
1241
 
                                                         node->alternatives=0;
1242
 
                                                         node->u.leaf=leaf;
1243
 
                                                         node->op=OP_LEAF;
1244
 
                                                 }
1245
 
                                                 return 0;
1246
 
                                         }
1247
 
                }
1248
 
 
1249
 
                bol=0;
 
604
static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, const char *suffix, size_t len)
 
605
{
 
606
        struct cli_matcher *root = &matcher->suffixes;
 
607
        struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
 
608
        size_t i;
 
609
        int ret;
 
610
 
 
611
        if(!new)
 
612
                return CL_EMEM;
 
613
        assert(root && suffix);
 
614
 
 
615
        new->rtype = 0;
 
616
        new->type = 0;
 
617
        new->sigid = 0;
 
618
        new->parts = 0;
 
619
        new->partno = 0;
 
620
        new->mindist = 0;
 
621
        new->maxdist = 0;
 
622
        new->offset = 0;
 
623
        new->length = len;
 
624
 
 
625
        new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
 
626
        if(new->length > root->maxpatlen)
 
627
                root->maxpatlen = new->length;
 
628
 
 
629
        new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
 
630
        if(!new->pattern) {
 
631
                free(new);
 
632
                return CL_EMEM;
 
633
        }
 
634
        for(i=0;i<len;i++)
 
635
                new->pattern[i] = suffix[i];/*new->pattern is short int* */
 
636
 
 
637
        new->customdata = info;
 
638
        new->virname = NULL;
 
639
        if((ret = cli_ac_addpatt(root,new))) {
 
640
                free(new->pattern);
 
641
                free(new);
 
642
                return ret;
 
643
        }
 
644
        SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len);
 
645
        return CL_SUCCESS;
 
646
}
 
647
 
 
648
#define MODULE "regex_list: "
 
649
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
 
650
 
 
651
static void list_add_tail(struct regex_list_ht *ht, struct regex_list *regex)
 
652
{
 
653
        if(!ht->head)
 
654
                ht->head = regex;
 
655
        if(ht->tail) {
 
656
                ht->tail->nxt = regex;
 
657
        }
 
658
        ht->tail = regex;
 
659
}
 
660
 
 
661
/* returns 0 on success, clamav error code otherwise */
 
662
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *iregex)
 
663
{
 
664
        struct regex_matcher *matcher = cbdata;
 
665
        struct regex_list *regex = cli_malloc(sizeof(*regex));
 
666
        const struct element *el;
 
667
 
 
668
        assert(matcher);
 
669
        if(!regex)
 
670
                return CL_EMEM;
 
671
        regex->pattern = iregex->pattern ? cli_strdup(iregex->pattern) : NULL;
 
672
        regex->preg = iregex->preg;
 
673
        regex->nxt = NULL;
 
674
        el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
 
675
        /* TODO: what if suffixes are prefixes of eachother and only one will
 
676
         * match? */
 
677
        if(el) {
 
678
                /* existing suffix */
 
679
                assert((size_t)el->data < matcher->suffix_cnt);
 
680
                list_add_tail(&matcher->suffix_regexes[el->data], regex);
 
681
                cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern);
 
682
        } else {
 
683
                /* new suffix */
 
684
                size_t n = matcher->suffix_cnt++;
 
685
                el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
 
686
                matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
 
687
                if(!matcher->suffix_regexes)
 
688
                        return CL_EMEM;
 
689
                matcher->suffix_regexes[n].tail = regex;
 
690
                matcher->suffix_regexes[n].head = regex;
 
691
                add_newsuffix(matcher, regex, suffix, suffix_len);
 
692
                cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern);
1250
693
        }
1251
694
        return 0;
1252
695
}
1253
696
 
1254
 
/* c has to be unsigned char here!! */
1255
 
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
1256
 
{
1257
 
        struct tree_node** children;
1258
 
        int rc;
1259
 
 
1260
 
        massert(node);
1261
 
        massert(c);
1262
 
        massert(info);
1263
 
 
1264
 
        if(!node->u.children)
1265
 
                return MATCH_FAILED;/* tree empty */
1266
 
        *info = NULL;
1267
 
        len++;
1268
 
        c--;
1269
 
        for(;;) {
1270
 
                massert(node);
1271
 
                children = node->u.children;
1272
 
                switch(node->op) {
1273
 
                        case OP_ROOT:
1274
 
                                rc=1;
1275
 
                                break;
1276
 
                        case OP_PARCLOSE:
1277
 
                                /*this isn't a real character, so don't move*/
1278
 
                                c--;
1279
 
                                len++;
1280
 
                                rc=1;
1281
 
                                break;
1282
 
                        case OP_CHAR:
1283
 
                                massert(*c==node->c && "We know this has to match");
1284
 
                                rc = 1;/* *c==node->c;- we know it has matched */
1285
 
                                break;
1286
 
                        case OP_DOT:    
1287
 
                                rc = *c!='\n';
1288
 
                                break;
1289
 
                        case OP_STDCLASS:
1290
 
                                rc = char_class[*c]&(node->c);
1291
 
                                break;
1292
 
                        case OP_CUSTOMCLASS:
1293
 
                        {
1294
 
                                char_bitmap_p bitmap;
1295
 
                                massert(children);
1296
 
                                bitmap = (char_bitmap_p)node->u.bitmap[0];
1297
 
                                children++;
1298
 
                                rc = bitmap[*c>>3]&(1<<(*c&0x7));
1299
 
                                break;
1300
 
                        }
1301
 
                        case OP_LEAF:
1302
 
                        {
1303
 
                                const struct leaf_info* leaf = node->u.leaf;
1304
 
                                /*isleaf = 1;*/
1305
 
                                if(leaf->preg) {
1306
 
                                        rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0);
1307
 
                                }
1308
 
                                else  {
1309
 
                                        massert(*c==node->c && "We know this has to match[2]");
1310
 
                                        rc = 1;
1311
 
                                }
1312
 
                                if(rc) {
1313
 
                                        *info = leaf->info;
1314
 
                                        return MATCH_SUCCESS;
1315
 
                                }
1316
 
                                break;
1317
 
                        }
1318
 
                        default:
1319
 
                                /* impossible */
1320
 
                                cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
1321
 
                                exit(1);
1322
 
                }
1323
 
                len--;
1324
 
                if(!len) rc=0;
1325
 
                c++;
1326
 
                if(rc) {
1327
 
                        const char csearch = *c;
1328
 
                        int left = 0,right = node->alternatives-1;
1329
 
                        int mid;
1330
 
                        /*matched so far, go deeper*/
1331
 
                        /*do a binary search between children */
1332
 
                        massert(children);
1333
 
                        while(left<=right) {
1334
 
                                mid  = left+(right-left)/2;
1335
 
                                if (children[mid]->c == csearch)
1336
 
                                        break;
1337
 
                                else if(children[mid]->c < csearch)
1338
 
                                        left=mid+1;
1339
 
                                else
1340
 
                                        right=mid-1;
1341
 
                        }
1342
 
                        if(left<=right) {
1343
 
                                node = children[mid];
1344
 
                                massert(node);
1345
 
                        }
1346
 
                        else {
1347
 
                                if(node->alternatives) {
1348
 
                                        if(!children[0]->listend) {
1349
 
                                                node = children[0];
1350
 
                                                c++;
1351
 
                                                len--;
1352
 
                                        }
1353
 
                                        while(node && node->listend) {
1354
 
                                                node = node->next;/* climb up */
1355
 
                                                c--;
1356
 
                                                len++;
1357
 
                                        }
1358
 
                                        if(!node || !node->next) 
1359
 
                                                return MATCH_FAILED;/* reached root node */
1360
 
                                        node=node->next;
1361
 
                                        c--;
1362
 
                                        len++;
1363
 
                                }
1364
 
                                else if(node->u.children) {
1365
 
                                        struct tree_node* rewrite_next = NULL;
1366
 
                                        if(node->op==OP_PARCLOSE) 
1367
 
                                                rewrite_next = node;
1368
 
                                        node = children[0];
1369
 
                                        massert(node);
1370
 
                                        massert(node->op!=OP_CHAR);
1371
 
                                        if(rewrite_next)
1372
 
                                                node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
1373
 
                                                                             we need to know 
1374
 
                                                                             from which one we came, so we can find out way back
1375
 
                                                                             should we fail to match somewhere deeper*/
1376
 
                                }
1377
 
                        }
1378
 
                }
1379
 
                else {
1380
 
                        /* this node didn't match, try sibling, or parent (if no more siblings) */
1381
 
                        while(node && node->listend) {
1382
 
                                node = node->next;/* sibling of parent */
1383
 
                                c--;
1384
 
                                len++;
1385
 
                        }
1386
 
                        if(!node || !node->next) /* reached root node, it has no next */
1387
 
                                return MATCH_FAILED;
1388
 
                        else {
1389
 
                                c--;
1390
 
                                len++;
1391
 
                                node=node->next;
1392
 
                        }
1393
 
                }
1394
 
        }
1395
 
        return MATCH_FAILED;
1396
 
}
1397
 
 
1398
 
/* push node on stack, only if it isn't there already */
1399
 
static void stack_push_once(struct node_stack* stack,struct tree_node* node)
1400
 
{
 
697
static size_t reverse_string(char *pattern)
 
698
{
 
699
        size_t len = strlen(pattern);
1401
700
        size_t i;
1402
 
        massert(stack);
1403
 
        massert(node);
1404
 
 
1405
 
        for(i=0;i < stack->cnt;i++)
1406
 
                if(stack->data[i]==node)
1407
 
                        return;
1408
 
        stack_push(stack,node);
1409
 
}
1410
 
 
1411
 
static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
1412
 
{
1413
 
        struct tree_node **children;
1414
 
        massert(matcher);
1415
 
        massert(node);
1416
 
 
1417
 
        children = tree_node_get_children(node);
1418
 
        if(node->op==OP_LEAF) {
1419
 
                struct leaf_info* leaf = node->u.leaf;
1420
 
                if(node->next && !node->listend)
1421
 
                        destroy_tree_internal(matcher,node->next);
1422
 
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
1423
 
                stack_push_once(&matcher->node_stack,node);
1424
 
                if(leaf->preg) {
1425
 
                        cli_regfree(leaf->preg);
1426
 
                        free(leaf->preg);
1427
 
                        leaf->preg=NULL;
1428
 
                }
1429
 
                if(leaf->info) {
1430
 
                        free(leaf->info);
1431
 
                        leaf->info=NULL;
1432
 
                }
1433
 
        /*      return;*/
1434
 
        }
1435
 
        if(node->alternatives) {
1436
 
                int i;
1437
 
                struct tree_node* p;
1438
 
                massert(children);
1439
 
                p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
1440
 
                for(i=0;i<node->alternatives;i++)
1441
 
                        destroy_tree_internal(matcher,children[i]);
1442
 
                if(p && p!=node)
1443
 
                        destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
1444
 
        }
1445
 
        else {
1446
 
                if(children) {
1447
 
                        if(children[0])
1448
 
                                destroy_tree_internal(matcher,children[0]);             
1449
 
                }
1450
 
        }
1451
 
        if(node->op!=OP_LEAF && node->next && !node->listend)
1452
 
                destroy_tree_internal(matcher,node->next);
1453
 
        if(node->u.children)
1454
 
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
1455
 
        if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
1456
 
                free(node->u.children[0]);
1457
 
                node->u.children[0]=NULL;
1458
 
        }
1459
 
        stack_push_once(&matcher->node_stack,node);
1460
 
}
1461
 
 
1462
 
static void destroy_tree(struct regex_matcher* matcher)
1463
 
{
1464
 
        /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
1465
 
         * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
1466
 
         * (and push to stack only if it doesn't contain it already*/
1467
 
        massert(matcher);
1468
 
 
1469
 
        stack_reset(&matcher->node_stack);
1470
 
        destroy_tree_internal(matcher,matcher->root_regex);
1471
 
        destroy_tree_internal(matcher,matcher->root_regex_hostonly);
1472
 
        while (matcher->node_stack.cnt) {
1473
 
                struct tree_node* node = stack_pop(&matcher->node_stack);
1474
 
                if(node)
1475
 
                        free(node);
1476
 
        }
1477
 
}
1478
 
#ifndef NDEBUG
1479
 
static void dump_node(struct tree_node* node)
1480
 
{
1481
 
        int i;
1482
 
        struct tree_node* p,**children;
1483
 
        massert(node);
1484
 
        if(node->op==OP_LEAF) {
1485
 
                if(node->u.leaf->preg)
1486
 
                        printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
1487
 
                else
1488
 
                        printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
1489
 
                if(node->next && !node->listend) {
1490
 
                        printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1491
 
                        dump_node(node->next);
1492
 
                }
1493
 
                return;
1494
 
        }
1495
 
        printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
1496
 
        if(node->next)
1497
 
                printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1498
 
        printf("n%p -> {",(void*)node);/*using address of node as id*/
1499
 
        children = tree_node_get_children(node);
1500
 
        if(node->alternatives)
1501
 
                massert(children);
1502
 
        for(i=0;i<node->alternatives;i++)
1503
 
                printf("n%p ",(void*)children[i]);
1504
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1505
 
                for(p=children[0]->next;p!=node;p=p->next)
1506
 
                {
1507
 
                        massert(p);
1508
 
                        printf("n%p ",(void*)p);
1509
 
                        if(p->op==OP_LEAF || p->listend)
1510
 
                                break;
1511
 
                }
1512
 
        if(!node->alternatives && children && children[0])
1513
 
                printf("n%p ",(void*)children[0]);
1514
 
        printf("};\n");
1515
 
        printf("{rank=same;");
1516
 
        for(i=0;i<node->alternatives;i++)
1517
 
                printf("n%p ",(void*)node->u.children[i]);
1518
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1519
 
                for(p=children[0]->next;p!=node;p=p->next) 
1520
 
                {
1521
 
                        printf("n%p ",(void*)p);        
1522
 
                        if(p->op==OP_LEAF || p->listend)
1523
 
                                break;
1524
 
                }
1525
 
        if(!node->alternatives && children && children[0])
1526
 
                printf("n%p ",(void*)children[0]);
1527
 
        printf("};\n");
1528
 
        for(i=0;i<node->alternatives;i++)
1529
 
                dump_node(children[i]);
1530
 
        if(node->alternatives && children[0]->op!=OP_LEAF)
1531
 
                for(p=children[0]->next;p!=node;p=p->next)
1532
 
                {
1533
 
                        dump_node(p);
1534
 
                        if(p->op==OP_LEAF || p->listend)
1535
 
                                break;
1536
 
                }
1537
 
        if(!node->alternatives && children && children[0])
1538
 
                dump_node(children[0]);
1539
 
}
1540
 
 
1541
 
void dump_tree(struct tree_node* root)
1542
 
{
1543
 
        /*use dot/dotty from graphviz to view it*/
1544
 
        massert(root);
1545
 
        printf("digraph tree {\n");
1546
 
        dump_node(root);
1547
 
        printf("}\n");
1548
 
}
1549
 
#endif
 
701
        for(i=0; i < (len/2); i++) {
 
702
                char aux = pattern[i];
 
703
                pattern[i] = pattern[len-i-1];
 
704
                pattern[len-i-1] = aux;
 
705
        }
 
706
        return len;
 
707
}
 
708
 
 
709
static regex_t *new_preg(struct regex_matcher *matcher)
 
710
{
 
711
        regex_t *r;
 
712
        matcher->all_pregs = cli_realloc(matcher->all_pregs, ++matcher->regex_cnt * sizeof(*matcher->all_pregs));
 
713
        if(!matcher->all_pregs)
 
714
                return NULL;
 
715
        r = cli_malloc(sizeof(*r));
 
716
        if(!r)
 
717
                return NULL;
 
718
        matcher->all_pregs[matcher->regex_cnt-1] = r;
 
719
        return r;
 
720
}
 
721
 
 
722
static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
 
723
{
 
724
        size_t len;
 
725
        struct regex_list regex;
 
726
        int rc;
 
727
 
 
728
        len = reverse_string(pattern);
 
729
        regex.nxt = NULL;
 
730
        regex.pattern = cli_strdup(pattern);
 
731
        regex.preg = NULL;
 
732
        rc = add_pattern_suffix(matcher, pattern, len, &regex);
 
733
        free(regex.pattern);
 
734
        return rc;
 
735
}
 
736
 
 
737
int regex_list_add_pattern(struct regex_matcher *matcher, char *pattern)
 
738
{
 
739
        int rc;
 
740
        regex_t *preg;
 
741
        size_t len;
 
742
        /* we only match the host, so remove useless stuff */
 
743
        const char remove_end[] = "([/?].*)?/";
 
744
        const char remove_end2[] = "([/?].*)/";
 
745
 
 
746
        len = strlen(pattern);
 
747
        if(len > sizeof(remove_end)) {
 
748
                if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
 
749
                        len -= sizeof(remove_end) - 1;
 
750
                        pattern[len++]='/';
 
751
                }
 
752
                if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
 
753
                        len -= sizeof(remove_end2) - 1;
 
754
                        pattern[len++]='/';
 
755
                }
 
756
        }
 
757
        pattern[len] = '\0';
 
758
 
 
759
        preg = new_preg(matcher);
 
760
        if(!preg)
 
761
                return CL_EMEM;
 
762
 
 
763
        rc = cli_regex2suffix(pattern, preg, add_pattern_suffix, (void*)matcher);
 
764
        if(rc) {
 
765
                cli_regfree(preg);
 
766
        }
 
767
 
 
768
        return rc;
 
769
}