53
55
#include "matcher.h"
55
57
#include "readdb.h"
58
enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
59
typedef unsigned char* char_bitmap_p;
62
* OP_CHAR: 1 character, c = character
64
* OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
65
* OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
66
* OP_DOT: single . matching any character except \n
67
* OP_LEAF: this is a leaf node, reinterpret structure
70
struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
72
struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
73
char_bitmap_p* bitmap;
74
struct leaf_info* leaf;
78
char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
79
char listend;/* no more siblings, next pointer is pointer to parent*/
83
char* info;/* what does it mean that we reached the leaf...*/
84
regex_t* preg;/* this is NULL if leaf node, and non-regex*/
87
/* Character classes */
88
static const char* std_class[] = {
101
/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
105
#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
107
/* generated by contrib/phishing/generate_tables.c */
108
static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
109
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
110
0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
111
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
112
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
114
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
115
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
116
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
117
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
119
{0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc,
120
0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78,
121
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
122
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
124
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125
0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
126
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
127
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
129
{0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff,
130
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
131
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
132
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
134
{0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
135
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
136
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
137
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
139
{0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
140
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
141
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
142
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
144
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
145
0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07,
146
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
147
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
149
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
150
0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00,
151
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
152
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
154
{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
155
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
156
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
157
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
159
{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
160
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
161
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
162
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
164
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03,
165
0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
166
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
167
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
170
static const unsigned short int char_class[256] = {
171
0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200,
172
0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200,
173
0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414,
174
0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414,
175
0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519,
176
0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414,
177
0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499,
178
0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200,
179
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
180
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
181
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
182
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
183
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
184
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
185
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
186
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
189
static const size_t std_class_cnt = sizeof(std_class)/sizeof(std_class[0]);
58
#include "jsparse/textbuf.h"
59
#include "regex_suffix.h"
192
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
193
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
194
static void destroy_tree(struct regex_matcher* matcher);
195
static struct tree_node* tree_root_alloc(void);
196
static int build_regex_list(struct regex_matcher* matcher);
197
static void stack_destroy(struct node_stack* stack);
200
void dump_tree(struct tree_node* root);
61
static regex_t *new_preg(struct regex_matcher *matcher);
62
static size_t reverse_string(char *pattern);
63
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *regex);
64
static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
67
/* ----- shift-or filtering -------------- */
69
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
70
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
72
static void SO_init(struct filter *m)
74
memset(m->B, ~0, sizeof(m->B));
75
memset(m->end, ~0, sizeof(m->end));
76
memset(m->end_fast, ~0, sizeof(m->end_fast));
79
/* because we use uint32_t */
80
#define MAXSOPATLEN 32
82
/* merge another pattern into the filter
83
* add('abc'); add('bcd'); will match [ab][bc][cd] */
84
static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
89
/* cut length, and make it modulo 2 */
90
if(len > MAXSOPATLEN) {
93
/* we use 2-grams, must be multiple of 2 */
99
/* Shift-Or like preprocessing */
100
for(j=0;j < len-1;j++) {
101
/* use overlapping 2-grams. We need them overlapping because matching can start at any position */
102
q = cli_readint16( &pattern[j] );
103
m->B[q] &= ~(1 << j);
105
/* we use variable length patterns, use last character to mark pattern end,
106
* can lead to false positives.*/
107
/* mark that at state j, the q-gram q can end the pattern */
110
m->end[q] &= ~(1 << j);
111
m->end_fast[pattern[j+1]] &= ~(1<<j);
116
/* this is like a FSM, with multiple active states at the same time.
117
* each bit in "state" means an active state, when a char is encountered
118
* we determine what states can remain active.
119
* The FSM transition rules are expressed as bit-masks */
120
long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
124
const uint32_t *B = m->B;
125
const uint32_t *End = m->end;
126
const uint32_t *EndFast = m->end_fast;
128
/* cut length, and make it modulo 2 */
129
if(len > MAXSOPATLEN) {
132
/* we use 2-grams, must be multiple of 2 */
136
/* Shift-Or like search algorithm */
137
for(j=0;j < len-1; j++) {
138
const uint16_t q0 = cli_readint16( &data[j] );
140
state = (state << 1) | B[q0];
141
/* state marks with a 0 bit all active states
142
* End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
143
* if we got two 0's at matching positions, it means we encountered a pattern's end */
144
match_end = state | EndFast[data[j+1]];
145
if((match_end != 0xffffffff) && (state | End[q0]) != 0xffffffff) {
146
/* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
147
* end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
150
/* if state is reachable, and this character can finish a pattern, assume match */
151
/* to reduce false positives check if qgram can finish the pattern */
152
/* return position of probable match */
153
/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
154
* longest pattern that could match */
155
return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0;
162
/* ----------------------------------------------------------- */
203
165
#define MATCH_SUCCESS 0
204
166
#define MATCH_FAILED -1
246
246
* Do not send NULL pointers to this function!!
249
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
249
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
251
251
char* orig_real_url = real_url;
254
massert(display_url);
252
struct regex_list *regex;
253
size_t real_len, display_len, buffer_len;
256
259
if(!matcher->list_inited)
258
massert(matcher->list_built);
261
assert(matcher->list_built);
259
262
/* skip initial '.' inserted by get_host */
260
263
if(real_url[0] == '.') real_url++;
261
264
if(display_url[0] == '.') display_url++;
265
real_len = strlen(real_url);
266
display_len = strlen(display_url);
267
buffer_len = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
269
/* too short, no match possible */
263
size_t real_len = strlen(real_url);
264
size_t display_len = strlen(display_url);
265
size_t buffer_len = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
266
char* buffer = cli_malloc(buffer_len+1);
273
char *buffer = cli_malloc(buffer_len+1);
269
276
struct cli_ac_data mdata;
277
struct cli_ac_result *res = NULL;
274
282
strncpy(buffer,real_url,real_len);
275
buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
283
buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
276
284
if(!hostOnly || is_whitelist) {
277
285
strncpy(buffer+real_len+1,display_url,display_len);
279
buffer[buffer_len - 1] = '/';
280
buffer[buffer_len]=0;
287
buffer[buffer_len - 1] = '/';
288
buffer[buffer_len]=0;
282
289
cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
285
if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
289
for(i = 0; i < matcher->root_hosts_cnt; i++) {
290
/* doesn't need to match terminating \0*/
291
rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
292
cli_ac_freedata(&mdata);
295
const char* matched = strchr(*info,':');
296
const size_t match_len = matched ? strlen(matched+1) : 0;
297
if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
298
(match_len == buffer_len || /* full match */
299
(match_len < buffer_len &&
300
((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
301
/* subdomain matched*/)) {
303
cli_dbgmsg("Got a match: %s with %s\n", buffer, *info);
304
cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
305
if(real_len >= match_len + 1) {
306
const size_t pos = real_len - match_len - 1;
307
if(real_url[pos] != '.') {
308
/* we need to shift left, and insert a '.'
309
* we have an extra '.' at the beginning inserted by get_host to have room,
310
* orig_real_url has to be used here,
311
* because we want to overwrite that extra '.' */
312
size_t orig_real_len = strlen(orig_real_url);
313
cli_dbgmsg("No dot here:%s\n",real_url+pos);
314
real_url = orig_real_url;
315
memmove(real_url, real_url+1, orig_real_len-match_len-1);
316
real_url[orig_real_len-match_len-1]='.';
317
cli_dbgmsg("After inserting .: %s\n", real_url);
322
cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c);
291
if((rc = cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN)))
294
bufrev = cli_strdup(buffer);
297
reverse_string(bufrev);
298
rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
302
/* filter says this suffix doesn't match.
303
* The filter has false positives, but no false
307
rc = cli_ac_scanbuff((const unsigned char*)bufrev,buffer_len, NULL, (void*)®ex, &res, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
309
cli_ac_freedata(&mdata);
313
struct cli_ac_result *q;
314
regex = res->customdata;
315
while(!rc && regex) {
316
/* loop over multiple regexes corresponding to
319
/* we matched a static pattern */
320
rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
322
rc = !cli_regexec(regex->preg, buffer, 0, NULL, 0);
324
if(rc) *info = regex->pattern;
329
rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
332
333
cli_dbgmsg("Lookup result: not in regex list\n");
686
559
/* Done with this matcher, free resources */
687
560
void regex_list_done(struct regex_matcher* matcher)
691
regex_list_cleanup(matcher);
692
if(matcher->list_loaded) {
693
if(matcher->root_hosts) {
695
for(i=0;i<matcher->root_hosts_cnt;i++)
696
cli_ac_free(&matcher->root_hosts[i]);
697
free(matcher->root_hosts);
698
matcher->root_hosts=NULL;
701
matcher->root_hosts_cnt=0;
564
if(matcher->list_inited) {
566
cli_ac_free(&matcher->suffixes);
567
if(matcher->suffix_regexes) {
568
for(i=0;i<matcher->suffix_cnt;i++) {
569
struct regex_list *r = matcher->suffix_regexes[i].head;
571
struct regex_list *q = r;
577
free(matcher->suffix_regexes);
578
matcher->suffix_regexes = NULL;
580
if(matcher->all_pregs) {
581
for(i=0;i<matcher->regex_cnt;i++) {
582
regex_t *r = matcher->all_pregs[i];
586
free(matcher->all_pregs);
588
hashtab_free(&matcher->suffix_hash);
589
cli_bm_free(&matcher->md5_hashes);
702
590
matcher->list_built=0;
703
destroy_tree(matcher);
704
591
matcher->list_loaded=0;
706
593
if(matcher->list_inited) {
707
594
matcher->list_inited=0;
709
stack_destroy(&matcher->node_stack);
710
stack_destroy(&matcher->node_stack_alt);
713
/* Tree matcher algorithm */
717
const unsigned char* start;
718
char_bitmap_p bitmap;
725
enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
727
static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
734
token->type=TOKEN_CHAR;
735
token->u.c = *(++pat);
736
if(islower(token->u.c)) {
737
/* handle \n, \t, etc. */
738
char fmt[3] = {'\\', '\0', '\0'};
742
if(snprintf(&c,1,fmt)!=1) {
743
token->type=TOKEN_REGEX;
744
token->u.start = pat;
752
token->type=TOKEN_ALT;
759
token->type=TOKEN_REGEX;
764
/*see if it is something simple like a list of characters, a range, or negated ...*/
765
const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
766
unsigned char range_start=0;
768
char_bitmap_p bitmap = cli_malloc(32);
772
memset(bitmap,0xFF,32);/*match chars not in brackets*/
776
memset(bitmap,0x00,32);
778
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
779
if (*pat=='-' && hasprev) {
781
unsigned char range_end;
783
massert(range_start);
787
if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
790
/* this is getting complicated, bail out */
791
cli_warnmsg("confused about collating sequences in regex,bailing out");
793
token->type=TOKEN_REGEX;
801
for(c=range_start+1;c<=range_end;c++)
802
bitmap[c>>3] ^= 1<<(c&0x7);
805
else if (pat[0]=='[' && pat[1]==':') {
806
const unsigned char* end;
811
end=(unsigned char*)strstr((const char*)pat,":]");
813
cli_warnmsg("confused about std char class syntax regex,bailing out");
815
token->type=TOKEN_REGEX;
820
for(i=0;i<std_class_cnt;i++)
821
if(!strncmp((const char*)pat,std_class[i],len)) {
827
if(char_class[i]&(1<<found))
828
bitmap[i>>3] ^= 1<<(i&0x7);
832
cli_warnmsg("confused about regex bracket expression, bailing out");
834
token->type=TOKEN_REGEX;
839
bitmap[*pat>>3] ^= 1<<(*pat&0x7);
845
/*TODO: see if this bitmap already exists, then reuse*/
846
token->type = TOKEN_BRACKET;
847
token->u.bitmap = bitmap;
851
massert(0 && "Encountered ] without matching [");
855
token->type=TOKEN_DOT;
858
token->type=TOKEN_PAR_OPEN;
861
token->type=TOKEN_PAR_CLOSE;
864
token->type=TOKEN_CHAR;
872
#define INITIAL_ALT_STACK 10
873
#define ALT_STACK_GROW 20
875
static const unsigned char* find_regex_start(const unsigned char* pat)
877
struct token_t token;
878
/*TODO: find where the regex part begins, for ex:
879
* abcd+, regex begins at 'd'
881
const unsigned char* last=NULL;
882
const unsigned char* tmp=NULL;
883
const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
884
size_t altpositions_capacity = INITIAL_ALT_STACK;
885
size_t altpositions_cnt = 0;
891
/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
892
* The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
893
* back up to the last known good position
894
* Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
895
* Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
896
* TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
900
pat = getNextToken(pat,&token);
901
if(token.type!=TOKEN_REGEX) {
903
lasttype = token.type;
904
if(token.type==TOKEN_BRACKET && token.u.bitmap)
905
free(token.u.bitmap);
906
if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
907
/* save this position on stack, succesfully parsed till here*/
908
if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
909
/* encountered another alternate (|) operator, override previous | position stored */
910
altpositions[altpositions_cnt-1]=last;
912
altpositions[altpositions_cnt++] = last;
913
if(altpositions_cnt == altpositions_capacity) {
914
altpositions_capacity += ALT_STACK_GROW;
915
altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
920
} else if (lasttype==TOKEN_PAR_CLOSE) {
921
/* remove last stored position from stack, succesfully this last group */
923
massert(altpositions_cnt>0);
928
last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
929
/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
931
} while(*pat && token.type!=TOKEN_REGEX);
933
return *pat ? last : last+1;
936
static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
938
struct tree_node* node = cli_malloc(sizeof(*node));
940
node->alternatives=0;
942
node->listend=listend;
943
node->u.children=NULL;
948
static struct tree_node* tree_root_alloc(void)
950
struct tree_node* root=tree_node_alloc(NULL,1);
960
static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
963
struct tree_node **children;
967
children = tree_node_get_children(node);
968
right = node->alternatives-1;
970
if(!node->alternatives)
973
while(*left<=right) {
974
int mid = *left+(right-*left)/2;
975
if(children[mid]->c == csearch)
976
return children[mid];
977
else if(children[mid]->c < csearch)
985
static struct tree_node* tree_get_next(struct tree_node* node)
987
struct tree_node** children;
989
children = tree_node_get_children(node);
991
if(!node->alternatives && children && children[0])
993
else if(node->alternatives<=1)
996
return children[0]->next;
999
static size_t tree_node_get_array_size(const struct tree_node* node)
1002
/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
1003
return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
1006
static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
1008
struct tree_node* new, *alt = tree_get_next(node);
1009
struct tree_node **children;
1010
node->alternatives++;
1011
node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
1012
if(!node->u.children)
1015
children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
1017
new = tree_node_alloc(alt , node == alt );
1023
if(node->alternatives-left-1>0)
1024
memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
1025
children[left] = new;
1030
static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
1032
struct tree_node **children;
1036
children = tree_node_get_children(node);
1037
if(node->alternatives) {
1039
if(children[0]->next == node) {
1042
for(i=0;i<node->alternatives;i++) {
1043
children[i]->next = new;
1044
children[i]->listend = 0;
1048
struct tree_node* p;
1049
for(p = children[0]->next ; p->next != node ; p = p->next)
1050
massert(!p->listend);
1057
int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
1058
if(node->u.children)
1059
if(node->u.children[idx]) {
1060
node = node->u.children[idx];
1061
while(node->next && !node->listend)
1064
new->next = node->next;
1069
node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
1070
if(node->u.children) {
1071
node->u.children[idx] = new;
1076
static unsigned char char_getclass(const unsigned char* bitmap)
1081
for(i=0;i<std_class_cnt;i++)
1082
if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
1084
return std_class_cnt;
1087
static void stack_destroy(struct node_stack* stack)
1093
stack->capacity = 0;
1096
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
1097
void regex_list_cleanup(struct regex_matcher* matcher)
1101
stack_destroy(&matcher->node_stack);
1102
stack_destroy(&matcher->node_stack_alt);
1103
stack_init(&matcher->node_stack);
1104
stack_init(&matcher->node_stack_alt);
1107
598
int is_regex_ok(struct regex_matcher* matcher)
1110
601
return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
1113
/* returns 0 on success, regexec error code otherwise */
1114
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
1117
const unsigned char* pat_end = find_regex_start(pat);
1118
struct token_t token;
1119
struct tree_node* node;
1123
node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
1125
stack_reset(&matcher->node_stack);
1126
stack_reset(&matcher->node_stack_alt);
1127
stack_push(&matcher->node_stack,node);
1129
for(;node->op!=OP_LEAF;){
1131
pat = getNextToken(pat,&token);
1133
token.type = TOKEN_REGEX;
1137
token.type = TOKEN_DONE;
1139
switch(token.type) {
1142
/* search for char in tree */
1144
struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
1148
/* not found, insert it */
1149
node = tree_node_char_insert(node,token.u.c,left);
1154
case TOKEN_PAR_OPEN:
1155
stack_push(&matcher->node_stack_alt,NULL);/* marker */
1156
stack_push(&matcher->node_stack,node);
1159
case TOKEN_PAR_CLOSE: {
1160
/*TODO: test this!!!*/
1161
struct tree_node* node_alt = node;
1162
node = tree_node_alloc(NULL,1);
1163
node->op=OP_PARCLOSE;
1166
tree_node_insert_nonbin(node_alt,node);
1167
while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
1168
tree_node_insert_nonbin(node_alt,node);
1170
stack_pop(&matcher->node_stack);
1175
stack_push(&matcher->node_stack_alt,node);
1176
node = stack_pop(&matcher->node_stack);
1177
stack_push(&matcher->node_stack,node);
1182
struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1183
unsigned char charclass = char_getclass(token.u.bitmap);
1184
if(charclass == std_class_cnt) {/*not a std char class*/
1185
new->op = OP_CUSTOMCLASS;
1186
new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
1187
if(!new->u.children)
1189
new->u.bitmap[0] = token.u.bitmap;
1190
new->u.bitmap[1] = NULL;
1191
tree_node_insert_nonbin(node,new);
1195
new->op = OP_STDCLASS;
1197
tree_node_insert_nonbin(node,new);
1205
struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1207
tree_node_insert_nonbin(node,new);
1214
struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
1217
leaf->info = cli_strdup(info);
1218
if(token.type==TOKEN_REGEX) {
1220
struct tree_node* new;
1222
preg=cli_malloc(sizeof(*preg));
1225
rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
1229
new=cli_malloc(sizeof(*new));
1234
new->alternatives=0;
1237
tree_node_insert_nonbin(node,new);
1241
node->alternatives=0;
604
static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, const char *suffix, size_t len)
606
struct cli_matcher *root = &matcher->suffixes;
607
struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
613
assert(root && suffix);
625
new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
626
if(new->length > root->maxpatlen)
627
root->maxpatlen = new->length;
629
new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
635
new->pattern[i] = suffix[i];/*new->pattern is short int* */
637
new->customdata = info;
639
if((ret = cli_ac_addpatt(root,new))) {
644
SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len);
648
#define MODULE "regex_list: "
649
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
651
static void list_add_tail(struct regex_list_ht *ht, struct regex_list *regex)
656
ht->tail->nxt = regex;
661
/* returns 0 on success, clamav error code otherwise */
662
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *iregex)
664
struct regex_matcher *matcher = cbdata;
665
struct regex_list *regex = cli_malloc(sizeof(*regex));
666
const struct element *el;
671
regex->pattern = iregex->pattern ? cli_strdup(iregex->pattern) : NULL;
672
regex->preg = iregex->preg;
674
el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
675
/* TODO: what if suffixes are prefixes of eachother and only one will
678
/* existing suffix */
679
assert((size_t)el->data < matcher->suffix_cnt);
680
list_add_tail(&matcher->suffix_regexes[el->data], regex);
681
cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern);
684
size_t n = matcher->suffix_cnt++;
685
el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
686
matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
687
if(!matcher->suffix_regexes)
689
matcher->suffix_regexes[n].tail = regex;
690
matcher->suffix_regexes[n].head = regex;
691
add_newsuffix(matcher, regex, suffix, suffix_len);
692
cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern);
1254
/* c has to be unsigned char here!! */
1255
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
1257
struct tree_node** children;
1264
if(!node->u.children)
1265
return MATCH_FAILED;/* tree empty */
1271
children = node->u.children;
1277
/*this isn't a real character, so don't move*/
1283
massert(*c==node->c && "We know this has to match");
1284
rc = 1;/* *c==node->c;- we know it has matched */
1290
rc = char_class[*c]&(node->c);
1292
case OP_CUSTOMCLASS:
1294
char_bitmap_p bitmap;
1296
bitmap = (char_bitmap_p)node->u.bitmap[0];
1298
rc = bitmap[*c>>3]&(1<<(*c&0x7));
1303
const struct leaf_info* leaf = node->u.leaf;
1306
rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0);
1309
massert(*c==node->c && "We know this has to match[2]");
1314
return MATCH_SUCCESS;
1320
cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
1327
const char csearch = *c;
1328
int left = 0,right = node->alternatives-1;
1330
/*matched so far, go deeper*/
1331
/*do a binary search between children */
1333
while(left<=right) {
1334
mid = left+(right-left)/2;
1335
if (children[mid]->c == csearch)
1337
else if(children[mid]->c < csearch)
1343
node = children[mid];
1347
if(node->alternatives) {
1348
if(!children[0]->listend) {
1353
while(node && node->listend) {
1354
node = node->next;/* climb up */
1358
if(!node || !node->next)
1359
return MATCH_FAILED;/* reached root node */
1364
else if(node->u.children) {
1365
struct tree_node* rewrite_next = NULL;
1366
if(node->op==OP_PARCLOSE)
1367
rewrite_next = node;
1370
massert(node->op!=OP_CHAR);
1372
node->next = rewrite_next;/* this node is pointed to by several parent nodes,
1374
from which one we came, so we can find out way back
1375
should we fail to match somewhere deeper*/
1380
/* this node didn't match, try sibling, or parent (if no more siblings) */
1381
while(node && node->listend) {
1382
node = node->next;/* sibling of parent */
1386
if(!node || !node->next) /* reached root node, it has no next */
1387
return MATCH_FAILED;
1395
return MATCH_FAILED;
1398
/* push node on stack, only if it isn't there already */
1399
static void stack_push_once(struct node_stack* stack,struct tree_node* node)
697
static size_t reverse_string(char *pattern)
699
size_t len = strlen(pattern);
1405
for(i=0;i < stack->cnt;i++)
1406
if(stack->data[i]==node)
1408
stack_push(stack,node);
1411
static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
1413
struct tree_node **children;
1417
children = tree_node_get_children(node);
1418
if(node->op==OP_LEAF) {
1419
struct leaf_info* leaf = node->u.leaf;
1420
if(node->next && !node->listend)
1421
destroy_tree_internal(matcher,node->next);
1422
stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
1423
stack_push_once(&matcher->node_stack,node);
1425
cli_regfree(leaf->preg);
1435
if(node->alternatives) {
1437
struct tree_node* p;
1439
p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
1440
for(i=0;i<node->alternatives;i++)
1441
destroy_tree_internal(matcher,children[i]);
1443
destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
1448
destroy_tree_internal(matcher,children[0]);
1451
if(node->op!=OP_LEAF && node->next && !node->listend)
1452
destroy_tree_internal(matcher,node->next);
1453
if(node->u.children)
1454
stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
1455
if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
1456
free(node->u.children[0]);
1457
node->u.children[0]=NULL;
1459
stack_push_once(&matcher->node_stack,node);
1462
static void destroy_tree(struct regex_matcher* matcher)
1464
/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
1465
* i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
1466
* (and push to stack only if it doesn't contain it already*/
1469
stack_reset(&matcher->node_stack);
1470
destroy_tree_internal(matcher,matcher->root_regex);
1471
destroy_tree_internal(matcher,matcher->root_regex_hostonly);
1472
while (matcher->node_stack.cnt) {
1473
struct tree_node* node = stack_pop(&matcher->node_stack);
1479
static void dump_node(struct tree_node* node)
1482
struct tree_node* p,**children;
1484
if(node->op==OP_LEAF) {
1485
if(node->u.leaf->preg)
1486
printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
1488
printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
1489
if(node->next && !node->listend) {
1490
printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1491
dump_node(node->next);
1495
printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
1497
printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1498
printf("n%p -> {",(void*)node);/*using address of node as id*/
1499
children = tree_node_get_children(node);
1500
if(node->alternatives)
1502
for(i=0;i<node->alternatives;i++)
1503
printf("n%p ",(void*)children[i]);
1504
if(node->alternatives && children[0]->op!=OP_LEAF)
1505
for(p=children[0]->next;p!=node;p=p->next)
1508
printf("n%p ",(void*)p);
1509
if(p->op==OP_LEAF || p->listend)
1512
if(!node->alternatives && children && children[0])
1513
printf("n%p ",(void*)children[0]);
1515
printf("{rank=same;");
1516
for(i=0;i<node->alternatives;i++)
1517
printf("n%p ",(void*)node->u.children[i]);
1518
if(node->alternatives && children[0]->op!=OP_LEAF)
1519
for(p=children[0]->next;p!=node;p=p->next)
1521
printf("n%p ",(void*)p);
1522
if(p->op==OP_LEAF || p->listend)
1525
if(!node->alternatives && children && children[0])
1526
printf("n%p ",(void*)children[0]);
1528
for(i=0;i<node->alternatives;i++)
1529
dump_node(children[i]);
1530
if(node->alternatives && children[0]->op!=OP_LEAF)
1531
for(p=children[0]->next;p!=node;p=p->next)
1534
if(p->op==OP_LEAF || p->listend)
1537
if(!node->alternatives && children && children[0])
1538
dump_node(children[0]);
1541
void dump_tree(struct tree_node* root)
1543
/*use dot/dotty from graphviz to view it*/
1545
printf("digraph tree {\n");
701
for(i=0; i < (len/2); i++) {
702
char aux = pattern[i];
703
pattern[i] = pattern[len-i-1];
704
pattern[len-i-1] = aux;
709
static regex_t *new_preg(struct regex_matcher *matcher)
712
matcher->all_pregs = cli_realloc(matcher->all_pregs, ++matcher->regex_cnt * sizeof(*matcher->all_pregs));
713
if(!matcher->all_pregs)
715
r = cli_malloc(sizeof(*r));
718
matcher->all_pregs[matcher->regex_cnt-1] = r;
722
static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
725
struct regex_list regex;
728
len = reverse_string(pattern);
730
regex.pattern = cli_strdup(pattern);
732
rc = add_pattern_suffix(matcher, pattern, len, ®ex);
737
int regex_list_add_pattern(struct regex_matcher *matcher, char *pattern)
742
/* we only match the host, so remove useless stuff */
743
const char remove_end[] = "([/?].*)?/";
744
const char remove_end2[] = "([/?].*)/";
746
len = strlen(pattern);
747
if(len > sizeof(remove_end)) {
748
if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
749
len -= sizeof(remove_end) - 1;
752
if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
753
len -= sizeof(remove_end2) - 1;
759
preg = new_preg(matcher);
763
rc = cli_regex2suffix(pattern, preg, add_pattern_suffix, (void*)matcher);