~ubuntu-branches/ubuntu/feisty/clamav/feisty

« back to all changes in this revision

Viewing changes to libclamav/regex_list.c

  • Committer: Bazaar Package Importer
  • Author(s): Kees Cook
  • Date: 2007-02-20 10:33:44 UTC
  • mto: This revision was merged to the branch mainline in revision 16.
  • Revision ID: james.westby@ubuntu.com-20070220103344-zgcu2psnx9d98fpa
Tags: upstream-0.90
ImportĀ upstreamĀ versionĀ 0.90

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*
2
2
 *  Match a string against a list of patterns/regexes.
3
3
 *
4
 
 *  Copyright (C) 2007-2008 Sourcefire, Inc.
5
 
 *
6
 
 *  Authors: Tƶrƶk Edvin
 
4
 *  Copyright (C) 2006 Tļæ½rļæ½k Edvin <edwintorok@gmail.com>
7
5
 *
8
6
 *  This program is free software; you can redistribute it and/or modify
9
 
 *  it under the terms of the GNU General Public License version 2 as
10
 
 *  published by the Free Software Foundation.
 
7
 *  it under the terms of the GNU General Public License as published by
 
8
 *  the Free Software Foundation; either version 2 of the License, or
 
9
 *  (at your option) any later version.
11
10
 *
12
11
 *  This program is distributed in the hope that it will be useful,
13
12
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18
17
 *  along with this program; if not, write to the Free Software
19
18
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20
19
 *  MA 02110-1301, USA.
 
20
 *
 
21
 *  $Log: regex_list.c,v $
 
22
 *  Revision 1.18  2006/12/19 20:47:45  tkojm
 
23
 *  strict whitelisting
 
24
 *
 
25
 *  Revision 1.17  2006/12/19 20:30:17  tkojm
 
26
 *  fix some compiler warnings
 
27
 *
 
28
 *  Revision 1.16  2006/12/02 00:42:44  tkojm
 
29
 *  add functionality level support for .pdb/.wdb files
 
30
 *
 
31
 *  Revision 1.15  2006/11/15 15:26:54  tkojm
 
32
 *  pattern matcher accuracy improvements
 
33
 *
 
34
 *  Revision 1.14  2006/11/05 18:16:56  acab
 
35
 *  Patch for bug 52 from Edvin
 
36
 *
 
37
 *  Revision 1.13  2006/10/30 17:53:03  tkojm
 
38
 *  apply patch from Edvin reported by Luca
 
39
 *
 
40
 *  Revision 1.12  2006/10/28 15:34:40  tkojm
 
41
 *  .pdb/.wdb files now use colon as delimiter
 
42
 *
 
43
 *  Revision 1.11  2006/10/15 19:16:33  tkojm
 
44
 *  allow loading multiple .pdb/.wdb files
 
45
 *
 
46
 *  Revision 1.10  2006/10/14 23:52:02  tkojm
 
47
 *  code cleanup
 
48
 *
 
49
 *  Revision 1.9  2006/10/10 23:51:49  tkojm
 
50
 *  apply patches for the anti-phish code from Edwin
 
51
 *
 
52
 *  Revision 1.8  2006/10/08 18:55:15  tkojm
 
53
 *  fix crash in phishing code on database reload (Edvin Torok)
 
54
 *
 
55
 *  Revision 1.7  2006/10/07 11:00:46  tkojm
 
56
 *  make the experimental anti-phishing code more thread safe
 
57
 *
 
58
 *  Revision 1.6  2006/09/27 19:14:49  njh
 
59
 *  Fix segfault on Solaris
 
60
 *
 
61
 *  Revision 1.5  2006/09/26 18:55:36  njh
 
62
 *  Fixed portability issues
 
63
 *
 
64
 *  Revision 1.4  2006/09/25 18:27:00  njh
 
65
 *  Fix handling of escaped characters
 
66
 *
 
67
 *  Revision 1.3  2006/09/24 19:28:03  acab
 
68
 *  fixes for type "R" regex handler
 
69
 *
 
70
 *  Revision 1.2  2006/09/16 15:49:27  acab
 
71
 *  phishing: fixed bugs and updated docs
 
72
 *
 
73
 *  Revision 1.1  2006/09/12 19:38:39  acab
 
74
 *  Phishing module merge - libclamav
 
75
 *
 
76
 *  Revision 1.13  2006/09/11 19:25:08  edwin
 
77
 *  Non-printable characters in regex (although they are invalid inside an url, added some support for it).
 
78
 *
 
79
 *  Revision 1.12  2006/08/28 08:43:06  edwin
 
80
 *  Fixed a few minor leaks.
 
81
 *  Valgrind now says:"All heap blocks were freed -- no leaks are possible"
 
82
 *
 
83
 *  Revision 1.11  2006/08/20 21:18:11  edwin
 
84
 *  Added the script used to generate iana_tld.sh
 
85
 *  Added checks for phish_domaincheck_db
 
86
 *  Added phishing module design document from wiki (as discussed with aCaB).
 
87
 *  Updated .wdb/.pdb format documentation (in regex_list.c)
 
88
 *  Fixed some memory leaks in regex_list.c
 
89
 *  IOW: cleanups before the deadline.
 
90
 *  I consider my module to be ready for evaluation now.
 
91
 *
 
92
 *  Revision 1.10  2006/08/20 19:42:02  edwin
 
93
 *  Fix custom character class, and generic regex handling.
 
94
 *
 
95
 *  Revision 1.9  2006/08/19 21:08:47  edwin
 
96
 *  Fixed:Forgot to add form tag handling when it contains images.
 
97
 *  Various fixes to get rid of gcc warnings.
 
98
 *
 
99
 *  Revision 1.8  2006/08/19 09:26:51  edwin
 
100
 *  regex_list.c: Fixed regex alternatives handling (bug discovered with autotests).
 
101
 *  And forgot to commit manager.c last time.
 
102
 *
 
103
 *  Revision 1.7  2006/08/17 20:31:43  edwin
 
104
 *  Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off.
 
105
 *  Fix a still reachable leak. Remove unneeded build_regex_list export.
 
106
 *
 
107
 *  Revision 1.6  2006/08/12 14:35:34  edwin
 
108
 *  Fix some compiler warnings.
 
109
 *  Fix an assertion failure in regex_list.
 
110
 *  Interpret display links that start with http|https|ftp, always as an URL.
 
111
 *
 
112
 *  Revision 1.5  2006/08/06 20:27:07  edwin
 
113
 *  New option to enable phish scan for all domains (disabled by default).
 
114
 *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
 
115
 *  Updated phishcheck control flow to better incorporate the domainlist.
 
116
 *  Updated manpage with new options.
 
117
 *
 
118
 *  TODO:there is a still-reachable leak in regex_list.c
 
119
 *
 
120
 *  Revision 1.4  2006/08/01 20:19:15  edwin
 
121
 *  Integrate domainlist check into phishcheck. Warning: enabled by default.
 
122
 *  Regex bracket handling update.
 
123
 *  Better regex paranthesized & alternate expression handling.
 
124
 *
 
125
 *  Revision 1.3  2006/07/31 20:12:30  edwin
 
126
 *  Preliminary support for domain databases (domains to check by phishmodule)
 
127
 *  Better memory allocation failure handling in regex_list
 
128
 *
21
129
 */
22
130
 
23
131
#if HAVE_CONFIG_H
24
132
#include "clamav-config.h"
25
133
#endif
26
134
 
 
135
#ifdef CL_EXPERIMENTAL
 
136
 
 
137
#ifndef CL_DEBUG
 
138
#define NDEBUG
 
139
#endif
 
140
 
27
141
#ifdef CL_THREAD_SAFE
28
142
#ifndef _REENTRANT
29
143
#define _REENTRANT
32
146
 
33
147
#include <stdio.h>
34
148
#include <stdlib.h>
 
149
#include <errno.h>
35
150
#include <string.h>
 
151
#ifdef  HAVE_STRINGS_H
 
152
#include <strings.h>
 
153
#endif
36
154
#include <ctype.h>
37
 
#include <zlib.h>
38
155
 
39
156
#include <limits.h>
40
157
#include <sys/types.h>
41
 
#include <assert.h>
42
 
 
43
 
 
44
 
#include "regex/regex.h"
45
 
 
 
158
 
 
159
#ifdef  HAVE_REGEX_H
 
160
/*#define USE_PCRE*/
 
161
#include <regex.h>
 
162
#endif
 
163
 
 
164
#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
 
165
#include <stddef.h>
 
166
#endif
46
167
 
47
168
#include "clamav.h"
48
169
#include "others.h"
 
170
#include "defaults.h"
 
171
#include "str.h"
 
172
#include "filetypes.h"
 
173
#include "mbox.h"
49
174
#include "regex_list.h"
50
175
#include "matcher-ac.h"
51
 
#include "matcher.h"
52
 
#include "str.h"
53
 
#include "readdb.h"
54
 
#include "jsparse/textbuf.h"
55
 
#include "regex_suffix.h"
56
 
#include "default.h"
57
 
#include "hashtab.h"
58
 
 
59
 
#include "mpool.h"
 
176
 
 
177
 
 
178
/*Tree*/
 
179
enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
 
180
typedef unsigned char* char_bitmap_p;
 
181
/*
 
182
 *
 
183
 * OP_CHAR: 1 character, c = character
 
184
 * complex stuff:
 
185
 * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
 
186
 * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
 
187
 * OP_DOT: single . matching any character except \n
 
188
 * OP_LEAF: this is a leaf node, reinterpret structure
 
189
 */
 
190
struct tree_node {
 
191
        struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
 
192
        unsigned char c;
 
193
        enum token_op_t op;
 
194
        char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
 
195
        char listend;/* no more siblings, next pointer is pointer to parent*/
 
196
        union {
 
197
                struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
 
198
                char_bitmap_p* bitmap;
 
199
                struct leaf_info*  leaf;
 
200
        } u;
 
201
};
 
202
 
 
203
struct leaf_info {
 
204
        char* info;/* what does it mean that we reached the leaf...*/
 
205
        regex_t* preg;/* this is NULL if leaf node, and non-regex*/
 
206
};
 
207
 
 
208
/* Character classes */
 
209
static const char* std_class[] = {
 
210
        "[:alnum:]",
 
211
        "[:digit:]",
 
212
        "[:punct:]",
 
213
        "[:alpha:]",
 
214
        "[:graph:]",
 
215
        "[:space:]",
 
216
        "[:blank:]",
 
217
        "[:lower:]", 
 
218
        "[:upper:]",
 
219
        "[:cntrl:]",
 
220
        "[:print:]",
 
221
        "[:xdigit:]"
 
222
        /* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
 
223
};
 
224
 
 
225
 
 
226
#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
 
227
 
 
228
/* generated by contrib/phishing/generate_tables.c */
 
229
static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
 
230
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
 
231
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
 
232
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
233
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
234
 
 
235
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
 
236
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
237
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
238
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
239
 
 
240
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 
 
241
         0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, 
 
242
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
243
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
244
 
 
245
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
246
         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
 
247
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
248
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
249
 
 
250
        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 
 
251
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
 
252
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
253
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
254
 
 
255
        {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
 
256
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
257
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
258
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
259
 
 
260
        {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
 
261
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
262
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
263
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
264
 
 
265
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
266
         0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 
 
267
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
268
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
269
 
 
270
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
271
         0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 
 
272
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
273
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
274
 
 
275
        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
 
276
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 
 
277
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
278
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
279
 
 
280
        {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 
 
281
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
 
282
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
283
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
284
 
 
285
        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
 
286
         0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 
 
287
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 
288
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 
289
};
 
290
 
 
291
static const unsigned short int char_class[256] = {
 
292
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, 
 
293
        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 
 
294
        0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
 
295
        0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
 
296
        0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 
 
297
        0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, 
 
298
        0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 
 
299
        0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, 
 
300
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
301
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
302
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
303
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
304
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
305
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
306
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
 
307
        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
 
308
};
 
309
 
 
310
static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
60
311
 
61
312
/* Prototypes */
62
 
static regex_t *new_preg(struct regex_matcher *matcher);
63
 
static size_t reverse_string(char *pattern);
64
 
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *regex);
65
 
static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
66
 
/* ---------- */
67
 
 
68
 
#define MATCH_SUCCESS 0
 
313
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info);
 
314
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
 
315
static void destroy_tree(struct regex_matcher* matcher);
 
316
static struct tree_node* tree_root_alloc(void);
 
317
static int build_regex_list(struct regex_matcher* matcher);
 
318
static void stack_destroy(struct node_stack* stack);
 
319
 
 
320
#ifndef NDEBUG
 
321
void dump_tree(struct tree_node* root);
 
322
#endif
 
323
 
 
324
#define MATCH_SUCCESS 0 
69
325
#define MATCH_FAILED  -1
70
326
 
71
327
/*
78
334
}
79
335
 
80
336
 
81
 
static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info, const char* buffer, size_t pos)
82
 
{
83
 
        const char* str;
84
 
        size_t realpos = 0;
85
 
        if(!info) {
86
 
                return (pos <= strlen(buffer)) ? buffer[pos>0 ? pos-1:0] : '\0';
87
 
        }
88
 
        str = info->pre_displayLink.data;
89
 
        cli_dbgmsg("calc_pos_with_skip: skip:%lu, %lu - %lu \"%s\",\"%s\"\n", pos, info->host_start, info->host_end, str, buffer);
90
 
        pos += info->host_start;
91
 
        while(str[realpos] && !isalnum(str[realpos])) realpos++;
92
 
        for(; str[realpos] && (pos>0); pos--) {
93
 
                while(str[realpos]==' ') realpos++;
94
 
                realpos++;
95
 
        }
96
 
        while(str[realpos]==' ') realpos++;
97
 
        cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);
98
 
        return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
99
 
}
100
 
 
101
 
static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
102
 
{
103
 
        char c;
104
 
        size_t match_len;
105
 
 
106
 
        if(!regex || !regex->pattern)
107
 
                return 0;
108
 
        match_len = strlen(regex->pattern);
109
 
        if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
110
 
                        (match_len == buffer_len || /* full match */
111
 
                         (match_len < buffer_len &&
112
 
                          ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
113
 
                         /* subdomain matched*/)) {
114
 
                /* we have an extra / at the end */
115
 
                if(match_len > 0) match_len--;
116
 
                cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
117
 
                cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
118
 
                if(real_len >= match_len + 1) {
119
 
                        const size_t pos = real_len - match_len - 1;
120
 
                        if(real_url[pos] != '.') {
121
 
                                /* we need to shift left, and insert a '.'
122
 
                                 * we have an extra '.' at the beginning inserted by get_host to have room,
123
 
                                 * orig_real_url has to be used here, 
124
 
                                 * because we want to overwrite that extra '.' */
125
 
                                size_t orig_real_len = strlen(orig_real_url);
126
 
                                cli_dbgmsg("No dot here:%s\n",real_url+pos);
127
 
                                real_url = orig_real_url;
128
 
                                memmove(real_url, real_url+1, orig_real_len-match_len-1);
129
 
                                real_url[orig_real_len-match_len-1]='.';
130
 
                                cli_dbgmsg("After inserting .: %s\n", real_url);
131
 
                        }
132
 
                }
133
 
                return 1;
134
 
        }
135
 
        cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
136
 
        return 0;
137
 
}
138
 
 
139
337
/*
140
338
 * @matcher - matcher structure to use
141
339
 * @real_url - href target
149
347
 * Do not send NULL pointers to this function!!
150
348
 *
151
349
 */
152
 
int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
 
350
int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info,int is_whitelist)
153
351
{
154
 
        char* orig_real_url = real_url;
155
 
        struct regex_list *regex;
156
 
        size_t real_len, display_len, buffer_len;
157
 
 
158
 
        assert(matcher);
159
 
        assert(real_url);
160
 
        assert(display_url);
161
 
        *info = NULL;
 
352
        massert(matcher);
 
353
        massert(real_url);
 
354
        massert(display_url);
 
355
        massert(info);
162
356
        if(!matcher->list_inited)
163
357
                return 0;
164
 
        assert(matcher->list_built);
165
 
        /* skip initial '.' inserted by get_host */
166
 
        if(real_url[0] == '.') real_url++;
167
 
        if(display_url[0] == '.') display_url++;
168
 
        real_len    = strlen(real_url);
169
 
        display_len = strlen(display_url);
170
 
        buffer_len  = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
171
 
        if(buffer_len < 3) {
172
 
                /* too short, no match possible */
173
 
                return 0;
174
 
        }
 
358
        massert(matcher->list_built);
175
359
        {
176
 
                char *buffer = cli_malloc(buffer_len+1);
177
 
                char *bufrev;
178
 
                int rc = 0, root;
 
360
                size_t real_len    = strlen(real_url);
 
361
                size_t display_len = strlen(display_url);
 
362
                size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
 
363
                char*  buffer = cli_malloc(buffer_len+1);
 
364
                size_t i;
 
365
                int rc = 0;
179
366
                struct cli_ac_data mdata;
180
 
                struct cli_ac_result *res = NULL;
181
367
 
182
368
                if(!buffer)
183
369
                        return CL_EMEM;
184
370
 
185
371
                strncpy(buffer,real_url,real_len);
186
 
                buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
 
372
                buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
187
373
                if(!hostOnly || is_whitelist) {
188
374
                        strncpy(buffer+real_len+1,display_url,display_len);
 
375
                        if(is_whitelist) 
 
376
                                buffer[buffer_len - 1] = '/';
 
377
                        buffer[buffer_len]=0;
189
378
                }
190
 
                buffer[buffer_len - 1] = '/';
191
 
                buffer[buffer_len]=0;
192
379
                cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
193
380
 
194
 
                if((rc = cli_ac_initdata(&mdata, 0, 0, 0, CLI_DEFAULT_AC_TRACKLEN)))
195
 
                        return rc;
196
 
 
197
 
                bufrev = cli_strdup(buffer);
198
 
                if(!bufrev)
199
 
                        return CL_EMEM;
200
 
                reverse_string(bufrev);
201
 
                rc = filter_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
202
 
                if(rc == -1) {
203
 
                        free(buffer);
204
 
                        free(bufrev);
205
 
                        /* filter says this suffix doesn't match.
206
 
                         * The filter has false positives, but no false
207
 
                         * negatives */
208
 
                        return 0;
209
 
                }
210
 
                rc = cli_ac_scanbuff((const unsigned char*)bufrev,buffer_len, NULL, (void*)&regex, &res, &matcher->suffixes,&mdata,0,0,NULL,AC_SCAN_VIR,NULL);
211
 
                free(bufrev);
212
 
                cli_ac_freedata(&mdata);
213
 
 
214
 
                rc = 0;
215
 
                root = matcher->root_regex_idx;
216
 
                while(res || root) {
217
 
                        struct cli_ac_result *q;
218
 
                        if (!res) {
219
 
                            regex = matcher->suffix_regexes[root].head;
220
 
                            root = 0;
221
 
                        } else {
222
 
                            regex = res->customdata;
223
 
                        }
224
 
                        while(!rc && regex) {
225
 
                                /* loop over multiple regexes corresponding to
226
 
                                 * this suffix */
227
 
                                if (!regex->preg) {
228
 
                                        /* we matched a static pattern */
229
 
                                        rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
230
 
                                } else {
231
 
                                        rc = !cli_regexec(regex->preg, buffer, 0, NULL, 0);
232
 
                                }
233
 
                                if(rc) *info = regex->pattern;
234
 
                                regex = regex->nxt;
235
 
                        }
236
 
                        if (res) {
237
 
                            q = res;
238
 
                            res = res->next;
239
 
                            free(q);
240
 
                        }
241
 
                }
 
381
                if(hostOnly) {
 
382
                        if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
 
383
                                return rc;
 
384
                        rc = 0;
 
385
 
 
386
                        for(i = 0; i < matcher->root_hosts_cnt; i++) {
 
387
                                if(( rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL) ))
 
388
                                        break;
 
389
                        }
 
390
                } else
 
391
                        rc = 0;
 
392
    
 
393
                if(!rc && !hostOnly) 
 
394
                        rc = match_node(matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
242
395
                free(buffer);
243
396
                if(!rc)
244
 
                        cli_dbgmsg("Lookup result: not in regex list\n");
245
 
                else
246
 
                        cli_dbgmsg("Lookup result: in regex list\n");
 
397
                        cli_dbgmsg("not in regex list\n");
247
398
                return rc;
248
399
        }
249
400
}
250
401
 
 
402
/* node stack */
 
403
#define NODE_STACK_INITIAL 1024
 
404
#define NODE_STACK_GROW    4096
 
405
/* Initialize @stack */
 
406
static int stack_init(struct node_stack* stack)
 
407
{
 
408
        massert(stack);
 
409
 
 
410
        stack->cnt = 0;
 
411
        stack->capacity = NODE_STACK_INITIAL;
 
412
        stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
 
413
        if(!stack->data)
 
414
                return CL_EMEM;
 
415
        else
 
416
                return CL_SUCCESS;
 
417
}
 
418
 
 
419
/* Reset @stack pointer, but don't realloc */
 
420
static void stack_reset(struct node_stack* stack)
 
421
{
 
422
        massert(stack);
 
423
 
 
424
        stack->cnt = 0;
 
425
}
 
426
 
 
427
/* Push @node on @stack, growing it if necessarry */
 
428
static inline int stack_push(struct node_stack* stack,struct tree_node* node)
 
429
{
 
430
        massert(stack);
 
431
        massert(stack->data);
 
432
 
 
433
        if(stack->cnt == stack->capacity) {
 
434
                stack->capacity += NODE_STACK_GROW;
 
435
                stack->data = cli_realloc(stack->data,stack->capacity*sizeof(*stack->data));
 
436
                if(!stack->data)
 
437
                        return CL_EMEM;
 
438
        }
 
439
        stack->data[stack->cnt++] = node;
 
440
        return CL_SUCCESS;
 
441
}
 
442
 
 
443
/* Pops node from @stack, doesn't realloc */
 
444
static inline struct tree_node* stack_pop(struct node_stack* stack)
 
445
{
 
446
        massert(stack);
 
447
        massert(stack->data);
 
448
        massert(stack->cnt);/*don't pop from empty stack */
 
449
 
 
450
        return stack->cnt ? stack->data[--stack->cnt] : NULL;
 
451
}
251
452
 
252
453
/* Initialization & loading */
 
454
 
253
455
/* Initializes @matcher, allocating necesarry substructures */
254
 
int init_regex_list(struct regex_matcher* matcher, uint8_t dconf_prefiltering)
 
456
int init_regex_list(struct regex_matcher* matcher)
255
457
{
256
 
#ifdef USE_MPOOL
257
 
        mpool_t *mp = matcher->mempool;
258
 
#endif
259
458
        int rc;
260
 
 
261
 
        assert(matcher);
262
 
        memset(matcher, 0, sizeof(*matcher));
 
459
        /*
 
460
        if(!engine_ok) {
 
461
                cli_dbgmsg("Matcher engine not initialized\n");
 
462
                return CL_ENULLARG;
 
463
        }
 
464
        */
 
465
 
 
466
        massert(matcher);
 
467
        matcher->list_inited = 0;
 
468
        matcher->root_hosts_cnt = 0;
 
469
        matcher->root_hosts = NULL;
 
470
        matcher->root_hosts_cnt = 0;
 
471
 
 
472
        matcher->root_regex = tree_root_alloc();
 
473
        if(!matcher->root_regex) {
 
474
                return CL_EMEM;
 
475
        }
 
476
 
 
477
        if(( rc = stack_init(&matcher->node_stack) )) {
 
478
                free(matcher->root_regex);
 
479
                return rc;
 
480
        }
 
481
        if(( rc = stack_init(&matcher->node_stack_alt) )) {
 
482
                free(matcher->root_regex);
 
483
                stack_destroy(&matcher->node_stack);
 
484
                return rc;
 
485
        }
263
486
 
264
487
        matcher->list_inited=1;
265
 
        matcher->list_built=0;
 
488
        matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
266
489
        matcher->list_loaded=0;
267
 
        cli_hashtab_init(&matcher->suffix_hash, 512);
268
 
#ifdef USE_MPOOL
269
 
        matcher->mempool = mp;
270
 
        matcher->suffixes.mempool = mp;
271
 
        assert(mp && "mempool must be initialized");
272
 
#endif
273
 
        if((rc = cli_ac_init(&matcher->suffixes, 2, 32, dconf_prefiltering))) {
274
 
                return rc;
275
 
        }
276
 
#ifdef USE_MPOOL
277
 
        matcher->sha256_hashes.mempool = mp;
278
 
        matcher->hostkey_prefix.mempool = mp;
279
 
#endif
280
 
        if((rc = cli_bm_init(&matcher->sha256_hashes))) {
281
 
                return rc;
282
 
        }
283
 
        if((rc = cli_bm_init(&matcher->hostkey_prefix))) {
284
 
                return rc;
285
 
        }
286
 
        filter_init(&matcher->filter);
 
490
 
287
491
        return CL_SUCCESS;
288
492
}
289
493
 
 
494
/* inserts @pattern into @root, using ac-matcher 
 
495
 * although the name might be confusing, @pattern is not a regex!*/
 
496
static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
 
497
{
 
498
       int ret;
 
499
       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
 
500
       size_t len,i;
 
501
 
 
502
       if(!new)
 
503
               return CL_EMEM;
 
504
       massert(root);
 
505
       massert(pattern);
 
506
 
 
507
       len = strlen(pattern);
 
508
       new->type = 0;
 
509
       new->sigid = 0;
 
510
       new->parts = 0;
 
511
       new->partno = 0;
 
512
       new->mindist = 0;
 
513
       new->maxdist = 0;
 
514
       new->offset = 0;
 
515
       new->target = 0;
 
516
       new->length = len;
 
517
       if(new->length > root->maxpatlen)
 
518
               root->maxpatlen = new->length;
 
519
 
 
520
       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
 
521
       if(!new->pattern) {
 
522
               free(new);
 
523
               return CL_EMEM;
 
524
       }
 
525
       for(i=0;i<len;i++)
 
526
               new->pattern[i]=pattern[i];/*new->pattern is short int* */
 
527
 
 
528
       new->virname = strdup(info);
 
529
       if((ret = cli_ac_addpatt(root,new))) {
 
530
               free(new->virname);
 
531
               free(new->pattern);
 
532
               free(new);
 
533
               return ret;
 
534
       }
 
535
       return CL_SUCCESS;
 
536
}
 
537
 
290
538
static int functionality_level_check(char* line)
291
539
{
292
540
        char* ptmin;
319
567
                        max = atoi(ptmax);
320
568
 
321
569
                if(min > cl_retflevel()) {
322
 
                        cli_dbgmsg("regex list line %s not loaded (required f-level: %u)\n",line,(unsigned int)min);
 
570
                        cli_dbgmsg("regex list line %s not loaded (required f-level: %d)\n",line,min);
323
571
                        return CL_EMALFDB; 
324
572
                }
325
573
 
327
575
                        return CL_EMALFDB;
328
576
                ptmin[-1]='\0';
329
577
                return CL_SUCCESS;
330
 
        }
331
 
}
332
 
 
333
 
static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl, int is_prefix)
334
 
{
335
 
        int rc;
336
 
        struct cli_bm_patt *pat = mpool_calloc(matcher->mempool, 1, sizeof(*pat));
337
 
        struct cli_matcher *bm;
338
 
        const char *vname = NULL;
339
 
        if(!pat)
340
 
                return CL_EMEM;
341
 
        pat->pattern = (unsigned char*)cli_mpool_hex2str(matcher->mempool, pattern);
342
 
        if(!pat->pattern)
343
 
                return CL_EMALFDB;
344
 
        pat->length = 32;
345
 
        if (is_prefix) {
346
 
            pat->length=4;
347
 
            bm = &matcher->hostkey_prefix;
348
 
        } else {
349
 
            bm = &matcher->sha256_hashes;
350
 
        }
351
 
 
352
 
        if (!matcher->sha256_pfx_set.keys) {
353
 
            if((rc = cli_hashset_init(&matcher->sha256_pfx_set, 1048576, 90))) {
354
 
                return rc;
355
 
            }
356
 
        }
357
 
 
358
 
        if (fl != 'W' && pat->length == 32 &&
359
 
            cli_hashset_contains(&matcher->sha256_pfx_set, cli_readint32(pat->pattern)) &&
360
 
            cli_bm_scanbuff(pat->pattern, 32, &vname, NULL, &matcher->sha256_hashes,0,NULL,NULL) == CL_VIRUS) {
361
 
            if (*vname == 'W') {
362
 
                /* hash is whitelisted in local.gdb */
363
 
                cli_dbgmsg("Skipping hash %s\n", pattern);
364
 
                mpool_free(matcher->mempool, pat->pattern);
365
 
                mpool_free(matcher->mempool, pat);
366
 
                return CL_SUCCESS;
367
 
            }
368
 
        }
369
 
        pat->virname = mpool_malloc(matcher->mempool, 1);
370
 
        if(!pat->virname) {
371
 
                free(pat);
372
 
                return CL_EMEM;
373
 
        }
374
 
        *pat->virname = fl;
375
 
        cli_hashset_addkey(&matcher->sha256_pfx_set, cli_readint32(pat->pattern));
376
 
        if((rc = cli_bm_addpatt(bm, pat, "*"))) {
377
 
                cli_errmsg("add_hash: failed to add BM pattern\n");
378
 
                free(pat->pattern);
379
 
                free(pat->virname);
380
 
                free(pat);
381
 
                return CL_EMALFDB;
382
 
        }
383
 
        return CL_SUCCESS;
 
578
        }               
384
579
}
385
580
 
386
581
 
387
582
/* Load patterns/regexes from file */
388
 
int load_regex_matcher(struct cl_engine *engine,struct regex_matcher* matcher,FILE* fd,unsigned int *signo,unsigned int options,int is_whitelist,struct cli_dbio *dbio, uint8_t dconf_prefiltering)
 
583
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist)
389
584
{
390
 
        int rc,line=0,entry=0;
 
585
        int rc,line=0;
391
586
        char buffer[FILEBUFF];
392
587
 
393
 
        assert(matcher);
 
588
        massert(matcher);
 
589
        massert(fd);
394
590
 
395
591
        if(matcher->list_inited==-1)
396
592
                return CL_EMALFDB; /* already failed to load */
397
 
        if(!fd && !dbio) {
 
593
/*      if(matcher->list_loaded) {
 
594
                cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
 
595
                return CL_SUCCESS;
 
596
        }*/
 
597
        if(!fd) {
398
598
                cli_errmsg("Unable to load regex list (null file)\n");
399
 
                return CL_ENULLARG;
 
599
                return CL_EIO;
400
600
        }
401
601
 
402
602
        cli_dbgmsg("Loading regex_list\n");
403
603
        if(!matcher->list_inited) {
404
 
                rc = init_regex_list(matcher, dconf_prefiltering);
 
604
                rc = init_regex_list(matcher);
405
605
                if (!matcher->list_inited) {
406
606
                        cli_errmsg("Regex list failed to initialize!\n");
407
607
                        fatal_error(matcher);
408
608
                        return rc;
409
609
                }
 
610
                /*atexit(regex_list_done); TODO: destroy this in manager.c */
410
611
        }
411
612
        /*
412
613
         * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
413
614
         * Multiple lines of form, (empty lines are skipped):
414
615
         * Flags RealURL DisplayedURL
415
616
         * Where:
416
 
         * Flags: 
417
 
         *
418
 
         * .pdb files:
419
 
         * R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing 
 
617
         * Flags: R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing 
420
618
         * flags that should be filtered.
421
619
         * [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
422
 
         * 
423
 
         * .wdb files:
424
 
         * X - full URL regex 
425
 
         * Y - host-only regex
426
 
         * M - host simple pattern
 
620
         * Note:Flag filtering only makes sense in .pdb files.
427
621
         *
428
622
         * If a line in the file doesn't conform to this format, loading fails
429
623
         * 
430
624
         */
431
 
        while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
 
625
        while(fgets(buffer,FILEBUFF,fd)) {
432
626
                char* pattern;
433
627
                char* flags;
434
 
                size_t pattern_len;
435
 
 
436
628
                cli_chomp(buffer);
437
 
                line++;
438
629
                if(!*buffer)
439
630
                        continue;/* skip empty lines */
440
631
 
441
 
                if(functionality_level_check(buffer))
442
 
                        continue;
443
 
 
444
 
                if(engine->cb_sigload && engine->cb_sigload("phishing", buffer, engine->cb_sigload_ctx)) {
445
 
                        cli_dbgmsg("load_regex_matcher: skipping %s due to callback\n", buffer);
446
 
                        continue;
447
 
                }
448
 
 
449
 
                entry++;
 
632
                if(functionality_level_check(buffer)) 
 
633
                        continue;
 
634
 
 
635
                line++;
450
636
                pattern = strchr(buffer,':');
451
637
                if(!pattern) {
452
638
                        cli_errmsg("Malformed regex list line %d\n",line);
453
639
                        fatal_error(matcher);
454
640
                        return CL_EMALFDB;
455
641
                }
456
 
                /*pattern[0]='\0';*/
 
642
                pattern[0]='\0';
457
643
                flags = buffer+1;
458
644
                pattern++;
459
645
 
460
 
                pattern_len = strlen(pattern);
461
 
                if(pattern_len < FILEBUFF) {
462
 
                        pattern[pattern_len] = '/';
463
 
                        pattern[pattern_len+1] = '\0';
 
646
                if(is_whitelist) {
 
647
                        const size_t pattern_len = strlen(pattern);
 
648
                        if(pattern_len < FILEBUFF) {
 
649
                                pattern[pattern_len] = '/';
 
650
                                pattern[pattern_len+1] = '\0';
 
651
                        }
 
652
                        else {
 
653
                                cli_errmsg("Overlong regex line %d\n",line);
 
654
                                fatal_error(matcher);
 
655
                                return CL_EMALFDB;
 
656
                        }
 
657
                }
 
658
 
 
659
                if((buffer[0] == 'R' && !is_whitelist) || (buffer[0] == 'X' && is_whitelist)) {/*regex*/
 
660
                        if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags) ))
 
661
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 
662
                }
 
663
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
 
664
                        if(matcher->list_built) {
 
665
                                struct cli_matcher* old_hosts = matcher->root_hosts;
 
666
                                matcher->root_hosts_cnt++;
 
667
 
 
668
                                matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
 
669
                                if(!matcher->root_hosts) {
 
670
                                        matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
 
671
                                        return CL_EMEM;
 
672
                } 
 
673
                                memset(&matcher->root_hosts[matcher->root_hosts_cnt-1], 0, sizeof(struct cli_matcher));
 
674
                                matcher->root_hosts[matcher->root_hosts_cnt-1].ac_root = cli_calloc(1, sizeof(struct cli_ac_node));
 
675
                                if(!matcher->root_hosts[matcher->root_hosts_cnt-1].ac_root) {
 
676
                                        matcher->root_hosts_cnt--;
 
677
                                        return CL_EMEM;
 
678
                                }
 
679
                                cli_dbgmsg("Increased number of root_hosts in regex_list.c\n");
 
680
                                matcher->list_built = 0;
 
681
                        }
 
682
                        if(( rc = add_regex_list_element(&matcher->root_hosts[matcher->root_hosts_cnt-1],pattern,flags) ))
 
683
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
464
684
                }
465
685
                else {
466
 
                        cli_errmsg("Overlong regex line %d\n",line);
467
 
                        fatal_error(matcher);
468
 
                        return CL_EMALFDB;
469
 
                }
470
 
 
471
 
                if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {
472
 
                        /* regex for hostname*/
473
 
                        if (( rc = regex_list_add_pattern(matcher, pattern) ))
474
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
475
 
                }
476
 
                else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {
477
 
                        /*matches displayed host*/
478
 
                        if (( rc = add_static_pattern(matcher, pattern) ))
479
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
480
 
                } else if (buffer[0] == 'S' && (!is_whitelist || pattern[0]=='W')) {
481
 
                        pattern[pattern_len] = '\0';
482
 
                        if (pattern[0]=='W')
483
 
                            flags[0]='W';
484
 
                        if((pattern[0]=='W' || pattern[0]=='F' || pattern[0]=='P') && pattern[1]==':') {
485
 
                            pattern += 2;
486
 
                            if (( rc = add_hash(matcher, pattern, flags[0], pattern[-2] == 'P') )) {
487
 
                                cli_errmsg("Error loading at line: %d\n", line);
488
 
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
489
 
                            }
490
 
                        } else {
491
 
                            cli_errmsg("Error loading line: %d, %c\n", line, *pattern);
492
 
                            return CL_EMALFDB;
493
 
                        }
494
 
                } else {
495
 
                        return CL_EMALFDB;
 
686
                        return CL_EMALFDB;
 
687
                        /* this is useless, we have host, and regex matches
 
688
                        if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
 
689
                                return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
496
690
                }
497
691
        }
498
692
        matcher->list_loaded = 1;
499
 
        if(signo)
500
 
            *signo += entry;
 
693
        if(( rc = build_regex_list(matcher) ))
 
694
                return rc;
501
695
 
 
696
#ifndef NDEBUG
 
697
/*                      dump_tree(matcher->root_regex);*/
 
698
#endif
 
699
        if(!matcher->list_built) {
 
700
                cli_errmsg("Regex list not loaded: build failed!\n");
 
701
                fatal_error(matcher);
 
702
                return CL_EMALFDB;
 
703
        }
 
704
        regex_list_cleanup(matcher);
502
705
        return CL_SUCCESS;
503
706
}
504
707
 
 
708
/*
 
709
static void tree_node_merge_nonbin(struct tree_node* into,const struct tree_node* node)
 
710
{
 
711
        massert(into);
 
712
        massert(node);
 
713
 
 
714
        if(node->alternatives){
 
715
                if(node->u.children[0]->next == node) {
 
716
                        *no non-bin alternatives here*
 
717
                }
 
718
                else {
 
719
                        struct tree_node* p;
 
720
                        for(p = node->u.children[0]->next; p->next != node; p = p->next)
 
721
                                tree_node_insert_nonbin(into,p);
 
722
                }
 
723
        }
 
724
        else
 
725
                tree_node_insert_nonbin(into,node->u.children[0]);
 
726
}
 
727
*
 
728
static void tree_node_merge_bin(struct tree_node* into,const struct tree_node* node)
 
729
{
 
730
        if(node->u.children && node->alternatives) {
 
731
                if(!into->alternatives) {
 
732
                        * into has no bin part, just copy+link the node there*
 
733
                        int i;
 
734
                        struct tree_node* next = into->u.children[0];
 
735
                        into->u.children = node->u.children;
 
736
                        into->alternatives = node->alternatives;
 
737
                        for(i=0;i < into->alternatives;i++) {
 
738
                                if(into->u.children[i]->next == node) {
 
739
                                        into->u.children[i]->next = next;
 
740
                                        into->u.children[i]->listend = 0;
 
741
                                }
 
742
                                else {
 
743
                                        struct tree_node* p;
 
744
                                        for(p = into->u.children[0]->next; p->next != node; p = p->next);
 
745
                                        p->listend = 0;
 
746
                                        p->next = next;
 
747
                                }
 
748
                        }
 
749
                }
 
750
                const size_t new_size = tree_node_get_array_size(into) + tree_node_get_array_size(node);
 
751
                struct tree_node** new_children = cli_malloc(sizeof(
 
752
        }
 
753
        * else: no bin part to merge *
 
754
}
 
755
*/
 
756
 
 
757
static struct tree_node ** tree_node_get_children(const struct tree_node* node)
 
758
{
 
759
        return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
 
760
}
 
761
/* don't do this, it wastes too much memory, and has no benefit
 
762
static void regex_list_dobuild(struct tree_node* called_from,struct tree_node* node)
 
763
{
 
764
        struct tree_node **children;
 
765
        massert(node);
 
766
 
 
767
        children = tree_node_get_children(node);
 
768
        if(node->op!=OP_ROOT)
 
769
                massert(called_from);
 
770
        if(node->op==OP_TMP_PARCLOSE) {
 
771
                const size_t array_size = (node->alternatives +(called_from->op==OP_CUSTOMCLASS ? 1:0))*sizeof(*called_from->u.children);
 
772
                if(node->c)
 
773
                        return;* already processed this common node*
 
774
                else
 
775
                        node->c = 1;
 
776
                * copy children to called_from from this node
 
777
                 * called_from should have 0 alternatives, and a link to this node via ->u.children[0]
 
778
                 * *
 
779
                massert(called_from->alternatives == 0);
 
780
                massert(called_from->u.children);
 
781
                massert(called_from->u.children[0] == node);
 
782
                called_from->u.children = cli_realloc(called_from->u.children,array_size);
 
783
                called_from->u.children = node->u.children;
 
784
                called_from->alternatives = node->alternatives;
 
785
                if(called_from->alternatives) {
 
786
                        * fix parent pointers *
 
787
                        int i;TODO: do a deep copy of children here
 
788
                        struct tree_node **from_children = tree_node_get_children(called_from);
 
789
                        massert(from_children);
 
790
                        for(i=0;i < called_from->alternatives;i++) {
 
791
                                struct tree_node* p;
 
792
                                for(p=from_children[i];p->next != node; p = p->next);
 
793
                                p->next = called_from;
 
794
                        }
 
795
                }
 
796
        }
 
797
 
 
798
        if(node->op==OP_LEAF) 
 
799
        return;
 
800
        else if (node->alternatives) {
 
801
                int i;
 
802
                struct tree_node* p;
 
803
                massert(children);
 
804
                p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
 
805
                for(i=0;i<node->alternatives;i++)
 
806
                        regex_list_dobuild(node,children[i]);
 
807
                if(p && p!=node)
 
808
                        regex_list_dobuild(node,p);
 
809
        } else {
 
810
                if(children) 
 
811
                        if (children[0])
 
812
                                regex_list_dobuild(node,children[0]);
 
813
        }
 
814
        if(node->next && !node->listend)
 
815
                regex_list_dobuild(node,node->next);
 
816
        if(node->op==OP_TMP_PARCLOSE)
 
817
                node->c=0;
 
818
        *free(node);*
 
819
}
 
820
*/
505
821
 
506
822
/* Build the matcher list */
507
 
int cli_build_regex_list(struct regex_matcher* matcher)
 
823
static int build_regex_list(struct regex_matcher* matcher)
508
824
{
509
825
        int rc;
510
 
        if(!matcher)
511
 
                return CL_SUCCESS;
512
826
        if(!matcher->list_inited || !matcher->list_loaded) {
513
827
                cli_errmsg("Regex list not loaded!\n");
514
828
                return -1;/*TODO: better error code */
515
829
        }
516
830
        cli_dbgmsg("Building regex list\n");
517
 
        cli_hashtab_free(&matcher->suffix_hash);
518
 
        if(( rc = cli_ac_buildtrie(&matcher->suffixes) ))
519
 
                return rc;
 
831
        if(matcher->root_hosts)
 
832
                if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
 
833
                        return rc;
520
834
        matcher->list_built=1;
521
 
        cli_hashset_destroy(&matcher->sha256_pfx_set);
522
835
 
523
836
        return CL_SUCCESS;
524
837
}
526
839
/* Done with this matcher, free resources */
527
840
void regex_list_done(struct regex_matcher* matcher)
528
841
{
529
 
        assert(matcher);
530
 
 
531
 
        if(matcher->list_inited == 1) {
532
 
                size_t i;
533
 
                cli_ac_free(&matcher->suffixes);
534
 
                if(matcher->suffix_regexes) {
535
 
                        for(i=0;i<matcher->suffix_cnt;i++) {
536
 
                                struct regex_list *r = matcher->suffix_regexes[i].head;
537
 
                                while(r) {
538
 
                                        struct regex_list *q = r;
539
 
                                        r = r->nxt;
540
 
                                        free(q->pattern);
541
 
                                        free(q);
542
 
                                }
543
 
                        }
544
 
                        free(matcher->suffix_regexes);
545
 
                        matcher->suffix_regexes = NULL;
546
 
                }
547
 
                if(matcher->all_pregs) {
548
 
                        for(i=0;i<matcher->regex_cnt;i++) {
549
 
                                regex_t *r = matcher->all_pregs[i];
550
 
                                cli_regfree(r);
551
 
                                mpool_free(matcher->mempool, r);
552
 
                        }
553
 
                        mpool_free(matcher->mempool, matcher->all_pregs);
554
 
                }
555
 
                cli_hashtab_free(&matcher->suffix_hash);
556
 
                cli_bm_free(&matcher->sha256_hashes);
557
 
                cli_bm_free(&matcher->hostkey_prefix);
558
 
        }
 
842
        massert(matcher);
 
843
 
 
844
        regex_list_cleanup(matcher);
 
845
        if(matcher->list_loaded) {
 
846
                if(matcher->root_hosts) {
 
847
                        size_t i;
 
848
                        for(i=0;i<matcher->root_hosts_cnt;i++)
 
849
                                cli_ac_free(&matcher->root_hosts[i]);
 
850
                        free(matcher->root_hosts);
 
851
                        matcher->root_hosts=NULL;
 
852
                }
 
853
 
 
854
                matcher->root_hosts_cnt=0;
 
855
                matcher->list_built=0;
 
856
                destroy_tree(matcher);
 
857
                matcher->list_loaded=0;
 
858
        }
 
859
        if(matcher->list_inited) {
 
860
                matcher->list_inited=0;
 
861
        }
 
862
        stack_destroy(&matcher->node_stack);
 
863
        stack_destroy(&matcher->node_stack_alt);
 
864
}
 
865
 
 
866
/* Tree matcher algorithm */
 
867
struct token_t
 
868
{
 
869
        union {
 
870
                const unsigned char* start;
 
871
                char_bitmap_p  bitmap;
 
872
                unsigned char  c;
 
873
        } u;
 
874
        size_t len;
 
875
        char   type;
 
876
};
 
877
 
 
878
enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
 
879
 
 
880
static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
 
881
{
 
882
        massert(pat);
 
883
        massert(token);
 
884
 
 
885
        switch(*pat) {
 
886
                case '\\':
 
887
                        token->type=TOKEN_CHAR;
 
888
                        token->u.c = *(++pat);
 
889
                        if(islower(token->u.c)) {
 
890
                                /* handle \n, \t, etc. */
 
891
                                char fmt[3] = {'\\', '\0', '\0'};
 
892
                                char c;
 
893
 
 
894
                                fmt[1] = token->u.c;
 
895
                                if(snprintf(&c,1,fmt)!=1) {
 
896
                                        token->type=TOKEN_REGEX;
 
897
                                        token->u.start = pat;
 
898
                                }
 
899
                                else
 
900
                                        token->u.c=c;
 
901
                        }
 
902
                        token->len = 1;
 
903
                        break;
 
904
                case '|':
 
905
                        token->type=TOKEN_ALT;
 
906
                        break;
 
907
                case '*':
 
908
                case '+':
 
909
                case '?':
 
910
                case '{':
 
911
                case '}':
 
912
                        token->type=TOKEN_REGEX;
 
913
/*                      massert(0 && "find_regex_start should have forbidden us from finding regex special chars");*/
 
914
                        break;
 
915
                case '[':
 
916
                        {
 
917
                        /*TODO: implement*/
 
918
                        /*see if it is something simple like a list of characters, a range, or negated ...*/
 
919
                        const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
 
920
                        unsigned char range_start=0;
 
921
                        int hasprev = 0;
 
922
                        char_bitmap_p bitmap = cli_malloc(32);
 
923
                        if(!bitmap)
 
924
                                return NULL;
 
925
                        if (*pat=='^') {
 
926
                                memset(bitmap,0xFF,32);/*match chars not in brackets*/
 
927
                                pat++;
 
928
                        }
 
929
                        else
 
930
                                memset(bitmap,0x00,32);
 
931
                        do {
 
932
                                /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
 
933
                                if (*pat=='-' && hasprev) {
 
934
                                        /* it is a range*/
 
935
                                        unsigned char range_end;
 
936
                                        unsigned int c;
 
937
                                        massert(range_start);
 
938
                                        pat++;
 
939
                                        if (pat[0]=='[')
 
940
                                                if (pat[1]=='.') {
 
941
                                                        if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
 
942
                                                                range_end = '-';
 
943
                                                        else {
 
944
                                                                /* this is getting complicated, bail out */
 
945
                                                                cli_warnmsg("confused about collating sequences in regex,bailing out");
 
946
                                                                pat=old;
 
947
                                                                token->type=TOKEN_REGEX;
 
948
                                                                break;
 
949
                                                        }
 
950
                                                }
 
951
                                                else 
 
952
                                                        range_end = *pat;
 
953
                                        else
 
954
                                                range_end = *pat;
 
955
                                        for(c=range_start+1;c<=range_end;c++)
 
956
                                                bitmap[c>>3] ^= 1<<(c&0x7);
 
957
                                        hasprev = 0;
 
958
                                }
 
959
                                else if (pat[0]=='[' && pat[1]==':') {
 
960
                                                        const unsigned char* end;
 
961
                                                        int len,found=-1;
 
962
                                                        size_t i;
 
963
 
 
964
                                                        pat+=2;
 
965
                                                        end=(unsigned char*)strstr((const char*)pat,":]");
 
966
                                                        if(!end) {
 
967
                                                                cli_warnmsg("confused about std char class syntax regex,bailing out");
 
968
                                                                pat=old;
 
969
                                                                token->type=TOKEN_REGEX;
 
970
                                                                break;
 
971
                                                        }
 
972
 
 
973
                                                        len = end-pat;
 
974
                                                        for(i=0;i<std_class_cnt;i++)
 
975
                                                                if(!strncmp((const char*)pat,std_class[i],len)) {
 
976
                                                                        found=i;
 
977
                                                                        break;
 
978
                                                                }
 
979
                                                        if(found!=-1) {
 
980
                                                                for(i=0;i<256;i++)
 
981
                                                                        if(char_class[i]&(1<<found))
 
982
                                                                                bitmap[i>>3] ^= 1<<(i&0x7);
 
983
                                                        }
 
984
                                                        else {
 
985
                                                                /*unknown class*/
 
986
                                                                cli_warnmsg("confused about regex bracket expression, bailing out");
 
987
                                                                pat=old;
 
988
                                                                token->type=TOKEN_REGEX;
 
989
                                                                break;
 
990
                                                        }
 
991
                                                }
 
992
                                else {
 
993
                                        bitmap[*pat>>3] ^= 1<<(*pat&0x7);
 
994
                                        pat++;
 
995
                                        range_start = *pat;
 
996
                                        hasprev = 1;
 
997
                                }
 
998
                        } while(*pat!=']');
 
999
                        /*TODO: see if this bitmap already exists, then reuse*/                 
 
1000
                        token->type = TOKEN_BRACKET;
 
1001
                        token->u.bitmap = bitmap;
 
1002
                        break;
 
1003
                        }
 
1004
                case ']':
 
1005
                        massert(0 && "Encountered ] without matching [");
 
1006
                        /* bad state */
 
1007
                        break;
 
1008
                case '.':
 
1009
                        token->type=TOKEN_DOT;
 
1010
                        break;
 
1011
                case '(':
 
1012
                        token->type=TOKEN_PAR_OPEN;
 
1013
                        break;
 
1014
                case ')':
 
1015
                        token->type=TOKEN_PAR_CLOSE;
 
1016
                        break;
 
1017
                default:
 
1018
                        token->type=TOKEN_CHAR;
 
1019
                        token->u.c = *pat;
 
1020
                        token->len=1;
 
1021
                        break;
 
1022
        }
 
1023
        return ++pat;
 
1024
}
 
1025
 
 
1026
#define INITIAL_ALT_STACK 10
 
1027
#define ALT_STACK_GROW 20
 
1028
 
 
1029
static const unsigned char* find_regex_start(const unsigned char* pat)
 
1030
{
 
1031
        struct token_t token;
 
1032
        /*TODO: find where the regex part begins, for ex:
 
1033
         * abcd+, regex begins at 'd'
 
1034
         * */
 
1035
        const unsigned char* last=NULL;
 
1036
        const unsigned char* tmp=NULL;
 
1037
        const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
 
1038
        size_t altpositions_capacity = INITIAL_ALT_STACK;
 
1039
        size_t altpositions_cnt = 0;
 
1040
        char lasttype = -1;
 
1041
        if(!altpositions)
 
1042
                return NULL;
 
1043
        massert(pat);
 
1044
 
 
1045
        /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
 
1046
         * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
 
1047
         * back up to the last known good position
 
1048
         * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
 
1049
         * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
 
1050
         * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
 
1051
         * */
 
1052
        do {    
 
1053
                tmp = pat;
 
1054
                pat = getNextToken(pat,&token);
 
1055
                if(token.type!=TOKEN_REGEX) {
 
1056
                        last = tmp;
 
1057
                        lasttype = token.type;
 
1058
                        if(token.type==TOKEN_BRACKET && token.u.bitmap)
 
1059
                                free(token.u.bitmap);
 
1060
                        if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
 
1061
                                /* save this position on stack, succesfully parsed till here*/
 
1062
                                if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
 
1063
                                        /* encountered another alternate (|) operator, override previous | position stored */
 
1064
                                        altpositions[altpositions_cnt-1]=last;
 
1065
                                else {
 
1066
                                        altpositions[altpositions_cnt++] = last;
 
1067
                                        if(altpositions_cnt == altpositions_capacity) {
 
1068
                                                altpositions_capacity += ALT_STACK_GROW;
 
1069
                                                altpositions = cli_realloc(altpositions,altpositions_capacity*sizeof(*altpositions));
 
1070
                                                if(!altpositions)
 
1071
                                                        return NULL;
 
1072
                                        }
 
1073
                                }
 
1074
                        } else if (lasttype==TOKEN_PAR_CLOSE) {
 
1075
                                /* remove last stored position from stack, succesfully this last group */
 
1076
                                altpositions_cnt--;
 
1077
                                massert(altpositions_cnt>0);
 
1078
                        }
 
1079
                }
 
1080
                else {
 
1081
                        if(altpositions_cnt)
 
1082
                                last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
 
1083
                        /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
 
1084
                }
 
1085
        } while(*pat && token.type!=TOKEN_REGEX);
 
1086
        free(altpositions);
 
1087
        return *pat ? last : last+1;
 
1088
}
 
1089
 
 
1090
static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
 
1091
{
 
1092
        struct tree_node* node = cli_malloc(sizeof(*node));
 
1093
        if(node) {
 
1094
                node->alternatives=0;
 
1095
                node->next=next;
 
1096
                node->listend=listend;
 
1097
                node->u.children=NULL;
 
1098
        }
 
1099
        return node;
 
1100
}
 
1101
 
 
1102
static struct tree_node* tree_root_alloc(void)
 
1103
{
 
1104
        struct tree_node* root=tree_node_alloc(NULL,1);
 
1105
        if(root) {
 
1106
                root->op=OP_ROOT;
 
1107
                root->c=0;
 
1108
                root->next=NULL;
 
1109
                root->listend=1;
 
1110
        }
 
1111
        return root;
 
1112
}
 
1113
 
 
1114
static inline struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
 
1115
{
 
1116
        int right;
 
1117
        struct tree_node **children;
 
1118
        massert(node);
 
1119
        massert(left);
 
1120
 
 
1121
        children = tree_node_get_children(node);
 
1122
        right = node->alternatives-1;
 
1123
        *left = 0;
 
1124
        if(!node->alternatives)
 
1125
                return NULL;
 
1126
        massert(children);
 
1127
        while(*left<=right) {
 
1128
                int mid  = *left+(right-*left)/2;
 
1129
                if(children[mid]->c == csearch)
 
1130
                        return children[mid]; 
 
1131
                else if(children[mid]->c < csearch)
 
1132
                        *left=mid+1;
 
1133
                else
 
1134
                        right=mid-1;
 
1135
        }
 
1136
        return NULL;
 
1137
}
 
1138
 
 
1139
static inline struct tree_node* tree_get_next(struct tree_node* node)
 
1140
{
 
1141
        struct tree_node** children;
 
1142
        massert(node);
 
1143
        children = tree_node_get_children(node);
 
1144
 
 
1145
        if(!node->alternatives && children && children[0])
 
1146
                return children[0];
 
1147
        else if(node->alternatives<=1)
 
1148
                return node;
 
1149
        else
 
1150
                return children[0]->next;
 
1151
}
 
1152
 
 
1153
static inline size_t tree_node_get_array_size(const struct tree_node* node)
 
1154
{
 
1155
        massert(node);
 
1156
        /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
 
1157
        return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
 
1158
}
 
1159
 
 
1160
static inline struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
 
1161
{
 
1162
        struct tree_node* new, *alt = tree_get_next(node);
 
1163
        struct tree_node **children;
 
1164
        node->alternatives++;
 
1165
        node->u.children = cli_realloc(node->u.children,tree_node_get_array_size(node));
 
1166
        if(!node->u.children)
 
1167
                return NULL;
 
1168
 
 
1169
        children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
 
1170
 
 
1171
        new = tree_node_alloc(alt , node == alt );
 
1172
        if(new) {
 
1173
                new->op=OP_CHAR;
 
1174
                new->c=c;
 
1175
        }
 
1176
 
 
1177
        if(node->alternatives-left-1>0)
 
1178
                        memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
 
1179
        children[left] = new;   
 
1180
 
 
1181
        return new;
 
1182
}
 
1183
 
 
1184
static inline void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
 
1185
{
 
1186
        struct tree_node **children;
 
1187
        massert(node);
 
1188
        massert(new);
 
1189
 
 
1190
        children = tree_node_get_children(node);
 
1191
        if(node->alternatives) {
 
1192
                massert(children);
 
1193
                if(children[0]->next == node) {
 
1194
                        int i;
 
1195
                        new->listend = 1;
 
1196
                        for(i=0;i<node->alternatives;i++) {
 
1197
                                children[i]->next = new;
 
1198
                                children[i]->listend = 0;
 
1199
                        }
 
1200
                }
 
1201
                else {
 
1202
                        struct tree_node* p;
 
1203
                        for(p = children[0]->next ; p->next != node ; p = p->next)
 
1204
                                massert(!p->listend);
 
1205
                        new->listend = 1;
 
1206
                        p->listend = 0;
 
1207
                        p->next = new;
 
1208
                }
 
1209
        }
 
1210
        else {
 
1211
                int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
 
1212
                if(node->u.children)
 
1213
                        if(node->u.children[idx]) {
 
1214
                                node = node->u.children[idx];
 
1215
                                while(node->next && !node->listend)
 
1216
                                        node = node->next;
 
1217
                                node->listend = 0;
 
1218
                                node->next = new;
 
1219
                                new->listend=1;
 
1220
                        }
 
1221
                node->u.children = cli_realloc(node->u.children,sizeof(node->u.children[0])*(2));
 
1222
                if(node->u.children) {
 
1223
                        node->u.children[idx] = new;
 
1224
                }
 
1225
        }
 
1226
}
 
1227
 
 
1228
static inline unsigned char char_getclass(const unsigned char* bitmap)
 
1229
{
 
1230
        size_t i;
 
1231
        massert(bitmap);
 
1232
 
 
1233
        for(i=0;i<std_class_cnt;i++)
 
1234
                if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
 
1235
                        return i;
 
1236
        return std_class_cnt;
 
1237
}
 
1238
 
 
1239
static void stack_destroy(struct node_stack* stack)
 
1240
{
 
1241
        massert(stack);
 
1242
        if(stack->data)
 
1243
                free(stack->data);
 
1244
        stack->data = NULL;
 
1245
        stack->capacity = 0;
 
1246
}
 
1247
 
 
1248
/* call this after whitelist load is complete, and the tree is no longer going to be modified */
 
1249
void regex_list_cleanup(struct regex_matcher* matcher)
 
1250
{
 
1251
        massert(matcher);
 
1252
 
 
1253
        stack_destroy(&matcher->node_stack);
 
1254
        stack_destroy(&matcher->node_stack_alt);
 
1255
        stack_init(&matcher->node_stack);
 
1256
        stack_init(&matcher->node_stack_alt);
559
1257
}
560
1258
 
561
1259
int is_regex_ok(struct regex_matcher* matcher)
562
1260
{
563
 
        assert(matcher);
 
1261
        massert(matcher);
564
1262
        return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
565
1263
}
566
1264
 
567
 
static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, const char *suffix, size_t len)
568
 
{
569
 
        struct cli_matcher *root = &matcher->suffixes;
570
 
        struct cli_ac_patt *new = mpool_calloc(matcher->mempool,1,sizeof(*new));
571
 
        size_t i;
572
 
        int ret;
573
 
 
574
 
        if(!new)
575
 
                return CL_EMEM;
576
 
        assert(root && suffix);
577
 
 
578
 
        new->rtype = 0;
579
 
        new->type = 0;
580
 
        new->sigid = 0;
581
 
        new->parts = 0;
582
 
        new->partno = 0;
583
 
        new->mindist = 0;
584
 
        new->maxdist = 0;
585
 
        new->offset_min = CLI_OFF_ANY;
586
 
        new->length = len;
587
 
 
588
 
        new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
589
 
        if(new->length > root->maxpatlen)
590
 
                root->maxpatlen = new->length;
591
 
 
592
 
        new->pattern = mpool_malloc(matcher->mempool, sizeof(new->pattern[0])*len);
593
 
        if(!new->pattern) {
594
 
                mpool_free(matcher->mempool, new);
595
 
                return CL_EMEM;
596
 
        }
597
 
        for(i=0;i<len;i++)
598
 
                new->pattern[i] = suffix[i];/*new->pattern is short int* */
599
 
 
600
 
        new->customdata = info;
601
 
        new->virname = NULL;
602
 
        if((ret = cli_ac_addpatt(root,new))) {
603
 
                mpool_free(matcher->mempool, new->pattern);
604
 
                mpool_free(matcher->mempool, new);
605
 
                return ret;
606
 
        }
607
 
        filter_add_static(&matcher->filter, (const unsigned char*)suffix, len, "regex");
608
 
        return CL_SUCCESS;
609
 
}
610
 
 
611
 
#define MODULE "regex_list: "
612
 
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
613
 
 
614
 
static void list_add_tail(struct regex_list_ht *ht, struct regex_list *regex)
615
 
{
616
 
        if(!ht->head)
617
 
                ht->head = regex;
618
 
        if(ht->tail) {
619
 
                ht->tail->nxt = regex;
620
 
        }
621
 
        ht->tail = regex;
622
 
}
623
 
 
624
 
/* returns 0 on success, clamav error code otherwise */
625
 
static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *iregex)
626
 
{
627
 
        struct regex_matcher *matcher = cbdata;
628
 
        struct regex_list *regex = cli_malloc(sizeof(*regex));
629
 
        const struct cli_element *el;
630
 
 
631
 
        assert(matcher);
632
 
        if(!regex)
633
 
                return CL_EMEM;
634
 
        regex->pattern = iregex->pattern ? cli_strdup(iregex->pattern) : NULL;
635
 
        regex->preg = iregex->preg;
636
 
        regex->nxt = NULL;
637
 
        el = cli_hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
638
 
        /* TODO: what if suffixes are prefixes of eachother and only one will
639
 
         * match? */
640
 
        if(el) {
641
 
                /* existing suffix */
642
 
                assert((size_t)el->data < matcher->suffix_cnt);
643
 
                list_add_tail(&matcher->suffix_regexes[el->data], regex);
644
 
        } else {
645
 
                /* new suffix */
646
 
                size_t n = matcher->suffix_cnt++;
647
 
                el = cli_hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
648
 
                matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
649
 
                if(!matcher->suffix_regexes)
650
 
                        return CL_EMEM;
651
 
                matcher->suffix_regexes[n].tail = regex;
652
 
                matcher->suffix_regexes[n].head = regex;
653
 
                if (suffix[0] == '/' && suffix[1] == '\0')
654
 
                    matcher->root_regex_idx = n;
655
 
                add_newsuffix(matcher, regex, suffix, suffix_len);
 
1265
/* returns 0 on success, regexec error code otherwise */                                                
 
1266
static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info)
 
1267
{
 
1268
        int bol=1;
 
1269
        const unsigned char* pat_end = find_regex_start(pat);
 
1270
        struct token_t token;
 
1271
        struct tree_node* node;
 
1272
        
 
1273
        massert(matcher);
 
1274
 
 
1275
        node = matcher->root_regex;
 
1276
 
 
1277
        stack_reset(&matcher->node_stack);
 
1278
        stack_reset(&matcher->node_stack_alt);
 
1279
        stack_push(&matcher->node_stack,node);
 
1280
 
 
1281
        for(;node->op!=OP_LEAF;){
 
1282
                if(pat<pat_end)
 
1283
                        pat  = getNextToken(pat,&token);
 
1284
                else if(*pat) {
 
1285
                        token.type = TOKEN_REGEX;
 
1286
                        token.u.start=pat;
 
1287
                }
 
1288
                else
 
1289
                        token.type = TOKEN_DONE;
 
1290
 
 
1291
                switch(token.type) {
 
1292
                        case TOKEN_CHAR: 
 
1293
                                {
 
1294
                                        /* search for char in tree */
 
1295
                                        int left;
 
1296
                                        struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
 
1297
                                        if(newnode)
 
1298
                                                node = newnode;
 
1299
                                        else {
 
1300
                                                /* not found, insert it */
 
1301
                                                node = tree_node_char_insert(node,token.u.c,left);
 
1302
                                        }
 
1303
                                        break;
 
1304
                                }
 
1305
 
 
1306
                        case TOKEN_PAR_OPEN:
 
1307
                                stack_push(&matcher->node_stack_alt,NULL);/* marker */
 
1308
                                stack_push(&matcher->node_stack,node);
 
1309
                                break;
 
1310
 
 
1311
                        case TOKEN_PAR_CLOSE: {
 
1312
                                                      /*TODO: test this!!!*/
 
1313
                                                      struct tree_node* node_alt = node;
 
1314
                                                      node = tree_node_alloc(NULL,1);
 
1315
                                                      node->op=OP_PARCLOSE;
 
1316
                                                      node->c=0;
 
1317
                                                      node->listend=1;
 
1318
                                                      tree_node_insert_nonbin(node_alt,node);
 
1319
                                                      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
 
1320
                                                              tree_node_insert_nonbin(node_alt,node);
 
1321
                                                      }
 
1322
                                                      stack_pop(&matcher->node_stack);                                        
 
1323
                                                      break;
 
1324
                                              }
 
1325
 
 
1326
                        case TOKEN_ALT:
 
1327
                                stack_push(&matcher->node_stack_alt,node);
 
1328
                                node = stack_pop(&matcher->node_stack);
 
1329
                                stack_push(&matcher->node_stack,node);
 
1330
                                break;
 
1331
 
 
1332
                        case TOKEN_BRACKET:
 
1333
                                {
 
1334
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
 
1335
                                        unsigned char charclass = char_getclass(token.u.bitmap);
 
1336
                                        if(charclass == std_class_cnt) {/*not a std char class*/
 
1337
                                                new->op = OP_CUSTOMCLASS;
 
1338
                                                new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
 
1339
                                                if(!new->u.children)
 
1340
                                                        return CL_EMEM;
 
1341
                                                new->u.bitmap[0] = token.u.bitmap;
 
1342
                                                new->u.bitmap[1] = NULL;
 
1343
                                                tree_node_insert_nonbin(node,new);
 
1344
                                                node = new;
 
1345
                                        }
 
1346
                                        else {
 
1347
                                                new->op = OP_STDCLASS;
 
1348
                                                new->c = charclass;
 
1349
                                                tree_node_insert_nonbin(node,new);
 
1350
                                                node=new;
 
1351
                                        }
 
1352
                                        break;
 
1353
                                }
 
1354
 
 
1355
                        case TOKEN_DOT:
 
1356
                                {
 
1357
                                        struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
 
1358
                                        new->op = OP_DOT;
 
1359
                                        tree_node_insert_nonbin(node,new);
 
1360
                                        node=new;
 
1361
                                        break;
 
1362
                                }
 
1363
 
 
1364
                        case TOKEN_REGEX:
 
1365
                        case TOKEN_DONE: {
 
1366
                                                 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
 
1367
                                                 if(!leaf)
 
1368
                                                         return CL_EMEM;
 
1369
                                                 leaf->info=strdup(info);
 
1370
                                                 if(token.type==TOKEN_REGEX) {
 
1371
                                                         int rc;
 
1372
                                                         struct tree_node* new;
 
1373
                                                         regex_t* preg;
 
1374
                                                         preg=cli_malloc(sizeof(*preg));
 
1375
                                                         if(!preg)
 
1376
                                                                 return CL_EMEM;
 
1377
                                                         rc = regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
 
1378
                                                         leaf->preg=preg;
 
1379
                                                         if(rc)
 
1380
                                                                 return rc;
 
1381
                                                         new=cli_malloc(sizeof(*new));
 
1382
                                                         if(!new)
 
1383
                                                                 return CL_EMEM;
 
1384
                                                         new->op=OP_LEAF;
 
1385
                                                         new->next=node;
 
1386
                                                         new->alternatives=0;
 
1387
                                                         new->u.leaf=leaf;
 
1388
                                                         new->listend=1;
 
1389
                                                         tree_node_insert_nonbin(node,new);
 
1390
                                                 }
 
1391
                                                 else {
 
1392
                                                         leaf->preg=NULL;
 
1393
                                                         node->alternatives=0;
 
1394
                                                         node->u.leaf=leaf;
 
1395
                                                         node->op=OP_LEAF;
 
1396
                                                 }
 
1397
                                                 return 0;
 
1398
                                         }
 
1399
                }
 
1400
 
 
1401
                bol=0;
656
1402
        }
657
1403
        return 0;
658
1404
}
659
1405
 
660
 
static size_t reverse_string(char *pattern)
661
 
{
662
 
        size_t len = strlen(pattern);
 
1406
/* c has to be unsigned char here!! */
 
1407
static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
 
1408
{
 
1409
        struct tree_node** children;
 
1410
        int rc;
 
1411
 
 
1412
        massert(node);
 
1413
        massert(c);
 
1414
        massert(info);
 
1415
 
 
1416
        if(!node->u.children)
 
1417
                return MATCH_FAILED;/* tree empty */
 
1418
        *info = NULL;
 
1419
        len++;
 
1420
        c--;
 
1421
        for(;;) {
 
1422
                massert(node);
 
1423
                children = node->u.children;
 
1424
                switch(node->op) {
 
1425
                        case OP_ROOT:
 
1426
                                rc=1;
 
1427
                                break;
 
1428
                        case OP_PARCLOSE:
 
1429
                                /*this isn't a real character, so don't move*/
 
1430
                                c--;
 
1431
                                len++;
 
1432
                                rc=1;
 
1433
                                break;
 
1434
                        case OP_CHAR:
 
1435
                                massert(*c==node->c && "We know this has to match");
 
1436
                                rc = 1;/* *c==node->c;- we know it has matched */
 
1437
                                break;
 
1438
                        case OP_DOT:    
 
1439
                                rc = *c!='\n';
 
1440
                                break;
 
1441
                        case OP_STDCLASS:
 
1442
                                rc = char_class[*c]&(node->c);
 
1443
                                break;
 
1444
                        case OP_CUSTOMCLASS:
 
1445
                        {
 
1446
                                char_bitmap_p bitmap;
 
1447
                                massert(children);
 
1448
                                bitmap = (char_bitmap_p)node->u.bitmap[0];
 
1449
                                children++;
 
1450
                                rc = bitmap[*c>>3]&(1<<(*c&0x7));
 
1451
                                break;
 
1452
                        }
 
1453
                        case OP_LEAF:
 
1454
                        {
 
1455
                                const struct leaf_info* leaf = node->u.leaf;
 
1456
                                /*isleaf = 1;*/
 
1457
                                if(leaf->preg) {
 
1458
                                        rc = !regexec(leaf->preg,(const char*)c,0,NULL,0);
 
1459
                                }
 
1460
                                else  {
 
1461
                                        massert(*c==node->c && "We know this has to match[2]");
 
1462
                                        rc = 1;
 
1463
                                }
 
1464
                                if(rc) {
 
1465
                                        *info = leaf->info;
 
1466
                                        return MATCH_SUCCESS;
 
1467
                                }
 
1468
                                break;
 
1469
                        }
 
1470
                        default:
 
1471
                                /* impossible */
 
1472
                                cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
 
1473
                                exit(1);
 
1474
                }
 
1475
                len--;
 
1476
                if(!len) rc=0;
 
1477
                c++;
 
1478
                if(rc) {
 
1479
                        const char csearch = *c;
 
1480
                        int left = 0,right = node->alternatives-1;
 
1481
                        int mid;
 
1482
                        /*matched so far, go deeper*/
 
1483
                        /*do a binary search between children */
 
1484
                        massert(children);
 
1485
                        while(left<=right) {
 
1486
                                mid  = left+(right-left)/2;
 
1487
                                if (children[mid]->c == csearch)
 
1488
                                        break;
 
1489
                                else if(children[mid]->c < csearch)
 
1490
                                        left=mid+1;
 
1491
                                else
 
1492
                                        right=mid-1;
 
1493
                        }
 
1494
                        if(left<=right) {
 
1495
                                node = children[mid];
 
1496
                                massert(node);
 
1497
                        }
 
1498
                        else {
 
1499
                                if(node->alternatives) {
 
1500
                                        if(!children[0]->listend) {
 
1501
                                                node = children[0];
 
1502
                                                c++;
 
1503
                                                len--;
 
1504
                                        }
 
1505
                                        while(node && node->listend) {
 
1506
                                                node = node->next;/* climb up */
 
1507
                                                c--;
 
1508
                                                len++;
 
1509
                                        }
 
1510
                                        if(!node || !node->next) 
 
1511
                                                return MATCH_FAILED;/* reached root node */
 
1512
                                        node=node->next;
 
1513
                                        c--;
 
1514
                                        len++;
 
1515
                                }
 
1516
                                else if(node->u.children) {
 
1517
                                        struct tree_node* rewrite_next = NULL;
 
1518
                                        if(node->op==OP_PARCLOSE) 
 
1519
                                                rewrite_next = node;
 
1520
                                        node = children[0];
 
1521
                                        massert(node);
 
1522
                                        massert(node->op!=OP_CHAR);
 
1523
                                        if(rewrite_next)
 
1524
                                                node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
 
1525
                                                                             we need to know 
 
1526
                                                                             from which one we came, so we can find out way back
 
1527
                                                                             should we fail to match somewhere deeper*/
 
1528
                                }
 
1529
                        }
 
1530
                }
 
1531
                else {
 
1532
                        /* this node didn't match, try sibling, or parent (if no more siblings) */
 
1533
                        while(node && node->listend) {
 
1534
                                node = node->next;/* sibling of parent */
 
1535
                                c--;
 
1536
                                len++;
 
1537
                        }
 
1538
                        if(!node || !node->next) /* reached root node, it has no next */
 
1539
                                return MATCH_FAILED;
 
1540
                        else {
 
1541
                                c--;
 
1542
                                len++;
 
1543
                                node=node->next;
 
1544
                        }
 
1545
                }
 
1546
        }
 
1547
        return MATCH_FAILED;
 
1548
}
 
1549
 
 
1550
/* push node on stack, only if it isn't there already */
 
1551
static inline void stack_push_once(struct node_stack* stack,struct tree_node* node)
 
1552
{
663
1553
        size_t i;
664
 
        for(i=0; i < (len/2); i++) {
665
 
                char aux = pattern[i];
666
 
                pattern[i] = pattern[len-i-1];
667
 
                pattern[len-i-1] = aux;
668
 
        }
669
 
        return len;
670
 
}
671
 
 
672
 
static regex_t *new_preg(struct regex_matcher *matcher)
673
 
{
674
 
        regex_t *r;
675
 
        matcher->all_pregs = mpool_realloc(matcher->mempool, matcher->all_pregs, ++matcher->regex_cnt * sizeof(*matcher->all_pregs));
676
 
        if(!matcher->all_pregs)
677
 
                return NULL;
678
 
        r = mpool_malloc(matcher->mempool, sizeof(*r));
679
 
        if(!r)
680
 
                return NULL;
681
 
        matcher->all_pregs[matcher->regex_cnt-1] = r;
682
 
        return r;
683
 
}
684
 
 
685
 
static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
686
 
{
687
 
        size_t len;
688
 
        struct regex_list regex;
689
 
        int rc;
690
 
 
691
 
        len = reverse_string(pattern);
692
 
        regex.nxt = NULL;
693
 
        regex.pattern = cli_strdup(pattern);
694
 
        regex.preg = NULL;
695
 
        rc = add_pattern_suffix(matcher, pattern, len, &regex);
696
 
        free(regex.pattern);
697
 
        return rc;
698
 
}
699
 
 
700
 
int regex_list_add_pattern(struct regex_matcher *matcher, char *pattern)
701
 
{
702
 
        int rc;
703
 
        regex_t *preg;
704
 
        size_t len;
705
 
        /* we only match the host, so remove useless stuff */
706
 
        const char remove_end[] = "([/?].*)?/";
707
 
        const char remove_end2[] = "([/?].*)/";
708
 
 
709
 
        len = strlen(pattern);
710
 
        if(len > sizeof(remove_end)) {
711
 
                if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
712
 
                        len -= sizeof(remove_end) - 1;
713
 
                        pattern[len++]='/';
714
 
                }
715
 
                if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
716
 
                        len -= sizeof(remove_end2) - 1;
717
 
                        pattern[len++]='/';
718
 
                }
719
 
        }
720
 
        pattern[len] = '\0';
721
 
 
722
 
        preg = new_preg(matcher);
723
 
        if(!preg)
724
 
                return CL_EMEM;
725
 
 
726
 
        rc = cli_regex2suffix(pattern, preg, add_pattern_suffix, (void*)matcher);
727
 
        if(rc) {
728
 
                cli_regfree(preg);
729
 
        }
730
 
 
731
 
        return rc;
732
 
}
 
1554
        massert(stack);
 
1555
        massert(node);
 
1556
 
 
1557
        for(i=0;i < stack->cnt;i++)
 
1558
                if(stack->data[i]==node)
 
1559
                        return;
 
1560
        stack_push(stack,node);
 
1561
}
 
1562
 
 
1563
static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
 
1564
{
 
1565
        struct tree_node **children;
 
1566
        massert(matcher);
 
1567
        massert(node);
 
1568
 
 
1569
        children = tree_node_get_children(node);
 
1570
        if(node->op==OP_LEAF) {
 
1571
                struct leaf_info* leaf = node->u.leaf;
 
1572
                if(node->next && !node->listend)
 
1573
                        destroy_tree_internal(matcher,node->next);
 
1574
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
 
1575
                stack_push_once(&matcher->node_stack,node);
 
1576
                if(leaf->preg) {
 
1577
                        regfree(leaf->preg);
 
1578
                        free(leaf->preg);
 
1579
                        leaf->preg=NULL;
 
1580
                }
 
1581
                if(leaf->info) {
 
1582
                        free(leaf->info);
 
1583
                        leaf->info=NULL;
 
1584
                }
 
1585
        /*      return;*/
 
1586
        }
 
1587
        if(node->alternatives) {
 
1588
                int i;
 
1589
                struct tree_node* p;
 
1590
                massert(children);
 
1591
                p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
 
1592
                for(i=0;i<node->alternatives;i++)
 
1593
                        destroy_tree_internal(matcher,children[i]);
 
1594
                if(p && p!=node)
 
1595
                        destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
 
1596
        }
 
1597
        else {
 
1598
                if(children) {
 
1599
                        if(children[0])
 
1600
                                destroy_tree_internal(matcher,children[0]);             
 
1601
                }
 
1602
        }
 
1603
        if(node->op!=OP_LEAF && node->next && !node->listend)
 
1604
                destroy_tree_internal(matcher,node->next);
 
1605
        if(node->u.children)
 
1606
                stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
 
1607
        if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
 
1608
                free(node->u.children[0]);
 
1609
                node->u.children[0]=NULL;
 
1610
        }
 
1611
        stack_push_once(&matcher->node_stack,node);
 
1612
}
 
1613
 
 
1614
static void destroy_tree(struct regex_matcher* matcher)
 
1615
{
 
1616
        /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
 
1617
         * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
 
1618
         * (and push to stack only if it doesn't contain it already*/
 
1619
        massert(matcher);
 
1620
 
 
1621
        stack_reset(&matcher->node_stack);
 
1622
        destroy_tree_internal(matcher,matcher->root_regex);
 
1623
        while (matcher->node_stack.cnt) {
 
1624
                struct tree_node* node = stack_pop(&matcher->node_stack);
 
1625
                if(node)
 
1626
                        free(node);
 
1627
        }
 
1628
}
 
1629
#ifndef NDEBUG
 
1630
static void dump_node(struct tree_node* node)
 
1631
{
 
1632
        int i;
 
1633
        struct tree_node* p,**children;
 
1634
        massert(node);
 
1635
        if(node->op==OP_LEAF) {
 
1636
                if(node->u.leaf->preg)
 
1637
                        printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
 
1638
                else
 
1639
                        printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
 
1640
                if(node->next && !node->listend) {
 
1641
                        printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
 
1642
                        dump_node(node->next);
 
1643
                }
 
1644
                return;
 
1645
        }
 
1646
        printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
 
1647
        if(node->next)
 
1648
                printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
 
1649
        printf("n%p -> {",(void*)node);/*using address of node as id*/
 
1650
        children = tree_node_get_children(node);
 
1651
        if(node->alternatives)
 
1652
                massert(children);
 
1653
        for(i=0;i<node->alternatives;i++)
 
1654
                printf("n%p ",(void*)children[i]);
 
1655
        if(node->alternatives && children[0]->op!=OP_LEAF)
 
1656
                for(p=children[0]->next;p!=node;p=p->next)
 
1657
                {
 
1658
                        massert(p);
 
1659
                        printf("n%p ",(void*)p);
 
1660
                        if(p->op==OP_LEAF || p->listend)
 
1661
                                break;
 
1662
                }
 
1663
        if(!node->alternatives && children && children[0])
 
1664
                printf("n%p ",(void*)children[0]);
 
1665
        printf("};\n");
 
1666
        printf("{rank=same;");
 
1667
        for(i=0;i<node->alternatives;i++)
 
1668
                printf("n%p ",(void*)node->u.children[i]);
 
1669
        if(node->alternatives && children[0]->op!=OP_LEAF)
 
1670
                for(p=children[0]->next;p!=node;p=p->next) 
 
1671
                {
 
1672
                        printf("n%p ",(void*)p);        
 
1673
                        if(p->op==OP_LEAF || p->listend)
 
1674
                                break;
 
1675
                }
 
1676
        if(!node->alternatives && children && children[0])
 
1677
                printf("n%p ",(void*)children[0]);
 
1678
        printf("};\n");
 
1679
        for(i=0;i<node->alternatives;i++)
 
1680
                dump_node(children[i]);
 
1681
        if(node->alternatives && children[0]->op!=OP_LEAF)
 
1682
                for(p=children[0]->next;p!=node;p=p->next)
 
1683
                {
 
1684
                        dump_node(p);
 
1685
                        if(p->op==OP_LEAF || p->listend)
 
1686
                                break;
 
1687
                }
 
1688
        if(!node->alternatives && children && children[0])
 
1689
                dump_node(children[0]);
 
1690
}
 
1691
 
 
1692
void dump_tree(struct tree_node* root)
 
1693
{
 
1694
        /*use dot/dotty from graphviz to view it*/
 
1695
        massert(root);
 
1696
        printf("digraph tree {\n");
 
1697
        dump_node(root);
 
1698
        printf("}\n");
 
1699
}
 
1700
#endif
 
1701
 
 
1702
#endif