2
* A fast filter for static patterns.
4
* Copyright (C) 2008 Sourcefire, Inc.
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License version 2 as
10
* published by the Free Software Foundation.
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU General Public License for more details.
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23
#include "clamav-config.h"
25
#include "filtering.h"
26
#include "matcher-ac.h"
29
#include "perflogging.h"
30
/* ----- shift-or filtering -------------- */
33
* Description of algorithm:
35
* Multiple patterns are added to the filter.
36
* The filter retains an approximation of these patterns, which can lead to
37
* false positive matches, but not false negative matches.
39
* For each position in the filter we retain what qgrams can match at that
40
* position, for example (if we'd use characters as qgrams):
48
* But it also accepts (false positives):
51
* It doesn't however accept:
54
* This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states,
55
* each active bit meaning that a state is active.
57
* The states are activated sequentially, eachtransition decision is made
58
* considering if we can accept the character at position X.
59
* Since we can start a match at any position, position 0 is
60
* reactivated each time.
61
* When the last position is activated, the filter reports a match.
62
* If we can't accept the character at position X, the state remains inactive,
63
* and further states aren't activated (unless we activate this state in the
66
* Essentially this is an automaton like this:
68
* /\ (a|b|x) (t|z|a) (u|f|t)
69
* [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
71
* \_____________________________/
74
* But we are tracking multiple active states at each time (or run N automatons
75
* in parallel if you like, N = number of states).
77
* We can have S3 and S2 active, meaning that if the next character is
78
* acceptable, it transitions to S1,S3 and S4 being active, otherwise it
79
* transitions to S1 being active.
81
* Active states can either be represented as a binary 1 or 0, and using
82
* bit-shifting and masking.
83
* If we choose 1, we must use &, and after shifting always reactivate bit 0.
84
* If we choose 0, we must use |, and after shifting we don't need to do
85
* anything (since by shifting a 0 is implicitly introduced).
87
* This file implements the latter (shift-or) method.
89
* The discussion above considered pattern to be of same length (or truncated to
90
* be so). In reality patterns are of variable length, and we often have short
93
* Thus another bitmap was introduced, meaning that if (end[Q] == set), then
94
* a pattern can end at this position.
95
* Also we would fill the pattern's position filters quite quickly with only 256
96
* choices for a position, so the algorithm uses overlapping qgrams of length 2:
97
* 'abcd' is 3 qgrams: 'ab','bc','cd'
99
* The algorithm is very sensitive to the end[Q] filter, since it can have false
100
* positives due to short patterns!
101
* For optimal performance we need:
102
* - patterns as long as possible
103
* - probability for end[Q] to match low (avoid 0000, and other common case
104
* - choose the most "diverse" subset from a long pattern
106
* diverse = refering to what we are scanning, so that the filter rarely
107
* matches, so this actually means that we *want* to avoid adding more
108
* characters to the filter, if we have 2 patterns:
109
* abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
110
* don't add new character at the beginning.
112
* With NDB signatures there are more challenges to overcome:
115
* will make the filter accept:
116
* e8<all-256-values-here>, <all-256-values>00, ... 000000aa
118
* We should delay the pattern end as long as possible, especially if it is 0000
119
* The problem is that now the filter accepts 0000 on position 3, regardless
120
* of what we have on position 1 (even if we have something else than e8), so
121
* we have to be very careful not to allow 0000 on first position too,
122
* otherwise the filter will happily accept 000000000000.
124
* To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
125
* based (fits L2), both must match for the filter to consider it a match.
130
/*#define DETAILED_DEBUG*/
131
#ifdef DETAILED_DEBUG
132
#define detailed_dbg cli_dbgmsg
134
#define detailed_dbg(...)
137
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
138
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
140
void filter_init(struct filter *m)
142
memset(m->B, ~0, sizeof(m->B));
143
memset(m->end, ~0, sizeof(m->end));
146
/* because we use uint32_t */
147
#define MAXSOPATLEN 8
149
static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
151
return !(m->B[val] & (1<<pos));
154
static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
156
if (!filter_isset(m, pos, val)) {
157
cli_perf_log_count(FILTER_LOAD, pos);
158
m->B[val] &= ~(1<<pos);
163
static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
165
return !(m->end[a] & (1<<pos));
168
static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
170
if (!filter_end_isset(m, pos, a)) {
171
cli_perf_log_count(FILTER_END_LOAD, pos);
172
m->end[a] &= ~(1 << pos);
175
#define MAX_CHOICES 8
176
/* just an arbitrary limit, if patterns are longer, we cut
177
* the filter can only use MAXSOPATLEN (32) characters,
178
* this longer buffer is needed so that we can choose the "best" subpattern from
180
#define MAXPATLEN 255
182
/* merge another pattern into the filter
183
* add('abc'); add('bcd'); will match [ab][bc][cd] */
184
int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
188
uint32_t best = 0xffffffff;
189
uint8_t best_pos = 0;
191
cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
192
/* TODO: choose best among MAXCHOICES */
194
if(len > MAXPATLEN) {
200
/* we want subsigs to be as long as possible */
203
if (maxlen == 1) maxlen = 2;
206
for(j=0;(best < 100 && j<MAX_CHOICES) || (j < maxlen) ;j++) {
207
uint32_t num = MAXSOPATLEN;
211
for(k=j;k<len-1 && (k-j < MAXSOPATLEN);k++) {
212
q = cli_readint16( &pattern[k] );
213
/* we want to favor subsigs that add as little as
214
* possible to the filter */
215
num += filter_isset(m, k-j, q) ? 0 : MAXSOPATLEN - (k-j);
216
if ((k == j || k == j+1) && (q == 0x0000 || q == 0xffff))
217
num += k==j ? 10000 : 1000;/* bad */
219
/* it is very important to keep the end set small */
220
num += 10*(filter_end_isset(m, k-j-1, q) ? 0 : 1);
221
/* it is very important to have signatures as long as possible
223
num += 5*(MAXSOPATLEN - (k-j));
224
/* if we are lower length than threshold penalize */
227
/* favour longer patterns */
228
num -= (2*MAXSOPATLEN - (k + 1+j))*(k-j)/2;
236
assert(best_pos < len-1);
237
if (pattern[best_pos] == 0 && pattern[best_pos+1] == 0) {
238
detailed_dbg("filter (warning): subsignature begins with zero (static): %s\n", name);
243
if(len > MAXSOPATLEN) {
246
/* Shift-Or like preprocessing */
247
for(j=0;j < len-1;j++) {
248
/* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
249
q = cli_readint16( &pattern[j] );
250
filter_set_atpos(m, j, q);
252
/* we use variable length patterns, use last character to mark pattern end,
253
* can lead to false positives.*/
254
/* mark that at state j, the q-gram q can end the pattern */
257
filter_set_end(m, j, q);
263
/* if non-null i-th character = alt[start + step*i]; start+step*i < end;
265
struct cli_ac_special *alt;
271
static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
273
const struct cli_ac_special *alt = spec->alt;
275
assert (alt->type == 1);
276
assert (i < alt->num);
282
static const struct char_spec full_range = {NULL, 0,0xff,1};
284
static inline int spec_is_fullrange(const struct char_spec *spec0, const struct char_spec *spec1)
286
return !memcmp(spec0, &full_range, sizeof(full_range)) &&
287
!memcmp(spec1, &full_range, sizeof(full_range));
292
#define MIN(a,b) ((a) < (b) ? (a) : (b))
297
/* try to avoid if possible */
299
avoid_anywhere, /* includes avoid_first! */
300
/* not that bad, but still not best */
305
static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
308
unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
334
/* a bit better only */
343
/* at most 256 iterations here, otherwise base would be negative */
344
for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
345
for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
346
unsigned char c0 = spec_ith_char(spec0, k0);
347
unsigned char c1 = spec_ith_char(spec1, k1);
348
uint16_t a = c0 | (c1<<8);
349
num_introduced += filter_isset(m, i, a);
350
num_end_introduced += filter_end_isset(m, i, a);
353
*score = base - num_introduced;
354
*score_end = base - num_end_introduced;
355
if (badness == avoid_first && i) {
356
/* what is bad to begin with, is bad at end too */
357
*score_end -= 0x1000;
367
static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
369
struct choice *choice;
371
assert(ie < MAXPATLEN);
374
if (*cnt >= MAX_CHOICES)
376
if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
378
/* replace very bad picks if we're full */
379
for (j=0;j<*cnt;j++) {
380
if (choices[j].base < badness) {
381
if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
388
choice = &choices[i_neg];
390
choice = &choices[(*cnt)++];
393
choice->len = ie - i + 1;
394
choice->base = badness;
397
static inline int32_t spec_iter(const struct char_spec *spec)
400
return (1 + spec->end - spec->start)/spec->step;
403
int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
405
unsigned i, j = 0, stop = 0, l=0;
408
struct char_spec chars[MAXPATLEN];
409
enum badness char_badness[MAXPATLEN];
410
unsigned char patc[MAXPATLEN];
412
int32_t best_score = -0x7fffffff;
413
unsigned best_score_i = 0;
414
unsigned best_score_len = 0;
415
struct char_spec *spec0, *spec1;
417
struct choice choices[MAX_CHOICES];
418
unsigned choices_cnt = 0;
419
unsigned prefix_len = pat->prefix_length;
421
j = MIN(prefix_len + pat->length, MAXPATLEN);
423
const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
424
if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR)
426
patc[i] = (uint8_t)p;
429
/* all static, use add_static it has better heuristics for this
431
return filter_add_static(m, patc, j, pat->virname);
433
cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
434
/* transform AC characters into our representation */
435
for (i=0;i<j && !stop; i++) {
436
struct char_spec *spec = &chars[i];
437
const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
439
switch (p & CLI_MATCH_WILDCARD) {
441
spec->start = spec->end = (uint8_t)p;
444
case CLI_MATCH_IGNORE:
449
case CLI_MATCH_SPECIAL:
450
assert(pat->special_table);
451
/* assert(altcnt < pat->alt); */
452
assert(pat->special_table[altcnt]);
453
switch (pat->special_table[altcnt++]->type) {
454
case 1: /* ALT_CHAR */
456
spec->end = pat->special_table[altcnt-1]->num - 1;
458
spec->alt = pat->special_table[altcnt-1];
463
/* TODO: should something be done here?
467
case CLI_MATCH_NIBBLE_HIGH:
468
spec->start = (p & 0xf0);
469
spec->end = spec->start | 0x0f;
472
case CLI_MATCH_NIBBLE_LOW:
473
spec->start = (p & 0xf);
474
spec->end = 0xf0 | spec->start;
478
cli_errmsg("filtering: unknown wildcard character: %d\n", p);
486
cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname);
488
cli_warnmsg("Subpattern too short: %s\n", pat->virname);
494
/* new qgrams added to the filter */
497
num_iter = spec_iter(spec0) * spec_iter(spec1);
499
if (num_iter >= 0x100) {
500
if (num_iter == 0x10000)
501
char_badness[i] = reject;
503
char_badness[i] = avoid_anywhere;
506
enum badness scor = acceptable;
507
for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
508
for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
509
unsigned char c0 = spec_ith_char(spec0, k0);
510
unsigned char c1 = spec_ith_char(spec1, k1);
511
if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
519
if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127))
523
if (scor == acceptable && binary) {
524
/* slightly favor binary */
527
char_badness[i] = scor;
531
/* try to choose best subpattern */
533
/* calculating the score for all possible i start pos
534
* and all possible length is too slow, so choose best among N choices
536
for (i=0;i<j-1 && choices_cnt < MAX_CHOICES;i++) {
537
enum badness base0 = like, base1 = like;
538
unsigned kend = MIN(j-1, (i + MAXSOPATLEN)&~1), k;
540
/* add 2 scores: pattern with max length, one where we stop at
541
* first negative, and one we stop at last positive, but never
543
assert(kend-1 < j-1);
544
if (char_badness[i] == reject)
546
if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first)
548
/* if we have another choice don't choose this */
550
while ((kend > i+3) && char_badness[kend-1] == reject) kend--;
551
for (k=i;k<kend;k++) {
552
enum badness badness = char_badness[k];
553
if (badness < acceptable) {
554
if (badness == reject) {
555
/* this is a never pick */
559
if (badness == avoid_first && k != i)
561
if (k == i && badness == avoid_anywhere)
562
badness = avoid_first;
566
base0 = MIN(base0, badness);
568
base1 = MIN(base1, badness);
570
add_choice(choices, &choices_cnt, i, kend, base0);
573
/* try subpattern from after the wildcard */
576
/* if score is positive, it replaces a negative choice */
578
for(l=0;l<choices_cnt;l++) {
583
i = choices[l].begin;
584
kend = i + choices[l].len;
587
for(k = i; k < kend-1; k++) {
589
int32_t iscore, score_end;
591
get_score(char_badness[k], p, m, &chars[k], &chars[k+1],
592
&iscore, &score_end);
593
/* give more importance to the score of the characters
594
* at the beginning */
595
/* TODO: tune magic number here */
601
if (score + score_end > best_score) {
602
/* we may have negative scores, so truncating
603
* the pattern could actually get us a higher
605
best_score = score + score_end;
606
best_score_len = p + 2;
608
assert(i + best_score_len <= j);
613
if (best_score <= -0x7fffffff) {
614
cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
617
if (choices_cnt == 0) {
618
cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
621
assert(best_score_len >= 2);
622
detailed_dbg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
623
/* Shift-Or like preprocessing */
624
assert(1 < best_score_len);
625
for (i=0;i < best_score_len-1;i++) {
626
spec0 = &chars[best_score_i + i];
627
spec1 = &chars[best_score_i + i + 1];
628
/* use overlapping little-endian 2-grams, overlapping because match can start
629
* at any position (including odd) */
631
for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
632
for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
633
unsigned char c0 = spec_ith_char(spec0, k0);
634
unsigned char c1 = spec_ith_char(spec1, k1);
635
if (!c0 && !c1 && !i) {
636
detailed_dbg("filter (warning): subsignature begins with zero: %s\n",pat->virname);
638
filter_set_atpos(m, i, c0 | (c1<<8));
643
j = best_score_len - 2;
644
for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
645
for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
646
unsigned char c0 = spec_ith_char(spec0, k0);
647
unsigned char c1 = spec_ith_char(spec1, k1);
649
detailed_dbg("filter (warning): subsignature ends with zero: %s\n",pat->virname);
651
filter_set_end(m, j, c0 | (c1<<8));
657
static const struct match_len_info {
661
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
662
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
663
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
664
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
665
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
666
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
667
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
668
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{8,9},
669
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
670
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
671
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
672
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
673
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
674
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
675
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
676
{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{9,9},
677
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
678
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
679
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
680
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{7,8},
681
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
682
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
683
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
684
{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{8,8},
685
{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
686
{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{6,7},
687
{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
688
{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{7,7},
689
{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{5,6},
690
{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{6,6},
691
{2,5},{3,5},{2,5},{4,5},{2,5},{3,5},{2,5},{5,5},
692
{2,4},{3,4},{2,4},{4,4},{2,3},{3,3},{2,2},{0,0}
694
/* state 11110011 means that we may have a match of length min 4, max 5 */
696
__hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
700
const uint8_t *B = m->B;
701
const uint8_t *End = m->end;
702
uint8_t shortest, longest=0;
704
if (len < 2) return -1;
705
/* look for first match */
706
for (j=0; j < len-1;j++) {
707
uint8_t match_state_end;
708
const uint16_t q0 = cli_readint16( &data[j] );
710
state = (state << 1) | B[q0];
711
match_state_end = state | End[q0];
712
if (match_state_end != 0xff) {
713
inf->first_match = j;
717
/* no match, inf is invalid */
721
/* this is like a FSM, with multiple active states at the same time.
722
* each bit in "state" means an active state, when a char is encountered
723
* we determine what states can remain active.
724
* The FSM transition rules are expressed as bit-masks */
725
long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
729
const uint8_t *B = m->B;
730
const uint8_t *End = m->end;
732
/* we use 2-grams, must be higher than 1 */
733
if(len < 2) return -1;
734
/* Shift-Or like search algorithm */
735
for(j=0;j < len-1; j++) {
736
const uint16_t q0 = cli_readint16( &data[j] );
738
state = (state << 1) | B[q0];
739
/* state marks with a 0 bit all active states
740
* End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
741
* if we got two 0's at matching positions, it means we encountered a pattern's end */
742
match_end = state | End[q0];
743
if(match_end != 0xff) {
745
/* if state is reachable, and this character can finish a pattern, assume match */
746
/* to reduce false positives check if qgram can finish the pattern */
747
/* return position of probable match */
748
/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
749
* longest pattern that could match */
750
return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0;