259
261
###########################################################################
264
# Called in the parent process shortly before forking off child processes.
268
if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
269
$self->{store}->prefork_init;
273
###########################################################################
276
# Called in a child process shortly after being spawned.
277
sub spamd_child_init {
280
if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
281
$self->{store}->spamd_child_init;
285
###########################################################################
261
288
sub check_bayes {
262
289
my ($self, $pms, $fulltext, $min, $max) = @_;
386
414
@msgid = $self->get_msgid($msg);
389
foreach $msgid ( @msgid ) {
390
my $seen = $self->{store}->seen_get ($msgid);
417
foreach my $msgid_t ( @msgid ) {
418
my $seen = $self->{store}->seen_get ($msgid_t);
392
420
if (defined ($seen)) {
393
421
if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
394
dbg("bayes: $msgid already learnt correctly, not learning twice");
422
dbg("bayes: $msgid_t already learnt correctly, not learning twice");
396
424
} elsif ($seen !~ /^[hs]$/) {
397
warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");
425
warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
399
427
# bug 3704: If the message was already learned, don't try learning it again.
400
428
# this prevents, for instance, manually learning as spam, then autolearning
401
429
# as ham, or visa versa.
402
430
if ($self->{main}->{learn_no_relearn}) {
403
dbg("bayes: $msgid already learnt as opposite, not re-learning");
431
dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
407
dbg("bayes: $msgid already learnt as opposite, forgetting first");
435
dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
409
437
# kluge so that forget() won't untie the db on us ...
410
438
my $orig = $self->{main}->{learn_caller_will_untie};
448
470
my $tokens = $self->tokenize($msg, $msgdata);
451
$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
453
$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
472
{ my $timer = $self->{main}->time_method('b_count_change');
474
$self->{store}->nspam_nham_change(1, 0);
475
$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
477
$self->{store}->nspam_nham_change(0, 1);
478
$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
456
482
$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
667
694
dbg("bayes: corpus size: nspam = $ns, nham = $nn");
669
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
671
my $msgtokens = $self->tokenize($msg, $msgdata);
673
my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
697
{ my $timer = $self->{main}->time_method('b_tokenize');
698
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
699
$msgtokens = $self->tokenize($msg, $msgdata);
703
{ my $timer = $self->{main}->time_method('b_tok_get_all');
704
$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
707
my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
709
my $probabilities_ref =
710
$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
677
713
foreach my $tokendata (@{$tokensdata}) {
714
my $prob = shift(@$probabilities_ref);
715
next unless defined $prob;
678
716
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
679
my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
680
next unless defined $prob;
684
719
spam_count => $tok_spam,
725
my @pw_keys = keys %pw;
690
727
# If none of the tokens were found in the DB, we're going to skip
691
728
# this message...
693
730
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
697
734
my $tcount_total = keys %{$msgtokens};
698
my $tcount_learned = keys %pw;
735
my $tcount_learned = scalar @pw_keys;
700
737
# Figure out the message receive time (used as atime below)
701
738
# If the message atime comes back as being in the future, something's
706
743
$msgatime = $now if ( $msgatime > $now );
708
# now take the $count most significant tokens and calculate probs using
709
# Robinson's formula.
710
my $count = N_SIGNIFICANT_TOKENS;
713
745
my @touch_tokens;
714
746
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
715
747
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
717
my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
749
my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
718
750
my $log_each_token = (would_log('dbg', 'bayes') > 1);
720
foreach my $tok (sort {
721
$tok_strength{$b} <=> $tok_strength{$a}
724
if ($count-- < 0) { last; }
725
next if ($tok_strength{$tok} <
726
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
728
my $pw = $pw{$tok}->{prob};
752
# now take the most significant tokens and calculate probs using
753
# Robinson's formula.
755
@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
757
if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
760
foreach my $tok (@pw_keys) {
761
next if $tok_strength{$tok} <
762
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
764
my $pw_tok = $pw{$tok};
765
my $pw_prob = $pw_tok->{prob};
730
767
# What's more expensive, scanning headers for HAMMYTOKENS and
731
768
# SPAMMYTOKENS tags that aren't there or collecting data that
732
769
# won't be used? Just collecting the data is certainly simpler.
734
771
my $raw_token = $msgtokens->{$tok} || "(unknown)";
735
my $s = $pw{$tok}->{spam_count};
736
my $n = $pw{$tok}->{ham_count};
737
my $a = $pw{$tok}->{atime};
740
push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];
742
push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
772
my $s = $pw_tok->{spam_count};
773
my $n = $pw_tok->{ham_count};
774
my $a = $pw_tok->{atime};
776
push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
777
[$raw_token, $pw_prob, $s, $n, $a] );
779
push(@sorted, $pw_prob);
747
781
# update the atime on this token, it proved useful
748
782
push(@touch_tokens, $tok);
750
784
if ($log_each_token) {
751
dbg("bayes: token '$raw_token' => $pw");
785
dbg("bayes: token '$raw_token' => $pw_prob");
769
804
# no need to call tok_touch_all unless there were significant
770
805
# tokens and a score was returned
771
806
# we don't really care about the return value here
772
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
808
{ my $timer = $self->{main}->time_method('b_tok_touch_all');
809
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
812
my $timer_finish = $self->{main}->time_method('b_finish');
774
814
$permsgstatus->{bayes_nspam} = $ns;
775
815
$permsgstatus->{bayes_nham} = $nn;
790
830
dbg("bayes: not scoring message, returning undef");
833
undef $timer_compute_prob; # end a timing section if still running
834
if (!defined $timer_finish) {
835
$timer_finish = $self->{main}->time_method('b_finish');
793
838
# Take any opportunistic actions we can take
794
839
if ($self->{main}->{opportunistic_expire_check_only}) {
795
840
# we're supposed to report on expiry only -- so do the
820
865
$permsgstatus->set_tag ('BAYESTC', $tcount_total);
822
867
$permsgstatus->set_tag ('HAMMYTOKENS', sub {
823
869
$self->bayes_report_make_list
824
($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift);
870
($pms, $pms->{bayes_token_info_hammy}, shift);
827
873
$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
828
875
$self->bayes_report_make_list
829
($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift);
876
($pms, $pms->{bayes_token_info_spammy}, shift);
832
879
$permsgstatus->set_tag ('TOKENSUMMARY', sub {
833
if ( defined $permsgstatus->{tag_data}{BAYESTC} )
881
if ( defined $pms->{tag_data}{BAYESTC} )
835
my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED}
836
- $permsgstatus->{tag_data}{BAYESTCSPAMMY}
837
- $permsgstatus->{tag_data}{BAYESTCHAMMY};
838
my $tcount_new = $permsgstatus->{tag_data}{BAYESTC}
839
- $permsgstatus->{tag_data}{BAYESTCLEARNED};
883
my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
884
- $pms->{tag_data}{BAYESTCSPAMMY}
885
- $pms->{tag_data}{BAYESTCHAMMY};
886
my $tcount_new = $pms->{tag_data}{BAYESTC}
887
- $pms->{tag_data}{BAYESTCLEARNED};
840
888
"Tokens: new, $tcount_new; "
841
."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; "
889
."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
842
890
."neutral, $tcount_neutral; "
843
."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}."
891
."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
845
893
"Bayes not run.";
930
978
push(@msgid, $msgid);
933
# Use sha1_hex(Date:, last received: and top N bytes of body)
981
# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
983
# Use sha1_hex(Date: and top N bytes of body)
934
984
# where N is MIN(1024 bytes, 1/2 of body length)
936
986
my $date = $msg->get_header("Date");
937
987
$date = "None" if (!defined $date || $date eq ''); # No Date?
939
my @rcvd = $msg->get_header("Received");
940
my $rcvd = $rcvd[$#rcvd];
941
$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
989
#Removed per bug 5185
990
#my @rcvd = $msg->get_header("Received");
991
#my $rcvd = $rcvd[$#rcvd];
992
#$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
943
994
# Make a copy since pristine_body is a reference ...
944
995
my $body = join('', $msg->get_pristine_body());
945
997
if (length($body) > 64) { # Small Body?
946
998
my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
947
999
substr($body, $keep) = '';
950
unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
1002
#Stripping all CR and LF so that testing midstream from MTA and post delivery don't
1003
#generate different id's simply because of LF<->CR<->CRLF changes.
1004
$body =~ s/[\r\n]//g;
1006
unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
952
1008
return wantarray ? @msgid : $msgid[0];
1353
1409
###########################################################################
1411
# compute the probability that a token is spammish for each token
1412
sub _compute_prob_for_all_tokens {
1413
my ($self, $tokensdata, $ns, $nn) = @_;
1416
return if !$ns || !$nn;
1418
my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1419
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1422
if (!$self->{use_hapaxes}) {
1426
foreach my $tokendata (@{$tokensdata}) {
1427
my $s = $tokendata->[1]; # spam count
1428
my $n = $tokendata->[2]; # ham count
1431
no warnings 'uninitialized'; # treat undef as zero in addition
1432
if ($s + $n >= $threshold) {
1433
# ignoring low-freq tokens, also covers the (!$s && !$n) case
1435
# my $ratios = $s / $ns;
1436
# my $ration = $n / $nn;
1437
# $prob = $ratios / ($ration + $ratios);
1439
$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1441
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1442
# use Robinson's f(x) equation for low-n tokens, instead of just
1446
($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1448
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1452
# 'log_raw_counts' is used to log the raw data for the Bayes equations
1453
# during a mass-check, allowing the S and X constants to be optimized
1454
# quickly without requiring re-tokenization of the messages for each
1455
# attempt. There's really no need for this code to be uncommented in
1456
# normal use, however. It has never been publicly documented, so
1457
# commenting it out is fine. ;)
1459
## if ($self->{log_raw_counts}) {
1460
## $self->{raw_counts} .= " s=$s,n=$n ";
1463
push(@probabilities, $prob);
1465
return \@probabilities;
1355
1468
# compute the probability that a token is spammish
1356
1469
sub _compute_prob_for_token {
1357
1470
my ($self, $token, $ns, $nn, $s, $n) = @_;
1359
1472
# we allow the caller to give us the token information, just
1360
1473
# to save a potentially expensive lookup
1361
1474
if (!defined($s) || !defined($n)) {
1362
($s, $n, undef) = $self->{store}->tok_get ($token);
1365
return if ($s == 0 && $n == 0);
1367
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1368
return if ($s + $n < 10); # ignore low-freq tokens
1371
if (!$self->{use_hapaxes}) {
1372
return if ($s + $n < 2);
1375
return if ( $ns == 0 || $nn == 0 );
1377
my $ratios = ($s / $ns);
1378
my $ration = ($n / $nn);
1382
if ($ratios == 0 && $ration == 0) {
1383
warn "bayes: oops? ratios == ration == 0";
1386
$prob = ($ratios) / ($ration + $ratios);
1389
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1390
# use Robinson's f(x) equation for low-n tokens, instead of just
1393
$prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1395
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1398
# 'log_raw_counts' is used to log the raw data for the Bayes equations during
1399
# a mass-check, allowing the S and X constants to be optimized quickly
1400
# without requiring re-tokenization of the messages for each attempt. There's
1401
# really no need for this code to be uncommented in normal use, however. It
1402
# has never been publicly documented, so commenting it out is fine. ;)
1404
## if ($self->{log_raw_counts}) {
1405
## $self->{raw_counts} .= " s=$s,n=$n ";
1475
($s, $n, undef) = $self->{store}->tok_get($token);
1477
return if !$s && !$n;
1479
my $probabilities_ref =
1480
$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1482
return $probabilities_ref->[0];
1411
1485
###########################################################################