292
###########################################################################
296
my ($self, $params) = @_;
297
my $quiet = $params->{quiet};
299
# do a sanity check here. Wierd things happen if we remain tied
300
# after compiling; for example, spamd will never see that the
301
# number of messages has reached the bayes-scanning threshold.
302
if ($self->{store}->db_readable()) {
303
warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
304
$self->{store}->untie_db();
308
###########################################################################
310
# read configuration items to control bayes behaviour. Called by
311
# BayesStore::read_db_configs().
312
sub read_db_configs {
315
# use of hapaxes. Set on bayes object, since it controls prob
317
$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
319
###########################################################################
322
my ($self,$PMS) = @_;
324
return 0 unless $self->{use_ignores};
326
my $ig_from = $self->{main}->call_plugins ("check_wb_list",
327
{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
328
my $ig_to = $self->{main}->call_plugins ("check_wb_list",
329
{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
331
my $ignore = $ig_from || $ig_to;
333
dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
338
###########################################################################
342
my ($self, $params) = @_;
343
my $isspam = $params->{isspam};
344
my $msg = $params->{msg};
345
my $id = $params->{id};
347
if (!$self->{conf}->{use_bayes}) { return; }
349
my $msgdata = $self->get_body_from_msg ($msg);
353
local $SIG{'__DIE__'}; # do not run user die() traps in here
356
if ($self->{main}->{learn_to_journal}) {
357
# If we're going to learn to journal, we'll try going r/o first...
358
# If that fails for some reason, let's try going r/w. This happens
359
# if the DB doesn't exist yet.
360
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
362
$ok = $self->{store}->tie_db_writable();
366
$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
368
if (!$self->{main}->{learn_caller_will_untie}) {
369
$self->{store}->untie_db();
373
} or do { # if we died, untie the dbs.
374
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
375
$self->{store}->untie_db();
376
die "bayes: (in learn) $eval_stat\n";
382
# this function is trapped by the wrapper above
384
my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
385
my @msgid = ( $msgid );
387
if (!defined $msgid) {
388
@msgid = $self->get_msgid($msg);
391
foreach $msgid ( @msgid ) {
392
my $seen = $self->{store}->seen_get ($msgid);
394
if (defined ($seen)) {
395
if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
396
dbg("bayes: $msgid already learnt correctly, not learning twice");
398
} elsif ($seen !~ /^[hs]$/) {
399
warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");
401
# bug 3704: If the message was already learned, don't try learning it again.
402
# this prevents, for instance, manually learning as spam, then autolearning
403
# as ham, or visa versa.
404
if ($self->{main}->{learn_no_relearn}) {
405
dbg("bayes: $msgid already learnt as opposite, not re-learning");
409
dbg("bayes: $msgid already learnt as opposite, forgetting first");
411
# kluge so that forget() won't untie the db on us ...
412
my $orig = $self->{main}->{learn_caller_will_untie};
413
$self->{main}->{learn_caller_will_untie} = 1;
415
my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
417
# reset the value post-forget() ...
418
$self->{main}->{learn_caller_will_untie} = $orig;
420
# forget() gave us a fatal error, so propagate that up
422
dbg("bayes: forget() returned a fatal error, so learn() will too");
427
# we're only going to have seen this once, so stop if it's been
433
# Now that we're sure we haven't seen this message before ...
437
$self->{store}->nspam_nham_change (1, 0);
439
$self->{store}->nspam_nham_change (0, 1);
442
my $msgatime = $msg->receive_date();
444
# If the message atime comes back as being more than 1 day in the
445
# future, something's messed up and we should revert to current time as
448
$msgatime = time if ( $msgatime - time > 86400 );
450
my $tokens = $self->tokenize($msg, $msgdata);
453
$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
455
$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
458
$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
459
$self->{store}->cleanup();
461
$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
464
msgatime => $msgatime,
467
dbg("bayes: learned '$msgid', atime: $msgatime");
472
###########################################################################
476
my ($self, $params) = @_;
477
my $msg = $params->{msg};
478
my $id = $params->{id};
480
if (!$self->{conf}->{use_bayes}) { return; }
482
my $msgdata = $self->get_body_from_msg ($msg);
485
# we still tie for writing here, since we write to the seen db
488
local $SIG{'__DIE__'}; # do not run user die() traps in here
491
if ($self->{main}->{learn_to_journal}) {
492
# If we're going to learn to journal, we'll try going r/o first...
493
# If that fails for some reason, let's try going r/w. This happens
494
# if the DB doesn't exist yet.
495
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
497
$ok = $self->{store}->tie_db_writable();
501
$ret = $self->_forget_trapped ($msg, $msgdata, $id);
503
if (!$self->{main}->{learn_caller_will_untie}) {
504
$self->{store}->untie_db();
508
} or do { # if we died, untie the dbs.
509
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
510
$self->{store}->untie_db();
511
die "bayes: (in forget) $eval_stat\n";
517
# this function is trapped by the wrapper above
518
sub _forget_trapped {
519
my ($self, $msg, $msgdata, $msgid) = @_;
520
my @msgid = ( $msgid );
523
if (!defined $msgid) {
524
@msgid = $self->get_msgid($msg);
527
while( $msgid = shift @msgid ) {
528
my $seen = $self->{store}->seen_get ($msgid);
530
if (defined ($seen)) {
533
} elsif ($seen eq 'h') {
536
dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
540
# messages should only be learned once, so stop if we find a msgid
541
# which was seen before
545
dbg("bayes: forget: msgid $msgid not learnt, ignored");
549
# This message wasn't learnt before, so return
550
if (!defined $isspam) {
551
dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
555
$self->{store}->nspam_nham_change (-1, 0);
558
$self->{store}->nspam_nham_change (0, -1);
561
my $tokens = $self->tokenize($msg, $msgdata);
564
$self->{store}->multi_tok_count_change (-1, 0, $tokens);
566
$self->{store}->multi_tok_count_change (0, -1, $tokens);
569
$self->{store}->seen_delete ($msgid);
570
$self->{store}->cleanup();
572
$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
580
###########################################################################
584
my ($self, $params) = @_;
585
if (!$self->{conf}->{use_bayes}) { return 0; }
586
dbg("bayes: bayes journal sync starting");
587
$self->{store}->sync($params);
588
dbg("bayes: bayes journal sync completed");
591
###########################################################################
594
sub learner_expire_old_training {
595
my ($self, $params) = @_;
596
if (!$self->{conf}->{use_bayes}) { return 0; }
597
dbg("bayes: expiry starting");
598
my $timer = $self->{main}->time_method("expire_bayes");
599
$self->{store}->expire_old_tokens($params);
600
dbg("bayes: expiry completed");
603
###########################################################################
606
# Check to make sure we can tie() the DB, and we have enough entries to do a scan
607
# if we're told the caller will untie(), go ahead and leave the db tied.
608
sub learner_is_scan_available {
609
my ($self, $params) = @_;
611
return 0 unless $self->{conf}->{use_bayes};
612
return 0 unless $self->{store}->tie_db_readonly();
614
# We need the DB to stay tied, so if the journal sync occurs, don't untie!
615
my $caller_untie = $self->{main}->{learn_caller_will_untie};
616
$self->{main}->{learn_caller_will_untie} = 1;
618
# Do a journal sync if necessary. Do this before the nspam_nham_get()
619
# call since the sync may cause an update in the number of messages
621
$self->_opportunistic_calls(1);
623
# Reset the variable appropriately
624
$self->{main}->{learn_caller_will_untie} = $caller_untie;
626
my ($ns, $nn) = $self->{store}->nspam_nham_get();
628
if ($ns < $self->{conf}->{bayes_min_spam_num}) {
629
dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
630
if (!$self->{main}->{learn_caller_will_untie}) {
631
$self->{store}->untie_db();
635
if ($nn < $self->{conf}->{bayes_min_ham_num}) {
636
dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
637
if (!$self->{main}->{learn_caller_will_untie}) {
638
$self->{store}->untie_db();
646
###########################################################################
649
my ($self, $permsgstatus, $msg) = @_;
652
return unless $self->{conf}->{use_learner};
654
# When we're doing a scan, we'll guarantee that we'll do the untie,
655
# so override the global setting until we're done.
656
my $caller_untie = $self->{main}->{learn_caller_will_untie};
657
$self->{main}->{learn_caller_will_untie} = 1;
659
goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
661
goto skip unless $self->learner_is_scan_available();
663
my ($ns, $nn) = $self->{store}->nspam_nham_get();
665
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
666
## $self->{raw_counts} = " ns=$ns nn=$nn ";
669
dbg("bayes: corpus size: nspam = $ns, nham = $nn");
671
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
673
my $msgtokens = $self->tokenize($msg, $msgdata);
675
my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
679
foreach my $tokendata (@{$tokensdata}) {
680
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
681
my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
682
next unless defined $prob;
686
spam_count => $tok_spam,
687
ham_count => $tok_ham,
692
# If none of the tokens were found in the DB, we're going to skip
695
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
699
my $tcount_total = keys %{$msgtokens};
700
my $tcount_learned = keys %pw;
702
# Figure out the message receive time (used as atime below)
703
# If the message atime comes back as being in the future, something's
704
# messed up and we should revert to current time as a safety measure.
706
my $msgatime = $msg->receive_date();
708
$msgatime = $now if ( $msgatime > $now );
710
# now take the $count most significant tokens and calculate probs using
711
# Robinson's formula.
712
my $count = N_SIGNIFICANT_TOKENS;
716
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
717
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
719
my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
720
my $log_each_token = (would_log('dbg', 'bayes') > 1);
722
foreach my $tok (sort {
723
$tok_strength{$b} <=> $tok_strength{$a}
726
if ($count-- < 0) { last; }
727
next if ($tok_strength{$tok} <
728
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
730
my $pw = $pw{$tok}->{prob};
732
# What's more expensive, scanning headers for HAMMYTOKENS and
733
# SPAMMYTOKENS tags that aren't there or collecting data that
734
# won't be used? Just collecting the data is certainly simpler.
736
my $raw_token = $msgtokens->{$tok} || "(unknown)";
737
my $s = $pw{$tok}->{spam_count};
738
my $n = $pw{$tok}->{ham_count};
739
my $a = $pw{$tok}->{atime};
742
push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];
744
push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
749
# update the atime on this token, it proved useful
750
push(@touch_tokens, $tok);
752
if ($log_each_token) {
753
dbg("bayes: token '$raw_token' => $pw");
757
if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
758
$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
760
dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
764
$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
766
# Couldn't come up with a probability?
767
goto skip unless defined $score;
769
dbg("bayes: score = $score");
771
# no need to call tok_touch_all unless there were significant
772
# tokens and a score was returned
773
# we don't really care about the return value here
774
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
776
$permsgstatus->{bayes_nspam} = $ns;
777
$permsgstatus->{bayes_nham} = $nn;
779
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
780
## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
783
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
786
msgatime => $msgatime,
787
significant_tokens => \@touch_tokens,
791
if (!defined $score) {
792
dbg("bayes: not scoring message, returning undef");
795
# Take any opportunistic actions we can take
796
if ($self->{main}->{opportunistic_expire_check_only}) {
797
# we're supposed to report on expiry only -- so do the
798
# _opportunistic_calls() run for the journal only.
799
$self->_opportunistic_calls(1);
800
$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
803
$self->_opportunistic_calls();
806
# Do any cleanup we need to do
807
$self->{store}->cleanup();
809
# Reset the value accordingly
810
$self->{main}->{learn_caller_will_untie} = $caller_untie;
812
# If our caller won't untie the db, we need to do it.
813
if (!$caller_untie) {
814
$self->{store}->untie_db();
817
$permsgstatus->set_tag ('BAYESTCHAMMY',
818
($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
819
$permsgstatus->set_tag ('BAYESTCSPAMMY',
820
($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
821
$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
822
$permsgstatus->set_tag ('BAYESTC', $tcount_total);
824
$permsgstatus->set_tag ('HAMMYTOKENS', sub {
825
$self->bayes_report_make_list
826
($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift);
829
$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
830
$self->bayes_report_make_list
831
($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift);
834
$permsgstatus->set_tag ('TOKENSUMMARY', sub {
835
if( defined $self->{tag_data}{BAYESTC} )
837
my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED}
838
- $permsgstatus->{tag_data}{BAYESTCSPAMMY}
839
- $permsgstatus->{tag_data}{BAYESTCHAMMY};
840
my $tcount_new = $permsgstatus->{tag_data}{BAYESTC}
841
- $permsgstatus->{tag_data}{BAYESTCLEARNED};
842
"Tokens: new, $tcount_new; "
843
."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; "
844
."neutral, $tcount_neutral; "
845
."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}."
855
###########################################################################
858
sub learner_dump_database {
859
my ($self, $params) = @_;
860
my $magic = $params->{magic};
861
my $toks = $params->{toks};
862
my $regex = $params->{regex};
864
# allow dump to occur even if use_bayes disables everything else ...
865
#return 0 unless $self->{conf}->{use_bayes};
866
return 0 unless $self->{store}->tie_db_readonly();
868
my @vars = $self->{store}->get_storage_variables();
870
my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
872
my $template = '%3.3f %10u %10u %10u %s'."\n";
875
printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
876
or die "Error writing: $!";
877
printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
878
or die "Error writing: $!";
879
printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
880
or die "Error writing: $!";
881
printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
882
or die "Error writing: $!";
883
printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
884
or die "Error writing: $!";
886
printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
887
or die "Error writing: $!";
890
printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
891
or die "Error writing: $!";
894
printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
895
or die "Error writing: $!";
897
printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
898
or die "Error writing: $!";
900
printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
901
or die "Error writing: $!";
903
printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
904
or die "Error writing: $!";
909
# let the store sort out the db_toks
910
$self->{store}->dump_db_toks($template, $regex, @vars);
913
if (!$self->{main}->{learn_caller_will_untie}) {
914
$self->{store}->untie_db();
919
###########################################################################
920
# TODO: these are NOT public, but the test suite needs to call them.
923
my ($self, $msg) = @_;
927
my $msgid = $msg->get_header("Message-Id");
928
if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
929
# remove \r and < and > prefix/suffixes
931
$msgid =~ s/^<//; $msgid =~ s/>.*$//g;
932
push(@msgid, $msgid);
935
# Use sha1_hex(Date:, last received: and top N bytes of body)
936
# where N is MIN(1024 bytes, 1/2 of body length)
938
my $date = $msg->get_header("Date");
939
$date = "None" if (!defined $date || $date eq ''); # No Date?
941
my @rcvd = $msg->get_header("Received");
942
my $rcvd = $rcvd[$#rcvd];
943
$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
945
# Make a copy since pristine_body is a reference ...
946
my $body = join('', $msg->get_pristine_body());
947
if (length($body) > 64) { # Small Body?
948
my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
949
substr($body, $keep) = '';
952
unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
954
return wantarray ? @msgid : $msgid[0];
957
sub get_body_from_msg {
958
my ($self, $msg) = @_;
961
# I have no idea why this seems to happen. TODO
962
warn "bayes: msg not a ref: '$msg'";
967
Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
968
$msg->extract_message_metadata ($permsgstatus);
969
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
970
$permsgstatus->finish();
972
if (!defined $msgdata) {
974
warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
981
sub _get_msgdata_from_permsgstatus {
982
my ($self, $msg) = @_;
985
$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
986
$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
987
@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
991
###########################################################################
993
# The calling functions expect a uniq'ed array of tokens ...
995
my ($self, $msg, $msgdata) = @_;
998
my @tokens = map { $self->_tokenize_line ($_, '', 1) }
999
@{$msgdata->{bayes_token_body}};
1002
push (@tokens, map { $self->_tokenize_line ($_, '', 2) }
1003
@{$msgdata->{bayes_token_uris}});
1005
# add invisible tokens
1006
if (ADD_INVIZ_TOKENS_I_PREFIX) {
1007
push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }
1008
@{$msgdata->{bayes_token_inviz}});
1010
if (ADD_INVIZ_TOKENS_NO_PREFIX) {
1011
push (@tokens, map { $self->_tokenize_line ($_, "", 1) }
1012
@{$msgdata->{bayes_token_inviz}});
1015
# Tokenize the headers
1016
my %hdrs = $self->_tokenize_headers ($msg);
1017
while( my($prefix, $value) = each %hdrs ) {
1018
push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
1021
# Go ahead and uniq the array, skip null tokens (can happen sometimes)
1022
# generate an SHA1 hash and take the lower 40 bits as our token
1024
foreach my $token (@tokens) {
1025
next unless length($token); # skip 0 length tokens
1026
$tokens{substr(sha1($token), -5)} = $token;
1029
# return the keys == tokens ...
1033
sub _tokenize_line {
1035
my $tokprefix = $_[2];
1041
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1042
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1043
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1044
tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1046
# DO split on "..." or "--" or "---"; common formatting error resulting in
1047
# hapaxes. Keep the separator itself as a token, though, as long ones can
1048
# be good spamsigns.
1049
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1050
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1052
if (IGNORE_TITLE_CASE) {
1053
if ($region == 1 || $region == 2) {
1054
# lower-case Title Case at start of a full-stop-delimited line (as would
1055
# be seen in a Western language).
1056
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1060
my $magic_re = $self->{store}->get_magic_re();
1062
foreach my $token (split) {
1063
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1064
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1066
# Skip false magic tokens
1067
# TVD: we need to do a defined() check since SQL doesn't have magic
1068
# tokens, so the SQL BayesStore returns undef. I really want a way
1069
# of optimizing that out, but I haven't come up with anything yet.
1071
next if ( defined $magic_re && $token =~ /$magic_re/ );
1073
# *do* keep 3-byte tokens; there's some solid signs in there
1074
my $len = length($token);
1076
# but extend the stop-list. These are squarely in the gray
1077
# area, and it just slows us down to record them.
1078
# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1081
($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
1083
# are we in the body? If so, apply some body-specific breakouts
1084
if ($region == 1 || $region == 2) {
1085
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1086
push (@rettokens, $self->_tokenize_mail_addrs ($token));
1088
elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1089
push (@rettokens, "UD:".$token); # the full token
1090
my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1091
push (@rettokens, "UD:".$1); # UD = URL domain
1096
# note: do not trim down overlong tokens if they contain '*'. This is
1097
# used as part of split tokens such as "HTo:D*net" indicating that
1098
# the domain ".net" appeared in the To header.
1100
if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1101
if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
1102
# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1103
# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1105
while ($token =~ s/^(..?)//) {
1106
push (@rettokens, "8:$1");
1111
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1112
|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1113
|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1115
# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1116
# Spambayes trick via Matt: Just retain 7 chars. Do not retain
1117
# the length, it does not help; see my mail to -devel of Nov 20 2002.
1118
# "sk:" stands for "skip".
1119
$token = "sk:".substr($token, 0, 7);
1123
# decompose tokens? do this after shortening long tokens
1124
if ($region == 1 || $region == 2) {
1125
if (DECOMPOSE_BODY_TOKENS) {
1126
if ($token =~ /[^\w:\*]/) {
1127
my $decompd = $token; # "Foo!"
1128
$decompd =~ s/[^\w:\*]//gs;
1129
push (@rettokens, $tokprefix.$decompd); # "Foo"
1132
if ($token =~ /[A-Z]/) {
1133
my $decompd = $token; $decompd = lc $decompd;
1134
push (@rettokens, $tokprefix.$decompd); # "foo!"
1136
if ($token =~ /[^\w:\*]/) {
1137
$decompd =~ s/[^\w:\*]//gs;
1138
push (@rettokens, $tokprefix.$decompd); # "foo"
1144
push (@rettokens, $tokprefix.$token);
1150
sub _tokenize_headers {
1151
my ($self, $msg) = @_;
1156
$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1158
# get headers in array context
1161
for ($msg->get_all_headers()) {
1162
# first, keep a copy of Received headers, so we can strip down to last 2
1163
if (/^Received:/i) {
1164
push(@rcvdlines, $_);
1167
# and now skip lines for headers we don't want (including all Received)
1168
next if /^${IGNORED_HDRS}:/i;
1169
next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1172
push(@hdrs, $msg->get_all_metadata());
1174
# and re-add the last 2 received lines: usually a good source of
1175
# spamware tokens and HELO names.
1176
if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
1177
if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1181
my ($hdr, $val) = split(/:/, $_, 2);
1183
# remove user-specified headers here, after Received, in case they
1184
# want to ignore that too
1185
next if exists $user_ignore{lc $hdr};
1187
# Prep the header value
1191
# special tokenization for some headers:
1192
if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
1193
$val = $self->_pre_chew_message_id ($val);
1195
elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
1196
(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
1198
$val = $self->_pre_chew_addr_header ($val);
1200
elsif ($hdr eq 'Received') {
1201
$val = $self->_pre_chew_received ($val);
1203
elsif ($hdr eq 'Content-Type') {
1204
$val = $self->_pre_chew_content_type ($val);
1206
elsif ($hdr eq 'MIME-Version') {
1207
$val =~ s/1\.0//; # totally innocuous
1209
elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1210
$val = "1"; # just mark the presence, they create lots of hapaxen
1213
if (MAP_HEADERS_MID) {
1214
if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
1215
$parsed{"*MI"} = $val;
1218
if (MAP_HEADERS_FROMTOCC) {
1219
if ($hdr =~ /^(?:From|To|Cc)$/i) {
1220
$parsed{"*Ad"} = $val;
1223
if (MAP_HEADERS_USERAGENT) {
1224
if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
1225
$parsed{"*UA"} = $val;
1229
# replace hdr name with "compressed" version if possible
1230
if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1231
$hdr = $HEADER_NAME_COMPRESSION{$hdr};
1234
if (exists $parsed{$hdr}) {
1235
$parsed{$hdr} .= " ".$val;
1237
$parsed{$hdr} = $val;
1239
if (would_log('dbg', 'bayes') > 1) {
1240
dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1247
sub _pre_chew_content_type {
1248
my ($self, $val) = @_;
1250
# hopefully this will retain good bits without too many hapaxen
1251
if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1253
$boundary = '' if !defined $boundary; # avoid a warning
1254
$boundary =~ s/[a-fA-F0-9]/H/gs;
1255
# break up blocks of separator chars so they become their own tokens
1256
$boundary =~ s/([-_\.=]+)/ $1 /gs;
1260
# stop-list words for Content-Type header: these wind up totally gray
1261
$val =~ s/\b(?:text|charset)\b//;
1266
sub _pre_chew_message_id {
1267
my ($self, $val) = @_;
1268
# we can (a) get rid of a lot of hapaxen and (b) increase the token
1269
# specificity by pre-parsing some common formats.
1271
# Outlook Express format:
1272
$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1273
([0-9a-f]{4})[0-9a-f]{4}\$
1274
([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1277
$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1280
$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1281
[A-F0-9]{10,12}\@//gx;
1283
# try to split Message-ID segments on probable ID boundaries. Note that
1284
# Outlook message-ids seem to contain a server identifier ID in the last
1285
# 8 bytes before the @. Make sure this becomes its own token, it's a
1286
# great spam-sign for a learning system! Be sure to split on ".".
1287
$val =~ s/[^_A-Za-z0-9]/ /g;
1291
sub _pre_chew_received {
1292
my ($self, $val) = @_;
1294
# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1295
# and valid-format RFC-822/2822 dates
1297
$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1298
$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1299
$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1300
$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1302
$val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
1304
(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
1305
(?:19|20)?[0-9]{2}\s
1306
[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1307
(?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
1310
# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1311
# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1313
# also make a dup with the full IP, as fodder for
1314
# bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
1315
$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
1316
if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
1318
" ip*".$1.$2.$3.$4." ";
1321
" ip*".$1.$2.$3.$4." ";
1325
# trim these: they turn out as the most common tokens, but with a
1326
# prob of about .5. waste of space!
1327
$val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
1332
sub _pre_chew_addr_header {
1333
my ($self, $val) = @_;
1336
my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1339
push (@toks, $self->_tokenize_mail_addrs ($_));
1341
return join (' ', @toks);
1344
sub _tokenize_mail_addrs {
1345
my ($self, $addr) = @_;
1347
($addr =~ /(.+)\@(.+)$/) or return ();
1349
push(@toks, "U*".$1, "D*".$2);
1350
$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1355
###########################################################################
1357
# compute the probability that a token is spammish
1358
sub _compute_prob_for_token {
1359
my ($self, $token, $ns, $nn, $s, $n) = @_;
1361
# we allow the caller to give us the token information, just
1362
# to save a potentially expensive lookup
1363
if (!defined($s) || !defined($n)) {
1364
($s, $n, undef) = $self->{store}->tok_get ($token);
1367
return if ($s == 0 && $n == 0);
1369
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1370
return if ($s + $n < 10); # ignore low-freq tokens
1373
if (!$self->{use_hapaxes}) {
1374
return if ($s + $n < 2);
1377
return if ( $ns == 0 || $nn == 0 );
1379
my $ratios = ($s / $ns);
1380
my $ration = ($n / $nn);
1384
if ($ratios == 0 && $ration == 0) {
1385
warn "bayes: oops? ratios == ration == 0";
1388
$prob = ($ratios) / ($ration + $ratios);
1391
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1392
# use Robinson's f(x) equation for low-n tokens, instead of just
1395
$prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1397
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1400
# 'log_raw_counts' is used to log the raw data for the Bayes equations during
1401
# a mass-check, allowing the S and X constants to be optimized quickly
1402
# without requiring re-tokenization of the messages for each attempt. There's
1403
# really no need for this code to be uncommented in normal use, however. It
1404
# has never been publicly documented, so commenting it out is fine. ;)
1406
## if ($self->{log_raw_counts}) {
1407
## $self->{raw_counts} .= " s=$s,n=$n ";
1413
###########################################################################
1414
# If a token is neither hammy nor spammy, return 0.
1415
# For a spammy token, return the minimum number of additional ham messages
1416
# it would have had to appear in to no longer be spammy. Hammy tokens
1417
# are handled similarly. That's what the function does (at the time
1418
# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1419
# more useful if it returned the number of /additional/ ham messages
1420
# a spammy token would have to appear in to no longer be spammy but I
1421
# fear that might require the solution to a cubic equation, and I
1422
# just don't have the time for that now.
1424
sub _compute_declassification_distance {
1425
my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1427
return 0 if $ns == 0 && $nn == 0;
1429
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
1430
if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1432
return 0 if $Ns == 0 || $Nn == 0;
1433
return 0 if abs( $prob - 0.5 ) <
1434
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1436
my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1437
my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1439
return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1440
unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1442
my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1443
my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1444
my $a = $Nb * ( 1 - $p );
1445
my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1446
my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1447
my $discrim = $b * $b - 4 * $a * $c;
1448
my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1449
my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1451
# This shouldn't be necessary. Should not be < 1
1452
return $dd_exact < 1 ? 1 : int($dd_exact);
1455
###########################################################################
1457
sub _opportunistic_calls {
1458
my($self, $journal_only) = @_;
1460
# If we're not already tied, abort.
1461
if (!$self->{store}->db_readable()) {
1462
dbg("bayes: opportunistic call attempt failed, DB not readable");
1466
# Is an expire or sync running?
1467
my $running_expire = $self->{store}->get_running_expire_tok();
1468
if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1469
dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1473
# handle expiry and syncing
1474
if (!$journal_only && $self->{store}->expiry_due()) {
1475
dbg("bayes: opportunistic call found expiry due");
1477
# sync will bring the DB R/W as necessary, and the expire will remove
1478
# the running_expire token, may untie as well.
1479
$self->{main}->{bayes_scanner}->sync(1,1);
1481
elsif ( $self->{store}->sync_due() ) {
1482
dbg("bayes: opportunistic call found journal sync due");
1484
# sync will bring the DB R/W as necessary, may untie as well
1485
$self->{main}->{bayes_scanner}->sync(1,0);
1487
# We can only remove the running_expire token if we're doing R/W
1488
if ($self->{store}->db_writable()) {
1489
$self->{store}->remove_running_expire_tok();
1496
###########################################################################
1502
my $module = untaint_var($self->{conf}->{bayes_store_module});
1503
$module = 'Mail::SpamAssassin::BayesStore::DBM' if !$module;
1505
dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1507
require '.$module.';
1508
$store = '.$module.'->new($self);
1511
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1512
die "bayes: learner_new $module new() failed: $eval_stat\n";
1515
dbg("bayes: learner_new: got store=%s", $store);
1516
$self->{store} = $store;
1521
###########################################################################
1523
sub bayes_report_make_list {
1524
my ($self, $pms, $info, $param) = @_;
1525
return "Tokens not available." unless defined $info;
1527
my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
1531
Short => 'Token: \"$t\"',
1532
compact => '$p-$D--$t',
1533
Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1534
medium => '$p-$D-$N--$t',
1535
long => '$p-$d--${h}h-${s}s--${a}d--$t',
1536
Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1539
my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1541
return "Invalid format, must be one of: ".join(",",keys %formats)
1542
unless defined $raw_fmt;
1544
my $fmt = '"'.$raw_fmt.'"';
1545
my $amt = $limit < @$info ? $limit : @$info;
1546
return "" unless $amt;
1548
my $ns = $pms->{bayes_nspam};
1549
my $nh = $pms->{bayes_nham};
1550
my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1554
my($t,$prob,$s,$h,$u) = @$_;
1555
my $a = int(($now - $u)/(3600 * 24));
1556
my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1557
my $p = sprintf "%.3f", $prob;
1559
my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1560
my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1561
eval $fmt; ## no critic
1562
} @{$info}[0..$amt-1];