~ubuntu-branches/ubuntu/utopic/spamassassin/utopic-updates

« back to all changes in this revision

Viewing changes to lib/Mail/SpamAssassin/Plugin/Bayes.pm

Committer: Package Import Robot
Author(s): Noah Meyerhans
Date: 2014-02-14 22:45:15 UTC
mfrom: (0.8.1) (0.6.2) (5.1.22 sid)
Revision ID: package-import@ubuntu.com-20140214224515-z1es2twos8xh7n2y

Tags: 3.4.0-1

http://bugs.debian.org/738963

http://bugs.debian.org/738872

http://bugs.debian.org/738867

http://bugs.debian.org/738951

http://bugs.debian.org/738974

* New upstream version! (Closes: 738963, 738872, 738867)
* Scrub the environment when switching to the debian-spamd user in
  postinst and cron.daily. (Closes: 738951)
* Enhancements to postinst to better manage ownership of
  /var/lib/spamassassin, via Iain Lane <iain.lane@canonical.com>
  (Closes: 738974)

files added:
.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin/DCC.pm

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Plugin/DNSEval.pm

.pc/98_sa-compile-quiet

.pc/98_sa-compile-quiet/lib

.pc/98_sa-compile-quiet/lib/Mail

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin

.pc/98_sa-compile-quiet/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

META.json

debian/patches/98_sa-compile-quiet

lib/Mail/SpamAssassin/BayesStore/Redis.pm

lib/Mail/SpamAssassin/Plugin/AskDNS.pm

lib/Mail/SpamAssassin/Util/TinyRedis.pm

pkgrules/10_hasbase.cf

pkgrules/20_mailspike.cf

rules/v340.pre

t/autolearn.t

t/autolearn_force.t

t/autolearn_force_fail.t

t/basic_lint_without_sandbox.t

t/data/spam/badmime3.txt

t/dnsbl_subtests.t

files removed:
.pc/50_sa-learn_fix_empty_list_handling

.pc/50_sa-learn_fix_empty_list_handling/sa-learn.raw

.pc/60_bug_684709

.pc/60_bug_684709/lib

.pc/60_bug_684709/lib/Mail

.pc/60_bug_684709/lib/Mail/SpamAssassin

.pc/60_bug_684709/lib/Mail/SpamAssassin/Message.pm

.pc/85_disable_SSLv2

.pc/85_disable_SSLv2/spamc

.pc/85_disable_SSLv2/spamc/libspamc.c

.pc/85_disable_SSLv2/spamc/libspamc.h

.pc/85_disable_SSLv2/spamc/spamc.c

.pc/85_disable_SSLv2/spamc/spamc.pod

.pc/85_disable_SSLv2/spamd

.pc/85_disable_SSLv2/spamd/spamd.raw

.pc/90_missing_tld

.pc/90_missing_tld/lib

.pc/90_missing_tld/lib/Mail

.pc/90_missing_tld/lib/Mail/SpamAssassin

.pc/90_missing_tld/lib/Mail/SpamAssassin/Util

.pc/90_missing_tld/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

.pc/90_missing_tld/t

.pc/90_missing_tld/t/uri_text.t

.pc/91_no_rfc_ignorant

.pc/91_no_rfc_ignorant/pkgrules

.pc/91_no_rfc_ignorant/pkgrules/20_dnsbl_tests.cf

.pc/91_no_rfc_ignorant/pkgrules/30_text_de.cf

.pc/91_no_rfc_ignorant/pkgrules/50_scores.cf

.pc/91_no_rfc_ignorant/pkgrules/local.cf

.pc/91_no_rfc_ignorant/rules

.pc/91_no_rfc_ignorant/rules/STATISTICS-set1.txt

.pc/91_no_rfc_ignorant/rules/STATISTICS-set3.txt

.pc/91_no_rfc_ignorant/rules/active.list

.pc/95_bug694504-spamdforkscaling-crash

.pc/95_bug694504-spamdforkscaling-crash/lib

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin/Logger

.pc/95_bug694504-spamdforkscaling-crash/lib/Mail/SpamAssassin/Logger/Syslog.pm

.pc/95_bug694504-spamdforkscaling-crash/spamd

.pc/95_bug694504-spamdforkscaling-crash/spamd/spamd.raw

.pc/96_disable_njabl

.pc/96_disable_njabl/pkgrules

.pc/96_disable_njabl/pkgrules/20_dnsbl_tests.cf

.pc/96_disable_njabl/pkgrules/30_text_de.cf

.pc/96_disable_njabl/pkgrules/30_text_fr.cf

.pc/96_disable_njabl/pkgrules/30_text_nl.cf

.pc/96_disable_njabl/pkgrules/30_text_pl.cf

.pc/96_disable_njabl/pkgrules/50_scores.cf

.pc/98_bug721565-syntax-5.18

.pc/98_bug721565-syntax-5.18/lib

.pc/98_bug721565-syntax-5.18/lib/Mail

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/AsyncLoop.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Conf

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Conf/Parser.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/DnsResolver.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Message.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/PerMsgStatus.pm

.pc/98_bug721565-syntax-5.18/lib/Mail/SpamAssassin/Util.pm

debian/patches/50_sa-learn_fix_empty_list_handling

debian/patches/60_bug_684709

debian/patches/85_disable_SSLv2

debian/patches/90_missing_tld

debian/patches/91_no_rfc_ignorant

debian/patches/95_bug694504-spamdforkscaling-crash

debian/patches/96_disable_njabl

debian/patches/98_bug721565-syntax-5.18

rules/STATISTICS-set0.txt

rules/STATISTICS-set1.txt

rules/STATISTICS-set2.txt

rules/STATISTICS-set3.txt

t/dkim2.t

t/spamd_ssl_v2.t

t/spamd_ssl_v23.t

files modified:
.pc/10_change_config_paths/INSTALL

.pc/10_change_config_paths/README

.pc/10_change_config_paths/UPGRADE

.pc/10_change_config_paths/lib/Mail/SpamAssassin/Conf.pm

.pc/10_change_config_paths/lib/Mail/SpamAssassin/Plugin/Test.pm

.pc/10_change_config_paths/lib/spamassassin-run.pod

.pc/10_change_config_paths/sa-compile.raw

.pc/10_change_config_paths/sa-learn.raw

.pc/10_change_config_paths/spamc/spamc.pod

.pc/10_change_config_paths/spamd/spamd.raw

.pc/10_change_config_paths/sql/README

.pc/20_edit_spamc_pod/spamc/spamc.pod

.pc/30_edit_README/README

.pc/55_disable_nagios_epm/sa-check_spamd.raw

.pc/90_pod_cleanup/lib/Mail/SpamAssassin/Conf.pm

.pc/97_bug720499-pod-5.18/sa-check_spamd.raw

.pc/applied-patches

CREDITS

Changes

INSTALL

MANIFEST

MANIFEST.SKIP

META.yml

Makefile.PL

README

UPGRADE

build/mkrules

build/parse-rules-for-masses

debian/changelog

debian/control

debian/patches/10_change_config_paths

debian/patches/20_edit_spamc_pod

debian/patches/90_pod_cleanup

debian/patches/series

debian/sa-compile.postinst

debian/spamassassin.cron.daily

debian/spamassassin.postinst

debian/tests/daemon *

debian/tests/get_spam_points.py *

lib/Mail/SpamAssassin.pm

lib/Mail/SpamAssassin/AICache.pm

lib/Mail/SpamAssassin/ArchiveIterator.pm

lib/Mail/SpamAssassin/AsyncLoop.pm

lib/Mail/SpamAssassin/AutoWhitelist.pm

lib/Mail/SpamAssassin/Bayes/CombineChi.pm

lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm

lib/Mail/SpamAssassin/BayesStore.pm

lib/Mail/SpamAssassin/BayesStore/BDB.pm

lib/Mail/SpamAssassin/BayesStore/DBM.pm

lib/Mail/SpamAssassin/BayesStore/MySQL.pm

lib/Mail/SpamAssassin/BayesStore/PgSQL.pm

lib/Mail/SpamAssassin/BayesStore/SQL.pm

lib/Mail/SpamAssassin/Client.pm

lib/Mail/SpamAssassin/Conf.pm

lib/Mail/SpamAssassin/Conf/LDAP.pm

lib/Mail/SpamAssassin/Conf/Parser.pm

lib/Mail/SpamAssassin/Conf/SQL.pm

lib/Mail/SpamAssassin/Constants.pm

lib/Mail/SpamAssassin/Dns.pm

lib/Mail/SpamAssassin/DnsResolver.pm

lib/Mail/SpamAssassin/HTML.pm

lib/Mail/SpamAssassin/Locales.pm

lib/Mail/SpamAssassin/Logger.pm

lib/Mail/SpamAssassin/Logger/File.pm

lib/Mail/SpamAssassin/Logger/Syslog.pm

lib/Mail/SpamAssassin/MailingList.pm

lib/Mail/SpamAssassin/Message.pm

lib/Mail/SpamAssassin/Message/Metadata.pm

lib/Mail/SpamAssassin/Message/Metadata/Received.pm

lib/Mail/SpamAssassin/Message/Node.pm

lib/Mail/SpamAssassin/NetSet.pm

lib/Mail/SpamAssassin/PerMsgStatus.pm

lib/Mail/SpamAssassin/Plugin.pm

lib/Mail/SpamAssassin/Plugin/ASN.pm

lib/Mail/SpamAssassin/Plugin/AWL.pm

lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm

lib/Mail/SpamAssassin/Plugin/Bayes.pm

lib/Mail/SpamAssassin/Plugin/BodyEval.pm

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

lib/Mail/SpamAssassin/Plugin/Check.pm

lib/Mail/SpamAssassin/Plugin/DCC.pm

lib/Mail/SpamAssassin/Plugin/DKIM.pm

lib/Mail/SpamAssassin/Plugin/DNSEval.pm

lib/Mail/SpamAssassin/Plugin/FreeMail.pm

lib/Mail/SpamAssassin/Plugin/Hashcash.pm

lib/Mail/SpamAssassin/Plugin/HeaderEval.pm

lib/Mail/SpamAssassin/Plugin/ImageInfo.pm

lib/Mail/SpamAssassin/Plugin/MIMEEval.pm

lib/Mail/SpamAssassin/Plugin/PhishTag.pm

lib/Mail/SpamAssassin/Plugin/Pyzor.pm

lib/Mail/SpamAssassin/Plugin/Razor2.pm

lib/Mail/SpamAssassin/Plugin/RelayCountry.pm

lib/Mail/SpamAssassin/Plugin/Reuse.pm

lib/Mail/SpamAssassin/Plugin/SPF.pm

lib/Mail/SpamAssassin/Plugin/Test.pm

lib/Mail/SpamAssassin/Plugin/TextCat.pm

lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

lib/Mail/SpamAssassin/Plugin/URIEval.pm

lib/Mail/SpamAssassin/Plugin/VBounce.pm

lib/Mail/SpamAssassin/Plugin/WLBLEval.pm

lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm

lib/Mail/SpamAssassin/PluginHandler.pm

lib/Mail/SpamAssassin/Reporter.pm

lib/Mail/SpamAssassin/SQLBasedAddrList.pm

lib/Mail/SpamAssassin/SpamdForkScaling.pm

lib/Mail/SpamAssassin/Timeout.pm

lib/Mail/SpamAssassin/Util.pm

lib/Mail/SpamAssassin/Util/DependencyInfo.pm

lib/Mail/SpamAssassin/Util/Progress.pm

lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

lib/Mail/SpamAssassin/Util/ScopedTimer.pm

lib/spamassassin-run.pod

pkgrules/10_default_prefs.cf

pkgrules/20_aux_tlds.cf

pkgrules/20_dnsbl_tests.cf

pkgrules/20_drugs.cf

pkgrules/20_dynrdns.cf

pkgrules/20_fake_helo_tests.cf

pkgrules/20_freemail.cf

pkgrules/20_freemail_domains.cf

pkgrules/20_head_tests.cf

pkgrules/20_html_tests.cf

pkgrules/20_phrases.cf

pkgrules/20_ratware.cf

pkgrules/20_uri_tests.cf

pkgrules/20_vbounce.cf

pkgrules/25_dcc.cf

pkgrules/25_replace.cf

pkgrules/25_spf.cf

pkgrules/25_uribl.cf

pkgrules/30_text_de.cf

pkgrules/30_text_fr.cf

pkgrules/30_text_nl.cf

pkgrules/30_text_pl.cf

pkgrules/30_text_pt_br.cf

pkgrules/50_scores.cf

pkgrules/60_adsp_override_dkim.cf

pkgrules/60_whitelist_dkim.cf

pkgrules/72_active.cf

pkgrules/72_scores.cf

pkgrules/73_sandbox_manual_scores.cf

pkgrules/STATISTICS-set0-72_scores.cf.txt

pkgrules/STATISTICS-set1-72_scores.cf.txt

pkgrules/local.cf

pkgrules/regression_tests.cf

rules/active.list

rules/init.pre

rules/local.cf

rules/regression_tests.cf

sa-awl.raw

sa-check_spamd.raw

sa-compile.raw

sa-learn.raw

sa-update.raw

spamassassin.raw

spamc/libspamc.c

spamc/libspamc.h

spamc/spamc.c

spamc/spamc.pod

spamc/utils.c

spamc/utils.h

spamd/netbsd-rc-script.sh

spamd/spamd.raw

sql/README

sql/README.bayes

sql/awl_mysql.sql

sql/bayes_mysql.sql

sql/userpref_pg.sql

t/SATest.pm

t/bayesbdb.t

t/bayesdbm.t

t/bayesdbm_flock.t

t/bayessdbm.t

t/bayessdbm_seen_delete.t

t/bayessql.t

t/config.dist

t/config_errs.t

t/cross_user_config_leak.t

t/data/01_test_rules.cf

t/dcc.t

t/debug.t

t/dkim.t

t/duplicates.t

t/html_colors.t

t/if_can.t

t/lang_pl_tests.t

t/mimeparse.t

t/mkrules_else.t *

t/priorities.t

t/rcvd_parser.t

t/re_base_extraction.t

t/rule_multiple.t

t/rule_names.t

t/sa_check_spamd.t

t/sa_compile.t

t/spamd_hup.t

t/spamd_maxsize.t

t/spamd_prefork_stress_4.t

t/spamd_protocol_10.t

t/spamd_sql_prefs.t

t/spamd_unix_and_tcp.t

t/spf.t

t/trust_path.t

t/uri_text.t

t/uribl.t

t/uribl_all_types.t

t/uribl_domains_only.t

t/uribl_ips_only.t

t/whitelist_addrs.t

Show diffs side-by-side

added added

removed removed

lib/Mail/SpamAssassin/Plugin/Bayes.pm

249

%{$self} = ();

250

}

251

252

###########################################################################

253

252

254

# Plugin hook.

253

255

# Return this implementation object, for callers that need to know

254

256

# it. TODO: callers shouldn't *need* to know it!

258

260

259

261

###########################################################################

260

262

263

# Plugin hook.

264

# Called in the parent process shortly before forking off child processes.

265

sub prefork_init {

266

my ($self) = @_;

267

268

if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {

269

$self->{store}->prefork_init;

270

}

271

}

272

273

###########################################################################

274

275

# Plugin hook.

276

# Called in a child process shortly after being spawned.

277

sub spamd_child_init {

278

my ($self) = @_;

279

280

if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {

281

$self->{store}->spamd_child_init;

282

}

283

}

284

285

###########################################################################

286

287

# Plugin hook.

261

288

sub check_bayes {

262

289

my ($self, $pms, $fulltext, $min, $max) = @_;

263

290

349

376

350

377

eval {

351

378

local $SIG{'__DIE__'}; # do not run user die() traps in here

379

my $timer = $self->{main}->time_method("b_learn");

352

380

353

381

my $ok;

354

382

if ($self->{main}->{learn_to_journal}) {

386

414

@msgid = $self->get_msgid($msg);

387

415

}

388

416

389

foreach $msgid ( @msgid ) {

390

my $seen = $self->{store}->seen_get ($msgid);

417

foreach my $msgid_t ( @msgid ) {

418

my $seen = $self->{store}->seen_get ($msgid_t);

391

419

392

420

if (defined ($seen)) {

393

421

if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {

394

dbg("bayes: $msgid already learnt correctly, not learning twice");

422

dbg("bayes: $msgid_t already learnt correctly, not learning twice");

395

423

return 0;

396

424

} elsif ($seen !~ /^[hs]$/) {

397

warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");

425

warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");

398

426

} else {

399

427

# bug 3704: If the message was already learned, don't try learning it again.

400

428

# this prevents, for instance, manually learning as spam, then autolearning

401

429

# as ham, or visa versa.

402

430

if ($self->{main}->{learn_no_relearn}) {

403

dbg("bayes: $msgid already learnt as opposite, not re-learning");

431

dbg("bayes: $msgid_t already learnt as opposite, not re-learning");

404

432

return 0;

405

433

}

406

434

407

dbg("bayes: $msgid already learnt as opposite, forgetting first");

435

dbg("bayes: $msgid_t already learnt as opposite, forgetting first");

408

436

409

437

# kluge so that forget() won't untie the db on us ...

410

438

my $orig = $self->{main}->{learn_caller_will_untie};

431

459

# Now that we're sure we haven't seen this message before ...

432

460

$msgid = $msgid[0];

433

461

434

if ($isspam) {

435

$self->{store}->nspam_nham_change (1, 0);

436

} else {

437

$self->{store}->nspam_nham_change (0, 1);

438

}

439

440

462

my $msgatime = $msg->receive_date();

441

463

442

464

# If the message atime comes back as being more than 1 day in the

447

469

448

470

my $tokens = $self->tokenize($msg, $msgdata);

449

471

450

if ($isspam) {

451

$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);

452

} else {

453

$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);

472

{ my $timer = $self->{main}->time_method('b_count_change');

473

if ($isspam) {

474

$self->{store}->nspam_nham_change(1, 0);

475

$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);

476

} else {

477

$self->{store}->nspam_nham_change(0, 1);

478

$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);

479

}

454

480

}

455

481

456

482

$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));

484

510

# synchronously

485

511

eval {

486

512

local $SIG{'__DIE__'}; # do not run user die() traps in here

513

my $timer = $self->{main}->time_method("b_learn");

487

514

488

515

my $ok;

489

516

if ($self->{main}->{learn_to_journal}) {

666

693

667

694

dbg("bayes: corpus size: nspam = $ns, nham = $nn");

668

695

669

my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);

670

671

my $msgtokens = $self->tokenize($msg, $msgdata);

672

673

my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});

696

my $msgtokens;

697

{ my $timer = $self->{main}->time_method('b_tokenize');

698

my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);

699

$msgtokens = $self->tokenize($msg, $msgdata);

700

}

701

702

my $tokensdata;

703

{ my $timer = $self->{main}->time_method('b_tok_get_all');

704

$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});

705

}

706

707

my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');

708

709

my $probabilities_ref =

710

$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);

674

711

675

712

my %pw;

676

677

713

foreach my $tokendata (@{$tokensdata}) {

714

my $prob = shift(@$probabilities_ref);

715

next unless defined $prob;

678

716

my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};

679

my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);

680

next unless defined $prob;

681

682

717

$pw{$token} = {

683

718

prob => $prob,

684

719

spam_count => $tok_spam,

687

722

};

688

723

}

689

724

725

my @pw_keys = keys %pw;

726

690

727

# If none of the tokens were found in the DB, we're going to skip

691

728

# this message...

692

if (!keys %pw) {

729

if (!@pw_keys) {

693

730

dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");

694

731

goto skip;

695

732

}

696

733

697

734

my $tcount_total = keys %{$msgtokens};

698

my $tcount_learned = keys %pw;

735

my $tcount_learned = scalar @pw_keys;

699

736

700

737

# Figure out the message receive time (used as atime below)

701

738

# If the message atime comes back as being in the future, something's

705

742

my $now = time;

706

743

$msgatime = $now if ( $msgatime > $now );

707

744

708

# now take the $count most significant tokens and calculate probs using

709

# Robinson's formula.

710

my $count = N_SIGNIFICANT_TOKENS;

711

my @sorted;

712

713

745

my @touch_tokens;

714

746

my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];

715

747

my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];

716

748

717

my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;

749

my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);

718

750

my $log_each_token = (would_log('dbg', 'bayes') > 1);

719

751

720

foreach my $tok (sort {

721

$tok_strength{$b} <=> $tok_strength{$a}

722

} keys %pw)

723

{

724

if ($count-- < 0) { last; }

725

next if ($tok_strength{$tok} <

726

$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);

727

728

my $pw = $pw{$tok}->{prob};

752

# now take the most significant tokens and calculate probs using

753

# Robinson's formula.

754

755

@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;

756

757

if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }

758

759

my @sorted;

760

foreach my $tok (@pw_keys) {

761

next if $tok_strength{$tok} <

762

$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

763

764

my $pw_tok = $pw{$tok};

765

my $pw_prob = $pw_tok->{prob};

729

766

730

767

# What's more expensive, scanning headers for HAMMYTOKENS and

731

768

# SPAMMYTOKENS tags that aren't there or collecting data that

732

769

# won't be used? Just collecting the data is certainly simpler.

733

770

734

771

my $raw_token = $msgtokens->{$tok} || "(unknown)";

735

my $s = $pw{$tok}->{spam_count};

736

my $n = $pw{$tok}->{ham_count};

737

my $a = $pw{$tok}->{atime};

738

739

if ($pw < 0.5) {

740

push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];

741

} else {

742

push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];

743

}

744

745

push (@sorted, $pw);

772

my $s = $pw_tok->{spam_count};

773

my $n = $pw_tok->{ham_count};

774

my $a = $pw_tok->{atime};

775

776

push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },

777

[$raw_token, $pw_prob, $s, $n, $a] );

778

779

push(@sorted, $pw_prob);

746

780

747

781

# update the atime on this token, it proved useful

748

782

push(@touch_tokens, $tok);

749

783

750

784

if ($log_each_token) {

751

dbg("bayes: token '$raw_token' => $pw");

785

dbg("bayes: token '$raw_token' => $pw_prob");

752

786

}

753

787

}

754

788

760

794

}

761

795

762

796

$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);

797

undef $timer_compute_prob; # end a timing section

763

798

764

799

# Couldn't come up with a probability?

765

800

goto skip unless defined $score;

769

804

# no need to call tok_touch_all unless there were significant

770

805

# tokens and a score was returned

771

806

# we don't really care about the return value here

772

$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);

807

808

{ my $timer = $self->{main}->time_method('b_tok_touch_all');

809

$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);

810

}

811

812

my $timer_finish = $self->{main}->time_method('b_finish');

773

813

774

814

$permsgstatus->{bayes_nspam} = $ns;

775

815

$permsgstatus->{bayes_nham} = $nn;

790

830

dbg("bayes: not scoring message, returning undef");

791

831

}

792

832

833

undef $timer_compute_prob; # end a timing section if still running

834

if (!defined $timer_finish) {

835

$timer_finish = $self->{main}->time_method('b_finish');

836

}

837

793

838

# Take any opportunistic actions we can take

794

839

if ($self->{main}->{opportunistic_expire_check_only}) {

795

840

# we're supposed to report on expiry only -- so do the

820

865

$permsgstatus->set_tag ('BAYESTC', $tcount_total);

821

866

822

867

$permsgstatus->set_tag ('HAMMYTOKENS', sub {

868

my $pms = shift;

823

869

$self->bayes_report_make_list

824

($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift);

870

($pms, $pms->{bayes_token_info_hammy}, shift);

825

871

});

826

872

827

873

$permsgstatus->set_tag ('SPAMMYTOKENS', sub {

874

my $pms = shift;

828

875

$self->bayes_report_make_list

829

($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift);

876

($pms, $pms->{bayes_token_info_spammy}, shift);

830

877

});

831

878

832

879

$permsgstatus->set_tag ('TOKENSUMMARY', sub {

833

if ( defined $permsgstatus->{tag_data}{BAYESTC} )

880

my $pms = shift;

881

if ( defined $pms->{tag_data}{BAYESTC} )

834

882

{

835

my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED}

836

- $permsgstatus->{tag_data}{BAYESTCSPAMMY}

837

- $permsgstatus->{tag_data}{BAYESTCHAMMY};

838

my $tcount_new = $permsgstatus->{tag_data}{BAYESTC}

839

- $permsgstatus->{tag_data}{BAYESTCLEARNED};

883

my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}

884

- $pms->{tag_data}{BAYESTCSPAMMY}

885

- $pms->{tag_data}{BAYESTCHAMMY};

886

my $tcount_new = $pms->{tag_data}{BAYESTC}

887

- $pms->{tag_data}{BAYESTCLEARNED};

840

888

"Tokens: new, $tcount_new; "

841

."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; "

889

."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "

842

890

."neutral, $tcount_neutral; "

843

."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}."

891

."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."

844

892

} else {

845

893

"Bayes not run.";

846

894

}

930

978

push(@msgid, $msgid);

931

979

}

932

980

933

# Use sha1_hex(Date:, last received: and top N bytes of body)

981

# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation

982

983

# Use sha1_hex(Date: and top N bytes of body)

934

984

# where N is MIN(1024 bytes, 1/2 of body length)

935

985

936

986

my $date = $msg->get_header("Date");

937

987

$date = "None" if (!defined $date || $date eq ''); # No Date?

938

988

939

my @rcvd = $msg->get_header("Received");

940

my $rcvd = $rcvd[$#rcvd];

941

$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?

989

#Removed per bug 5185

990

#my @rcvd = $msg->get_header("Received");

991

#my $rcvd = $rcvd[$#rcvd];

992

#$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?

942

993

943

994

# Make a copy since pristine_body is a reference ...

944

995

my $body = join('', $msg->get_pristine_body());

996

945

997

if (length($body) > 64) { # Small Body?

946

998

my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );

947

999

substr($body, $keep) = '';

948

1000

}

949

1001

950

unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');

1002

#Stripping all CR and LF so that testing midstream from MTA and post delivery don't

1003

#generate different id's simply because of LF<->CR<->CRLF changes.

1004

$body =~ s/[\r\n]//g;

1005

1006

unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');

951

1007

952

1008

return wantarray ? @msgid : $msgid[0];

953

1009

}

1352

1408

1353

1409

###########################################################################

1354

1410

1411

# compute the probability that a token is spammish for each token

1412

sub _compute_prob_for_all_tokens {

1413

my ($self, $tokensdata, $ns, $nn) = @_;

1414

my @probabilities;

1415

1416

return if !$ns || !$nn;

1417

1418

my $threshold = 1; # ignore low-freq tokens below this s+n threshold

1419

if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1420

$threshold = 10;

1421

}

1422

if (!$self->{use_hapaxes}) {

1423

$threshold = 2;

1424

}

1425

1426

foreach my $tokendata (@{$tokensdata}) {

1427

my $s = $tokendata->[1]; # spam count

1428

my $n = $tokendata->[2]; # ham count

1429

my $prob;

1430

1431

no warnings 'uninitialized'; # treat undef as zero in addition

1432

if ($s + $n >= $threshold) {

1433

# ignoring low-freq tokens, also covers the (!$s && !$n) case

1434

1435

# my $ratios = $s / $ns;

1436

# my $ration = $n / $nn;

1437

# $prob = $ratios / ($ration + $ratios);

1438

1439

$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster

1440

1441

if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1442

# use Robinson's f(x) equation for low-n tokens, instead of just

1443

# ignoring them

1444

my $robn = $s + $n;

1445

$prob =

1446

($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))

1447

1448

($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);

1449

}

1450

}

1451

1452

# 'log_raw_counts' is used to log the raw data for the Bayes equations

1453

# during a mass-check, allowing the S and X constants to be optimized

1454

# quickly without requiring re-tokenization of the messages for each

1455

# attempt. There's really no need for this code to be uncommented in

1456

# normal use, however. It has never been publicly documented, so

1457

# commenting it out is fine. ;)

1458

1459

## if ($self->{log_raw_counts}) {

1460

## $self->{raw_counts} .= " s=$s,n=$n ";

1461

## }

1462

1463

push(@probabilities, $prob);

1464

}

1465

return \@probabilities;

1466

}

1467

1355

1468

# compute the probability that a token is spammish

1356

1469

sub _compute_prob_for_token {

1357

1470

my ($self, $token, $ns, $nn, $s, $n) = @_;

1359

1472

# we allow the caller to give us the token information, just

1360

1473

# to save a potentially expensive lookup

1361

1474

if (!defined($s) || !defined($n)) {

1362

($s, $n, undef) = $self->{store}->tok_get ($token);

1363

}

1364

1365

return if ($s == 0 && $n == 0);

1366

1367

if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1368

return if ($s + $n < 10); # ignore low-freq tokens

1369

}

1370

1371

if (!$self->{use_hapaxes}) {

1372

return if ($s + $n < 2);

1373

}

1374

1375

return if ( $ns == 0 || $nn == 0 );

1376

1377

my $ratios = ($s / $ns);

1378

my $ration = ($n / $nn);

1379

1380

my $prob;

1381

1382

if ($ratios == 0 && $ration == 0) {

1383

warn "bayes: oops? ratios == ration == 0";

1384

return;

1385

} else {

1386

$prob = ($ratios) / ($ration + $ratios);

1387

}

1388

1389

if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1390

# use Robinson's f(x) equation for low-n tokens, instead of just

1391

# ignoring them

1392

my $robn = $s+$n;

1393

$prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))

1394

1395

($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);

1396

}

1397

1398

# 'log_raw_counts' is used to log the raw data for the Bayes equations during

1399

# a mass-check, allowing the S and X constants to be optimized quickly

1400

# without requiring re-tokenization of the messages for each attempt. There's

1401

# really no need for this code to be uncommented in normal use, however. It

1402

# has never been publicly documented, so commenting it out is fine. ;)

1403

1404

## if ($self->{log_raw_counts}) {

1405

## $self->{raw_counts} .= " s=$s,n=$n ";

1406

## }

1407

1408

return $prob;

1475

($s, $n, undef) = $self->{store}->tok_get($token);

1476

}

1477

return if !$s && !$n;

1478

1479

my $probabilities_ref =

1480

$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);

1481

1482

return $probabilities_ref->[0];

1409

1483

}

1410

1484

1411

1485

###########################################################################

Older »