~ubuntu-branches/ubuntu/utopic/spamassassin/utopic-proposed

« back to all changes in this revision

Viewing changes to lib/Mail/SpamAssassin/Plugin/Bayes.pm

Committer: Bazaar Package Importer
Author(s): Noah Meyerhans
Date: 2010-01-26 22:53:12 UTC
mfrom: (1.1.13 upstream) (5.1.7 sid)
Revision ID: james.westby@ubuntu.com-20100126225312-wkftb10idc1kz2aq

Tags: 3.3.0-1

* New upstream version.
* Switch to dpkg-source 3.0 (quilt) format

files added:
META.yml

debian/patches/.dpkg-source-applied

debian/patches/10_change_config_paths

debian/patches/20_edit_spamc_pod

debian/patches/30_edit_README

debian/patches/50_sa-learn_fix_empty_list_handling

debian/patches/60_fix-pod

debian/patches/70_fix-whatis

debian/patches/80_fix_man_warnings

debian/patches/81_fix_man_speling

debian/patches/series

debian/source

debian/source/format

lib/Mail/SpamAssassin/BayesStore/BDB.pm

lib/Mail/SpamAssassin/Plugin/FreeMail.pm

lib/Mail/SpamAssassin/Plugin/PhishTag.pm

lib/Mail/SpamAssassin/Plugin/Reuse.pm

lib/Mail/SpamAssassin/Util/ScopedTimer.pm

pkgrules

pkgrules/10_default_prefs.cf

pkgrules/20_advance_fee.cf

pkgrules/20_body_tests.cf

pkgrules/20_compensate.cf

pkgrules/20_dnsbl_tests.cf

pkgrules/20_drugs.cf

pkgrules/20_dynrdns.cf

pkgrules/20_fake_helo_tests.cf

pkgrules/20_freemail.cf

pkgrules/20_freemail_domains.cf

pkgrules/20_head_tests.cf

pkgrules/20_html_tests.cf

pkgrules/20_imageinfo.cf

pkgrules/20_meta_tests.cf

pkgrules/20_net_tests.cf

pkgrules/20_phrases.cf

pkgrules/20_porn.cf

pkgrules/20_ratware.cf

pkgrules/20_uri_tests.cf

pkgrules/20_vbounce.cf

pkgrules/23_bayes.cf

pkgrules/25_accessdb.cf

pkgrules/25_antivirus.cf

pkgrules/25_asn.cf

pkgrules/25_dcc.cf

pkgrules/25_dkim.cf

pkgrules/25_hashcash.cf

pkgrules/25_pyzor.cf

pkgrules/25_razor2.cf

pkgrules/25_replace.cf

pkgrules/25_spf.cf

pkgrules/25_textcat.cf

pkgrules/25_uribl.cf

pkgrules/30_text_de.cf

pkgrules/30_text_fr.cf

pkgrules/30_text_it.cf

pkgrules/30_text_nl.cf

pkgrules/30_text_pl.cf

pkgrules/30_text_pt_br.cf

pkgrules/50_scores.cf

pkgrules/60_adsp_override_dkim.cf

pkgrules/60_awl.cf

pkgrules/60_shortcircuit.cf

pkgrules/60_whitelist.cf

pkgrules/60_whitelist_dkim.cf

pkgrules/60_whitelist_spf.cf

pkgrules/60_whitelist_subject.cf

pkgrules/72_active.cf

rules/v330.pre

sa-awl.raw

sa-check_spamd.raw

t/basic_meta.t

t/bayesbdb.t

t/config_tree_recurse.t

t/cross_user_config_leak.t

t/data/01_test_rules.pre

t/data/dkim

t/data/dkim/test-adsp-11.msg

t/data/dkim/test-adsp-12.msg

t/data/dkim/test-adsp-13.msg

t/data/dkim/test-adsp-14.msg

t/data/dkim/test-adsp-15.msg

t/data/dkim/test-adsp-16.msg

t/data/dkim/test-adsp-17.msg

t/data/dkim/test-adsp-18.msg

t/data/dkim/test-adsp-19.msg

t/data/dkim/test-adsp-20.msg

t/data/dkim/test-adsp-21.msg

t/data/dkim/test-adsp-22.msg

t/data/dkim/test-fail-01.msg

t/data/dkim/test-fail-02.msg

t/data/dkim/test-fail-03.msg

t/data/dkim/test-fail-04.msg

t/data/dkim/test-fail-05.msg

t/data/dkim/test-fail-06.msg

t/data/dkim/test-fail-07.msg

t/data/dkim/test-fail-08.msg

t/data/dkim/test-fail-09.msg

t/data/dkim/test-pass-01.msg

t/data/dkim/test-pass-02.msg

t/data/dkim/test-pass-03.msg

t/data/dkim/test-pass-04.msg

t/data/dkim/test-pass-05.msg

t/data/dkim/test-pass-06.msg

t/data/dkim/test-pass-07.msg

t/data/dkim/test-pass-08.msg

t/data/dkim/test-pass-09.msg

t/data/dkim/test-pass-10.msg

t/data/dkim/test-pass-11.msg

t/data/dkim/test-pass-12.msg

t/data/dkim/test-pass-13.msg

t/data/dkim/test-pass-14.msg

t/data/dkim/test-pass-15.msg

t/data/dkim/test-pass-16.msg

t/data/dkim/test-pass-17.msg

t/data/dkim/test-pass-18.msg

t/data/dkim/test-pass-19.msg

t/data/whitelists/ibm_enews_de

t/data/whitelists/infoworld

t/data/whitelists/netsol_renewal

t/data/whitelists/networkworld

t/db_awl_perms.t

t/dkim2.t

t/if_can.t

t/reuse.t

t/root_spamd_virtual.t

t/root_spamd_x_u.t

t/sa_awl.t

t/sa_check_spamd.t

t/sa_compile.t

t/spamc_bug6176.t

t/spamd_client.t

t/spamd_sql_prefs.t

t/spamd_whitelist_leak.t

t/stop_always_matching_regexps.t

t/timeout.t

files removed:
debian/patches/00list

debian/patches/05_fix_MakeMaker_DESTDIR.dpatch

debian/patches/10_change_config_paths.dpatch

debian/patches/20_edit_spamc_pod.dpatch

debian/patches/21_fix_spamc-undocumented-exit-status-98.dpatch

debian/patches/30_edit_README.dpatch

debian/patches/40_remote_dsbl.dpatch

debian/patches/41_remove_openwhois.dpatch

debian/patches/50_sa-learn_fix_empty_list_handling.dpatch

debian/patches/60_fix-pod.dpatch

debian/patches/70_fix-whatis.dpatch

debian/patches/80_fix_man_warnings.dpatch

debian/patches/81_fix_man_speling.dpatch

debian/patches/90_ipv6_header_support.dpatch

debian/patches/95_fix_FH_DATE_PAST_20XX.dpatch

lib/Mail/SpamAssassin/Plugin/DomainKeys.pm

rules/10_default_prefs.cf

rules/20_advance_fee.cf

rules/20_body_tests.cf

rules/20_compensate.cf

rules/20_dnsbl_tests.cf

rules/20_drugs.cf

rules/20_dynrdns.cf

rules/20_fake_helo_tests.cf

rules/20_head_tests.cf

rules/20_html_tests.cf

rules/20_imageinfo.cf

rules/20_meta_tests.cf

rules/20_net_tests.cf

rules/20_phrases.cf

rules/20_porn.cf

rules/20_ratware.cf

rules/20_uri_tests.cf

rules/20_vbounce.cf

rules/23_bayes.cf

rules/25_accessdb.cf

rules/25_antivirus.cf

rules/25_asn.cf

rules/25_dcc.cf

rules/25_dkim.cf

rules/25_domainkeys.cf

rules/25_hashcash.cf

rules/25_pyzor.cf

rules/25_razor2.cf

rules/25_replace.cf

rules/25_spf.cf

rules/25_textcat.cf

rules/25_uribl.cf

rules/30_text_de.cf

rules/30_text_fr.cf

rules/30_text_it.cf

rules/30_text_nl.cf

rules/30_text_pl.cf

rules/30_text_pt_br.cf

rules/50_scores.cf

rules/60_awl.cf

rules/60_shortcircuit.cf

rules/60_whitelist.cf

rules/60_whitelist_dk.cf

rules/60_whitelist_dkim.cf

rules/60_whitelist_spf.cf

rules/60_whitelist_subject.cf

rules/72_active.cf

rules/72_removed.cf

t/meta.t

t/rule_tests.t

files modified:
CREDITS

Changes

INSTALL

MANIFEST

MANIFEST.SKIP

Makefile.PL

PACKAGING

README

UPGRADE

USAGE

build/mkrules

build/parse-rules-for-masses

build/preprocessor

build/sha1sum.pl

debian/changelog

debian/control

debian/rules

ldap/README

lib/Mail/SpamAssassin.pm

lib/Mail/SpamAssassin/AICache.pm

lib/Mail/SpamAssassin/ArchiveIterator.pm

lib/Mail/SpamAssassin/AsyncLoop.pm

lib/Mail/SpamAssassin/AutoWhitelist.pm

lib/Mail/SpamAssassin/Bayes.pm

lib/Mail/SpamAssassin/Bayes/CombineChi.pm

lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm

lib/Mail/SpamAssassin/BayesStore.pm

lib/Mail/SpamAssassin/BayesStore/DBM.pm

lib/Mail/SpamAssassin/BayesStore/MySQL.pm

lib/Mail/SpamAssassin/BayesStore/PgSQL.pm

lib/Mail/SpamAssassin/BayesStore/SDBM.pm

lib/Mail/SpamAssassin/BayesStore/SQL.pm

lib/Mail/SpamAssassin/Client.pm

lib/Mail/SpamAssassin/Conf.pm

lib/Mail/SpamAssassin/Conf/LDAP.pm

lib/Mail/SpamAssassin/Conf/Parser.pm

lib/Mail/SpamAssassin/Conf/SQL.pm

lib/Mail/SpamAssassin/Constants.pm

lib/Mail/SpamAssassin/DBBasedAddrList.pm

lib/Mail/SpamAssassin/Dns.pm

lib/Mail/SpamAssassin/DnsResolver.pm

lib/Mail/SpamAssassin/HTML.pm

lib/Mail/SpamAssassin/Locales.pm

lib/Mail/SpamAssassin/Locker.pm

lib/Mail/SpamAssassin/Locker/Flock.pm

lib/Mail/SpamAssassin/Locker/UnixNFSSafe.pm

lib/Mail/SpamAssassin/Locker/Win32.pm

lib/Mail/SpamAssassin/Logger.pm

lib/Mail/SpamAssassin/Logger/File.pm

lib/Mail/SpamAssassin/Logger/Stderr.pm

lib/Mail/SpamAssassin/Logger/Syslog.pm

lib/Mail/SpamAssassin/MailingList.pm

lib/Mail/SpamAssassin/Message.pm

lib/Mail/SpamAssassin/Message/Metadata.pm

lib/Mail/SpamAssassin/Message/Metadata/Received.pm

lib/Mail/SpamAssassin/Message/Node.pm

lib/Mail/SpamAssassin/NetSet.pm

lib/Mail/SpamAssassin/PerMsgLearner.pm

lib/Mail/SpamAssassin/PerMsgStatus.pm

lib/Mail/SpamAssassin/PersistentAddrList.pm

lib/Mail/SpamAssassin/Plugin.pm

lib/Mail/SpamAssassin/Plugin/ASN.pm

lib/Mail/SpamAssassin/Plugin/AWL.pm

lib/Mail/SpamAssassin/Plugin/AccessDB.pm

lib/Mail/SpamAssassin/Plugin/AntiVirus.pm

lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm

lib/Mail/SpamAssassin/Plugin/Bayes.pm

lib/Mail/SpamAssassin/Plugin/BodyEval.pm

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

lib/Mail/SpamAssassin/Plugin/Check.pm

lib/Mail/SpamAssassin/Plugin/DCC.pm

lib/Mail/SpamAssassin/Plugin/DKIM.pm

lib/Mail/SpamAssassin/Plugin/DNSEval.pm

lib/Mail/SpamAssassin/Plugin/HTMLEval.pm

lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm

lib/Mail/SpamAssassin/Plugin/Hashcash.pm

lib/Mail/SpamAssassin/Plugin/HeaderEval.pm

lib/Mail/SpamAssassin/Plugin/ImageInfo.pm

lib/Mail/SpamAssassin/Plugin/MIMEEval.pm

lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm

lib/Mail/SpamAssassin/Plugin/OneLineBodyRuleType.pm

lib/Mail/SpamAssassin/Plugin/Pyzor.pm

lib/Mail/SpamAssassin/Plugin/Razor2.pm

lib/Mail/SpamAssassin/Plugin/RelayCountry.pm

lib/Mail/SpamAssassin/Plugin/RelayEval.pm

lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm

lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm

lib/Mail/SpamAssassin/Plugin/SPF.pm

lib/Mail/SpamAssassin/Plugin/Shortcircuit.pm

lib/Mail/SpamAssassin/Plugin/SpamCop.pm

lib/Mail/SpamAssassin/Plugin/Test.pm

lib/Mail/SpamAssassin/Plugin/TextCat.pm

lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

lib/Mail/SpamAssassin/Plugin/URIDetail.pm

lib/Mail/SpamAssassin/Plugin/URIEval.pm

lib/Mail/SpamAssassin/Plugin/VBounce.pm

lib/Mail/SpamAssassin/Plugin/WLBLEval.pm

lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm

lib/Mail/SpamAssassin/PluginHandler.pm

lib/Mail/SpamAssassin/Reporter.pm

lib/Mail/SpamAssassin/SQLBasedAddrList.pm

lib/Mail/SpamAssassin/SpamdForkScaling.pm

lib/Mail/SpamAssassin/SubProcBackChannel.pm

lib/Mail/SpamAssassin/Timeout.pm

lib/Mail/SpamAssassin/Util.pm

lib/Mail/SpamAssassin/Util/DependencyInfo.pm

lib/Mail/SpamAssassin/Util/Progress.pm

lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

lib/Mail/SpamAssassin/Util/TieOneStringHash.pm

lib/spamassassin-run.pod

rules/STATISTICS-set0.txt

rules/STATISTICS-set1.txt

rules/STATISTICS-set2.txt

rules/STATISTICS-set3.txt

rules/active.list

rules/local.cf

rules/user_prefs.template

rules/v310.pre

rules/v312.pre

sa-compile.raw

sa-learn.raw

sa-update.raw

spamassassin.raw

spamassassin.spec

spamc/getopt.c

spamc/libspamc.c

spamc/libspamc.h

spamc/spamc.c

spamc/spamc.h.in

spamc/spamc.pod

spamc/utils.c

spamc/utils.h

spamd-apache2/bin/apache-spamd.pl

spamd-apache2/lib/Mail/SpamAssassin/Spamd.pm

spamd-apache2/lib/Mail/SpamAssassin/Spamd/Apache2.pm

spamd-apache2/lib/Mail/SpamAssassin/Spamd/Apache2/Config.pm

spamd/PROTOCOL

spamd/README

spamd/README.vpopmail

spamd/redhat-rc-script.sh

spamd/spamd.raw

sql/README

sql/README.awl

sql/README.bayes

sql/awl_mysql.sql

sql/awl_pg.sql

sql/bayes_mysql.sql

sql/bayes_pg.sql

t/SATest.pm

t/bayesdbm.t

t/bayesdbm_flock.t

t/bayessdbm.t

t/bayessdbm_seen_delete.t

t/bayessql.t

t/cidrs.t

t/config.dist

t/data/01_test_rules.cf

t/data/Dumpheaders.pm

t/data/spam/004

t/data/spam/008

t/data/spam/011

t/data/spam/dnsbl.eml

t/data/testplugin.pm

t/db_based_whitelist.t

t/db_based_whitelist_ips.t

t/dcc.t

t/debug.t

t/dnsbl.t

t/dnsbl_sc_meta.t

t/duplicates.t

t/get_all_headers.t *

t/get_headers.t

t/html_obfu.t

t/ip_addrs.t

t/lang_lint.t

t/line_endings.t

t/mimeparse.t

t/missing_hb_separator.t

t/mkrules.t

t/razor2.t

t/rcvd_parser.t

t/re_base_extraction.t

t/recreate.t

t/root_spamd_tell.t

t/root_spamd_tell_paranoid.t

t/root_spamd_tell_x.t

t/root_spamd_tell_x_paranoid.t

t/root_spamd_u.t *

t/root_spamd_u_dcc.t *

t/rule_names.t

t/sha1.t

t/shortcircuit.t

t/spamc_H.t *

t/spamc_x_E_R.t

t/spamc_x_e.t *

t/spamd_hup.t

t/spamd_kill_restart.t

t/spamd_kill_restart_rr.t

t/spamd_maxsize.t

t/spamd_prefork_stress.t

t/spamd_prefork_stress_2.t

t/spamd_prefork_stress_3.t

t/spamd_prefork_stress_4.t

t/spamd_protocol_10.t

t/spamd_unix.t

t/spamd_user_rules_leak.t

t/spf.t

t/strip2.t

t/strip_no_subject.t

t/trust_path.t

t/uri.t

t/uri_text.t

t/uribl.t

t/whitelist_addrs.t

t/whitelist_from.t

Show diffs side-by-side

added added

removed removed

lib/Mail/SpamAssassin/Plugin/Bayes.pm

# limitations under the License.

# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier

=head1 DESCRIPTION

This is a Bayesian-style probabilistic classifier, using an algorithm based on

the one detailed in Paul Graham's I<A Plan For Spam> paper at:

http://www.paulgraham.com/spam.html

It also incorporates some other aspects taken from Graham Robinson's webpage

on the subject at:

http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html

And the chi-square probability combiner as described here:

http://www.linuxjournal.com/print.php?sid=6467

The results are incorporated into SpamAssassin as the BAYES_* rules.

=head1 METHODS

=over 4

=cut

package Mail::SpamAssassin::Plugin::Bayes;

use Mail::SpamAssassin::Plugin;

use strict;

use warnings;

use bytes;

use vars qw(@ISA);

@ISA = qw(Mail::SpamAssassin::Plugin);

# constructor: register the eval rule

use re 'taint';

BEGIN {

eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }

or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }

}

use Mail::SpamAssassin;

use Mail::SpamAssassin::Plugin;

use Mail::SpamAssassin::PerMsgStatus;

use Mail::SpamAssassin::Logger;

use Mail::SpamAssassin::Util qw(untaint_var);

# pick ONLY ONE of these combining implementations.

use Mail::SpamAssassin::Bayes::CombineChi;

# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;

our @ISA = qw(Mail::SpamAssassin::Plugin);

use vars qw{

$IGNORED_HDRS

$MARK_PRESENCE_ONLY_HDRS

%HEADER_NAME_COMPRESSION

$OPPORTUNISTIC_LOCK_VALID

};

# Which headers should we scan for tokens? Don't use all of them, as it's easy

# to pick up spurious clues from some. What we now do is use all of them

# *less* these well-known headers; that way we can pick up spammers' tracking

# headers (which are obviously not well-known in advance!).

# Received is handled specially

$IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise

|Delivered-To |Delivery-Date

|(?:X-)?Envelope-To

|X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text

|Subject # not worth a tiny gain vs. to db size increase

# Date: can provide invalid cues if your spam corpus is

# older/newer than ham

|Date

# List headers: ignore. a spamfiltering mailing list will

# become a nonspam sign.

|X-List|(?:X-)?Mailing-List

|Name|Url)

100

|X-Unsub(?:scribe)?

101

|X-Mailman-Version |X-Been[Tt]here |X-Loop

102

|Mail-Followup-To

103

|X-eGroups-(?:Return|From)

104

|X-MDMailing-List

105

|X-XEmacs-List

106

107

# gatewayed through mailing list (thanks to Allen Smith)

108

|(?:X-)?Resent-(?:From|To|Date)

109

|(?:X-)?Original-(?:From|To|Date)

110

111

# Spamfilter/virus-scanner headers: too easy to chain from

112

# these

113

|X-MailScanner(?:-SpamCheck)?

114

115

|X-Antispam |X-RBL-Warning |X-Mailscanner

116

|X-MDaemon-Deliver-To |X-Virus-Scanned

117

|X-Mass-Check-Id

118

|X-Pyzor |X-DCC-\S{2,25}-Metrics

119

|X-Filtered-B[Yy] |X-Scanned-By |X-Scanner

120

|X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status

121

|X-SpamCop-[^:]+

122

|X-SMTPD |(?:X-)?Spam-Apparently-To

123

|SPAM |X-Perlmx-Spam

124

|X-Bogosity

125

126

# some noisy Outlook headers that add no good clues:

127

|Content-Class |Thread-(?:Index|Topic)

128

|X-Original[Aa]rrival[Tt]ime

129

130

# Annotations from IMAP, POP, and MH:

131

132

|Lines |Content-Length

133

|X-UIDL? |X-IMAPbase

134

135

# Annotations from Bugzilla

136

|X-Bugzilla-[^:]+

137

138

# Annotations from VM: (thanks to Allen Smith)

139

140

|Summary-Format|VHeader|v\d-Data|Message-Order)

141

142

# Annotations from Gnus:

143

| X-Gnus-Mail-Source

144

| Xref

145

146

)}x;

147

148

# Note only the presence of these headers, in order to reduce the

149

# hapaxen they generate.

150

$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face

151

|X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint

152

|D(?:KIM|omainKey)-Signature

153

)}ix;

154

155

# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives

156

# for results. The winners are now the default settings.

157

use constant IGNORE_TITLE_CASE => 1;

158

use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1;

159

use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;

160

161

# tweaks of May 12 2003, see SpamAssassin-devel archives again.

162

use constant PRE_CHEW_ADDR_HEADERS => 1;

163

use constant CHEW_BODY_URIS => 1;

164

use constant CHEW_BODY_MAILADDRS => 1;

165

use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;

166

use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;

167

use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;

168

use constant IGNORE_MSGID_TOKENS => 0;

169

170

# tweaks of 12 March 2004, see bug 2129.

171

use constant DECOMPOSE_BODY_TOKENS => 1;

172

use constant MAP_HEADERS_MID => 1;

173

use constant MAP_HEADERS_FROMTOCC => 1;

174

use constant MAP_HEADERS_USERAGENT => 1;

175

176

# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26

177

use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;

178

use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;

179

180

# We store header-mined tokens in the db with a "HHeaderName:val" format.

181

# some headers may contain lots of gibberish tokens, so allow a little basic

182

# compression by mapping the header name at least here. these are the headers

183

# which appear with the most frequency in my db. note: this doesn't have to

184

# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing

185

# tokens from multiple different headers may impact accuracy, so might as well

186

# avoid this if possible. These are the top ones from my corpus, BTW (jm).

187

%HEADER_NAME_COMPRESSION = (

188

'Message-Id' => '*m',

189

'Message-ID' => '*M',

190

'Received' => '*r',

191

'User-Agent' => '*u',

192

'References' => '*f',

193

'In-Reply-To' => '*i',

194

'From' => '*F',

195

'Reply-To' => '*R',

196

'Return-Path' => '*p',

197

'Return-path' => '*rp',

198

'X-Mailer' => '*x',

199

'X-Authentication-Warning' => '*a',

200

'Organization' => '*o',

201

'Organisation' => '*o',

202

'Content-Type' => '*c',

203

'X-Spam-Relays-Trusted' => '*RT',

204

'X-Spam-Relays-Untrusted' => '*RU',

205

);

206

207

# How many seconds should the opportunistic_expire lock be valid?

208

$OPPORTUNISTIC_LOCK_VALID = 300;

209

210

# Should we use the Robinson f(w) equation from

211

# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?

212

# It gives better results, in that scores are more likely to distribute

213

# into the <0.5 range for nonspam and >0.5 for spam.

214

use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;

215

216

# How many of the most significant tokens should we use for the p(w)

217

# calculation?

218

use constant N_SIGNIFICANT_TOKENS => 150;

219

220

# How many significant tokens are required for a classifier score to

221

# be considered usable?

222

use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;

223

224

# How long a token should we hold onto? (note: German speakers typically

225

# will require a longer token than English ones.)

226

use constant MAX_TOKEN_LENGTH => 15;

227

228

###########################################################################

229

230

sub new {

231

my $class = shift;

my $mailsaobject = shift;

232

my ($main) = @_;

233

# some boilerplate...

234

$class = ref($class) || $class;

my $self = $class->SUPER::new($mailsaobject);

235

my $self = $class->SUPER::new($main);

236

bless ($self, $class);

237

# the important bit!

238

$self->{main} = $main;

239

$self->{conf} = $main->{conf};

240

$self->{use_ignores} = 1;

241

242

$self->register_eval_rule("check_bayes");

return $self;

}

243

$self;

244

}

245

246

sub finish {

247

my $self = shift;

248

if ($self->{store}) {

249

$self->{store}->untie_db();

250

}

251

%{$self} = ();

252

}

253

254

# Plugin hook.

255

# Return this implementation object, for callers that need to know

256

# it. TODO: callers shouldn't *need* to know it!

257

# used only in test suite to get access to {store}, internal APIs.

258

259

sub learner_get_implementation { return shift; }

260

261

###########################################################################

262

263

sub check_bayes {

264

my ($self, $pms, $fulltext, $min, $max) = @_;

265

266

return 0 if (!$pms->{conf}->{use_learner});

267

return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});

268

269

if (!exists ($pms->{bayes_score})) {

$pms->{bayes_score} = $self->{main}->{bayes_scanner}->scan ($pms, $pms->{msg});

270

my $timer = $self->{main}->time_method("check_bayes");

271

$pms->{bayes_score} = $self->scan($pms, $pms->{msg});

272

}

273

274

if (defined $pms->{bayes_score} &&

289

return 0;

290

}

291

292

###########################################################################

293

294

# Plugin hook.

295

sub learner_close {

296

my ($self, $params) = @_;

297

my $quiet = $params->{quiet};

298

299

# do a sanity check here. Wierd things happen if we remain tied

300

# after compiling; for example, spamd will never see that the

301

# number of messages has reached the bayes-scanning threshold.

302

if ($self->{store}->db_readable()) {

303

warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;

304

$self->{store}->untie_db();

305

}

306

}

307

308

###########################################################################

309

310

# read configuration items to control bayes behaviour. Called by

311

# BayesStore::read_db_configs().

312

sub read_db_configs {

313

my ($self) = @_;

314

315

# use of hapaxes. Set on bayes object, since it controls prob

316

# computation.

317

$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};

318

}

319

###########################################################################

320

321

sub ignore_message {

322

my ($self,$PMS) = @_;

323

324

return 0 unless $self->{use_ignores};

325

326

my $ig_from = $self->{main}->call_plugins ("check_wb_list",

327

{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });

328

my $ig_to = $self->{main}->call_plugins ("check_wb_list",

329

{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });

330

331

my $ignore = $ig_from || $ig_to;

332

333

dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;

334

335

return $ignore;

336

}

337

338

###########################################################################

339

340

# Plugin hook.

341

sub learn_message {

342

my ($self, $params) = @_;

343

my $isspam = $params->{isspam};

344

my $msg = $params->{msg};

345

my $id = $params->{id};

346

347

if (!$self->{conf}->{use_bayes}) { return; }

348

349

my $msgdata = $self->get_body_from_msg ($msg);

350

my $ret;

351

352

eval {

353

local $SIG{'__DIE__'}; # do not run user die() traps in here

354

355

my $ok;

356

if ($self->{main}->{learn_to_journal}) {

357

# If we're going to learn to journal, we'll try going r/o first...

358

# If that fails for some reason, let's try going r/w. This happens

359

# if the DB doesn't exist yet.

360

$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();

361

} else {

362

$ok = $self->{store}->tie_db_writable();

363

}

364

365

if ($ok) {

366

$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);

367

368

if (!$self->{main}->{learn_caller_will_untie}) {

369

$self->{store}->untie_db();

370

}

371

}

372

373

} or do { # if we died, untie the dbs.

374

my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;

375

$self->{store}->untie_db();

376

die "bayes: (in learn) $eval_stat\n";

377

};

378

379

return $ret;

380

}

381

382

# this function is trapped by the wrapper above

383

sub _learn_trapped {

384

my ($self, $isspam, $msg, $msgdata, $msgid) = @_;

385

my @msgid = ( $msgid );

386

387

if (!defined $msgid) {

388

@msgid = $self->get_msgid($msg);

389

}

390

391

foreach $msgid ( @msgid ) {

392

my $seen = $self->{store}->seen_get ($msgid);

393

394

if (defined ($seen)) {

395

if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {

396

dbg("bayes: $msgid already learnt correctly, not learning twice");

397

return 0;

398

} elsif ($seen !~ /^[hs]$/) {

399

warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");

400

} else {

401

# bug 3704: If the message was already learned, don't try learning it again.

402

# this prevents, for instance, manually learning as spam, then autolearning

403

# as ham, or visa versa.

404

if ($self->{main}->{learn_no_relearn}) {

405

dbg("bayes: $msgid already learnt as opposite, not re-learning");

406

return 0;

407

}

408

409

dbg("bayes: $msgid already learnt as opposite, forgetting first");

410

411

# kluge so that forget() won't untie the db on us ...

412

my $orig = $self->{main}->{learn_caller_will_untie};

413

$self->{main}->{learn_caller_will_untie} = 1;

414

415

my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);

416

417

# reset the value post-forget() ...

418

$self->{main}->{learn_caller_will_untie} = $orig;

419

420

# forget() gave us a fatal error, so propagate that up

421

if ($fatal) {

422

dbg("bayes: forget() returned a fatal error, so learn() will too");

423

return;

424

}

425

}

426

427

# we're only going to have seen this once, so stop if it's been

428

# seen already

429

last;

430

}

431

}

432

433

# Now that we're sure we haven't seen this message before ...

434

$msgid = $msgid[0];

435

436

if ($isspam) {

437

$self->{store}->nspam_nham_change (1, 0);

438

} else {

439

$self->{store}->nspam_nham_change (0, 1);

440

}

441

442

my $msgatime = $msg->receive_date();

443

444

# If the message atime comes back as being more than 1 day in the

445

# future, something's messed up and we should revert to current time as

446

# a safety measure.

447

448

$msgatime = time if ( $msgatime - time > 86400 );

449

450

my $tokens = $self->tokenize($msg, $msgdata);

451

452

if ($isspam) {

453

$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);

454

} else {

455

$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);

456

}

457

458

$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));

459

$self->{store}->cleanup();

460

461

$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,

462

isspam => $isspam,

463

msgid => $msgid,

464

msgatime => $msgatime,

465

});

466

467

dbg("bayes: learned '$msgid', atime: $msgatime");

468

469

470

}

471

472

###########################################################################

473

474

# Plugin hook.

475

sub forget_message {

476

my ($self, $params) = @_;

477

my $msg = $params->{msg};

478

my $id = $params->{id};

479

480

if (!$self->{conf}->{use_bayes}) { return; }

481

482

my $msgdata = $self->get_body_from_msg ($msg);

483

my $ret;

484

485

# we still tie for writing here, since we write to the seen db

486

# synchronously

487

eval {

488

local $SIG{'__DIE__'}; # do not run user die() traps in here

489

490

my $ok;

491

if ($self->{main}->{learn_to_journal}) {

492

# If we're going to learn to journal, we'll try going r/o first...

493

# If that fails for some reason, let's try going r/w. This happens

494

# if the DB doesn't exist yet.

495

$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();

496

} else {

497

$ok = $self->{store}->tie_db_writable();

498

}

499

500

if ($ok) {

501

$ret = $self->_forget_trapped ($msg, $msgdata, $id);

502

503

if (!$self->{main}->{learn_caller_will_untie}) {

504

$self->{store}->untie_db();

505

}

506

}

507

508

} or do { # if we died, untie the dbs.

509

my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;

510

$self->{store}->untie_db();

511

die "bayes: (in forget) $eval_stat\n";

512

};

513

514

return $ret;

515

}

516

517

# this function is trapped by the wrapper above

518

sub _forget_trapped {

519

my ($self, $msg, $msgdata, $msgid) = @_;

520

my @msgid = ( $msgid );

521

my $isspam;

522

523

if (!defined $msgid) {

524

@msgid = $self->get_msgid($msg);

525

}

526

527

while( $msgid = shift @msgid ) {

528

my $seen = $self->{store}->seen_get ($msgid);

529

530

if (defined ($seen)) {

531

if ($seen eq 's') {

532

$isspam = 1;

533

} elsif ($seen eq 'h') {

534

$isspam = 0;

535

} else {

536

dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");

537

return 0;

538

}

539

540

# messages should only be learned once, so stop if we find a msgid

541

# which was seen before

542

last;

543

}

544

else {

545

dbg("bayes: forget: msgid $msgid not learnt, ignored");

546

}

547

}

548

549

# This message wasn't learnt before, so return

550

if (!defined $isspam) {

551

dbg("bayes: forget: no msgid from this message has been learnt, skipping message");

552

return 0;

553

}

554

elsif ($isspam) {

555

$self->{store}->nspam_nham_change (-1, 0);

556

}

557

else {

558

$self->{store}->nspam_nham_change (0, -1);

559

}

560

561

my $tokens = $self->tokenize($msg, $msgdata);

562

563

if ($isspam) {

564

$self->{store}->multi_tok_count_change (-1, 0, $tokens);

565

} else {

566

$self->{store}->multi_tok_count_change (0, -1, $tokens);

567

}

568

569

$self->{store}->seen_delete ($msgid);

570

$self->{store}->cleanup();

571

572

$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,

573

isspam => $isspam,

574

msgid => $msgid,

575

});

576

577

578

}

579

580

###########################################################################

581

582

# Plugin hook.

583

sub learner_sync {

584

my ($self, $params) = @_;

585

if (!$self->{conf}->{use_bayes}) { return 0; }

586

dbg("bayes: bayes journal sync starting");

587

$self->{store}->sync($params);

588

dbg("bayes: bayes journal sync completed");

589

}

590

591

###########################################################################

592

593

# Plugin hook.

594

sub learner_expire_old_training {

595

my ($self, $params) = @_;

596

if (!$self->{conf}->{use_bayes}) { return 0; }

597

dbg("bayes: expiry starting");

598

my $timer = $self->{main}->time_method("expire_bayes");

599

$self->{store}->expire_old_tokens($params);

600

dbg("bayes: expiry completed");

601

}

602

603

###########################################################################

604

605

# Plugin hook.

606

# Check to make sure we can tie() the DB, and we have enough entries to do a scan

607

# if we're told the caller will untie(), go ahead and leave the db tied.

608

sub learner_is_scan_available {

609

my ($self, $params) = @_;

610

611

return 0 unless $self->{conf}->{use_bayes};

612

return 0 unless $self->{store}->tie_db_readonly();

613

614

# We need the DB to stay tied, so if the journal sync occurs, don't untie!

615

my $caller_untie = $self->{main}->{learn_caller_will_untie};

616

$self->{main}->{learn_caller_will_untie} = 1;

617

618

# Do a journal sync if necessary. Do this before the nspam_nham_get()

619

# call since the sync may cause an update in the number of messages

620

# learnt.

621

$self->_opportunistic_calls(1);

622

623

# Reset the variable appropriately

624

$self->{main}->{learn_caller_will_untie} = $caller_untie;

625

626

my ($ns, $nn) = $self->{store}->nspam_nham_get();

627

628

if ($ns < $self->{conf}->{bayes_min_spam_num}) {

629

dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});

630

if (!$self->{main}->{learn_caller_will_untie}) {

631

$self->{store}->untie_db();

632

}

633

return 0;

634

}

635

if ($nn < $self->{conf}->{bayes_min_ham_num}) {

636

dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});

637

if (!$self->{main}->{learn_caller_will_untie}) {

638

$self->{store}->untie_db();

639

}

640

return 0;

641

}

642

643

return 1;

644

}

645

646

###########################################################################

647

648

sub scan {

649

my ($self, $permsgstatus, $msg) = @_;

650

my $score;

651

652

return unless $self->{conf}->{use_learner};

653

654

# When we're doing a scan, we'll guarantee that we'll do the untie,

655

# so override the global setting until we're done.

656

my $caller_untie = $self->{main}->{learn_caller_will_untie};

657

$self->{main}->{learn_caller_will_untie} = 1;

658

659

goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));

660

661

goto skip unless $self->learner_is_scan_available();

662

663

my ($ns, $nn) = $self->{store}->nspam_nham_get();

664

665

## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()

666

## $self->{raw_counts} = " ns=$ns nn=$nn ";

667

## }

668

669

dbg("bayes: corpus size: nspam = $ns, nham = $nn");

670

671

my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);

672

673

my $msgtokens = $self->tokenize($msg, $msgdata);

674

675

my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});

676

677

my %pw;

678

679

foreach my $tokendata (@{$tokensdata}) {

680

my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};

681

my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);

682

next unless defined $prob;

683

684

$pw{$token} = {

685

prob => $prob,

686

spam_count => $tok_spam,

687

ham_count => $tok_ham,

688

atime => $atime

689

};

690

}

691

692

# If none of the tokens were found in the DB, we're going to skip

693

# this message...

694

if (!keys %pw) {

695

dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");

696

goto skip;

697

}

698

699

my $tcount_total = keys %{$msgtokens};

700

my $tcount_learned = keys %pw;

701

702

# Figure out the message receive time (used as atime below)

703

# If the message atime comes back as being in the future, something's

704

# messed up and we should revert to current time as a safety measure.

705

706

my $msgatime = $msg->receive_date();

707

my $now = time;

708

$msgatime = $now if ( $msgatime > $now );

709

710

# now take the $count most significant tokens and calculate probs using

711

# Robinson's formula.

712

my $count = N_SIGNIFICANT_TOKENS;

713

my @sorted;

714

715

my @touch_tokens;

716

my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];

717

my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];

718

719

my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;

720

my $log_each_token = (would_log('dbg', 'bayes') > 1);

721

722

foreach my $tok (sort {

723

$tok_strength{$b} <=> $tok_strength{$a}

724

} keys %pw)

725

{

726

if ($count-- < 0) { last; }

727

next if ($tok_strength{$tok} <

728

$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);

729

730

my $pw = $pw{$tok}->{prob};

731

732

# What's more expensive, scanning headers for HAMMYTOKENS and

733

# SPAMMYTOKENS tags that aren't there or collecting data that

734

# won't be used? Just collecting the data is certainly simpler.

735

736

my $raw_token = $msgtokens->{$tok} || "(unknown)";

737

my $s = $pw{$tok}->{spam_count};

738

my $n = $pw{$tok}->{ham_count};

739

my $a = $pw{$tok}->{atime};

740

741

if ($pw < 0.5) {

742

push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];

743

} else {

744

push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];

745

}

746

747

push (@sorted, $pw);

748

749

# update the atime on this token, it proved useful

750

push(@touch_tokens, $tok);

751

752

if ($log_each_token) {

753

dbg("bayes: token '$raw_token' => $pw");

754

}

755

}

756

757

if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&

758

$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))

759

{

760

dbg("bayes: cannot use bayes on this message; not enough usable tokens found");

761

goto skip;

762

}

763

764

$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);

765

766

# Couldn't come up with a probability?

767

goto skip unless defined $score;

768

769

dbg("bayes: score = $score");

770

771

# no need to call tok_touch_all unless there were significant

772

# tokens and a score was returned

773

# we don't really care about the return value here

774

$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);

775

776

$permsgstatus->{bayes_nspam} = $ns;

777

$permsgstatus->{bayes_nham} = $nn;

778

779

## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()

780

## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";

781

## }

782

783

$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,

784

probsref => \%pw,

785

score => $score,

786

msgatime => $msgatime,

787

significant_tokens => \@touch_tokens,

788

});

789

790

skip:

791

if (!defined $score) {

792

dbg("bayes: not scoring message, returning undef");

793

}

794

795

# Take any opportunistic actions we can take

796

if ($self->{main}->{opportunistic_expire_check_only}) {

797

# we're supposed to report on expiry only -- so do the

798

# _opportunistic_calls() run for the journal only.

799

$self->_opportunistic_calls(1);

800

$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();

801

}

802

else {

803

$self->_opportunistic_calls();

804

}

805

806

# Do any cleanup we need to do

807

$self->{store}->cleanup();

808

809

# Reset the value accordingly

810

$self->{main}->{learn_caller_will_untie} = $caller_untie;

811

812

# If our caller won't untie the db, we need to do it.

813

if (!$caller_untie) {

814

$self->{store}->untie_db();

815

}

816

817

$permsgstatus->set_tag ('BAYESTCHAMMY',

818

($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));

819

$permsgstatus->set_tag ('BAYESTCSPAMMY',

820

($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));

821

$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);

822

$permsgstatus->set_tag ('BAYESTC', $tcount_total);

823

824

$permsgstatus->set_tag ('HAMMYTOKENS', sub {

825

$self->bayes_report_make_list

826

($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift);

827

});

828

829

$permsgstatus->set_tag ('SPAMMYTOKENS', sub {

830

$self->bayes_report_make_list

831

($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift);

832

});

833

834

$permsgstatus->set_tag ('TOKENSUMMARY', sub {

835

if( defined $self->{tag_data}{BAYESTC} )

836

{

837

my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED}

838

- $permsgstatus->{tag_data}{BAYESTCSPAMMY}

839

- $permsgstatus->{tag_data}{BAYESTCHAMMY};

840

my $tcount_new = $permsgstatus->{tag_data}{BAYESTC}

841

- $permsgstatus->{tag_data}{BAYESTCLEARNED};

842

"Tokens: new, $tcount_new; "

843

."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; "

844

."neutral, $tcount_neutral; "

845

."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}."

846

} else {

847

"Bayes not run.";

848

}

849

});

850

851

852

return $score;

853

}

854

855

###########################################################################

856

857

# Plugin hook.

858

sub learner_dump_database {

859

my ($self, $params) = @_;

860

my $magic = $params->{magic};

861

my $toks = $params->{toks};

862

my $regex = $params->{regex};

863

864

# allow dump to occur even if use_bayes disables everything else ...

865

#return 0 unless $self->{conf}->{use_bayes};

866

return 0 unless $self->{store}->tie_db_readonly();

867

868

my @vars = $self->{store}->get_storage_variables();

869

870

my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;

871

872

my $template = '%3.3f %10u %10u %10u %s'."\n";

873

874

if ( $magic ) {

875

printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')

876

or die "Error writing: $!";

877

printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')

878

or die "Error writing: $!";

879

printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')

880

or die "Error writing: $!";

881

printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')

882

or die "Error writing: $!";

883

printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')

884

or die "Error writing: $!";

885

if ( $bv >= 2 ) {

886

printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')

887

or die "Error writing: $!";

888

}

889

if ( $bv < 2 ) {

890

printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')

891

or die "Error writing: $!";

892

}

893

if ( $bv >= 2 ) {

894

printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')

895

or die "Error writing: $!";

896

}

897

printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')

898

or die "Error writing: $!";

899

if ( $bv >= 2 ) {

900

printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')

901

or die "Error writing: $!";

902

903

printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')

904

or die "Error writing: $!";

905

}

906

}

907

908

if ( $toks ) {

909

# let the store sort out the db_toks

910

$self->{store}->dump_db_toks($template, $regex, @vars);

911

}

912

913

if (!$self->{main}->{learn_caller_will_untie}) {

914

$self->{store}->untie_db();

915

}

916

return 1;

917

}

918

919

###########################################################################

920

# TODO: these are NOT public, but the test suite needs to call them.

921

922

sub get_msgid {

923

my ($self, $msg) = @_;

924

925

my @msgid;

926

927

my $msgid = $msg->get_header("Message-Id");

928

if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {

929

# remove \r and < and > prefix/suffixes

930

chomp $msgid;

931

$msgid =~ s/^<//; $msgid =~ s/>.*$//g;

932

push(@msgid, $msgid);

933

}

934

935

# Use sha1_hex(Date:, last received: and top N bytes of body)

936

# where N is MIN(1024 bytes, 1/2 of body length)

937

938

my $date = $msg->get_header("Date");

939

$date = "None" if (!defined $date || $date eq ''); # No Date?

940

941

my @rcvd = $msg->get_header("Received");

942

my $rcvd = $rcvd[$#rcvd];

943

$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?

944

945

# Make a copy since pristine_body is a reference ...

946

my $body = join('', $msg->get_pristine_body());

947

if (length($body) > 64) { # Small Body?

948

my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );

949

substr($body, $keep) = '';

950

}

951

952

unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');

953

954

return wantarray ? @msgid : $msgid[0];

955

}

956

957

sub get_body_from_msg {

958

my ($self, $msg) = @_;

959

960

if (!ref $msg) {

961

# I have no idea why this seems to happen. TODO

962

warn "bayes: msg not a ref: '$msg'";

963

return { };

964

}

965

966

my $permsgstatus =

967

Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);

968

$msg->extract_message_metadata ($permsgstatus);

969

my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);

970

$permsgstatus->finish();

971

972

if (!defined $msgdata) {

973

# why?!

974

warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";

975

return { };

976

}

977

978

return $msgdata;

979

}

980

981

sub _get_msgdata_from_permsgstatus {

982

my ($self, $msg) = @_;

983

984

my $msgdata = { };

985

$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();

986

$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();

987

@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();

988

return $msgdata;

989

}

990

991

###########################################################################

992

993

# The calling functions expect a uniq'ed array of tokens ...

994

sub tokenize {

995

my ($self, $msg, $msgdata) = @_;

996

997

# the body

998

my @tokens = map { $self->_tokenize_line ($_, '', 1) }

999

@{$msgdata->{bayes_token_body}};

1000

1001

# the URI list

1002

push (@tokens, map { $self->_tokenize_line ($_, '', 2) }

1003

@{$msgdata->{bayes_token_uris}});

1004

1005

# add invisible tokens

1006

if (ADD_INVIZ_TOKENS_I_PREFIX) {

1007

push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }

1008

@{$msgdata->{bayes_token_inviz}});

1009

}

1010

if (ADD_INVIZ_TOKENS_NO_PREFIX) {

1011

push (@tokens, map { $self->_tokenize_line ($_, "", 1) }

1012

@{$msgdata->{bayes_token_inviz}});

1013

}

1014

1015

# Tokenize the headers

1016

my %hdrs = $self->_tokenize_headers ($msg);

1017

while( my($prefix, $value) = each %hdrs ) {

1018

push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));

1019

}

1020

1021

# Go ahead and uniq the array, skip null tokens (can happen sometimes)

1022

# generate an SHA1 hash and take the lower 40 bits as our token

1023

my %tokens;

1024

foreach my $token (@tokens) {

1025

next unless length($token); # skip 0 length tokens

1026

$tokens{substr(sha1($token), -5)} = $token;

1027

}

1028

1029

# return the keys == tokens ...

1030

return \%tokens;

1031

}

1032

1033

sub _tokenize_line {

1034

my $self = $_[0];

1035

my $tokprefix = $_[2];

1036

my $region = $_[3];

1037

local ($_) = $_[1];

1038

1039

my @rettokens;

1040

1041

# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,

1042

# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.

1043

# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"

1044

tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;

1045

1046

# DO split on "..." or "--" or "---"; common formatting error resulting in

1047

# hapaxes. Keep the separator itself as a token, though, as long ones can

1048

# be good spamsigns.

1049

s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;

1050

s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;

1051

1052

if (IGNORE_TITLE_CASE) {

1053

if ($region == 1 || $region == 2) {

1054

# lower-case Title Case at start of a full-stop-delimited line (as would

1055

# be seen in a Western language).

1056

s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;

1057

}

1058

}

1059

1060

my $magic_re = $self->{store}->get_magic_re();

1061

1062

foreach my $token (split) {

1063

$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end

1064

$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens

1065

1066

# Skip false magic tokens

1067

# TVD: we need to do a defined() check since SQL doesn't have magic

1068

# tokens, so the SQL BayesStore returns undef. I really want a way

1069

# of optimizing that out, but I haven't come up with anything yet.

1070

1071

next if ( defined $magic_re && $token =~ /$magic_re/ );

1072

1073

# *do* keep 3-byte tokens; there's some solid signs in there

1074

my $len = length($token);

1075

1076

# but extend the stop-list. These are squarely in the gray

1077

# area, and it just slows us down to record them.

1078

# See http://wiki.apache.org/spamassassin/BayesStopList for more info.

1079

1080

next if $len < 3 ||

1081

($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);

1082

1083

# are we in the body? If so, apply some body-specific breakouts

1084

if ($region == 1 || $region == 2) {

1085

if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {

1086

push (@rettokens, $self->_tokenize_mail_addrs ($token));

1087

}

1088

elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {

1089

push (@rettokens, "UD:".$token); # the full token

1090

my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {

1091

push (@rettokens, "UD:".$1); # UD = URL domain

1092

}

1093

}

1094

}

1095

1096

# note: do not trim down overlong tokens if they contain '*'. This is

1097

# used as part of split tokens such as "HTo:D*net" indicating that

1098

# the domain ".net" appeared in the To header.

1099

1100

if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {

1101

if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {

1102

# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,

1103

# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan

1104

# to me! (jm)

1105

while ($token =~ s/^(..?)//) {

1106

push (@rettokens, "8:$1");

1107

}

1108

next;

1109

}

1110

1111

if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)

1112

|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)

1113

|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))

1114

{

1115

# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)

1116

# Spambayes trick via Matt: Just retain 7 chars. Do not retain

1117

# the length, it does not help; see my mail to -devel of Nov 20 2002.

1118

# "sk:" stands for "skip".

1119

$token = "sk:".substr($token, 0, 7);

1120

}

1121

}

1122

1123

# decompose tokens? do this after shortening long tokens

1124

if ($region == 1 || $region == 2) {

1125

if (DECOMPOSE_BODY_TOKENS) {

1126

if ($token =~ /[^\w:\*]/) {

1127

my $decompd = $token; # "Foo!"

1128

$decompd =~ s/[^\w:\*]//gs;

1129

push (@rettokens, $tokprefix.$decompd); # "Foo"

1130

}

1131

1132

if ($token =~ /[A-Z]/) {

1133

my $decompd = $token; $decompd = lc $decompd;

1134

push (@rettokens, $tokprefix.$decompd); # "foo!"

1135

1136

if ($token =~ /[^\w:\*]/) {

1137

$decompd =~ s/[^\w:\*]//gs;

1138

push (@rettokens, $tokprefix.$decompd); # "foo"

1139

}

1140

}

1141

}

1142

}

1143

1144

push (@rettokens, $tokprefix.$token);

1145

}

1146

1147

return @rettokens;

1148

}

1149

1150

sub _tokenize_headers {

1151

my ($self, $msg) = @_;

1152

1153

my %parsed;

1154

1155

my %user_ignore;

1156

$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};

1157

1158

# get headers in array context

1159

my @hdrs;

1160

my @rcvdlines;

1161

for ($msg->get_all_headers()) {

1162

# first, keep a copy of Received headers, so we can strip down to last 2

1163

if (/^Received:/i) {

1164

push(@rcvdlines, $_);

1165

next;

1166

}

1167

# and now skip lines for headers we don't want (including all Received)

1168

next if /^${IGNORED_HDRS}:/i;

1169

next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;

1170

push(@hdrs, $_);

1171

}

1172

push(@hdrs, $msg->get_all_metadata());

1173

1174

# and re-add the last 2 received lines: usually a good source of

1175

# spamware tokens and HELO names.

1176

if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }

1177

if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }

1178

1179

for (@hdrs) {

1180

next unless /\S/;

1181

my ($hdr, $val) = split(/:/, $_, 2);

1182

1183

# remove user-specified headers here, after Received, in case they

1184

# want to ignore that too

1185

next if exists $user_ignore{lc $hdr};

1186

1187

# Prep the header value

1188

$val ||= '';

1189

chomp($val);

1190

1191

# special tokenization for some headers:

1192

if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {

1193

$val = $self->_pre_chew_message_id ($val);

1194

}

1195

elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)

1196

1197

{

1198

$val = $self->_pre_chew_addr_header ($val);

1199

}

1200

elsif ($hdr eq 'Received') {

1201

$val = $self->_pre_chew_received ($val);

1202

}

1203

elsif ($hdr eq 'Content-Type') {

1204

$val = $self->_pre_chew_content_type ($val);

1205

}

1206

elsif ($hdr eq 'MIME-Version') {

1207

$val =~ s/1\.0//; # totally innocuous

1208

}

1209

elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {

1210

$val = "1"; # just mark the presence, they create lots of hapaxen

1211

}

1212

1213

if (MAP_HEADERS_MID) {

1214

if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {

1215

$parsed{"*MI"} = $val;

1216

}

1217

}

1218

if (MAP_HEADERS_FROMTOCC) {

1219

if ($hdr =~ /^(?:From|To|Cc)$/i) {

1220

$parsed{"*Ad"} = $val;

1221

}

1222

}

1223

if (MAP_HEADERS_USERAGENT) {

1224

if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {

1225

$parsed{"*UA"} = $val;

1226

}

1227

}

1228

1229

# replace hdr name with "compressed" version if possible

1230

if (defined $HEADER_NAME_COMPRESSION{$hdr}) {

1231

$hdr = $HEADER_NAME_COMPRESSION{$hdr};

1232

}

1233

1234

if (exists $parsed{$hdr}) {

1235

$parsed{$hdr} .= " ".$val;

1236

} else {

1237

$parsed{$hdr} = $val;

1238

}

1239

if (would_log('dbg', 'bayes') > 1) {

1240

dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");

1241

}

1242

}

1243

1244

return %parsed;

1245

}

1246

1247

sub _pre_chew_content_type {

1248

my ($self, $val) = @_;

1249

1250

# hopefully this will retain good bits without too many hapaxen

1251

if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {

1252

my $boundary = $1;

1253

$boundary = '' if !defined $boundary; # avoid a warning

1254

$boundary =~ s/[a-fA-F0-9]/H/gs;

1255

# break up blocks of separator chars so they become their own tokens

1256

$boundary =~ s/([-_\.=]+)/ $1 /gs;

1257

$val .= $boundary;

1258

}

1259

1260

# stop-list words for Content-Type header: these wind up totally gray

1261

$val =~ s/\b(?:text|charset)\b//;

1262

1263

$val;

1264

}

1265

1266

sub _pre_chew_message_id {

1267

my ($self, $val) = @_;

1268

# we can (a) get rid of a lot of hapaxen and (b) increase the token

1269

# specificity by pre-parsing some common formats.

1270

1271

# Outlook Express format:

1272

$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$

1273

([0-9a-f]{4})[0-9a-f]{4}\$

1274

([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;

1275

1276

# Exim:

1277

$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;

1278

1279

# Sendmail:

1280

$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.

1281

[A-F0-9]{10,12}\@//gx;

1282

1283

# try to split Message-ID segments on probable ID boundaries. Note that

1284

# Outlook message-ids seem to contain a server identifier ID in the last

1285

# 8 bytes before the @. Make sure this becomes its own token, it's a

1286

# great spam-sign for a learning system! Be sure to split on ".".

1287

$val =~ s/[^_A-Za-z0-9]/ /g;

1288

$val;

1289

}

1290

1291

sub _pre_chew_received {

1292

my ($self, $val) = @_;

1293

1294

# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs

1295

# and valid-format RFC-822/2822 dates

1296

1297

$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail

1298

$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail

1299

$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail

1300

$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim

1301

1302

$val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?

1303

[0-3\s]?[0-9]\s

1304

(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s

1305

(?:19|20)?[0-9]{2}\s

1306

[0-2][0-9](?:\:[0-5][0-9]){1,2}\s

1307

(?:\s*$|$|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*

1308

//gx;

1309

1310

# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for

1311

# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens

1312

# (on both sides)

1313

# also make a dup with the full IP, as fodder for

1314

# bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"

1315

$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{

1316

if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {

1317

$1.$2.$3.$4.

1318

" ip*".$1.$2.$3.$4." ";

1319

} else {

1320

$1.$2.$3.

1321

" ip*".$1.$2.$3.$4." ";

1322

}

1323

}gex;

1324

1325

# trim these: they turn out as the most common tokens, but with a

1326

# prob of about .5. waste of space!

1327

$val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;

1328

1329

$val;

1330

}

1331

1332

sub _pre_chew_addr_header {

1333

my ($self, $val) = @_;

1334

local ($_);

1335

1336

my @addrs = $self->{main}->find_all_addrs_in_line ($val);

1337

my @toks;

1338

foreach (@addrs) {

1339

push (@toks, $self->_tokenize_mail_addrs ($_));

1340

}

1341

return join (' ', @toks);

1342

}

1343

1344

sub _tokenize_mail_addrs {

1345

my ($self, $addr) = @_;

1346

1347

($addr =~ /(.+)\@(.+)$/) or return ();

1348

my @toks;

1349

push(@toks, "U*".$1, "D*".$2);

1350

$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }

1351

return @toks;

1352

}

1353

1354

1355

###########################################################################

1356

1357

# compute the probability that a token is spammish

1358

sub _compute_prob_for_token {

1359

my ($self, $token, $ns, $nn, $s, $n) = @_;

1360

1361

# we allow the caller to give us the token information, just

1362

# to save a potentially expensive lookup

1363

if (!defined($s) || !defined($n)) {

1364

($s, $n, undef) = $self->{store}->tok_get ($token);

1365

}

1366

1367

return if ($s == 0 && $n == 0);

1368

1369

if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1370

return if ($s + $n < 10); # ignore low-freq tokens

1371

}

1372

1373

if (!$self->{use_hapaxes}) {

1374

return if ($s + $n < 2);

1375

}

1376

1377

return if ( $ns == 0 || $nn == 0 );

1378

1379

my $ratios = ($s / $ns);

1380

my $ration = ($n / $nn);

1381

1382

my $prob;

1383

1384

if ($ratios == 0 && $ration == 0) {

1385

warn "bayes: oops? ratios == ration == 0";

1386

return;

1387

} else {

1388

$prob = ($ratios) / ($ration + $ratios);

1389

}

1390

1391

if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {

1392

# use Robinson's f(x) equation for low-n tokens, instead of just

1393

# ignoring them

1394

my $robn = $s+$n;

1395

$prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))

1396

1397

($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);

1398

}

1399

1400

# 'log_raw_counts' is used to log the raw data for the Bayes equations during

1401

# a mass-check, allowing the S and X constants to be optimized quickly

1402

# without requiring re-tokenization of the messages for each attempt. There's

1403

# really no need for this code to be uncommented in normal use, however. It

1404

# has never been publicly documented, so commenting it out is fine. ;)

1405

1406

## if ($self->{log_raw_counts}) {

1407

## $self->{raw_counts} .= " s=$s,n=$n ";

1408

## }

1409

1410

return $prob;

1411

}

1412

1413

###########################################################################

1414

# If a token is neither hammy nor spammy, return 0.

1415

# For a spammy token, return the minimum number of additional ham messages

1416

# it would have had to appear in to no longer be spammy. Hammy tokens

1417

# are handled similarly. That's what the function does (at the time

1418

# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly

1419

# more useful if it returned the number of /additional/ ham messages

1420

# a spammy token would have to appear in to no longer be spammy but I

1421

# fear that might require the solution to a cubic equation, and I

1422

# just don't have the time for that now.

1423

1424

sub _compute_declassification_distance {

1425

my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;

1426

1427

return 0 if $ns == 0 && $nn == 0;

1428

1429

if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}

1430

if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}

1431

1432

return 0 if $Ns == 0 || $Nn == 0;

1433

return 0 if abs( $prob - 0.5 ) <

1434

$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

1435

1436

my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);

1437

my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

1438

1439

return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na

1440

unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;

1441

1442

my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;

1443

my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;

1444

my $a = $Nb * ( 1 - $p );

1445

my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;

1446

my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );

1447

my $discrim = $b * $b - 4 * $a * $c;

1448

my $disc_max_0 = $discrim < 0 ? 0 : $discrim;

1449

my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;

1450

1451

# This shouldn't be necessary. Should not be < 1

1452

return $dd_exact < 1 ? 1 : int($dd_exact);

1453

}

1454

1455

###########################################################################

1456

1457

sub _opportunistic_calls {

1458

my($self, $journal_only) = @_;

1459

1460

# If we're not already tied, abort.

1461

if (!$self->{store}->db_readable()) {

1462

dbg("bayes: opportunistic call attempt failed, DB not readable");

1463

return;

1464

}

1465

1466

# Is an expire or sync running?

1467

my $running_expire = $self->{store}->get_running_expire_tok();

1468

if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {

1469

dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");

1470

return;

1471

}

1472

1473

# handle expiry and syncing

1474

if (!$journal_only && $self->{store}->expiry_due()) {

1475

dbg("bayes: opportunistic call found expiry due");

1476

1477

# sync will bring the DB R/W as necessary, and the expire will remove

1478

# the running_expire token, may untie as well.

1479

$self->{main}->{bayes_scanner}->sync(1,1);

1480

}

1481

elsif ( $self->{store}->sync_due() ) {

1482

dbg("bayes: opportunistic call found journal sync due");

1483

1484

# sync will bring the DB R/W as necessary, may untie as well

1485

$self->{main}->{bayes_scanner}->sync(1,0);

1486

1487

# We can only remove the running_expire token if we're doing R/W

1488

if ($self->{store}->db_writable()) {

1489

$self->{store}->remove_running_expire_tok();

1490

}

1491

}

1492

1493

return;

1494

}

1495

1496

###########################################################################

1497

1498

sub learner_new {

1499

my ($self) = @_;

1500

1501

my $store;

1502

my $module = untaint_var($self->{conf}->{bayes_store_module});

1503

$module = 'Mail::SpamAssassin::BayesStore::DBM' if !$module;

1504

1505

dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);

1506

eval '

1507

require '.$module.';

1508

$store = '.$module.'->new($self);

1509

1510

' or do {

1511

my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;

1512

die "bayes: learner_new $module new() failed: $eval_stat\n";

1513

};

1514

1515

dbg("bayes: learner_new: got store=%s", $store);

1516

$self->{store} = $store;

1517

1518

$self;

1519

}

1520

1521

###########################################################################

1522

1523

sub bayes_report_make_list {

1524

my ($self, $pms, $info, $param) = @_;

1525

return "Tokens not available." unless defined $info;

1526

1527

my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');

1528

1529

my %formats = (

1530

short => '$t',

1531

Short => 'Token: \"$t\"',

1532

compact => '$p-$D--$t',

1533

Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',

1534

medium => '$p-$D-$N--$t',

1535

long => '$p-$d--${h}h-${s}s--${a}d--$t',

1536

Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'

1537

);

1538

1539

my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});

1540

1541

return "Invalid format, must be one of: ".join(",",keys %formats)

1542

unless defined $raw_fmt;

1543

1544

my $fmt = '"'.$raw_fmt.'"';

1545

my $amt = $limit < @$info ? $limit : @$info;

1546

return "" unless $amt;

1547

1548

my $ns = $pms->{bayes_nspam};

1549

my $nh = $pms->{bayes_nham};

1550

my $digit = sub { $_[0] > 9 ? "+" : $_[0] };

1551

my $now = time;

1552

1553

join ', ', map {

1554

my($t,$prob,$s,$h,$u) = @$_;

1555

my $a = int(($now - $u)/(3600 * 24));

1556

my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);

1557

my $p = sprintf "%.3f", $prob;

1558

my $n = $s + $h;

1559

my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);

1560

my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);

1561

eval $fmt; ## no critic

1562

} @{$info}[0..$amt-1];

1563

}

1564

1565

1566

1567

=back

1568

1569

=cut

Older »