3
# Licensed to the Apache Software Foundation (ASF) under one or more
4
# contributor license agreements. See the NOTICE file distributed with
5
# this work for additional information regarding copyright ownership.
6
# The ASF licenses this file to you under the Apache License, Version 2.0
7
# (the "License"); you may not use this file except in compliance with
8
# the License. You may obtain a copy of the License at:
10
# http://www.apache.org/licenses/LICENSE-2.0
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
28
$spamtest %opt $isspam $forget
29
$messagecount $learnedcount $messagelimit
30
$progress $total_messages $init_results $start_time
31
$synconly $learnprob @targets $bayes_override_path
34
my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time
35
my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time
36
my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time
38
use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time
40
BEGIN { # see comments in "spamassassin.raw" for doco
41
my @bin = File::Spec->splitpath($0);
42
my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1])
43
|| File::Spec->curdir;
45
if (-e $bin.'/lib/Mail/SpamAssassin.pm'
46
|| !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
49
$searchrelative = 1; # disabled during "make install": REMOVEFORINST
50
if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
52
unshift ( @INC, '../blib/lib' );
54
foreach ( qw(lib ../lib/site_perl
55
../lib/spamassassin ../share/spamassassin/lib))
57
my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
58
if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
59
{ unshift ( @INC, $dir ); last; }
65
use Mail::SpamAssassin;
66
use Mail::SpamAssassin::ArchiveIterator;
67
use Mail::SpamAssassin::Message;
68
use Mail::SpamAssassin::PerMsgLearner;
69
use Mail::SpamAssassin::Util::Progress;
70
use Mail::SpamAssassin::Logger;
72
###########################################################################
74
$SIG{PIPE} = 'IGNORE';
76
# used to be CmdLearn::cmd_run() ...
85
Getopt::Long::Configure(
86
qw(bundling no_getopt_compat
87
permute no_auto_abbrev no_ignore_case)
92
'ham|nonspam' => sub { $isspam = 0; },
93
'spam' => sub { $isspam = 1; },
95
'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" },
97
'username|u=s' => \$opt{'username'},
98
'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
99
'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
100
'siteconfigpath=s' => \$opt{'siteconfigpath'},
101
'cf=s' => \@{$opt{'cf'}},
103
'folders|f=s' => \$opt{'folders'},
104
'force-expire|expire' => \$opt{'force-expire'},
105
'local|L' => \$opt{'local'},
106
'no-sync|nosync' => \$opt{'nosync'},
107
'showdots' => \$opt{'showdots'},
108
'progress' => \$opt{'progress'},
109
'use-ignores' => \$opt{'use-ignores'},
110
'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" },
112
'learnprob=f' => \$opt{'learnprob'},
113
'randseed=i' => \$opt{'randseed'},
114
'stopafter=i' => \$opt{'stopafter'},
116
'debug|debug-level|D:s' => \$opt{'debug'},
117
'help|h|?' => \$opt{'help'},
118
'version|V' => \$opt{'version'},
120
'dump:s' => \$opt{'dump'},
121
'import' => \$opt{'import'},
123
'backup' => \$opt{'backup'},
124
'clear' => \$opt{'clear'},
125
'restore=s' => \$opt{'restore'},
127
'dir' => sub { $opt{'old_format'} = 'dir'; },
128
'file' => sub { $opt{'old_format'} = 'file'; },
129
'mbox' => sub { $opt{'format'} = 'mbox'; },
130
'mbx' => sub { $opt{'format'} = 'mbx'; },
131
'single' => sub { $opt{'old_format'} = 'single'; },
133
'db|dbpath=s' => \$bayes_override_path,
134
're|regexp=s' => \$opt{'regexp'},
138
or usage( 0, "Unknown option!" );
140
if ( defined $opt{'help'} ) {
141
usage( 0, "For more information read the manual page" );
143
if ( defined $opt{'version'} ) {
144
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
148
# set debug areas, if any specified (only useful for command-line tools)
149
if (defined $opt{'debug'}) {
150
$opt{'debug'} ||= 'all';
153
if ( $opt{'force-expire'} ) {
157
if ($opt{'showdots'} && $opt{'progress'}) {
158
print "--showdots and --progress may not be used together, please select just one\n";
162
if ( !defined $isspam
163
&& !defined $synconly
165
&& !defined $opt{'dump'}
166
&& !defined $opt{'import'}
167
&& !defined $opt{'clear'}
168
&& !defined $opt{'backup'}
169
&& !defined $opt{'restore'}
170
&& !defined $opt{'folders'} )
173
"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
177
# We need to make sure the journal syncs pre-forget...
178
if ( defined $forget && $opt{'nosync'} ) {
181
"sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
184
if ( defined $opt{'old_format'} ) {
186
#Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
187
#Convert it to the new behavior:
188
if ( $opt{'old_format'} eq 'single' ) {
193
my $post_config = '';
195
# kluge to support old check_bayes_db operation
196
# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
197
# to override. Just access the dbpath version via post_config_text.
198
if ( defined $bayes_override_path ) {
199
# Add a default prefix if the path is a directory
200
if ( -d $bayes_override_path ) {
201
$bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
204
$post_config .= "bayes_path $bayes_override_path\n";
207
# These options require bayes_scanner, which requires "use_bayes 1", but
208
# that's not necessary for these commands.
209
if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} ||
210
defined $opt{'backup'} || defined $opt{'restore'}) {
211
$post_config .= "use_bayes 1\n";
214
$post_config .= join("\n", @{$opt{'cf'}})."\n";
216
# create the tester factory
217
$spamtest = new Mail::SpamAssassin(
219
rules_filename => $opt{'configpath'},
220
site_rules_filename => $opt{'siteconfigpath'},
221
userprefs_filename => $opt{'prefspath'},
222
username => $opt{'username'},
223
debug => $opt{'debug'},
224
local_tests_only => $opt{'local'},
225
dont_copy_prefs => 1,
227
DEF_RULES_DIR => $DEF_RULES_DIR,
228
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
229
post_config_text => $post_config,
234
dbg("sa-learn: spamtest initialized");
236
# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
237
# To be resolved more cleanly!!!
238
if ($spamtest->{bayes_scanner}) {
239
foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
240
if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) {
241
# copy plugin's "store" object ref one level up!
242
$spamtest->{bayes_scanner}->{store} = $plugin->{store};
247
if (Mail::SpamAssassin::Util::am_running_on_windows()) {
248
binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363
249
binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
252
if ( defined $opt{'dump'} ) {
253
my ( $magic, $toks );
255
if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens!
256
( $magic, $toks ) = ( 1, 1 );
258
elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only
259
( $magic, $toks ) = ( 1, 0 );
261
elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only
262
( $magic, $toks ) = ( 0, 1 );
264
else { # unknown option
265
warn "Unknown dump option '" . $opt{'dump'} . "'\n";
266
$spamtest->finish_learner();
270
if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
271
$spamtest->finish_learner();
272
die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
275
$spamtest->finish_learner();
276
# make sure we notice any write errors while flushing output buffer
277
close STDOUT or die "error closing STDOUT: $!";
278
close STDIN or die "error closing STDIN: $!";
282
if ( defined $opt{'import'} ) {
283
my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
284
$spamtest->finish_learner();
285
# make sure we notice any write errors while flushing output buffer
286
close STDOUT or die "error closing STDOUT: $!";
287
close STDIN or die "error closing STDIN: $!";
291
if (defined $opt{'clear'}) {
292
unless ($spamtest->{bayes_scanner}->{store}->clear_database()) {
293
$spamtest->finish_learner();
294
die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n";
297
$spamtest->finish_learner();
298
# make sure we notice any write errors while flushing output buffer
299
close STDOUT or die "error closing STDOUT: $!";
300
close STDIN or die "error closing STDIN: $!";
304
if (defined $opt{'backup'}) {
305
unless ($spamtest->{bayes_scanner}->{store}->backup_database()) {
306
$spamtest->finish_learner();
307
die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n";
310
$spamtest->finish_learner();
311
# make sure we notice any write errors while flushing output buffer
312
close STDOUT or die "error closing STDOUT: $!";
313
close STDIN or die "error closing STDIN: $!";
317
if (defined $opt{'restore'}) {
319
my $filename = $opt{'restore'};
322
$spamtest->finish_learner();
323
die "ERROR: You must specify a filename to restore.\n";
326
unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) {
327
$spamtest->finish_learner();
328
die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n";
331
$spamtest->finish_learner();
332
# make sure we notice any write errors while flushing output buffer
333
close STDOUT or die "error closing STDOUT: $!";
334
close STDIN or die "error closing STDIN: $!";
338
if ( !$spamtest->{conf}->{use_bayes} ) {
339
warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n";
343
$spamtest->init_learner(
345
force_expire => $opt{'force-expire'},
346
learn_to_journal => $opt{'nosync'},
348
caller_will_untie => 1
352
$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};
355
$spamtest->rebuild_learner_caches(
358
showdots => $opt{'showdots'}
361
$spamtest->finish_learner();
362
# make sure we notice any write errors while flushing output buffer
363
close STDOUT or die "error closing STDOUT: $!";
364
close STDIN or die "error closing STDIN: $!";
368
$messagelimit = $opt{'stopafter'};
369
$learnprob = $opt{'learnprob'};
371
if ( defined $opt{'randseed'} ) {
372
srand( $opt{'randseed'} );
375
# sync the journal first if we're going to go r/w so we make sure to
376
# learn everything before doing anything else.
378
if ( !$opt{nosync} ) {
379
$spamtest->rebuild_learner_caches();
382
# what is the result of the run? will end up being the exit code.
385
# run this lot in an eval block, so we can catch die's and clear
388
$SIG{HUP} = \&killed;
389
$SIG{INT} = \&killed;
390
$SIG{TERM} = \&killed;
392
if ( $opt{folders} ) {
393
open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!";
394
for ($!=0; <F>; $!=0) {
397
if (/^(?:ham|spam):\w*:/) {
398
push ( @targets, $_ );
404
defined $_ || $!==0 or
405
$!==EBADF ? dbg("error reading from $opt{folders}: $!")
406
: die "error reading from $opt{folders}: $!";
407
close(F) or die "error closing $opt{folders}: $!";
410
###########################################################################
411
# Deal with the target listing, and STDIN -> tempfile
413
my $tempfile; # will be defined if stdin -> tempfile
414
push(@targets, @ARGV);
415
@targets = ('-') unless @targets;
417
for(my $elem = 0; $elem <= $#targets; $elem++) {
418
# ArchiveIterator doesn't really like STDIN, so if "-" is specified
419
# as a target, make it a temp file instead.
420
if ( $targets[$elem] =~ /(?:^|:)-$/ ) {
421
if (defined $tempfile) {
422
# uh-oh, stdin specified multiple times?
423
warn "skipping extra stdin target (".$targets[$elem].")\n";
424
splice @targets, $elem, 1;
425
$elem--; # go back to this element again
430
( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
431
binmode $handle or die "cannot set binmode on file $tempfile: $!";
433
# avoid slurping the whole file into memory, copy chunk by chunk
435
while ( $nread=sysread(STDIN,$inbuf,16384) )
436
{ print {$handle} $inbuf or die "error writing to $tempfile: $!" }
437
defined $nread or die "error reading from STDIN: $!";
438
close $handle or die "error closing $tempfile: $!";
440
# re-aim the targets at the tempfile instead of STDIN
441
$targets[$elem] =~ s/-$/$tempfile/;
445
# make sure the target list is in the normal AI format
446
if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) {
447
my $item = splice @targets, $elem, 1;
448
target($item); # add back to the list
449
$elem--; # go back to this element again
454
###########################################################################
456
my $iter = new Mail::SpamAssassin::ArchiveIterator(
458
'opt_all' => 0, # skip messages over 250k
459
'opt_want_date' => 0,
463
$iter->set_functions(\&wanted, \&result);
470
# if exit_status isn't already set to non-zero, set it to the reverse of the
471
# run result (0 is bad, 1+ is good -- the opposite of exit status codes)
472
eval { $exit_status ||= ! $iter->run(@targets); };
474
print STDERR "\n" if ($opt{showdots});
475
$progress->final() if ($opt{progress} && $progress);
477
my $phrase = defined $forget ? "Forgot" : "Learned";
478
print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n";
480
# If we needed to make a tempfile, go delete it.
481
if (defined $tempfile) {
482
unlink $tempfile or die "cannot unlink temporary file $tempfile: $!";
486
if ($@) { die $@ unless ( $@ =~ /HITLIMIT/ ); }
489
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
490
$spamtest->finish_learner();
494
$spamtest->finish_learner();
495
# make sure we notice any write errors while flushing output buffer
496
close STDOUT or die "error closing STDOUT: $!";
497
close STDIN or die "error closing STDIN: $!";
500
###########################################################################
503
$spamtest->finish_learner();
510
my $class = ( $isspam ? "spam" : "ham" );
511
my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" );
513
push ( @targets, "$class:$format:$target" );
516
###########################################################################
521
return unless $opt{'progress'};
523
$total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
525
$progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,});
528
###########################################################################
531
my ($class, $result, $time) = @_;
533
# don't open results files until we get here to avoid overwriting files
534
&init_results if !$init_results;
536
$progress->update($messagecount) if ($opt{progress} && $progress);
539
###########################################################################
542
my ( $class, $id, $time, $dataref ) = @_;
544
my $spam = $class eq "s" ? 1 : 0;
546
if ( defined($learnprob) ) {
547
if ( int( rand( 1 / $learnprob ) ) != 0 ) {
548
print STDERR '_' if ( $opt{showdots} );
553
if ( defined($messagelimit) && $learnedcount > $messagelimit ) {
554
$progress->final() if ($opt{progress} && $progress);
559
my $ma = $spamtest->parse($dataref);
561
if ( $ma->get_header("X-Spam-Checker-Version") ) {
562
my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
567
my $status = $spamtest->learn( $ma, undef, $spam, $forget );
568
my $learned = $status->did_learn();
570
if ( !defined $learned ) { # undef=learning unavailable
571
die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n";
573
elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned
584
print STDERR '.' if ( $opt{showdots} );
588
###########################################################################
591
my ( $verbose, $message ) = @_;
592
my $ver = Mail::SpamAssassin::Version();
593
print "SpamAssassin version $ver\n";
594
pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
597
# ---------------------------------------------------------------------------
601
sa-learn - train SpamAssassin's Bayesian classifier
605
B<sa-learn> [options] [file]...
607
B<sa-learn> [options] --dump [ all | data | magic ]
611
--ham Learn messages as ham (non-spam)
612
--spam Learn messages as spam
613
--forget Forget a message
614
--use-ignores Use bayes_ignore_from and bayes_ignore_to
615
--sync Synchronize the database and the journal if needed
616
--force-expire Force a database sync and expiry run
617
--dbpath <path> Allows commandline override (in bayes_path form)
618
for where to read the Bayes DB from
619
--dump [all|data|magic] Display the contents of the Bayes database
620
Takes optional argument for what to display
621
--regexp <re> For dump only, specifies which tokens to
622
dump based on a regular expression.
623
-f file, --folders=file Read list of files/directories from file
624
--dir Ignored; historical compatibility
625
--file Ignored; historical compatibility
626
--mbox Input sources are in mbox format
627
--mbx Input sources are in mbx format
628
--showdots Show progress using dots
629
--progress Show progress using progress bar
630
--no-sync Skip synchronizing the database and journal
632
-L, --local Operate locally, no network accesses
633
--import Migrate data from older version/non DB_File
635
--clear Wipe out existing database
636
--backup Backup, to STDOUT, existing database
637
--restore <filename> Restore a database from filename
638
-u username, --username=username
639
Override username taken from the runtime
640
environment, used with SQL
641
-C path, --configpath=path, --config-file=path
642
Path to standard configuration dir
643
-p prefs, --prefspath=file, --prefs-file=file
644
Set user preferences file
645
--siteconfigpath=path Path for site configs
646
(default: /etc/spamassassin)
647
--cf='config line' Additional line of configuration
648
-D, --debug [area=n,...] Print debugging messages
649
-V, --version Print version
650
-h, --help Print usage message
654
Given a typical selection of your incoming mail classified as spam or ham
655
(non-spam), this tool will feed each mail to SpamAssassin, allowing it
656
to 'learn' what signs are likely to mean spam, and which are likely to
659
Simply run this command once for each of your mail folders, and it will
660
''learn'' from the mail therein.
662
Note that csh-style I<globbing> in the mail folder names is supported;
663
in other words, listing a folder name as C<*> will scan every folder
664
that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
666
SpamAssassin remembers which mail messages it has learnt already, and will not
667
re-learn those messages again, unless you use the B<--forget> option. Messages
668
learnt as spam will have SpamAssassin markup removed, on the fly.
670
If you make a mistake and scan a mail as ham when it is spam, or vice
671
versa, simply rerun this command with the correct classification, and the
672
mistake will be corrected. SpamAssassin will automatically 'forget' the
673
previous indications.
675
Users of C<spamd> who wish to perform training remotely, over a network,
676
should investigate the C<spamc -L> switch.
684
Learn the input message(s) as ham. If you have previously learnt any of the
685
messages as spam, SpamAssassin will forget them first, then re-learn them as
686
ham. Alternatively, if you have previously learnt them as ham, it'll skip them
687
this time around. If the messages have already been filtered through
688
SpamAssassin, the learner will ignore any modifications SpamAssassin may have
693
Learn the input message(s) as spam. If you have previously learnt any of the
694
messages as ham, SpamAssassin will forget them first, then re-learn them as
695
spam. Alternatively, if you have previously learnt them as spam, it'll skip
696
them this time around. If the messages have already been filtered through
697
SpamAssassin, the learner will ignore any modifications SpamAssassin may have
700
=item B<--folders>=I<filename>, B<-f> I<filename>
702
sa-learn will read in the list of folders from the specified file, one folder
703
per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>,
704
sa-learn will learn that folder appropriately, otherwise the folders will be
705
assumed to be of the type specified by B<--ham> or B<--spam>.
707
C<type> above is optional, but is the same as the standard for
708
ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not
713
sa-learn will read in the file(s) containing the emails to be learned,
714
and will process them in mbox format (one or more emails per file).
718
sa-learn will read in the file(s) containing the emails to be learned,
719
and will process them in mbx format (one or more emails per file).
721
=item B<--use-ignores>
723
Don't learn the message if a from address matches configuration file
724
item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
725
The option might be used when learning from a large file of messages
726
from which the hammy spam messages or spammy ham messages have not
731
Synchronize the journal and databases. Upon successfully syncing the
732
database with the entries in the journal, the journal file is removed.
734
=item B<--force-expire>
736
Forces an expiry attempt, regardless of whether it may be necessary
737
or not. Note: This doesn't mean any tokens will actually expire.
738
Please see the EXPIRATION section below.
740
Note: C<--force-expire> also causes the journal data to be synchronized
741
into the Bayes databases.
745
Forget a given message previously learnt.
749
Allows a commandline override of the I<bayes_path> configuration option.
751
=item B<--dump> I<option>
753
Display the contents of the Bayes database. Without an option or with
754
the I<all> option, all magic tokens and data tokens will be displayed.
755
I<magic> will only display magic tokens, and I<data> will only display
758
Can also use the B<--regexp> I<RE> option to specify which tokens to
759
display based on a regular expression.
763
Clear an existing Bayes database by removing all traces of the database.
765
WARNING: This is destructive and should be used with care.
769
Performs a dump of the Bayes database in machine/human readable format.
771
The dump will include token and seen data. It is suitable for input back
772
into the --restore command.
774
=item B<--restore>=I<filename>
776
Performs a restore of the Bayes database defined by I<filename>.
778
WARNING: This is a destructive operation, previous Bayes data will be wiped out.
780
=item B<-h>, B<--help>
782
Print help message and exit.
784
=item B<-u> I<username>, B<--username>=I<username>
786
If specified this username will override the username taken from the runtime
787
environment. You can use this option to specify users in a virtual user
788
configuration when using SQL as the Bayes backend.
790
NOTE: This option will not change to the given I<username>, it will only attempt
791
to act on behalf of that user. Because of this you will need to have proper
792
permissions to be able to change files owned by I<username>. In the case of SQL
793
this generally is not a problem.
795
=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
797
Use the specified path for locating the distributed configuration files.
798
Ignore the default directories (usually C</usr/share/spamassassin> or similar).
800
=item B<--siteconfigpath>=I<path>
802
Use the specified path for locating site-specific configuration files. Ignore
803
the default directories (usually C</etc/spamassassin> or similar).
805
=item B<--cf='config line'>
807
Add additional lines of configuration directly from the command-line, parsed
808
after the configuration files are read. Multiple B<--cf> arguments can be
809
used, and each will be considered a separate line of configuration.
811
=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
813
Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
817
Prints a progress bar (to STDERR) showing the current progress. In the case
818
where no valid terminal is found this option will behave very much like the
821
=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
823
Produce debugging output. If no areas are listed, all debugging information is
824
printed. Diagnostic output can also be enabled for each area individually;
825
I<area> is the area of the code to instrument. For example, to produce
826
diagnostic output on bayes, learn, and dns, use:
828
spamassassin -D bayes,learn,dns
830
For more information about which areas (also known as channels) are available,
831
please see the documentation at:
833
C<http://wiki.apache.org/spamassassin/DebugChannels>
835
Higher priority informational messages that are suitable for logging in normal
836
circumstances are available with an area of "info".
840
Skip the slow synchronization step which normally takes place after
841
changing database entries. If you plan to learn from many folders in
842
a batch, or to learn many individual messages one-by-one, it is faster
843
to use this switch and run C<sa-learn --sync> once all the folders have
846
Clarification: The state of I<--no-sync> overrides the
847
I<bayes_learn_to_journal> configuration option. If not specified,
848
sa-learn will learn to the database directly. If specified, sa-learn
849
will learn to the journal file.
851
Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
852
which is slightly confusing. In this case, the I<--no-sync> option is
853
ignored since there is no learn operation.
855
=item B<-L>, B<--local>
857
Do not perform any network accesses while learning details about the mail
858
messages. This will speed up the learning process, but may result in a
859
slightly lower accuracy.
861
Note that this is currently ignored, as current versions of SpamAssassin will
862
not perform network access while learning; but future versions may.
866
If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
867
module installed, it will have created files in other formats, such as
868
C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate
869
that old data into the C<DB_File> format. It will overwrite any data currently
872
Can also be used with the B<--dbpath> I<path> option to specify the location of
873
the Bayes files to use.
879
There are now multiple backend storage modules available for storing
880
user's bayesian data. As such you might want to migrate from one
881
backend to another. Here is a simple procedure for migrating from one
884
Note that if you have individual user databases you will have to
885
perform a similar procedure for each one of them.
889
=item sa-learn --sync
891
This will sync any outstanding journal entries
893
=item sa-learn --backup > backup.txt
895
This will save all your Bayes data to a plain text file.
897
=item sa-learn --clear
899
This is optional, but good to do to clear out the old database.
903
At this point, if you have multiple databases, you should perform the
904
procedure above for each of them. (i.e. each user's database needs to
905
be backed up before continuing.)
907
=item Switch backends
909
Once you have backed up all databases you can update your
910
configuration for the new database backend. This will involve at least
911
the bayes_store_module config option and may involve some additional
912
config options depending on what is required by the module. (For
913
example, you may need to configure an SQL database.)
915
=item sa-learn --restore backup.txt
917
Again, you need to do this for every database.
921
If you are migrating to SQL you can make use of the -u <username>
922
option in sa-learn to populate each user's database. Otherwise, you
923
must run sa-learn as the user who database you are restoring.
926
=head1 INTRODUCTION TO BAYESIAN FILTERING
928
(Thanks to Michael Bell for this section!)
930
For a more lengthy description of how this works, go to
931
http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably
932
readable, even if statistics make me break out in hives.
934
The short semi-inaccurate version: Given training, a spam heuristics engine
935
can take the most "spammy" and "hammy" words and apply probabilistic
936
analysis. Furthermore, once given a basis for the analysis, the engine can
937
continue to learn iteratively by applying both the non-Bayesian and Bayesian
938
rulesets together to create evolving "intelligence".
940
SpamAssassin 2.50 and later supports Bayesian spam analysis, in
941
the form of the BAYES rules. This is a new feature, quite powerful,
942
and is disabled until enough messages have been learnt.
944
The pros of Bayesian spam analysis:
948
=item Can greatly reduce false positives and false negatives.
950
It learns from your mail, so it is tailored to your unique e-mail flow.
952
=item Once it starts learning, it can continue to learn from SpamAssassin
953
and improve over time.
961
=item A decent number of messages are required before results are useful
962
for ham/spam determination.
964
=item It's hard to explain why a message is or isn't marked as spam.
966
i.e.: a straightforward rule, that matches, say, "VIAGRA" is
967
easy to understand. If it generates a false positive or false negative,
968
it is fairly easy to understand why.
970
With Bayesian analysis, it's all probabilities - "because the past says
971
it is likely as this falls into a probabilistic distribution common to past
972
spam in your systems". Tell that to your users! Tell that to the client
973
when he asks "what can I do to change this". (By the way, the answer in
974
this case is "use whitelisting".)
976
=item It will take disk space and memory.
978
The databases it maintains take quite a lot of resources to store and use.
982
=head1 GETTING STARTED
984
Still interested? Ok, here's the guidelines for getting this working.
986
First a high-level overview:
990
=item Build a significant sample of both ham and spam.
992
I suggest several thousand of each, placed in SPAM and HAM directories or
993
mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much
994
better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY
995
message. You're urged to avoid using a publicly available corpus (sample) -
996
this must be taken from YOUR mail server, if it is to be statistically useful.
997
Otherwise, the results may be pretty skewed.
999
=item Use this tool to teach SpamAssassin about these samples, like so:
1001
sa-learn --spam /path/to/spam/folder
1002
sa-learn --ham /path/to/ham/folder
1005
Let SpamAssassin proceed, learning stuff. When it finds ham and spam
1006
it will add the "interesting tokens" to the database.
1008
=item If you need SpamAssassin to forget about specific messages, use
1009
the B<--forget> option.
1011
This can be applied to either ham or spam that has run through the
1012
B<sa-learn> processes. It's a bit of a hammer, really, lowering the
1013
weighting of the specific tokens in that message (only if that message has
1014
been processed before).
1016
=item Learning from single messages uses a command like this:
1018
sa-learn --ham --no-sync mailmessage
1020
This is handy for binding to a key in your mail user agent. It's very fast, as
1021
all the time-consuming stuff is deferred until you run with the C<--sync>
1024
=item Autolearning is enabled by default
1026
If you don't have a corpus of mail saved to learn, you can let
1027
SpamAssassin automatically learn the mail that you receive. If you are
1028
autolearning from scratch, the amount of mail you receive will determine
1029
how long until the BAYES_* rules are activated.
1033
=head1 EFFECTIVE TRAINING
1035
Learning filters require training to be effective. If you don't train
1036
them, they won't work. In addition, you need to train them with new
1037
messages regularly to keep them up-to-date, or their data will become
1038
stale and impact accuracy.
1040
You need to train with both spam I<and> ham mails. One type of mail
1041
alone will not have any effect.
1043
Note that if your mail folders contain things like forwarded spam,
1044
discussions of spam-catching rules, etc., this will cause trouble. You
1045
should avoid scanning those messages if possible. (An easy way to do this
1046
is to move them aside, into a folder which is not scanned.)
1048
If the messages you are learning from have already been filtered through
1049
SpamAssassin, the learner will compensate for this. In effect, it learns what
1050
each message would look like if you had run C<spamassassin -d> over it in
1053
Another thing to be aware of, is that typically you should aim to train
1054
with at least 1000 messages of spam, and 1000 ham messages, if
1055
possible. More is better, but anything over about 5000 messages does not
1056
improve accuracy significantly in our tests.
1058
Be careful that you train from the same source -- for example, if you train
1059
on old spam, but new ham mail, then the classifier will think that
1060
a mail with an old date stamp is likely to be spam.
1062
It's also worth noting that training with a very small quantity of
1063
ham, will produce atrocious results. You should aim to train with at
1064
least the same amount (or more if possible!) of ham data than spam.
1066
On an on-going basis, it is best to keep training the filter to make
1067
sure it has fresh data to work from. There are various ways to do
1072
=item 1. Supervised learning
1074
This means keeping a copy of all or most of your mail, separated into spam
1075
and ham piles, and periodically re-training using those. It produces
1076
the best results, but requires more work from you, the user.
1078
(An easy way to do this, by the way, is to create a new folder for
1079
'deleted' messages, and instead of deleting them from other folders,
1080
simply move them in there instead. Then keep all spam in a separate
1081
folder and never delete it. As long as you remember to move misclassified
1082
mails into the correct folder set, it is easy enough to keep up to date.)
1084
=item 2. Unsupervised learning from Bayesian classification
1086
Another way to train is to chain the results of the Bayesian classifier
1087
back into the training, so it reinforces its own decisions. This is only
1088
safe if you then retrain it based on any errors you discover.
1090
SpamAssassin does not support this method, due to experimental results
1091
which strongly indicate that it does not work well, and since Bayes is
1092
only one part of the resulting score presented to the user (while Bayes
1093
may have made the wrong decision about a mail, it may have been overridden
1096
=item 3. Unsupervised learning from SpamAssassin rules
1098
Also called 'auto-learning' in SpamAssassin. Based on statistical
1099
analysis of the SpamAssassin success rates, we can automatically train the
1100
Bayesian database with a certain degree of confidence that our training
1103
It should be supplemented with some supervised training in addition, if
1106
This is the default, but can be turned off by setting the SpamAssassin
1107
configuration parameter C<bayes_auto_learn> to 0.
1109
=item 4. Mistake-based training
1111
This means training on a small number of mails, then only training on
1112
messages that SpamAssassin classifies incorrectly. This works, but it
1113
takes longer to get it right than a full training session would.
1119
B<sa-learn> and the other parts of SpamAssassin's Bayesian learner,
1120
use a set of persistent database files to store the learnt tokens, as follows.
1126
The database of tokens, containing the tokens learnt, their count of
1127
occurrences in ham and spam, and the timestamp when the token was last
1130
This database also contains some 'magic' tokens, as follows: the version
1131
number of the database, the number of ham and spam messages learnt, the
1132
number of tokens in the database, and timestamps of: the last journal
1133
sync, the last expiry run, the last expiry token reduction count, the
1134
last expiry timestamp delta, the oldest token timestamp in the database,
1135
and the newest token timestamp in the database.
1137
This is a database file, using C<DB_File>. The database 'version
1138
number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x
1139
development releases, 2 for 2.6x, and 3 for 3.0 and later releases.
1143
A map of Message-Id and some data from headers and body to what that
1144
message was learnt as. This is used so that SpamAssassin can avoid
1145
re-learning a message it has already seen, and so it can reverse the
1146
training if you later decide that message was learnt incorrectly.
1148
This is a database file, using C<DB_File>.
1152
While SpamAssassin is scanning mails, it needs to track which tokens
1153
it uses in its calculations. To avoid the contention of having each
1154
SpamAssassin process attempting to gain write access to the Bayes DB,
1155
the token timestamps are written to a 'journal' file which will later
1156
(either automatically or via C<sa-learn --sync>) be used to synchronize
1159
Also, through the use of C<bayes_learn_to_journal>, or when using the
1160
C<--no-sync> option with sa-learn, the actual learning data will take
1161
be placed into the journal for later synchronization. This is typically
1162
useful for high-traffic sites to avoid the same contention as stated
1169
Since SpamAssassin can auto-learn messages, the Bayes database files
1170
could increase perpetually until they fill your disk. To control this,
1171
SpamAssassin performs journal synchronization and bayes expiration
1172
periodically when certain criteria (listed below) are met.
1174
SpamAssassin can sync the journal and expire the DB tokens either
1175
manually or opportunistically. A journal sync is due if I<--sync>
1176
is passed to sa-learn (manual), or if the following is true
1181
=item - bayes_journal_max_size does not equal 0 (means don't sync)
1183
=item - the journal file exists
1191
=item - the journal file has a size greater than bayes_journal_max_size
1199
=item - a journal sync has previously occurred, and at least 1 day has
1200
passed since that sync
1204
Expiry is due if I<--force-expire> is passed to sa-learn (manual),
1205
or if all of the following are true (opportunistic):
1209
=item - the last expire was attempted at least 12hrs ago
1211
=item - bayes_auto_expire does not equal 0
1213
=item - the number of tokens in the DB is > 100,000
1215
=item - the number of tokens in the DB is > bayes_expiry_max_db_size
1217
=item - there is at least a 12 hr difference between the oldest and newest token atimes
1223
If either the manual or opportunistic method causes an expire run
1224
to start, here is the logic that is used:
1228
=item - figure out how many tokens to keep. take the larger of
1229
either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal
1230
reduction is number of tokens - number of tokens to keep.
1232
=item - if the reduction number is < 1000 tokens, abort (not worth the effort).
1234
=item - if an expire has been done before, guesstimate the new
1235
atime delta based on the old atime delta. (new_atime_delta =
1236
old_atime_delta * old_reduction_count / goal)
1238
=item - if no expire has been done before, or the last expire looks
1239
"weird", do an estimation pass. The definition of "weird" is:
1243
=item - last expire over 30 days ago
1245
=item - last atime delta was < 12 hrs
1247
=item - last reduction count was < 1000 tokens
1249
=item - estimated new atime delta is < 12 hrs
1251
=item - the difference between the last reduction count and the goal reduction count is > 50%
1257
=head2 ESTIMATION PASS LOGIC
1259
Go through each of the DB's tokens. Starting at 12hrs, calculate
1260
whether or not the token would be expired (based on the difference
1261
between the token's atime and the db's newest token atime) and keep
1262
the count. Work out from 12hrs exponentially by powers of 2. ie:
1263
12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs
1264
* 512 (6144hrs, or 256 days).
1266
The larger the delta, the smaller the number of tokens that will
1267
be expired. Conversely, the number of tokens goes up as the delta
1268
gets smaller. So starting at the largest atime delta, figure out
1269
which delta will expire the most tokens without going above the
1270
goal expiration count. Use this to choose the atime delta to use,
1271
unless one of the following occurs:
1275
=item - the largest atime (smallest reduction count) would expire
1276
too many tokens. this means the learned tokens are mostly old and
1277
there needs to be new tokens learned before an expire can
1280
=item - all of the atime choices result in 0 tokens being removed.
1281
this means the tokens are all newer than 12 hours and there needs
1282
to be new tokens learned before an expire can occur.
1284
=item - the number of tokens that would be removed is < 1000. the
1285
benefit isn't worth the effort. more tokens need to be learned.
1289
If the expire run gets past this point, it will continue to the end.
1290
A new DB is created since the majority of DB libraries don't shrink the
1291
DB file when tokens are removed. So we do the "create new, migrate old
1292
to new, remove old, rename new" shuffle.
1294
=head2 EXPIRY RELATED CONFIGURATION SETTINGS
1298
=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin
1299
ought to opportunistically attempt to expire the Bayes database.
1300
The default is 1 (yes).
1302
=item C<bayes_expiry_max_db_size> specifies both the auto-expire token
1303
count point, as well as the resulting number of tokens after expiry
1304
as described above. The default value is 150,000, which is roughly
1305
equivalent to a 6Mb database file if you're using DB_File.
1307
=item C<bayes_journal_max_size> specifies how large the Bayes
1308
journal will grow before it is opportunistically synced. The
1309
default value is 102400.
1315
The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module.
1316
Install this as a normal Perl module, using C<perl -MCPAN -e shell>,
1323
Mail::SpamAssassin(3)
1324
Mail::SpamAssassin::ArchiveIterator(3)
1326
E<lt>http://www.paulgraham.com/E<gt>
1327
Paul Graham's "A Plan For Spam" paper
1329
E<lt>http://www.linuxjournal.com/article/6467E<gt>
1330
Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin
1332
E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt>
1333
'Training on error' page. A discussion of various Bayes training regimes,
1334
including 'train on error' and unsupervised training.
1336
=head1 PREREQUISITES
1338
C<Mail::SpamAssassin>
1342
The SpamAssassin(tm) Project E<lt>http://spamassassin.apache.org/E<gt>