3
# bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
5
# Since Bayesish probability analysis requires training on a corpus, the
6
# traditional SpamAssassin 10-pass cross-validation suite can't be used. Also,
7
# Bayes requires its own ten-pass testing, separately, to judge the effects of
8
# tweaks. So that's what this is.
10
# Before running, you need to create a test corpus, as "cor/spam" and
11
# "cor/ham". Here's how to do this:
14
# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
15
# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
17
# SADIR = top-level directory of SpamAssassin distro
18
# TEST = the directory where the corpus and results are to be written
19
# spfN = mail folders full of spam
20
# hamN = mail folders full of ham
22
# It will produce a directory of results called "results". The most important
23
# are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
24
# the output of analysis of all scores and frequencies from the
25
# bayes-thresholds script.
27
# NOTE: by default you will need *AT LEAST* 2000 of either type to use
28
# this, since bayes will not be activated without 200 messages in the db,
29
# and each fold is run using 10% of the corpus -- and 2000/10 = 200.
31
###########################################################################
36
if [ "$#" -gt 0 ] ; then
41
PATH=$SADIR:$SADIR/masses:$PATH
43
results=$testdir/results
44
tmpdir=$results/config
46
rm -rf $results $tmpdir
48
# now, just copy in the Bayes ruleset
49
mkdir -p $results $tmpdir/rules
50
cp ../rules/23_bayes.cf $tmpdir/rules
51
cp ../rules/50*.cf $tmpdir/rules
53
# tell SpamAssassin to use this path for DBs
54
# TODO: for tests of these settings, read from a test-specific file
57
bayes_path $tmpdir/dbs/bayes
62
" > $tmpdir/rules/30bayes_path.cf
67
LEARN_ALL_THEN_FORGET_TEST_SET=0
70
echo "Backing up full learned DBs..."
71
( cd $tmpdir; tar cvf learned-all.tar dbs )
74
echo "Restoring full learned DBs..."
75
( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
78
if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
80
# learn the lot, then forget the ones we're testing on each time.
81
# faster than learning from scratch for each fold
83
# note: we use randseed=1 so that every run will always pick the
84
# same messages if --learnprob is used.
87
echo -n "Learning from all ham buckets..." ; date
88
time sa-learn --ham --randseed=1 --no-sync $learnargs \
89
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*
91
echo -n "Learning from all spam buckets..." ; date
92
time sa-learn --spam --randseed=1 --no-sync $learnargs \
93
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*
95
time sa-learn --sync $learnargs --config-file=$tmpdir/rules
97
echo -n "Done learning. " ; date
98
) 2>&1 | tee $results/learn.log
100
echo "Dumping bayes DB..."
101
( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \
102
> $results/bayes_db.dump
110
echo -n "Starting test..." ; date
111
for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
112
echo -n "Bucket $bucket..." ; date
114
if [ $bucket != 1 ] ; then restore_dbs ; fi
116
rdir=$results/bucket$bucket
119
: > $rdir/hbucketlearn
120
: > $rdir/sbucketlearn
121
: > $rdir/hbuckettest
122
: > $rdir/sbuckettest
123
for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
125
[ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t
126
[ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t
128
if [ $type = l ] ; then
129
echo "Using bucket for learn: $subbucket ..."
130
cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn
131
cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn
133
echo "Using bucket for test: $subbucket ..."
134
cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest
135
cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest
139
if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
140
echo "Forgetting contents of test ham bucket..."
141
time sa-learn --forget --config-file=$tmpdir/rules --showdots \
142
--mbox $rdir/hbuckettest
144
echo "Forgetting contents of test spam bucket..."
145
time sa-learn --forget --config-file=$tmpdir/rules --showdots \
146
--mbox $rdir/sbuckettest
149
echo "Learning contents of learn ham bucket..."
150
time sa-learn --ham --randseed=1 --no-sync $learnargs \
151
--showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn
153
echo "Learning contents of learn spam bucket..."
154
time sa-learn --spam --randseed=1 --no-sync $learnargs \
155
--showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn
157
time sa-learn --sync $learnargs --config-file=$tmpdir/rules
159
echo "Dumping bayes DB..."
160
( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
161
> $rdir/bayes_db.dump
164
time sa-learn --sync --config-file=$tmpdir/rules
166
if [ $INTERLEAVE_TESTS = 1 ] ; then
167
# now split the ham and spam test bucket into 10 sub-buckets,
168
# so we interleave ham and spam while testing. important for
169
# judging expiry effects
170
: > $rdir/nonspam.log
173
mkdir $rdir/testbuckets
176
tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
178
tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
182
for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
183
echo "Running mass-check on ham test-bucket $subbucket..."
184
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
185
--bayes --mbox $rdir/testbuckets/ham.$subbucket \
188
echo "Running mass-check on spam test-bucket $subbucket..."
189
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
190
--bayes --mbox $rdir/testbuckets/spam.$subbucket \
195
echo "Running mass-check on ham bucket..."
196
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
197
--bayes --mbox $rdir/hbuckettest \
200
echo "Running mass-check on spam bucket..."
201
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
202
--bayes --mbox $rdir/sbuckettest \
207
./bayes-testing/draw-bayes-histogram \
208
$rdir/spam.log $rdir/nonspam.log \
211
./bayes-testing/bayes-thresholds \
212
$rdir/spam.log $rdir/nonspam.log \
215
./bayes-testing/bayes-static-thresholds \
216
$rdir/spam.log $rdir/nonspam.log \
217
> $rdir/thresholds.static
219
# remove these, they're too big.
220
rm -f $rdir/hbucketlearn $rdir/sbucketlearn $rdir/hbuckettest $rdir/sbuckettest
223
echo -n "Done test..." ; date
225
) 2>&1 | tee $results/test.log
227
cat $results/bucket*/spam.log > $results/spam_all.log
228
cat $results/bucket*/nonspam.log > $results/nonspam_all.log
230
./bayes-testing/draw-bayes-histogram \
231
$results/spam_all.log $results/nonspam_all.log \
233
./bayes-testing/bayes-thresholds \
234
$results/spam_all.log $results/nonspam_all.log \
235
> $results/thresholds_all
236
./bayes-testing/bayes-static-thresholds \
237
$results/spam_all.log $results/nonspam_all.log \
238
> $results/thresholds.static