3
# Train bogofilter from a ham and spam corpus
5
# Copyright 2003 by Trevor Harrison (trevor-trainbogo@harrison.org)
7
# This file is released under the GPL. See http://www.gnu.org/licenses/gpl.txt
9
# $Id: trainbogo.sh,v 1.6 2004/05/30 20:25:16 m-a Exp $ #
11
# Note: this script has not yet had bogofilter maintainer review.
12
# Security concerned people should not run it if in doubt about its security.
18
echo " trainbogo.sh [options]"
22
echo " Required arguments:"
23
echo " -H hamdir points to directory with all your ham"
24
echo " -S spamdir points to directory will all your spam"
26
echo " Optional arguments:"
27
echo " -s statdir directory where stat and tmp files are created."
28
echo " default is ./stats.tmp"
29
echo " -b pathtobogofilter points to the bogofilter executable,"
30
echo " with any bogofilter options you need."
31
echo " ex. -b \"/usr/local/bin/bogofilter -d /etc/bogodb\""
32
echo " -f force rebuild of ham and spam directory index. Will"
33
echo " cause msgs to be sorted into new order unless"
34
echo " -p and -t are used."
35
echo " -c cleanup statdir when done. (default is not to)"
36
echo " -p rndseed specify the pid.timestamp used to randomize the msgs."
37
echo " ex. -p 5432.1049498805"
38
echo " -m don't test or train bogofilter, just show cached stats."
39
echo " -n don't train bogofilter, just test."
40
echo " -q don't show stats or dots. (quiet)"
49
echo " Train bogofilter from a qmail maildir type ham and spam corpus"
51
echo " This script relies on you having seperated your qmail maildir messages into"
52
echo " ham and spam directories. This script randomizes the message order, and"
53
echo " then feeds each message in turn into bogofilter, noting if bogofilter"
54
echo " correctly identified the message as ham or spam. If mis-identified, it"
55
echo " trains bogofilter with that message, and then re-tests to see if bogofilter"
56
echo " correctly identifies the message."
58
echo " When I've used this script on my ham/spam collection, it takes about 4"
59
echo " consecutive executions to get my wordlists to a 0 false positive state."
60
echo " Just because this script reports 0 failed trainings doesn't mean that you"
61
echo " are ready to go. Run the script a second time to make sure. You should"
62
echo " keep running the script until you get 0 misdetections and, of course, 0"
63
echo " retrain failed's."
65
echo " While running, trainbogo.sh will write some dots and dashes to the screen."
67
echo " . = successfully categorized the message."
68
echo " - = failed to categorized the message, and training was turned off (-n)."
69
echo " + = successfully categorized the message after being retrained."
70
echo " f = failed to categorize the message after training."
72
echo " The results of the testing can be found in the statsdir. Log files have"
73
echo " the filename of each message that match the logfile name:"
75
echo " trainbogo.log.[0,1].[success,fail]"
76
echo " 0 = spam message log"
77
echo " 1 = ham message log"
78
echo " success/fail = were/weren't correctly categorized."
85
[ -n "${verbose}" ] && echo $@
90
[ -z "${quiet}" ] && echo $@
95
[ -z "${quiet}" ] && printf "%s" "$*"
100
verbose "Performing cleanup"
102
[ -z "${log}" ] || [ -z "${list}" ] || [ "${docleanup}" != "y" ] && return
104
rm -f ${log}.[01].success ${log}.[01].fail \
105
${log}.[01].train.success ${log}.[01].train.fail \
108
[ "${madestatsdir}" = "y" ] && [ -n "${statsdir}" ] && rmdir --ignore-fail-on-non-empty "${statsdir}"
117
statsdir="${PWD}/stats.tmp/"
118
origstatsdir="${statsdir}"
121
while getopts "H:S:s:b:p:fcmnqvh" optname; do
125
"H") hamdir="$OPTARG" ;;
126
"S") spamdir="$OPTARG" ;;
127
"s") statsdir="$OPTARG" ;;
131
"p") rndseed=$OPTARG ;;
132
"m") dotest= ; dotrain= ;;
141
# Check for required options
142
[ -z "${hamdir}" ] || [ ! -d "${hamdir}" ] && echo "Missing or bad -H option" && usage && exit
143
[ -z "${spamdir}" ] || [ ! -d "${spamdir}" ] && echo "Missing or bad -S option" && usage && exit
144
[ -z "${statsdir}" ] && echo "Bad statsdir option" && usage && exit
146
# make the stats dir if its missing, but only if its the default stats dir and not user specified
147
[ "${statsdir}" = "${origstatsdir}" ] && [ ! -d "${statsdir}" ] && mkdir "${statsdir}" && madestatsdir=y
148
[ ! -d "${statsdir}" ] && echo "Missing statsdir (-s option)" && exit
150
# check for bogofilter
151
bfbin=$(which ${bf%% *})
152
[ $? -ne 0 ] && echo "Missing bogofilter, not in path? (${bf})" && exit
153
[ ! -x "${bfbin}" ] && echo "Missing or bad bogofilter binary! (${bf})" && exit
155
list="${statsdir}/trainbogo.filenames.txt"
156
log="${statsdir}/trainbogo.log"
159
if [ ! -f "${log}.0.success" ] || [ -n "${dotest}" ] || [ -n "${dotrain}" ] ; then
160
verbose "init log files"
165
>"${log}.0.train.success"
166
>"${log}.0.train.fail"
167
>"${log}.1.train.success"
168
>"${log}.1.train.fail"
171
# First make a randomly sorted list of all the ham and spam files (if needed)
172
if [ ! -f "${list}" ] || [ -n "${dofilelist}" ]; then
173
# MD5 all the spam and ham
175
[ -z "${rndseed}" ] && rndseed="$$.$(date +%s)"
177
normal "MD5'ing ham and spam corpus, rndseed used: ${rndseed}"
181
for i in "${hamdir}"/* "${spamdir}"/*
183
[ ! -f "${i}" ] && continue
184
md5=$(printf "%s" "${rndseed}${i}" | md5sum | sed "s/ -//")
185
echo "${md5} ${i}" >> "${list}"
188
[ $(wc -l < "${list}") -eq 0 ] && echo "No files to work on!!!" && exit
190
# This randomizes the file names by sorting on the md5 hash
191
normal "Randomizing ham and spam"
192
sort "${list}" > "${list}.tmp"
193
mv -f "${list}.tmp" "${list}"
196
sed "s/^.\{32\} \(.*\)/\1/" < "${list}" > "${list}.tmp"
197
mv -f "${list}.tmp" "${list}"
199
# Put expected bogofilter error levels in front of each filename
200
# Using @'s for sed's rule delimiter because ${hamdir} can have /'s.
201
# Hopefully there won't be any @'s in the ham/spam dir name.
202
sed "s@^${hamdir}\(.*\)@1 ${hamdir}\\1@g; s@^${spamdir}\(.*\)@0 ${spamdir}\\1@g" < "${list}" > "${list}.tmp"
203
mv -f "${list}.tmp" "${list}"
206
# Read each filename from the filelist and test and train bogofilter.
207
if [ -n "${dotest}" ] || [ -n "${dotrain}" ]; then
208
normal "Training bogofilter"
209
(while read spamstatus fname
212
bogotest=$(${bf} -v < "${fname}")
214
if [ ${spamstatus} -eq ${ret} ]; then # bogofilter detected this message correctly
215
echo "${fname}" >> "${log}.${spamstatus}.success"
220
# Bogofilter failed to detect the msg correctly
221
echo "${fname}" >> "${log}.${spamstatus}.fail"
224
[ -z "${dotrain}" ] && continue
226
# Set the bogofilter option for training
227
if [ ${spamstatus} -eq 0 ]; then
234
${bf} ${bfopt} < "${fname}"
237
bogotest=$(${bf} -v < "${fname}")
240
# Did it train successfully?
241
if [ ${spamstatus} -eq ${ret} ]; then
250
echo "${fname}" >> "${log}.${spamstatus}.train.${testresult}"
257
if [ -z "${quiet}" ]; then
259
total_msg=$(wc -l < "${list}")
261
total_ham_msg=$(ls "${hamdir}" | wc -l)
262
total_ham_success=$(wc -l < "${log}.1.success")
263
total_ham_fail=$(wc -l < "${log}.1.fail")
264
total_ham_train_fail=$(wc -l < "${log}.1.train.fail")
266
total_spam_msg=$(ls "${spamdir}" | wc -l)
267
total_spam_success=$(wc -l < "${log}.0.success")
268
total_spam_fail=$(wc -l < "${log}.0.fail")
269
total_spam_train_fail=$(wc -l < "${log}.0.train.fail")
271
echo "Total messages: ${total_msg}"
273
echo "Total ham: ${total_ham_msg}"
274
echo "Misdetected ham: ${total_ham_fail}"
275
[ -n "${dotrain}" ] && echo " retrain fail: ${total_ham_train_fail}"
277
echo "Total spam: ${total_spam_msg}"
278
echo "Misdetected spam: ${total_spam_fail}"
279
[ -n "${dotrain}" ] && echo " retrain fail: ${total_spam_train_fail}"