3
### Train spambayes on all previously-untrained messages in a mailbox.
5
### This keeps track of messages it's already trained by adding an
6
### X-Spambayes-Trained: header to each one. Then, if you move one to
7
### another folder, it will retrain that message. You would want to run
8
### this from a cron job on your server.
10
"""Usage: %(program)s [OPTIONS] ...
12
Where OPTIONS is one or more of:
16
use the DBM store. A DBM file is larger than the pickle and
17
creating it is slower, but loading it is much faster,
18
especially for large word databases. Recommended for use with
19
sb_filter or any procmail-based filter.
21
use the pickle store. A pickle is smaller and faster to create,
22
but much slower to load. Recommended for use with sb_server and
25
mbox or directory of known good messages (non-spam) to train on.
26
Can be specified more than once.
28
mbox or directory of known spam messages to train on.
29
Can be specified more than once.
31
force training, ignoring the trained header. Use this if you
32
need to rebuild your database from scratch.
36
-n train mail residing in "new" directory, in addition to "cur"
37
directory, which is always trained (Maildir only)
39
-r remove mail which was trained on (Maildir only)
41
-o section:option:value
42
set [section, option] in the options database to value
48
# Maintain compatibility with Python 2.2
51
import sys, os, getopt, email
53
from spambayes import hammie, storage, mboxutils
54
from spambayes.Options import options, get_pathname_option
60
"""Return an email Message object.
62
This works like mboxutils.get_message, except it doesn't junk the
63
headers if there's an error. Doing so would cause a headerless
64
message to be written back out!
68
if isinstance(obj, email.Message.Message):
70
# Create an email Message object.
71
if hasattr(obj, "read"):
74
msg = email.message_from_string(obj)
75
except email.Errors.MessageParseError:
79
def msg_train(h, msg, is_spam, force):
80
"""Train bayes with a single message."""
82
# XXX: big hack -- why is email.Message unable to represent
83
# multipart/alternative?
85
mboxutils.as_string(msg)
87
# We'll be unable to represent this as text :(
91
spamtxt = options["Headers", "header_spam_string"]
93
spamtxt = options["Headers", "header_ham_string"]
94
oldtxt = msg.get(options["Headers", "trained_header_name"])
96
# Train no matter what.
98
del msg[options["Headers", "trained_header_name"]]
99
elif oldtxt == spamtxt:
100
# Skip this one, we've already trained with it.
103
# It's been trained, but as something else. Untrain.
104
del msg[options["Headers", "trained_header_name"]]
105
h.untrain(msg, not is_spam)
106
h.train(msg, is_spam)
107
msg.add_header(options["Headers", "trained_header_name"], spamtxt)
111
def maildir_train(h, path, is_spam, force, removetrained):
112
"""Train bayes with all messages from a maildir."""
114
if loud: print " Reading %s as Maildir" % (path,)
120
host = socket.gethostname()
124
for fn in os.listdir(path):
125
cfn = os.path.join(path, fn)
126
tfn = os.path.normpath(os.path.join(path, "..", "tmp",
127
"%d.%d_%d.%s" % (time.time(), pid,
129
if (os.path.isdir(cfn)):
132
if loud and counter % 10 == 0:
133
sys.stdout.write("\r%6d" % counter)
139
print "Malformed message: %s. Skipping..." % cfn
141
if not msg_train(h, msg, is_spam, force):
144
if not options["Headers", "include_trained"]:
147
f.write(mboxutils.as_string(msg))
149
shutil.copystat(cfn, tfn)
151
# XXX: This will raise an exception on Windows. Do any Windows
152
# people actually use Maildirs?
158
sys.stdout.write("\r%6d" % counter)
159
sys.stdout.write("\r Trained %d out of %d messages\n" %
162
def mbox_train(h, path, is_spam, force):
163
"""Train bayes with a Unix mbox"""
165
if loud: print " Reading as Unix mbox"
170
# Open and lock the mailbox. Some systems require it be opened for
171
# writes in order to assert an exclusive lock.
172
f = file(path, "r+b")
173
fcntl.flock(f, fcntl.LOCK_EX)
174
mbox = mailbox.PortableUnixMailbox(f, get_message)
182
print "Malformed message number %d. I can't train on this mbox, sorry." % counter
185
if loud and counter % 10 == 0:
186
sys.stdout.write("\r%6d" % counter)
188
if msg_train(h, msg, is_spam, force):
190
if options["Headers", "include_trained"]:
191
# Write it out with the Unix "From " line
192
outf.write(mboxutils.as_string(msg, True))
194
if options["Headers", "include_trained"]:
197
os.ftruncate(f.fileno(), 0)
200
# If anything goes wrong, don't try to write
201
print "Problem truncating mbox--nothing written"
204
for line in outf.xreadlines():
207
print >> sys.stderr ("Problem writing mbox! Sorry, "
208
"I tried my best, but your mail "
212
fcntl.flock(f, fcntl.LOCK_UN)
215
sys.stdout.write("\r%6d" % counter)
216
sys.stdout.write("\r Trained %d out of %d messages\n" %
219
def mhdir_train(h, path, is_spam, force):
220
"""Train bayes with an mh directory"""
222
if loud: print " Reading as MH mailbox"
229
for fn in glob.glob(os.path.join(path, "[0-9]*")):
233
tfn = os.path.join(path, "spambayes.tmp")
234
if loud and counter % 10 == 0:
235
sys.stdout.write("\r%6d" % counter)
241
print "Malformed message: %s. Skipping..." % cfn
243
msg_train(h, msg, is_spam, force)
245
if not options["Headers", "include_trained"]:
248
f.write(mboxutils.as_string(msg))
250
shutil.copystat(cfn, tfn)
252
# XXX: This will raise an exception on Windows. Do any Windows
253
# people actually use MH directories?
257
sys.stdout.write("\r%6d" % counter)
258
sys.stdout.write("\r Trained %d out of %d messages\n" %
261
def train(h, path, is_spam, force, trainnew, removetrained):
262
if not os.path.exists(path):
263
raise ValueError("Nonexistent path: %s" % path)
264
elif os.path.isfile(path):
265
mbox_train(h, path, is_spam, force)
266
elif os.path.isdir(os.path.join(path, "cur")):
267
maildir_train(h, os.path.join(path, "cur"), is_spam, force,
270
maildir_train(h, os.path.join(path, "new"), is_spam, force,
272
elif os.path.isdir(path):
273
mhdir_train(h, path, is_spam, force)
275
raise ValueError("Unable to determine mailbox type: " + path)
278
def usage(code, msg=''):
279
"""Print usage message and sys.exit(code)."""
281
print >> sys.stderr, msg
283
print >> sys.stderr, __doc__ % globals()
287
"""Main program; parse options and go."""
292
opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:')
293
except getopt.error, msg:
297
usage(2, "No options given")
301
removetrained = False
304
for opt, arg in opts:
320
options.set_from_cmdline(arg, sys.stderr)
321
pck, usedb = storage.database_type(opts)
323
usage(2, "Positional arguments not allowed")
326
# Use settings in configuration file.
327
usedb = options["Storage", "persistent_use_database"]
328
pck = get_pathname_option("Storage",
329
"persistent_storage_file")
331
h = hammie.open(pck, usedb, "c")
334
if loud: print "Training ham (%s):" % g
335
train(h, g, False, force, trainnew, removetrained)
340
if loud: print "Training spam (%s):" % s
341
train(h, s, True, force, trainnew, removetrained)
349
if __name__ == "__main__":