~ubuntu-branches/ubuntu/karmic/spambayes/karmic

« back to all changes in this revision

Viewing changes to scripts/sb_mboxtrain.py

Committer: Bazaar Package Importer
Author(s): Jorge Bernal
Date: 2005-04-07 14:02:02 UTC
Revision ID: james.westby@ubuntu.com-20050407140202-mgyh6t7gn2dlrrw5

Tags: upstream-1.0.1

Import upstream version 1.0.1

files added:

CHANGELOG.txt

LICENSE.txt

MANIFEST.in

NEWTRICKS.txt

Outlook2000

Outlook2000/README.txt

Outlook2000/about.html

Outlook2000/addin.py

Outlook2000/config.py

Outlook2000/config_wizard.py

Outlook2000/default_bayes_customize.ini

Outlook2000/dialogs

Outlook2000/dialogs/FolderSelector.py

Outlook2000/dialogs/__init__.py

Outlook2000/dialogs/async_processor.py

Outlook2000/dialogs/dialog_map.py

Outlook2000/dialogs/dlgcore.py

Outlook2000/dialogs/dlgutils.py

Outlook2000/dialogs/opt_processors.py

Outlook2000/dialogs/processors.py

Outlook2000/dialogs/resources

Outlook2000/dialogs/resources/__init__.py

Outlook2000/dialogs/resources/dialogs.h

Outlook2000/dialogs/resources/dialogs.rc

Outlook2000/dialogs/resources/folders.bmp

Outlook2000/dialogs/resources/rc2py.py

Outlook2000/dialogs/resources/rclabels2text.py

Outlook2000/dialogs/resources/rcparser.py

Outlook2000/dialogs/resources/sblogo.bmp

Outlook2000/dialogs/resources/sbwizlogo.bmp

Outlook2000/dialogs/test_dialogs.py

Outlook2000/dialogs/wizard_processors.py

Outlook2000/docs

Outlook2000/docs/configuration.html

Outlook2000/docs/images

Outlook2000/docs/images/field_chooser_after.jpg

Outlook2000/docs/images/field_chooser_new_field.jpg

Outlook2000/docs/images/manager-select.jpg

Outlook2000/docs/images/manager.jpg

Outlook2000/docs/images/python.jpg

Outlook2000/docs/images/sblogo.jpg

Outlook2000/docs/images/span.jpg

Outlook2000/docs/images/training.jpg

Outlook2000/docs/troubleshooting.html

Outlook2000/docs/welcome.html

Outlook2000/export.py

Outlook2000/filter.py

Outlook2000/images

Outlook2000/images/delete_as_spam.bmp

Outlook2000/images/recover_ham.bmp

Outlook2000/installer

Outlook2000/installer/README.txt

Outlook2000/installer/crank.py

Outlook2000/installer/installation_notes.rtf

Outlook2000/installer/spambayes_addin.iss

Outlook2000/installer/spambayes_addin.py

Outlook2000/installer/spambayes_addin.spec

Outlook2000/manager.py

Outlook2000/msgstore.py

Outlook2000/oastats.py

Outlook2000/sandbox

Outlook2000/sandbox/delete_outlook_field.py

Outlook2000/sandbox/dump_profiles.py

Outlook2000/sandbox/dump_props.py

Outlook2000/sandbox/extract_bad_msg_from_log.py

Outlook2000/sandbox/extract_prop.py

Outlook2000/sandbox/find_dupe_props.py

Outlook2000/sandbox/mapi_driver.py

Outlook2000/sandbox/set_read_flag.py

Outlook2000/tester.py

Outlook2000/train.py

PKG-INFO

POP3PROXY.txt

README-DEVEL.txt

README.txt

TESTING.txt

WHAT_IS_NEW.txt

contrib

contrib/BULK.txt

contrib/SmarterHTTPServer.py

contrib/bulkgraph.py

contrib/bulktrain.sh

contrib/findbest.py

contrib/mod_spambayes.py

contrib/muttrc

contrib/nway.py

contrib/procmailrc

contrib/spambayes.el

contrib/spamcounts.py

contrib/tte.py

pspam

pspam/README.txt

pspam/pop.py

pspam/pspam

pspam/pspam/__init__.py

pspam/pspam/database.py

pspam/pspam/folder.py

pspam/pspam/message.py

pspam/pspam/profile.py

pspam/scoremsg.py

pspam/update.py

pspam/vmspam.ini

pspam/zeo.sh

runtest.sh

scripts

scripts/README.txt

scripts/sb_bnfilter.py

scripts/sb_bnserver.py

scripts/sb_chkopts.py

scripts/sb_client.py

scripts/sb_dbexpimp.py

scripts/sb_evoscore.py

scripts/sb_filter.py

scripts/sb_imapfilter.py

scripts/sb_mailsort.py

scripts/sb_mboxtrain.py

scripts/sb_notesfilter.py

scripts/sb_pop3dnd.py

scripts/sb_server.py

scripts/sb_unheader.py

scripts/sb_upload.py

scripts/sb_xmlrpcserver.py

setup.py

spambayes

spambayes/Corpus.py

spambayes/CostCounter.py

spambayes/Dibbler.py

spambayes/FileCorpus.py

spambayes/Histogram.py

spambayes/ImapUI.py

spambayes/Options.py

spambayes/OptionsClass.py

spambayes/ProxyUI.py

spambayes/PyMeldLite.py

spambayes/ServerUI.py

spambayes/Stats.py

spambayes/TestDriver.py

spambayes/TestToolsUI.py

spambayes/Tester.py

spambayes/UserInterface.py

spambayes/Version.py

spambayes/__init__.py

spambayes/cdb.py

spambayes/cdb_classifier.py

spambayes/chi2.py

spambayes/classifier.py

spambayes/compatcsv.py

spambayes/compatheapq.py

spambayes/compatsets.py

spambayes/dbmstorage.py

spambayes/hammie.py

spambayes/hammiebulk.py

spambayes/mboxutils.py

spambayes/message.py

spambayes/msgs.py

spambayes/oe_mailbox.py

spambayes/optimize.py

spambayes/resources

spambayes/resources/__init__.py

spambayes/resources/classify.gif

spambayes/resources/classify_gif.py

spambayes/resources/config.gif

spambayes/resources/config_gif.py

spambayes/resources/helmet.gif

spambayes/resources/helmet_gif.py

spambayes/resources/help.gif

spambayes/resources/help_gif.py

spambayes/resources/message.gif

spambayes/resources/message_gif.py

spambayes/resources/query.gif

spambayes/resources/query_gif.py

spambayes/resources/scanning__init__.py

spambayes/resources/status.gif

spambayes/resources/status_gif.py

spambayes/resources/train.gif

spambayes/resources/train_gif.py

spambayes/resources/ui.html

spambayes/resources/ui.psp

spambayes/resources/ui_html.py

spambayes/resources/ui_psp.py

spambayes/smtpproxy.py

spambayes/storage.py

spambayes/test

spambayes/test/README.txt

spambayes/test/sb_test_support.py

spambayes/test/test_programs.py

spambayes/test/test_sb-server.py

spambayes/test/test_smtpproxy.py

spambayes/test/test_storage.py

spambayes/tokenizer.py

testtools

testtools/cmp.py

testtools/dotest.sh

testtools/es2hs.py

testtools/fpfn.py

testtools/incremental.HOWTO.txt

testtools/incremental.TODO.txt

testtools/incremental.py

testtools/mboxtest.py

testtools/mkgraph.py

testtools/mksets.py

testtools/rates.py

testtools/regimes.py

testtools/simplexloop.py

testtools/sort+group.py

testtools/table.py

testtools/timcv.py

testtools/timtest.py

testtools/weaktest.py

utilities

utilities/HistToGNU.py

utilities/convert_config_file.py

utilities/dump_cdb.py

utilities/extractmessages.py

utilities/hammer.py

utilities/loosecksum.py

utilities/mboxcount.py

utilities/mkreversemap.py

utilities/pop3graph.py

utilities/rebal.py

utilities/split.py

utilities/splitn.py

utilities/splitndirs.py

utilities/which_database.py

windows

windows/README.txt

windows/autoconfigure.py

windows/docs

windows/docs/troubleshooting.html

windows/pop3proxy_service.py

windows/pop3proxy_tray.py

windows/py2exe

windows/py2exe/README.txt

windows/py2exe/setup_all.py

windows/readme_proxy.html

windows/resources

windows/resources/dialogs.h

windows/resources/dialogs.rc

windows/resources/sb-started.ico

windows/resources/sb-stopped.ico

windows/resources/sbicon.ico

windows/spambayes.iss

Show diffs side-by-side

added added

removed removed

scripts/sb_mboxtrain.py

#! /usr/bin/env python

### Train spambayes on all previously-untrained messages in a mailbox.

###

### This keeps track of messages it's already trained by adding an

### X-Spambayes-Trained: header to each one. Then, if you move one to

### another folder, it will retrain that message. You would want to run

### this from a cron job on your server.

"""Usage: %(program)s [OPTIONS] ...

Where OPTIONS is one or more of:

-h

show usage and exit

-d DBNAME

use the DBM store. A DBM file is larger than the pickle and

creating it is slower, but loading it is much faster,

especially for large word databases. Recommended for use with

sb_filter or any procmail-based filter.

-p DBNAME

use the pickle store. A pickle is smaller and faster to create,

but much slower to load. Recommended for use with sb_server and

sb_xmlrpcserver.

-g PATH

mbox or directory of known good messages (non-spam) to train on.

Can be specified more than once.

-s PATH

mbox or directory of known spam messages to train on.

Can be specified more than once.

-f

force training, ignoring the trained header. Use this if you

need to rebuild your database from scratch.

-q

quiet mode; no output

-n train mail residing in "new" directory, in addition to "cur"

directory, which is always trained (Maildir only)

-r remove mail which was trained on (Maildir only)

-o section:option:value

set [section, option] in the options database to value

"""

try:

True, False

except NameError:

# Maintain compatibility with Python 2.2

True, False = 1, 0

import sys, os, getopt, email

import shutil

from spambayes import hammie, storage, mboxutils

from spambayes.Options import options, get_pathname_option

program = sys.argv[0]

loud = True

def get_message(obj):

"""Return an email Message object.

This works like mboxutils.get_message, except it doesn't junk the

headers if there's an error. Doing so would cause a headerless

message to be written back out!

"""

if isinstance(obj, email.Message.Message):

return obj

# Create an email Message object.

if hasattr(obj, "read"):

obj = obj.read()

try:

msg = email.message_from_string(obj)

except email.Errors.MessageParseError:

msg = None

return msg

def msg_train(h, msg, is_spam, force):

"""Train bayes with a single message."""

# XXX: big hack -- why is email.Message unable to represent

# multipart/alternative?

try:

mboxutils.as_string(msg)

except TypeError:

# We'll be unable to represent this as text :(

return False

if is_spam:

spamtxt = options["Headers", "header_spam_string"]

else:

spamtxt = options["Headers", "header_ham_string"]

oldtxt = msg.get(options["Headers", "trained_header_name"])

if force:

# Train no matter what.

if oldtxt != None:

del msg[options["Headers", "trained_header_name"]]

elif oldtxt == spamtxt:

100

# Skip this one, we've already trained with it.

101

return False

102

elif oldtxt != None:

103

# It's been trained, but as something else. Untrain.

104

del msg[options["Headers", "trained_header_name"]]

105

h.untrain(msg, not is_spam)

106

h.train(msg, is_spam)

107

msg.add_header(options["Headers", "trained_header_name"], spamtxt)

108

109

return True

110

111

def maildir_train(h, path, is_spam, force, removetrained):

112

"""Train bayes with all messages from a maildir."""

113

114

if loud: print " Reading %s as Maildir" % (path,)

115

116

import time

117

import socket

118

119

pid = os.getpid()

120

host = socket.gethostname()

121

counter = 0

122

trained = 0

123

124

for fn in os.listdir(path):

125

cfn = os.path.join(path, fn)

126

tfn = os.path.normpath(os.path.join(path, "..", "tmp",

127

"%d.%d_%d.%s" % (time.time(), pid,

128

counter, host)))

129

if (os.path.isdir(cfn)):

130

continue

131

counter += 1

132

if loud and counter % 10 == 0:

133

sys.stdout.write("\r%6d" % counter)

134

sys.stdout.flush()

135

f = file(cfn, "rb")

136

msg = get_message(f)

137

f.close()

138

if not msg:

139

print "Malformed message: %s. Skipping..." % cfn

140

continue

141

if not msg_train(h, msg, is_spam, force):

142

continue

143

trained += 1

144

if not options["Headers", "include_trained"]:

145

continue

146

f = file(tfn, "wb")

147

f.write(mboxutils.as_string(msg))

148

f.close()

149

shutil.copystat(cfn, tfn)

150

151

# XXX: This will raise an exception on Windows. Do any Windows

152

# people actually use Maildirs?

153

os.rename(tfn, cfn)

154

if (removetrained):

155

os.unlink(cfn)

156

157

if loud:

158

sys.stdout.write("\r%6d" % counter)

159

sys.stdout.write("\r Trained %d out of %d messages\n" %

160

(trained, counter))

161

162

def mbox_train(h, path, is_spam, force):

163

"""Train bayes with a Unix mbox"""

164

165

if loud: print " Reading as Unix mbox"

166

167

import mailbox

168

import fcntl

169

170

# Open and lock the mailbox. Some systems require it be opened for

171

# writes in order to assert an exclusive lock.

172

f = file(path, "r+b")

173

fcntl.flock(f, fcntl.LOCK_EX)

174

mbox = mailbox.PortableUnixMailbox(f, get_message)

175

176

outf = os.tmpfile()

177

counter = 0

178

trained = 0

179

180

for msg in mbox:

181

if not msg:

182

print "Malformed message number %d. I can't train on this mbox, sorry." % counter

183

return

184

counter += 1

185

if loud and counter % 10 == 0:

186

sys.stdout.write("\r%6d" % counter)

187

sys.stdout.flush()

188

if msg_train(h, msg, is_spam, force):

189

trained += 1

190

if options["Headers", "include_trained"]:

191

# Write it out with the Unix "From " line

192

outf.write(mboxutils.as_string(msg, True))

193

194

if options["Headers", "include_trained"]:

195

outf.seek(0)

196

try:

197

os.ftruncate(f.fileno(), 0)

198

f.seek(0)

199

except:

200

# If anything goes wrong, don't try to write

201

print "Problem truncating mbox--nothing written"

202

raise

203

try:

204

for line in outf.xreadlines():

205

f.write(line)

206

except:

207

print >> sys.stderr ("Problem writing mbox! Sorry, "

208

"I tried my best, but your mail "

209

"may be corrupted.")

210

raise

211

212

fcntl.flock(f, fcntl.LOCK_UN)

213

f.close()

214

if loud:

215

sys.stdout.write("\r%6d" % counter)

216

sys.stdout.write("\r Trained %d out of %d messages\n" %

217

(trained, counter))

218

219

def mhdir_train(h, path, is_spam, force):

220

"""Train bayes with an mh directory"""

221

222

if loud: print " Reading as MH mailbox"

223

224

import glob

225

226

counter = 0

227

trained = 0

228

229

for fn in glob.glob(os.path.join(path, "[0-9]*")):

230

counter += 1

231

232

cfn = fn

233

tfn = os.path.join(path, "spambayes.tmp")

234

if loud and counter % 10 == 0:

235

sys.stdout.write("\r%6d" % counter)

236

sys.stdout.flush()

237

f = file(fn, "rb")

238

msg = get_message(f)

239

f.close()

240

if not msg:

241

print "Malformed message: %s. Skipping..." % cfn

242

continue

243

msg_train(h, msg, is_spam, force)

244

trained += 1

245

if not options["Headers", "include_trained"]:

246

continue

247

f = file(tfn, "wb")

248

f.write(mboxutils.as_string(msg))

249

f.close()

250

shutil.copystat(cfn, tfn)

251

252

# XXX: This will raise an exception on Windows. Do any Windows

253

# people actually use MH directories?

254

os.rename(tfn, cfn)

255

256

if loud:

257

sys.stdout.write("\r%6d" % counter)

258

sys.stdout.write("\r Trained %d out of %d messages\n" %

259

(trained, counter))

260

261

def train(h, path, is_spam, force, trainnew, removetrained):

262

if not os.path.exists(path):

263

raise ValueError("Nonexistent path: %s" % path)

264

elif os.path.isfile(path):

265

mbox_train(h, path, is_spam, force)

266

elif os.path.isdir(os.path.join(path, "cur")):

267

maildir_train(h, os.path.join(path, "cur"), is_spam, force,

268

removetrained)

269

if trainnew:

270

maildir_train(h, os.path.join(path, "new"), is_spam, force,

271

removetrained)

272

elif os.path.isdir(path):

273

mhdir_train(h, path, is_spam, force)

274

else:

275

raise ValueError("Unable to determine mailbox type: " + path)

276

277

278

def usage(code, msg=''):

279

"""Print usage message and sys.exit(code)."""

280

if msg:

281

print >> sys.stderr, msg

282

print >> sys.stderr

283

print >> sys.stderr, __doc__ % globals()

284

sys.exit(code)

285

286

def main():

287

"""Main program; parse options and go."""

288

289

global loud

290

291

try:

292

opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:')

293

except getopt.error, msg:

294

usage(2, msg)

295

296

if not opts:

297

usage(2, "No options given")

298

299

force = False

300

trainnew = False

301

removetrained = False

302

good = []

303

spam = []

304

for opt, arg in opts:

305

if opt == '-h':

306

usage(0)

307

elif opt == "-f":

308

force = True

309

elif opt == "-n":

310

trainnew = True

311

elif opt == "-q":

312

loud = False

313

elif opt == '-g':

314

good.append(arg)

315

elif opt == '-s':

316

spam.append(arg)

317

elif opt == "-r":

318

removetrained = True

319

elif opt == '-o':

320

options.set_from_cmdline(arg, sys.stderr)

321

pck, usedb = storage.database_type(opts)

322

if args:

323

usage(2, "Positional arguments not allowed")

324

325

if usedb == None:

326

# Use settings in configuration file.

327

usedb = options["Storage", "persistent_use_database"]

328

pck = get_pathname_option("Storage",

329

"persistent_storage_file")

330

331

h = hammie.open(pck, usedb, "c")

332

333

for g in good:

334

if loud: print "Training ham (%s):" % g

335

train(h, g, False, force, trainnew, removetrained)

336

sys.stdout.flush()

337

save = True

338

339

for s in spam:

340

if loud: print "Training spam (%s):" % s

341

train(h, s, True, force, trainnew, removetrained)

342

sys.stdout.flush()

343

save = True

344

345

if save:

346

h.store()

347

348

349

if __name__ == "__main__":

350

main()

Older »