1
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
2
# amir@divmod.org. This is free software; you can redistribute it and/or
3
# modify it under the terms of version 2.1 of the GNU Lesser General Public
4
# License as published by the Free Software Foundation.
8
from rfc822 import AddressList
11
from reverend.thomas import Bayes
12
from reverend.splitter import Splitter
15
class EmailClassifier(Bayes):
17
def getTokens(self, msg):
19
# This should return a list of strings
20
# which will be used as the key into
21
# the table of token counts
22
tokens = self.getHeaderTokens(msg)
23
tokens += self.getBodyTokens(msg)
25
# Get some tokens that are generated from the
26
# header and the structure
27
tokens += self.getMetaTokens(msg)
30
def getBodyTokens(self, msg):
31
text = self.getTextPlain(msg)
34
tl = self.splitter.split(text)
37
def getHeaderTokens(self, msg):
38
subj = msg.get('subject','nosubject')
40
text += msg.get('from','fromnoone') + ' '
41
text += msg.get('to','tonoone') + ' '
42
text += msg.get('cc','ccnoone') + ' '
43
tl = self.splitter.split(text)
46
def getTextPlain(self, msg):
47
for part in msg.walk():
49
if typ and typ.lower() == "text/plain":
50
text = part.get_payload(decode=True)
54
def getTextHtml(self, msg):
55
for part in msg.walk():
57
if typ and typ.lower() == "text/html":
58
text = part.get_payload(decode=False)
62
def getMetaTokens(self, msg):
64
for f in ['Content-type', 'X-Priority', 'X-Mailer',
65
'content-transfer-encoding', 'X-MSMail-Priority']:
66
r.append(f +':' + msg.get(f, 'None'))
68
text = self.getTextPlain(msg)
69
html = self.getTextHtml(msg)
71
for stem, part in zip(['text','html'],[text,html]):
73
r.append(stem + '_None')
76
r.append(stem + '_True')
93
at = AddressList(t).addresslist
95
ac = AddressList(c).addresslist
98
r.append('to_more_than_5')
100
r.append('to_more_than_10')
102
r.append('cc_more_than_5')
104
r.append('cc_more_than_10')