1
from __future__ import generators
6
from spambayes.tokenizer import tokenize
12
SEED = random.randrange(2000000000)
15
__slots__ = 'tag', 'guts'
17
def __init__(self, dir, name):
18
path = dir + "/" + name
25
return tokenize(self.guts)
27
# Compare msgs by their paths; this is appropriate for sets of msgs.
31
def __eq__(self, other):
32
return self.tag == other.tag
37
# The iterator yields a stream of Msg objects, taken from a list of
39
class MsgStream(object):
40
__slots__ = 'tag', 'directories', 'keep'
42
def __init__(self, tag, directories, keep=None):
44
self.directories = directories
52
for directory in self.directories:
53
for fname in os.listdir(directory):
54
yield Msg(directory, fname)
56
# We only want part of the msgs. Shuffle each directory list, but
57
# in such a way that we'll get the same result each time this is
58
# called on the same directory list.
59
for directory in self.directories:
60
all = os.listdir(directory)
61
random.seed(hash(max(all)) ^ SEED) # reproducible across calls
64
all.sort() # seems to speed access on Win98!
66
yield Msg(directory, fname)
71
class HamStream(MsgStream):
72
def __init__(self, tag, directories, train=0):
74
MsgStream.__init__(self, tag, directories, HAMTRAIN)
76
MsgStream.__init__(self, tag, directories, HAMTEST)
78
class SpamStream(MsgStream):
79
def __init__(self, tag, directories, train=0):
81
MsgStream.__init__(self, tag, directories, SPAMTRAIN)
83
MsgStream.__init__(self, tag, directories, SPAMTEST)
85
def setparms(hamtrain, spamtrain, hamtest=None, spamtest=None, seed=None):
86
"""Set HAMTEST/TRAIN and SPAMTEST/TRAIN.
87
If seed is not None, also set SEED.
88
If (ham|spam)test are not set, set to the same as the (ham|spam)train
89
numbers (backwards compat option).
92
global HAMTEST, SPAMTEST, HAMTRAIN, SPAMTRAIN, SEED
93
HAMTRAIN, SPAMTRAIN = hamtrain, spamtrain