3
"""Split an mbox into N random directories of files.
5
Usage: %(program)s [-h] [-g] [-s seed] [-v] -n N sourcembox ... outdirbase
9
Print this help message and exit
12
Do globbing on each sourcepath. This is helpful on Windows, where
13
the native shells don't glob, or when you have more mboxes than
14
your shell allows you to specify on the commandline.
17
Seed the random number generator with seed (an integer).
18
By default, use system time at startup to seed.
21
Verbose. Displays a period for each 100 messages parsed.
22
May display other stuff.
25
The number of output mboxes desired. This is required.
29
The mbox or path to an mbox to split.
32
The base path + name prefix for each of the N output dirs.
33
Output files have names of the form
34
outdirbase + ("Set%%d/%%d" %% (i, n))
37
%(program)s -s 123 -n5 Data/spam.mbox Data/Spam/Set
39
produces 5 directories, named Data/Spam/Set1 through Data/Spam/Set5. Each
40
contains a random selection of the messages in spam.mbox, and together
41
they contain every message in spam.mbox exactly once. Each has
42
approximately the same number of messages. spam.mbox is not altered. In
43
addition, the seed for the random number generator is forced to 123, so
44
that while the split is random, it's reproducible.
55
from spambayes import mboxutils
60
# Maintain compatibility with Python 2.2
66
def usage(code, msg=''):
67
print >> sys.stderr, __doc__ % globals()
69
print >> sys.stderr, msg
74
opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help'])
75
except getopt.error, msg:
82
if opt in ('-h', '--help'):
93
if n is None or n <= 1:
94
usage(1, "an -n value > 1 is required")
97
usage(1, "input mbox name and output base path are required")
98
inputpaths, outputbasepath = args[:-1], args[-1]
100
outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
102
if not os.path.isdir(dir):
106
for inputpath in inputpaths:
108
inpaths = glob.glob(inputpath)
110
inpaths = [inputpath]
112
for inpath in inpaths:
113
mbox = mboxutils.getmbox(inpath)
115
i = random.randrange(n)
117
#assert astext.endswith('\n')
119
msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
120
msgfile.write(astext)
123
if counter % 100 == 0:
124
sys.stdout.write('.')
129
print counter, "messages split into", n, "directories"
131
if __name__ == '__main__':