3
# This script runs a simple benchmark of the python stemmer interface.
7
datafiles = ('sampledata/englishvoc.txt', 'sampledata/puttydoc.txt',)
10
for datafile in datafiles:
12
for line in open(datafile):
13
words.extend(line.split())
14
for cache_size in (0, 1, 10000, 30000):
17
stemmer = Stemmer.Stemmer('en', %d)
19
for line in open('%s'):
20
words.extend(line.split())
21
""" % (cache_size, datafile)
22
t = timeit.Timer(setup=setup,
23
stmt='stemmer.stemWords(words)')
24
for iters in (1, 2, 3, 10):
25
times = [time / iters for time in t.repeat(5, iters)]
26
print "'%s':words=%d,cacheSize=%d,iters=%d,mintime=%f" % (datafile, len(words), cache_size, iters, min(times))