1
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
2
# amir@divmod.org. This is free software; you can redistribute it and/or
3
# modify it under the terms of version 2.1 of the GNU Lesser General Public
4
# License as published by the Free Software Foundation.
12
class BayesData(dict):
14
def __init__(self, name='', pool=None):
21
def trainedOn(self, item):
22
return item in self.training
25
return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
29
def __init__(self, tokenizer=None, combiner=None, dataClass=None):
31
self.dataClass = BayesData
33
self.dataClass = dataClass
34
self.corpus = self.dataClass('__Corpus__')
36
self.pools['__Corpus__'] = self.corpus
39
# The tokenizer takes an object and returns
42
self._tokenizer = Tokenizer()
44
self._tokenizer = tokenizer
45
# The combiner combines probabilities
47
self.combiner = self.robinson
49
self.combiner = combiner
54
def newPool(self, poolName):
55
"""Create a new pool, without actually doing any
58
self.dirty = True # not always true, but it's simple
59
return self.pools.setdefault(poolName, self.dataClass(poolName))
61
def removePool(self, poolName):
62
del(self.pools[poolName])
65
def renamePool(self, poolName, newName):
66
self.pools[newName] = self.pools[poolName]
67
self.pools[newName].name = newName
68
self.removePool(poolName)
71
def mergePools(self, destPool, sourcePool):
72
"""Merge an existing pool into another.
73
The data from sourcePool is merged into destPool.
74
The arguments are the names of the pools to be merged.
75
The pool named sourcePool is left in tact and you may
76
want to call removePool() to get rid of it.
78
sp = self.pools[sourcePool]
79
dp = self.pools[destPool]
80
for tok, count in sp.items():
88
def poolData(self, poolName):
89
"""Return a list of the (token, count) tuples.
91
return self.pools[poolName].items()
93
def poolTokens(self, poolName):
94
"""Return a list of the tokens in this pool.
96
return [tok for tok, count in self.poolData(poolName)]
98
def save(self, fname='bayesdata.dat'):
99
from cPickle import dump
100
fp = open(fname, 'wb')
104
def load(self, fname='bayesdata.dat'):
105
from cPickle import load
106
fp = open(fname, 'rb')
107
self.pools = load(fp)
109
self.corpus = self.pools['__Corpus__']
113
"""Return a sorted list of Pool names.
114
Does not include the system pool '__Corpus__'.
116
pools = self.pools.keys()
117
pools.remove('__Corpus__')
118
pools = [pool for pool in pools]
122
def buildCache(self):
123
""" merges corpora and computes probabilities
126
for pname, pool in self.pools.items():
127
# skip our special pool
128
if pname == '__Corpus__':
131
poolCount = pool.tokenCount
132
themCount = max(self.corpus.tokenCount - poolCount, 1)
133
cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
135
for word, totCount in self.corpus.items():
136
# for every word in the copus
137
# check to see if this pool contains this word
138
thisCount = float(pool.get(word, 0.0))
139
if (thisCount == 0.0):
141
otherCount = float(totCount) - thisCount
146
goodMetric = min(1.0, otherCount/poolCount)
147
badMetric = min(1.0, thisCount/themCount)
148
f = badMetric / (goodMetric + badMetric)
150
# PROBABILITY_THRESHOLD
151
if abs(f-0.5) >= 0.1 :
152
# GOOD_PROB, BAD_PROB
153
cacheDict[word] = max(0.0001, min(0.9999, f))
161
def getTokens(self, obj):
162
"""By default, we expect obj to be a screen and split
165
Note that this does not change the case.
166
In some applications you may want to lowecase everthing
167
so that "king" and "King" generate the same token.
169
Override this in your subclass for objects other
172
Alternatively, you can pass in a tokenizer as part of
175
return self._tokenizer.tokenize(obj)
177
def getProbs(self, pool, words):
178
""" extracts the probabilities of tokens in a message
180
probs = [(word, pool[word]) for word in words if word in pool]
181
probs.sort(lambda x,y: cmp(y[1],x[1]))
184
def train(self, pool, item, uid=None):
185
"""Train Bayes by telling him that item belongs
186
in pool. uid is optional and may be used to uniquely
187
identify the item that is being trained on.
189
tokens = self.getTokens(item)
190
pool = self.pools.setdefault(pool, self.dataClass(pool))
191
self._train(pool, tokens)
192
self.corpus.trainCount += 1
195
pool.training.append(uid)
198
def untrain(self, pool, item, uid=None):
199
tokens = self.getTokens(item)
200
pool = self.pools.get(pool, None)
203
self._untrain(pool, tokens)
204
# I guess we want to count this as additional training?
205
self.corpus.trainCount += 1
208
pool.training.remove(uid)
211
def _train(self, pool, tokens):
214
count = pool.get(token, 0)
215
pool[token] = count + 1
216
count = self.corpus.get(token, 0)
217
self.corpus[token] = count + 1
219
pool.tokenCount += wc
220
self.corpus.tokenCount += wc
222
def _untrain(self, pool, tokens):
224
count = pool.get(token, 0)
229
pool[token] = count - 1
232
count = self.corpus.get(token, 0)
235
del(self.corpus[token])
237
self.corpus[token] = count - 1
238
self.corpus.tokenCount -= 1
240
def trainedOn(self, msg):
241
for p in self.cache.values():
242
if msg in p.training:
246
def guess(self, msg):
247
tokens = Set(self.getTokens(msg))
248
pools = self.poolProbs()
251
for pname, pprobs in pools.items():
252
p = self.getProbs(pprobs, tokens)
254
res[pname]=self.combiner(p, pname)
256
res.sort(lambda x,y: cmp(y[1], x[1]))
259
def robinson(self, probs, ignore):
260
""" computes the probability of a message being spam (Robinson's method)
261
P = 1 - prod(1-p)^(1/n)
262
Q = 1 - prod(p)^(1/n)
263
S = (1 + (P-Q)/(P+Q)) / 2
264
Courtesy of http://christophe.delord.free.fr/en/index.html
268
P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
269
Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
270
S = (P - Q) / (P + Q)
274
def robinsonFisher(self, probs, ignore):
275
""" computes the probability of a message being spam (Robinson-Fisher method)
276
H = C-1( -2.ln(prod(p)), 2*n )
277
S = C-1( -2.ln(prod(1-p)), 2*n )
279
Courtesy of http://christophe.delord.free.fr/en/index.html
282
try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
283
except OverflowError: H = 0.0
284
try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
285
except OverflowError: S = 0.0
286
return (1 + H - S) / 2
289
return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
292
return len(self.corpus)
295
"""A simple regex-based whitespace tokenizer.
296
It expects a string and can return all tokens lower-cased
297
or in their existing case.
300
WORD_RE = re.compile('\\w+', re.U)
302
def __init__(self, lower=False):
305
def tokenize(self, obj):
306
for match in self.WORD_RE.finditer(obj):
308
yield match.group().lower()
313
""" return P(chisq >= chi, with df degree of freedom)
319
sum = term = math.exp(-m)
320
for i in range(1, df/2):