~divmod-dev/divmod.org/dangling-1091

« back to all changes in this revision

Viewing changes to Reverend/reverend/thomas.py

Committer: washort
Date: 2005-10-25 18:49:27 UTC
Revision ID: svn-v4:866e43f7-fbfc-0310-8f2a-ec88d1da2979:trunk:2573

import Reverend

files added:
Reverend

Reverend/MANIFEST.in

Reverend/README.txt

Reverend/changelog.txt

Reverend/examples

Reverend/examples/emailtrainer.py

Reverend/examples/readme.txt

Reverend/reverend

Reverend/reverend/__init__.py

Reverend/reverend/guessers

Reverend/reverend/guessers/__init__.py

Reverend/reverend/guessers/email.py

Reverend/reverend/thomas.py

Reverend/reverend/ui

Reverend/reverend/ui/__init__.py

Reverend/reverend/ui/tester.py

Reverend/reverend/ui/trainer.py

Reverend/reverend/ui/util.py

Reverend/setup.py

Show diffs side-by-side

added added

removed removed

Reverend/reverend/thomas.py

# amir@divmod.org. This is free software; you can redistribute it and/or

# modify it under the terms of version 2.1 of the GNU Lesser General Public

# License as published by the Free Software Foundation.

import operator

import string, re

import math

from sets import Set

class BayesData(dict):

def __init__(self, name='', pool=None):

self.name = name

self.training = []

self.pool = pool

self.tokenCount = 0

self.trainCount = 0

def trainedOn(self, item):

return item in self.training

def __repr__(self):

return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)

class Bayes(object):

def __init__(self, tokenizer=None, combiner=None, dataClass=None):

if dataClass is None:

self.dataClass = BayesData

else:

self.dataClass = dataClass

self.corpus = self.dataClass('__Corpus__')

self.pools = {}

self.pools['__Corpus__'] = self.corpus

self.trainCount = 0

self.dirty = True

# The tokenizer takes an object and returns

# a list of strings

if tokenizer is None:

self._tokenizer = Tokenizer()

else:

self._tokenizer = tokenizer

# The combiner combines probabilities

if combiner is None:

self.combiner = self.robinson

else:

self.combiner = combiner

def commit(self):

self.save()

def newPool(self, poolName):

"""Create a new pool, without actually doing any

training.

"""

self.dirty = True # not always true, but it's simple

return self.pools.setdefault(poolName, self.dataClass(poolName))

def removePool(self, poolName):

del(self.pools[poolName])

self.dirty = True

def renamePool(self, poolName, newName):

self.pools[newName] = self.pools[poolName]

self.pools[newName].name = newName

self.removePool(poolName)

self.dirty = True

def mergePools(self, destPool, sourcePool):

"""Merge an existing pool into another.

The data from sourcePool is merged into destPool.

The arguments are the names of the pools to be merged.

The pool named sourcePool is left in tact and you may

want to call removePool() to get rid of it.

"""

sp = self.pools[sourcePool]

dp = self.pools[destPool]

for tok, count in sp.items():

if dp.get(tok):

dp[tok] += count

else:

dp[tok] = count

dp.tokenCount += 1

self.dirty = True

def poolData(self, poolName):

"""Return a list of the (token, count) tuples.

"""

return self.pools[poolName].items()

def poolTokens(self, poolName):

"""Return a list of the tokens in this pool.

"""

return [tok for tok, count in self.poolData(poolName)]

def save(self, fname='bayesdata.dat'):

from cPickle import dump

100

fp = open(fname, 'wb')

101

dump(self.pools, fp)

102

fp.close()

103

104

def load(self, fname='bayesdata.dat'):

105

from cPickle import load

106

fp = open(fname, 'rb')

107

self.pools = load(fp)

108

fp.close()

109

self.corpus = self.pools['__Corpus__']

110

self.dirty = True

111

112

def poolNames(self):

113

"""Return a sorted list of Pool names.

114

Does not include the system pool '__Corpus__'.

115

"""

116

pools = self.pools.keys()

117

pools.remove('__Corpus__')

118

pools = [pool for pool in pools]

119

pools.sort()

120

return pools

121

122

def buildCache(self):

123

""" merges corpora and computes probabilities

124

"""

125

self.cache = {}

126

for pname, pool in self.pools.items():

127

# skip our special pool

128

if pname == '__Corpus__':

129

continue

130

131

poolCount = pool.tokenCount

132

themCount = max(self.corpus.tokenCount - poolCount, 1)

133

cacheDict = self.cache.setdefault(pname, self.dataClass(pname))

134

135

for word, totCount in self.corpus.items():

136

# for every word in the copus

137

# check to see if this pool contains this word

138

thisCount = float(pool.get(word, 0.0))

139

if (thisCount == 0.0):

140

continue

141

otherCount = float(totCount) - thisCount

142

143

if not poolCount:

144

goodMetric = 1.0

145

else:

146

goodMetric = min(1.0, otherCount/poolCount)

147

badMetric = min(1.0, thisCount/themCount)

148

f = badMetric / (goodMetric + badMetric)

149

150

# PROBABILITY_THRESHOLD

151

if abs(f-0.5) >= 0.1 :

152

# GOOD_PROB, BAD_PROB

153

cacheDict[word] = max(0.0001, min(0.9999, f))

154

155

def poolProbs(self):

156

if self.dirty:

157

self.buildCache()

158

self.dirty = False

159

return self.cache

160

161

def getTokens(self, obj):

162

"""By default, we expect obj to be a screen and split

163

it on whitespace.

164

165

Note that this does not change the case.

166

In some applications you may want to lowecase everthing

167

so that "king" and "King" generate the same token.

168

169

Override this in your subclass for objects other

170

than text.

171

172

Alternatively, you can pass in a tokenizer as part of

173

instance creation.

174

"""

175

return self._tokenizer.tokenize(obj)

176

177

def getProbs(self, pool, words):

178

""" extracts the probabilities of tokens in a message

179

"""

180

probs = [(word, pool[word]) for word in words if word in pool]

181

probs.sort(lambda x,y: cmp(y[1],x[1]))

182

return probs[:2048]

183

184

def train(self, pool, item, uid=None):

185

"""Train Bayes by telling him that item belongs

186

in pool. uid is optional and may be used to uniquely

187

identify the item that is being trained on.

188

"""

189

tokens = self.getTokens(item)

190

pool = self.pools.setdefault(pool, self.dataClass(pool))

191

self._train(pool, tokens)

192

self.corpus.trainCount += 1

193

pool.trainCount += 1

194

if uid:

195

pool.training.append(uid)

196

self.dirty = True

197

198

def untrain(self, pool, item, uid=None):

199

tokens = self.getTokens(item)

200

pool = self.pools.get(pool, None)

201

if not pool:

202

return

203

self._untrain(pool, tokens)

204

# I guess we want to count this as additional training?

205

self.corpus.trainCount += 1

206

pool.trainCount += 1

207

if uid:

208

pool.training.remove(uid)

209

self.dirty = True

210

211

def _train(self, pool, tokens):

212

wc = 0

213

for token in tokens:

214

count = pool.get(token, 0)

215

pool[token] = count + 1

216

count = self.corpus.get(token, 0)

217

self.corpus[token] = count + 1

218

wc += 1

219

pool.tokenCount += wc

220

self.corpus.tokenCount += wc

221

222

def _untrain(self, pool, tokens):

223

for token in tokens:

224

count = pool.get(token, 0)

225

if count:

226

if count == 1:

227

del(pool[token])

228

else:

229

pool[token] = count - 1

230

pool.tokenCount -= 1

231

232

count = self.corpus.get(token, 0)

233

if count:

234

if count == 1:

235

del(self.corpus[token])

236

else:

237

self.corpus[token] = count - 1

238

self.corpus.tokenCount -= 1

239

240

def trainedOn(self, msg):

241

for p in self.cache.values():

242

if msg in p.training:

243

return True

244

return False

245

246

def guess(self, msg):

247

tokens = Set(self.getTokens(msg))

248

pools = self.poolProbs()

249

250

res = {}

251

for pname, pprobs in pools.items():

252

p = self.getProbs(pprobs, tokens)

253

if len(p) != 0:

254

res[pname]=self.combiner(p, pname)

255

res = res.items()

256

res.sort(lambda x,y: cmp(y[1], x[1]))

257

return res

258

259

def robinson(self, probs, ignore):

260

""" computes the probability of a message being spam (Robinson's method)

261

P = 1 - prod(1-p)^(1/n)

262

Q = 1 - prod(p)^(1/n)

263

S = (1 + (P-Q)/(P+Q)) / 2

264

Courtesy of http://christophe.delord.free.fr/en/index.html

265

"""

266

267

nth = 1./len(probs)

268

P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth

269

Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth

270

S = (P - Q) / (P + Q)

271

return (1 + S) / 2

272

273

274

def robinsonFisher(self, probs, ignore):

275

""" computes the probability of a message being spam (Robinson-Fisher method)

276

H = C-1( -2.ln(prod(p)), 2*n )

277

S = C-1( -2.ln(prod(1-p)), 2*n )

278

I = (1 + H - S) / 2

279

Courtesy of http://christophe.delord.free.fr/en/index.html

280

"""

281

n = len(probs)

282

try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)

283

except OverflowError: H = 0.0

284

try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)

285

except OverflowError: S = 0.0

286

return (1 + H - S) / 2

287

288

def __repr__(self):

289

return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]

290

291

def __len__(self):

292

return len(self.corpus)

293

294

class Tokenizer:

295

"""A simple regex-based whitespace tokenizer.

296

It expects a string and can return all tokens lower-cased

297

or in their existing case.

298

"""

299

300

WORD_RE = re.compile('\\w+', re.U)

301

302

def __init__(self, lower=False):

303

self.lower = lower

304

305

def tokenize(self, obj):

306

for match in self.WORD_RE.finditer(obj):

307

if self.lower:

308

yield match.group().lower()

309

else:

310

yield match.group()

311

312

def chi2P(chi, df):

313

""" return P(chisq >= chi, with df degree of freedom)

314

315

df must be even

316

"""

317

assert df & 1 == 0

318

m = chi / 2.0

319

sum = term = math.exp(-m)

320

for i in range(1, df/2):

321

term *= m/i

322

sum += term

323

return min(sum, 1.0)

324

Older »