~divmod-dev/divmod.org/811792-different-metadata-approach

Viewing changes to Reverend/reverend/thomas.py

Committer: exarkun
Date: 2010-07-16 16:31:27 UTC
Revision ID: svn-v4:866e43f7-fbfc-0310-8f2a-ec88d1da2979:trunk:18001

Merge thomas-sets-warning-2996

Author: Ange, exarkun
Reviewer: mithrandi
Fixes: #2996

Avoid using the deprecated `sets` module in Reverend.

files added:
Reverend/reverend/test/test_thomas.py

files modified:
Reverend/reverend/thomas.py

Show diffs side-by-side

added added

removed removed

Reverend/reverend/thomas.py

import operator

import re

import math

from sets import Set

try:

set

except NameError:

# Fall back to the sets module if there's no set builtin yet.

from sets import Set as set

class BayesData(dict):

self.pool = pool

self.tokenCount = 0

self.trainCount = 0

def trainedOn(self, item):

return item in self.training

def __repr__(self):

return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)

class Bayes(object):

def __init__(self, tokenizer=None, combiner=None, dataClass=None):

if dataClass is None:

self.dataClass = BayesData

127

133

# skip our special pool

128

134

if pname == '__Corpus__':

129

135

continue

130

136

131

137

poolCount = pool.tokenCount

132

138

themCount = max(self.corpus.tokenCount - poolCount, 1)

133

139

cacheDict = self.cache.setdefault(pname, self.dataClass(pname))

137

143

# check to see if this pool contains this word

138

144

thisCount = float(pool.get(word, 0.0))

139

145

if (thisCount == 0.0):

140

continue

146

continue

141

147

otherCount = float(totCount) - thisCount

142

148

143

149

if not poolCount:

146

152

goodMetric = min(1.0, otherCount/poolCount)

147

153

badMetric = min(1.0, thisCount/themCount)

148

154

f = badMetric / (goodMetric + badMetric)

149

155

150

156

# PROBABILITY_THRESHOLD

151

157

if abs(f-0.5) >= 0.1 :

152

158

# GOOD_PROB, BAD_PROB

153

159

cacheDict[word] = max(0.0001, min(0.9999, f))

154

160

155

161

def poolProbs(self):

156

162

if self.dirty:

157

163

self.buildCache()

165

171

Note that this does not change the case.

166

172

In some applications you may want to lowecase everthing

167

173

so that "king" and "King" generate the same token.

168

174

169

175

Override this in your subclass for objects other

170

176

than text.

171

177

228

234

else:

229

235

pool[token] = count - 1

230

236

pool.tokenCount -= 1

231

237

232

238

count = self.corpus.get(token, 0)

233

239

if count:

234

240

if count == 1:

237

243

self.corpus[token] = count - 1

238

244

self.corpus.tokenCount -= 1

239

245

240

def trainedOn(self, msg):

246

def trainedOn(self, msg):

241

247

for p in self.cache.values():

242

248

if msg in p.training:

243

249

return True

244

250

return False

245

251

246

252

def guess(self, msg):

247

tokens = Set(self.getTokens(msg))

253

tokens = set(self.getTokens(msg))

248

254

pools = self.poolProbs()

249

255

250

256

res = {}

254

260

res[pname]=self.combiner(p, pname)

255

261

res = res.items()

256

262

res.sort(lambda x,y: cmp(y[1], x[1]))

257

return res

263

return res

258

264

259

265

def robinson(self, probs, ignore):

260

266

""" computes the probability of a message being spam (Robinson's method)

263

269

S = (1 + (P-Q)/(P+Q)) / 2

264

270

Courtesy of http://christophe.delord.free.fr/en/index.html

265

271

"""

266

272

267

273

nth = 1./len(probs)

268

274

P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth

269

275

Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth

296

302

It expects a string and can return all tokens lower-cased

297

303

or in their existing case.

298

304

"""

299

305

300

306

WORD_RE = re.compile('\\w+', re.U)

301

307

302

308

def __init__(self, lower=False):

303

309

self.lower = lower

304

310

305

311

def tokenize(self, obj):

306

312

for match in self.WORD_RE.finditer(obj):

307

313

if self.lower:

308

314

yield match.group().lower()

309

315

else:

310

316

yield match.group()

311

317

312

318

def chi2P(chi, df):

313

319

""" return P(chisq >= chi, with df degree of freedom)

314

320

321

327

term *= m/i

322

328

sum += term

323

329

return min(sum, 1.0)

324

Older »