~ubuntu-branches/ubuntu/precise/weka/precise

* Andrew Mccallum, Kamal Nigam: A Comparison of Event Models for Naive Bayes Text Classification. In: AAAI-98 Workshop on 'Learning for Text Categorization', 1998.

*

* The core equation for this classifier:

*

* P[Ci|D] = (P[D|Ci] x P[Ci]) / P[D] (Bayes rule)

*

* where Ci is class i and D is a document.

*

* BibTeX:

* <pre>

* @inproceedings{Mccallum1998,

* author = {Andrew Mccallum and Kamal Nigam},

* booktitle = {AAAI-98 Workshop on 'Learning for Text Categorization'},

* title = {A Comparison of Event Models for Naive Bayes Text Classification},

* year = {1998}

* }

* </pre>

*

* Valid options are:

* <pre> -D

* If set, classifier is run in debug mode and

* may output additional info to the console</pre>

* @author Andrew Golightly (acg4@cs.waikato.ac.nz)

* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)

* @version $Revision: 1.15 $

public class NaiveBayesMultinomial

extends Classifier

implements WeightedInstancesHandler,TechnicalInformationHandler {

/** for serialization */

static final long serialVersionUID = 5932177440181257085L;

/**

* probability that a word (w) exists in a class (H) (i.e. Pr[w|H])

* The matrix is in the this format: probOfWordGivenClass[class][wordAttribute]

* NOTE: the values are actually the log of Pr[w|H]

protected double[][] m_probOfWordGivenClass;

/** the probability of a class (i.e. Pr[H]) */

protected double[] m_probOfClass;

/** number of unique words */

protected int m_numAttributes;

/** number of class values */

protected int m_numClasses;

/** cache lnFactorial computations */

100

protected double[] m_lnFactorialCache = new double[]{0.0,0.0};

101

102

/** copy of header information for use in toString method */

103

protected Instances m_headerInfo;

104

105

/**

106

* Returns a string describing this classifier

107

* @return a description of the classifier suitable for

108

* displaying in the explorer/experimenter gui

109

110

public String globalInfo() {

111

return

112

"Class for building and using a multinomial Naive Bayes classifier. "

113

+ "For more information see,\n\n"

114

+ getTechnicalInformation().toString() + "\n\n"

115

+ "The core equation for this classifier:\n\n"

116

+ "P[Ci|D] = (P[D|Ci] x P[Ci]) / P[D] (Bayes rule)\n\n"

117

+ "where Ci is class i and D is a document.";

118

}

119

120

/**

121

* Returns an instance of a TechnicalInformation object, containing

122

* detailed information about the technical background of this class,

123

* e.g., paper reference or book this class is based on.

124

125

* @return the technical information about this class

126

127

public TechnicalInformation getTechnicalInformation() {

128

TechnicalInformation result;

129

130

result = new TechnicalInformation(Type.INPROCEEDINGS);

131

result.setValue(Field.AUTHOR, "Andrew Mccallum and Kamal Nigam");

132

result.setValue(Field.YEAR, "1998");

133

result.setValue(Field.TITLE, "A Comparison of Event Models for Naive Bayes Text Classification");

134

result.setValue(Field.BOOKTITLE, "AAAI-98 Workshop on 'Learning for Text Categorization'");

135

136

return result;

137

}

138

139

/**

140

* Returns default capabilities of the classifier.

141

142

* @return the capabilities of this classifier

143

144

public Capabilities getCapabilities() {

145

Capabilities result = super.getCapabilities();

146

147

// attributes

148

result.enable(Capability.NUMERIC_ATTRIBUTES);

149

150

// class

151

result.enable(Capability.NOMINAL_CLASS);

152

result.enable(Capability.MISSING_CLASS_VALUES);

153

154

return result;

155

}

156

157

/**

158

* Generates the classifier.

159

160

* @param instances set of instances serving as training data

161

* @throws Exception if the classifier has not been generated successfully

162

163

public void buildClassifier(Instances instances) throws Exception

164

{

165

// can classifier handle the data?

166

getCapabilities().testWithFail(instances);

167

168

// remove instances with missing class

169

instances = new Instances(instances);

170

instances.deleteWithMissingClass();

171

172

m_headerInfo = new Instances(instances, 0);

173

m_numClasses = instances.numClasses();

174

m_numAttributes = instances.numAttributes();

175

m_probOfWordGivenClass = new double[m_numClasses][];

176

177

178

initialising the matrix of word counts

179

NOTE: Laplace estimator introduced in case a word that does not appear for a class in the

180

training set does so for the test set

181

182

for(int c = 0; c<m_numClasses; c++)

183

{

184

m_probOfWordGivenClass[c] = new double[m_numAttributes];

185

for(int att = 0; att<m_numAttributes; att++)

186

{

187

m_probOfWordGivenClass[c][att] = 1;

188

}

189

}

190

191

//enumerate through the instances

192

Instance instance;

193

int classIndex;

194

double numOccurences;

195

double[] docsPerClass = new double[m_numClasses];

196

double[] wordsPerClass = new double[m_numClasses];

197

198

java.util.Enumeration enumInsts = instances.enumerateInstances();

199

while (enumInsts.hasMoreElements())

200

{

201

instance = (Instance) enumInsts.nextElement();

202

classIndex = (int)instance.value(instance.classIndex());

203

docsPerClass[classIndex] += instance.weight();

204

205

for(int a = 0; a<instance.numValues(); a++)

206

if(instance.index(a) != instance.classIndex())

207

{

208

if(!instance.isMissing(a))

209

{

210

numOccurences = instance.valueSparse(a) * instance.weight();

211

if(numOccurences < 0)

212

throw new Exception("Numeric attribute values must all be greater or equal to zero.");

213

wordsPerClass[classIndex] += numOccurences;

214

m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences;

215

}

216

}

217

}

218

219

220

normalising probOfWordGivenClass values

221

and saving each value as the log of each value

222

223

for(int c = 0; c<m_numClasses; c++)

224

for(int v = 0; v<m_numAttributes; v++)

225

m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1));

226

227

228

calculating Pr(H)

229

NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of

230

training instances

231

232

final double numDocs = instances.sumOfWeights() + m_numClasses;

233

m_probOfClass = new double[m_numClasses];

234

for(int h=0; h<m_numClasses; h++)

235

m_probOfClass[h] = (double)(docsPerClass[h] + 1)/numDocs;

236

}

237

238

/**

239

* Calculates the class membership probabilities for the given test

240

* instance.

241

242

* @param instance the instance to be classified

243

* @return predicted class probability distribution

244

* @throws Exception if there is a problem generating the prediction

245

246

public double [] distributionForInstance(Instance instance) throws Exception

247

{

248

double[] probOfClassGivenDoc = new double[m_numClasses];

249

250

//calculate the array of log(Pr[D|C])

251

double[] logDocGivenClass = new double[m_numClasses];

252

for(int h = 0; h<m_numClasses; h++)

253

logDocGivenClass[h] = probOfDocGivenClass(instance, h);

254

255

double max = logDocGivenClass[Utils.maxIndex(logDocGivenClass)];

256

double probOfDoc = 0.0;

257

258

for(int i = 0; i<m_numClasses; i++)

259

{

260

probOfClassGivenDoc[i] = Math.exp(logDocGivenClass[i] - max) * m_probOfClass[i];

261

probOfDoc += probOfClassGivenDoc[i];

262

}

263

264

Utils.normalize(probOfClassGivenDoc,probOfDoc);

265

266

return probOfClassGivenDoc;

267

}

268

269

/**

270

* log(N!) + (for all the words)(log(Pi^ni) - log(ni!))

271

272

* where

273

* N is the total number of words

274

* Pi is the probability of obtaining word i

275

* ni is the number of times the word at index i occurs in the document

276

277

* @param inst The instance to be classified

278

* @param classIndex The index of the class we are calculating the probability with respect to

279

280

* @return The log of the probability of the document occuring given the class

281

282

283

private double probOfDocGivenClass(Instance inst, int classIndex)

284

{

285

double answer = 0;

286

//double totalWords = 0; //no need as we are not calculating the factorial at all.

287

288

double freqOfWordInDoc; //should be double

289

for(int i = 0; i<inst.numValues(); i++)

290

if(inst.index(i) != inst.classIndex())

291

{

292

freqOfWordInDoc = inst.valueSparse(i);

293

//totalWords += freqOfWordInDoc;

294

answer += (freqOfWordInDoc * m_probOfWordGivenClass[classIndex][inst.index(i)]

295

); //- lnFactorial(freqOfWordInDoc));

296

}

297

298

//answer += lnFactorial(totalWords);//The factorial terms don't make

299

//any difference to the classifier's

300

//accuracy, so not needed.

301

302

return answer;

303

}

304

305

/**

306

* Fast computation of ln(n!) for non-negative ints

307

308

* negative ints are passed on to the general gamma-function

309

* based version in weka.core.SpecialFunctions

310

311

* if the current n value is higher than any previous one,

312

* the cache is extended and filled to cover it

313

314

* the common case is reduced to a simple array lookup

315

316

* @param n the integer

317

* @return ln(n!)

318

319

320

public double lnFactorial(int n)

321

{

322

if (n < 0) return weka.core.SpecialFunctions.lnFactorial(n);

323

324

if (m_lnFactorialCache.length <= n) {

325

double[] tmp = new double[n+1];

326

System.arraycopy(m_lnFactorialCache,0,tmp,0,m_lnFactorialCache.length);

327

for(int i = m_lnFactorialCache.length; i < tmp.length; i++)

328

tmp[i] = tmp[i-1] + Math.log(i);

329

m_lnFactorialCache = tmp;

330

}

331

332

return m_lnFactorialCache[n];

333

}

334

335

/**

336

* Returns a string representation of the classifier.

337

338

* @return a string representation of the classifier

339

340

public String toString()

341

{

342

StringBuffer result = new StringBuffer("The independent probability of a class\n--------------------------------------\n");

343

344

for(int c = 0; c<m_numClasses; c++)

345

result.append(m_headerInfo.classAttribute().value(c)).append("\t").append(Double.toString(m_probOfClass[c])).append("\n");

346

347

result.append("\nThe probability of a word given the class\n-----------------------------------------\n\t");

348

349

for(int c = 0; c<m_numClasses; c++)

350

result.append(m_headerInfo.classAttribute().value(c)).append("\t");

351

352

result.append("\n");

353

354

for(int w = 0; w<m_numAttributes; w++)

355

{

356

result.append(m_headerInfo.attribute(w).name()).append("\t");

357

for(int c = 0; c<m_numClasses; c++)

358

result.append(Double.toString(Math.exp(m_probOfWordGivenClass[c][w]))).append("\t");

359

result.append("\n");

360

}

361

362

return result.toString();

363

}

364

365

/**

366

* Main method for testing this class.

367

368

* @param argv the options

369

370

public static void main(String [] argv) {

371

runClassifier(new NaiveBayesMultinomial(), argv);

372

}

373

}

374

Older »