~ubuntu-branches/ubuntu/precise/weka/precise

* Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.

* <p/>

* Valid options are: <p/>

* <pre> -M <num>

* minimum allowable standard deviation for normal density computation

* (default 1e-6)</pre>

* <pre> -W <clusterer name>

* Clusterer to wrap.

* (default weka.clusterers.SimpleKMeans)</pre>

* <pre>

* Options specific to clusterer weka.clusterers.SimpleKMeans:

* </pre>

* <pre> -N <num>

* number of clusters. (default = 2).</pre>

* <pre> -S <num>

* random number seed.

* (default 10)</pre>

* Options after "--" are passed on to the base clusterer.

* @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)

* @author Mark Hall (mhall@cs.waikato.ac.nz)

* @author Eibe Frank (eibe@cs.waikato.ac.nz)

* @version $Revision: 1.13 $

public class MakeDensityBasedClusterer

extends DensityBasedClusterer

implements NumberOfClustersRequestable,

OptionHandler,

WeightedInstancesHandler {

/** for serialization */

static final long serialVersionUID = -5643302427972186631L;

/** holds training instances header information */

private Instances m_theInstances;

/** prior probabilities for the fitted clusters */

private double [] m_priors;

/** normal distributions fitted to each numeric attribute in each cluster */

private double [][][] m_modelNormal;

/** discrete distributions fitted to each discrete attribute in each cluster */

private DiscreteEstimator [][] m_model;

/** default minimum standard deviation */

private double m_minStdDev = 1e-6;

/** The clusterer being wrapped */

private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();

/** globally replace missing values */

private ReplaceMissingValues m_replaceMissing;

/**

100

* Default constructor.

101

102

103

public MakeDensityBasedClusterer() {

104

super();

105

}

106

107

/**

108

* Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.

109

110

* @param toWrap the clusterer to wrap around

111

112

public MakeDensityBasedClusterer(Clusterer toWrap) {

113

114

setClusterer(toWrap);

115

}

116

117

/**

118

* Returns a string describing classifier

119

* @return a description suitable for

120

* displaying in the explorer/experimenter gui

121

122

public String globalInfo() {

123

return

124

"Class for wrapping a Clusterer to make it return a distribution "

125

+ "and density. Fits normal distributions and discrete distributions "

126

+ "within each cluster produced by the wrapped clusterer. Supports the "

127

+ "NumberOfClustersRequestable interface only if the wrapped Clusterer "

128

+ "does.";

129

}

130

131

/**

132

* String describing default clusterer.

133

134

* @return the default clusterer classname

135

136

protected String defaultClustererString() {

137

return SimpleKMeans.class.getName();

138

}

139

140

/**

141

* Set the number of clusters to generate.

142

143

* @param n the number of clusters to generate

144

* @throws Exception if the wrapped clusterer has not been set, or if

145

* the wrapped clusterer does not implement this facility.

146

147

public void setNumClusters(int n) throws Exception {

148

if (m_wrappedClusterer == null) {

149

throw new Exception("Can't set the number of clusters to generate - "

150

+"no clusterer has been set yet.");

151

}

152

if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {

153

throw new Exception("Can't set the number of clusters to generate - "

154

+"wrapped clusterer does not support this facility.");

155

}

156

157

((NumberOfClustersRequestable)m_wrappedClusterer).setNumClusters(n);

158

}

159

160

/**

161

* Returns default capabilities of the clusterer (i.e., of the wrapper

162

* clusterer).

163

164

* @return the capabilities of this clusterer

165

166

public Capabilities getCapabilities() {

167

if (m_wrappedClusterer != null)

168

return m_wrappedClusterer.getCapabilities();

169

else

170

return super.getCapabilities();

171

}

172

173

/**

174

* Builds a clusterer for a set of instances.

175

176

* @param data the instances to train the clusterer with

177

* @throws Exception if the clusterer hasn't been set or something goes wrong

178

179

public void buildClusterer(Instances data) throws Exception {

180

// can clusterer handle the data?

181

getCapabilities().testWithFail(data);

182

183

m_replaceMissing = new ReplaceMissingValues();

184

m_replaceMissing.setInputFormat(data);

185

data = weka.filters.Filter.useFilter(data, m_replaceMissing);

186

187

m_theInstances = new Instances(data, 0);

188

if (m_wrappedClusterer == null) {

189

throw new Exception("No clusterer has been set");

190

}

191

m_wrappedClusterer.buildClusterer(data);

192

m_model =

193

new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];

194

m_modelNormal =

195

new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()][2];

196

double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];

197

m_priors = new double[m_wrappedClusterer.numberOfClusters()];

198

for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {

199

for (int j = 0; j < data.numAttributes(); j++) {

200

if (data.attribute(j).isNominal()) {

201

m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),

202

true);

203

}

204

}

205

}

206

207

Instance inst = null;

208

209

// Compute mean, etc.

210

int[] clusterIndex = new int[data.numInstances()];

211

for (int i = 0; i < data.numInstances(); i++) {

212

inst = data.instance(i);

213

int cluster = m_wrappedClusterer.clusterInstance(inst);

214

m_priors[cluster] += inst.weight();

215

for (int j = 0; j < data.numAttributes(); j++) {

216

if (!inst.isMissing(j)) {

217

if (data.attribute(j).isNominal()) {

218

m_model[cluster][j].addValue(inst.value(j),inst.weight());

219

} else {

220

m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);

221

weights[cluster][j] += inst.weight();

222

}

223

}

224

}

225

clusterIndex[i] = cluster;

226

}

227

228

for (int j = 0; j < data.numAttributes(); j++) {

229

if (data.attribute(j).isNumeric()) {

230

for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {

231

if (weights[i][j] > 0) {

232

m_modelNormal[i][j][0] /= weights[i][j];

233

}

234

}

235

}

236

}

237

238

// Compute standard deviations

239

for (int i = 0; i < data.numInstances(); i++) {

240

inst = data.instance(i);

241

for (int j = 0; j < data.numAttributes(); j++) {

242

if (!inst.isMissing(j)) {

243

if (data.attribute(j).isNumeric()) {

244

double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);

245

m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;

246

}

247

}

248

}

249

}

250

251

for (int j = 0; j < data.numAttributes(); j++) {

252

if (data.attribute(j).isNumeric()) {

253

for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {

254

if (weights[i][j] > 0) {

255

m_modelNormal[i][j][1] =

256

Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]);

257

} else if (weights[i][j] <= 0) {

258

m_modelNormal[i][j][1] = Double.MAX_VALUE;

259

}

260

if (m_modelNormal[i][j][1] <= m_minStdDev) {

261

m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;

262

if (m_modelNormal[i][j][1] <= m_minStdDev) {

263

m_modelNormal[i][j][1] = m_minStdDev;

264

}

265

}

266

}

267

}

268

}

269

270

Utils.normalize(m_priors);

271

}

272

273

/**

274

* Returns the cluster priors.

275

276

* @return the cluster priors

277

278

public double[] clusterPriors() {

279

280

double[] n = new double[m_priors.length];

281

282

System.arraycopy(m_priors, 0, n, 0, n.length);

283

return n;

284

}

285

286

/**

287

* Computes the log of the conditional density (per cluster) for a given instance.

288

289

* @param inst the instance to compute the density for

290

* @return an array containing the estimated densities

291

* @throws Exception if the density could not be computed

292

* successfully

293

294

public double[] logDensityPerClusterForInstance(Instance inst) throws Exception {

295

296

int i, j;

297

double logprob;

298

double[] wghts = new double[m_wrappedClusterer.numberOfClusters()];

299

300

m_replaceMissing.input(inst);

301

inst = m_replaceMissing.output();

302

303

for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {

304

logprob = 0;

305

for (j = 0; j < inst.numAttributes(); j++) {

306

if (!inst.isMissing(j)) {

307

if (inst.attribute(j).isNominal()) {

308

logprob += Math.log(m_model[i][j].getProbability(inst.value(j)));

309

} else { // numeric attribute

310

logprob += logNormalDens(inst.value(j),

311

m_modelNormal[i][j][0],

312

m_modelNormal[i][j][1]);

313

}

314

}

315

}

316

wghts[i] = logprob;

317

}

318

return wghts;

319

}

320

321

/** Constant for normal distribution. */

322

private static double m_normConst = 0.5 * Math.log(2 * Math.PI);

323

324

/**

325

* Density function of normal distribution.

326

* @param x input value

327

* @param mean mean of distribution

328

* @param stdDev standard deviation of distribution

329

* @return the density

330

331

private double logNormalDens (double x, double mean, double stdDev) {

332

333

double diff = x - mean;

334

335

return - (diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev);

336

}

337

338

/**

339

* Returns the number of clusters.

340

341

* @return the number of clusters generated for a training dataset.

342

* @throws Exception if number of clusters could not be returned successfully

343

344

public int numberOfClusters() throws Exception {

345

346

return m_wrappedClusterer.numberOfClusters();

347

}

348

349

/**

350

* Returns a description of the clusterer.

351

352

* @return a string containing a description of the clusterer

353

354

public String toString() {

355

StringBuffer text = new StringBuffer();

356

text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "

357

+ m_wrappedClusterer.toString());

358

359

text.append("\nFitted estimators (with ML estimates of variance):\n");

360

361

for (int j = 0; j < m_priors.length; j++) {

362

text.append("\nCluster: " + j + " Prior probability: "

363

+ Utils.doubleToString(m_priors[j], 4) + "\n\n");

364

365

for (int i = 0; i < m_model[0].length; i++) {

366

text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");

367

368

if (m_theInstances.attribute(i).isNominal()) {

369

if (m_model[j][i] != null) {

370

text.append(m_model[j][i].toString());

371

}

372

}

373

else {

374

text.append("Normal Distribution. Mean = "

375

+ Utils.doubleToString(m_modelNormal[j][i][0], 4)

376

+ " StdDev = "

377

+ Utils.doubleToString(m_modelNormal[j][i][1], 4)

378

+ "\n");

379

}

380

}

381

}

382

383

return text.toString();

384

}

385

386

/**

387

* Returns the tip text for this property

388

* @return tip text for this property suitable for

389

* displaying in the explorer/experimenter gui

390

391

public String clustererTipText() {

392

return "the clusterer to wrap";

393

}

394

395

/**

396

* Sets the clusterer to wrap.

397

398

* @param toWrap the clusterer

399

400

public void setClusterer(Clusterer toWrap) {

401

402

m_wrappedClusterer = toWrap;

403

}

404

405

/**

406

* Gets the clusterer being wrapped.

407

408

* @return the clusterer

409

410

public Clusterer getClusterer() {

411

412

return m_wrappedClusterer;

413

}

414

415

/**

416

* Returns the tip text for this property

417

* @return tip text for this property suitable for

418

* displaying in the explorer/experimenter gui

419

420

public String minStdDevTipText() {

421

return "set minimum allowable standard deviation";

422

}

423

424

/**

425

* Set the minimum value for standard deviation when calculating

426

* normal density. Reducing this value can help prevent arithmetic

427

* overflow resulting from multiplying large densities (arising from small

428

* standard deviations) when there are many singleton or near singleton

429

* values.

430

* @param m minimum value for standard deviation

431

432

public void setMinStdDev(double m) {

433

m_minStdDev = m;

434

}

435

436

/**

437

* Get the minimum allowable standard deviation.

438

* @return the minumum allowable standard deviation

439

440

public double getMinStdDev() {

441

return m_minStdDev;

442

}

443

444

/**

445

* Returns an enumeration describing the available options..

446

447

* @return an enumeration of all the available options.

448

449

public Enumeration listOptions() {

450

Vector result = new Vector();

451

452

result.addElement(new Option(

453

"\tminimum allowable standard deviation for normal density computation "

454

+"\n\t(default 1e-6)"

455

,"M",1,"-M <num>"));

456

457

result.addElement(new Option(

458

"\tClusterer to wrap.\n"

459

+ "\t(default " + defaultClustererString() + ")",

460

"W", 1,"-W <clusterer name>"));

461

462

if ((m_wrappedClusterer != null) &&

463

(m_wrappedClusterer instanceof OptionHandler)) {

464

result.addElement(new Option(

465

"",

466

"", 0, "\nOptions specific to clusterer "

467

+ m_wrappedClusterer.getClass().getName() + ":"));

468

Enumeration enu = ((OptionHandler)m_wrappedClusterer).listOptions();

469

while (enu.hasMoreElements()) {

470

result.addElement(enu.nextElement());

471

}

472

}

473

474

return result.elements();

475

}

476

477

/**

478

* Parses a given list of options. <p/>

479

480

481

* Valid options are: <p/>

482

483

* <pre> -M <num>

484

* minimum allowable standard deviation for normal density computation

485

* (default 1e-6)</pre>

486

487

* <pre> -W <clusterer name>

488

* Clusterer to wrap.

489

* (default weka.clusterers.SimpleKMeans)</pre>

490

491

* <pre>

492

* Options specific to clusterer weka.clusterers.SimpleKMeans:

493

* </pre>

494

495

* <pre> -N <num>

496

* number of clusters. (default = 2).</pre>

497

498

* <pre> -S <num>

499

* random number seed.

500

* (default 10)</pre>

501

502

503

504

* @param options the list of options as an array of strings

505

* @throws Exception if an option is not supported

506

507

public void setOptions(String[] options) throws Exception {

508

509

String optionString = Utils.getOption('M', options);

510

if (optionString.length() != 0)

511

setMinStdDev((new Double(optionString)).doubleValue());

512

else

513

setMinStdDev(1e-6);

514

515

String wString = Utils.getOption('W', options);

516

if (wString.length() == 0)

517

wString = defaultClustererString();

518

setClusterer(Clusterer.forName(wString, Utils.partitionOptions(options)));

519

}

520

521

/**

522

* Gets the current settings of the clusterer.

523

524

* @return an array of strings suitable for passing to setOptions()

525

526

public String[] getOptions() {

527

528

String [] clustererOptions = new String [0];

529

if ((m_wrappedClusterer != null) &&

530

(m_wrappedClusterer instanceof OptionHandler)) {

531

clustererOptions = ((OptionHandler)m_wrappedClusterer).getOptions();

532

}

533

String [] options = new String [clustererOptions.length + 5];

534

int current = 0;

535

536

options[current++] = "-M";

537

options[current++] = ""+getMinStdDev();

538

539

if (getClusterer() != null) {

540

options[current++] = "-W";

541

options[current++] = getClusterer().getClass().getName();

542

}

543

options[current++] = "--";

544

545

System.arraycopy(clustererOptions, 0, options, current,

546

clustererOptions.length);

547

current += clustererOptions.length;

548

while (current < options.length) {

549

options[current++] = "";

550

}

551

return options;

552

}

553

554

/**

555

* Main method for testing this class.

556

557

* @param argv the options

558

559

public static void main(String [] argv) {

560

runClusterer(new MakeDensityBasedClusterer(), argv);

561

}

562

}

Older »