~ubuntu-branches/ubuntu/precise/weka/precise

* A filter that uses a density-based clusterer to generate cluster membership values; filtered instances are composed of these values plus the class attribute (if set in the input data). If a (nominal) class attribute is set, the clusterer is run separately for each class. The class attribute (if set) and any user-specified attributes are ignored during the clustering operation

* <p/>

* Valid options are: <p/>

* <pre> -W <clusterer name>

* Full name of clusterer to use. eg:

* weka.clusterers.EM

* Additional options after the '--'.

* (default: weka.clusterers.EM)</pre>

* <pre> -I <att1,att2-att4,...>

* The range of attributes the clusterer should ignore.

* (the class attribute is automatically ignored)</pre>

* Options after the -- are passed on to the clusterer.

* @author Mark Hall (mhall@cs.waikato.ac.nz)

* @author Eibe Frank

* @version $Revision: 1.14 $

public class ClusterMembership

extends Filter

implements UnsupervisedFilter, OptionHandler {

/** for serialization */

static final long serialVersionUID = 6675702504667714026L;

/** The clusterer */

protected DensityBasedClusterer m_clusterer = new weka.clusterers.EM();

/** Array for storing the clusterers */

protected DensityBasedClusterer[] m_clusterers;

/** Range of attributes to ignore */

protected Range m_ignoreAttributesRange;

/** Filter for removing attributes */

protected Filter m_removeAttributes;

/** The prior probability for each class */

protected double[] m_priors;

/**

* Returns the Capabilities of this filter.

* @return the capabilities of this object

* @see Capabilities

public Capabilities getCapabilities() {

Capabilities result = m_clusterer.getCapabilities();

result.setMinimumNumberInstances(0);

100

101

return result;

102

}

103

104

/**

105

* Returns the Capabilities of this filter, makes sure that the class is

106

* never set (for the clusterer).

107

108

* @param data the data to use for customization

109

* @return the capabilities of this object, based on the data

110

* @see #getCapabilities()

111

112

public Capabilities getCapabilities(Instances data) {

113

Instances newData;

114

115

newData = new Instances(data, 0);

116

newData.setClassIndex(-1);

117

118

return super.getCapabilities(newData);

119

}

120

121

/**

122

* tests the data whether the filter can actually handle it

123

124

* @param instanceInfo the data to test

125

* @throws Exception if the test fails

126

127

protected void testInputFormat(Instances instanceInfo) throws Exception {

128

getCapabilities(instanceInfo).testWithFail(removeIgnored(instanceInfo));

129

}

130

131

/**

132

* Sets the format of the input instances.

133

134

* @param instanceInfo an Instances object containing the input instance

135

* structure (any instances contained in the object are ignored - only the

136

* structure is required).

137

* @return true if the outputFormat may be collected immediately

138

* @throws Exception if the inputFormat can't be set successfully

139

140

public boolean setInputFormat(Instances instanceInfo) throws Exception {

141

142

super.setInputFormat(instanceInfo);

143

m_removeAttributes = null;

144

m_priors = null;

145

146

return false;

147

}

148

149

/**

150

* filters all attributes that should be ignored

151

152

* @param data the data to filter

153

* @return the filtered data

154

* @throws Exception if filtering fails

155

156

protected Instances removeIgnored(Instances data) throws Exception {

157

Instances result = data;

158

159

if (m_ignoreAttributesRange != null || data.classIndex() >= 0) {

160

result = new Instances(data);

161

m_removeAttributes = new Remove();

162

String rangeString = "";

163

if (m_ignoreAttributesRange != null) {

164

rangeString += m_ignoreAttributesRange.getRanges();

165

}

166

if (data.classIndex() >= 0) {

167

if (rangeString.length() > 0) {

168

rangeString += "," + (data.classIndex() + 1);

169

} else {

170

rangeString = "" + (data.classIndex() + 1);

171

}

172

}

173

((Remove) m_removeAttributes).setAttributeIndices(rangeString);

174

((Remove) m_removeAttributes).setInvertSelection(false);

175

m_removeAttributes.setInputFormat(data);

176

result = Filter.useFilter(data, m_removeAttributes);

177

}

178

179

return result;

180

}

181

182

/**

183

* Signify that this batch of input to the filter is finished.

184

185

* @return true if there are instances pending output

186

* @throws IllegalStateException if no input structure has been defined

187

188

public boolean batchFinished() throws Exception {

189

190

if (getInputFormat() == null) {

191

throw new IllegalStateException("No input instance format defined");

192

}

193

194

if (outputFormatPeek() == null) {

195

Instances toFilter = getInputFormat();

196

Instances[] toFilterIgnoringAttributes;

197

198

// Make subsets if class is nominal

199

if ((toFilter.classIndex() >= 0) && toFilter.classAttribute().isNominal()) {

200

toFilterIgnoringAttributes = new Instances[toFilter.numClasses()];

201

for (int i = 0; i < toFilter.numClasses(); i++) {

202

toFilterIgnoringAttributes[i] = new Instances(toFilter, toFilter.numInstances());

203

}

204

for (int i = 0; i < toFilter.numInstances(); i++) {

205

toFilterIgnoringAttributes[(int)toFilter.instance(i).classValue()].add(toFilter.instance(i));

206

}

207

m_priors = new double[toFilter.numClasses()];

208

for (int i = 0; i < toFilter.numClasses(); i++) {

209

toFilterIgnoringAttributes[i].compactify();

210

m_priors[i] = toFilterIgnoringAttributes[i].sumOfWeights();

211

}

212

Utils.normalize(m_priors);

213

} else {

214

toFilterIgnoringAttributes = new Instances[1];

215

toFilterIgnoringAttributes[0] = toFilter;

216

m_priors = new double[1];

217

m_priors[0] = 1;

218

}

219

220

// filter out attributes if necessary

221

for (int i = 0; i < toFilterIgnoringAttributes.length; i++)

222

toFilterIgnoringAttributes[i] = removeIgnored(toFilterIgnoringAttributes[i]);

223

224

// build the clusterers

225

if ((toFilter.classIndex() <= 0) || !toFilter.classAttribute().isNominal()) {

226

m_clusterers = DensityBasedClusterer.makeCopies(m_clusterer, 1);

227

m_clusterers[0].buildClusterer(toFilterIgnoringAttributes[0]);

228

} else {

229

m_clusterers = DensityBasedClusterer.makeCopies(m_clusterer, toFilter.numClasses());

230

for (int i = 0; i < m_clusterers.length; i++) {

231

if (toFilterIgnoringAttributes[i].numInstances() == 0) {

232

m_clusterers[i] = null;

233

} else {

234

m_clusterers[i].buildClusterer(toFilterIgnoringAttributes[i]);

235

}

236

}

237

}

238

239

// create output dataset

240

FastVector attInfo = new FastVector();

241

for (int j = 0; j < m_clusterers.length; j++) {

242

if (m_clusterers[j] != null) {

243

for (int i = 0; i < m_clusterers[j].numberOfClusters(); i++) {

244

attInfo.addElement(new Attribute("pCluster_" + j + "_" + i));

245

}

246

}

247

}

248

if (toFilter.classIndex() >= 0) {

249

attInfo.addElement(toFilter.classAttribute().copy());

250

}

251

attInfo.trimToSize();

252

Instances filtered = new Instances(toFilter.relationName()+"_clusterMembership",

253

attInfo, 0);

254

if (toFilter.classIndex() >= 0) {

255

filtered.setClassIndex(filtered.numAttributes() - 1);

256

}

257

setOutputFormat(filtered);

258

259

// build new dataset

260

for (int i = 0; i < toFilter.numInstances(); i++) {

261

convertInstance(toFilter.instance(i));

262

}

263

}

264

flushInput();

265

266

m_NewBatch = true;

267

return (numPendingOutput() != 0);

268

}

269

270

/**

271

* Input an instance for filtering. Ordinarily the instance is processed

272

* and made available for output immediately. Some filters require all

273

* instances be read before producing output.

274

275

* @param instance the input instance

276

* @return true if the filtered instance may now be

277

* collected with output().

278

* @throws IllegalStateException if no input format has been defined.

279

280

public boolean input(Instance instance) throws Exception {

281

282

if (getInputFormat() == null) {

283

throw new IllegalStateException("No input instance format defined");

284

}

285

if (m_NewBatch) {

286

resetQueue();

287

m_NewBatch = false;

288

}

289

290

if (outputFormatPeek() != null) {

291

convertInstance(instance);

292

return true;

293

}

294

295

bufferInput(instance);

296

return false;

297

}

298

299

/**

300

* Converts logs back to density values.

301

302

* @param j the index of the clusterer

303

* @param in the instance to convert the logs back

304

* @return the densities

305

* @throws Exception if something goes wrong

306

307

protected double[] logs2densities(int j, Instance in) throws Exception {

308

309

double[] logs = m_clusterers[j].logJointDensitiesForInstance(in);

310

311

for (int i = 0; i < logs.length; i++) {

312

logs[i] += Math.log(m_priors[j]);

313

}

314

return logs;

315

}

316

317

/**

318

* Convert a single instance over. The converted instance is added to

319

* the end of the output queue.

320

321

* @param instance the instance to convert

322

* @throws Exception if something goes wrong

323

324

protected void convertInstance(Instance instance) throws Exception {

325

326

// set up values

327

double [] instanceVals = new double[outputFormatPeek().numAttributes()];

328

double [] tempvals;

329

if (instance.classIndex() >= 0) {

330

tempvals = new double[outputFormatPeek().numAttributes() - 1];

331

} else {

332

tempvals = new double[outputFormatPeek().numAttributes()];

333

}

334

int pos = 0;

335

for (int j = 0; j < m_clusterers.length; j++) {

336

if (m_clusterers[j] != null) {

337

double [] probs;

338

if (m_removeAttributes != null) {

339

m_removeAttributes.input(instance);

340

probs = logs2densities(j, m_removeAttributes.output());

341

} else {

342

probs = logs2densities(j, instance);

343

}

344

System.arraycopy(probs, 0, tempvals, pos, probs.length);

345

pos += probs.length;

346

}

347

}

348

tempvals = Utils.logs2probs(tempvals);

349

System.arraycopy(tempvals, 0, instanceVals, 0, tempvals.length);

350

if (instance.classIndex() >= 0) {

351

instanceVals[instanceVals.length - 1] = instance.classValue();

352

}

353

354

push(new Instance(instance.weight(), instanceVals));

355

}

356

357

/**

358

* Returns an enumeration describing the available options.

359

360

* @return an enumeration of all the available options.

361

362

public Enumeration listOptions() {

363

364

Vector newVector = new Vector(2);

365

366

newVector.

367

addElement(new Option("\tFull name of clusterer to use. eg:\n"

368

+ "\t\tweka.clusterers.EM\n"

369

+ "\tAdditional options after the '--'.\n"

370

+ "\t(default: weka.clusterers.EM)",

371

"W", 1, "-W <clusterer name>"));

372

373

newVector.

374

addElement(new Option("\tThe range of attributes the clusterer should ignore."

375

+"\n\t(the class attribute is automatically ignored)",

376

"I", 1,"-I <att1,att2-att4,...>"));

377

378

return newVector.elements();

379

}

380

381

/**

382

* Parses a given list of options. <p/>

383

384

385

* Valid options are: <p/>

386

387

* <pre> -W <clusterer name>

388

* Full name of clusterer to use. eg:

389

* weka.clusterers.EM

390

* Additional options after the '--'.

391

* (default: weka.clusterers.EM)</pre>

392

393

* <pre> -I <att1,att2-att4,...>

394

* The range of attributes the clusterer should ignore.

395

* (the class attribute is automatically ignored)</pre>

396

397

398

399

* Options after the -- are passed on to the clusterer.

400

401

* @param options the list of options as an array of strings

402

* @throws Exception if an option is not supported

403

404

public void setOptions(String[] options) throws Exception {

405

406

String clustererString = Utils.getOption('W', options);

407

if (clustererString.length() == 0)

408

clustererString = weka.clusterers.EM.class.getName();

409

setDensityBasedClusterer((DensityBasedClusterer)Utils.

410

forName(DensityBasedClusterer.class, clustererString,

411

Utils.partitionOptions(options)));

412

413

setIgnoredAttributeIndices(Utils.getOption('I', options));

414

Utils.checkForRemainingOptions(options);

415

}

416

417

/**

418

* Gets the current settings of the filter.

419

420

* @return an array of strings suitable for passing to setOptions

421

422

public String [] getOptions() {

423

424

String [] clustererOptions = new String [0];

425

if ((m_clusterer != null) &&

426

(m_clusterer instanceof OptionHandler)) {

427

clustererOptions = ((OptionHandler)m_clusterer).getOptions();

428

}

429

String [] options = new String [clustererOptions.length + 5];

430

int current = 0;

431

432

if (!getIgnoredAttributeIndices().equals("")) {

433

options[current++] = "-I";

434

options[current++] = getIgnoredAttributeIndices();

435

}

436

437

if (m_clusterer != null) {

438

options[current++] = "-W";

439

options[current++] = getDensityBasedClusterer().getClass().getName();

440

}

441

442

options[current++] = "--";

443

System.arraycopy(clustererOptions, 0, options, current,

444

clustererOptions.length);

445

current += clustererOptions.length;

446

447

while (current < options.length) {

448

options[current++] = "";

449

}

450

return options;

451

}

452

453

/**

454

* Returns a string describing this filter

455

456

* @return a description of the filter suitable for

457

* displaying in the explorer/experimenter gui

458

459

public String globalInfo() {

460

461

return "A filter that uses a density-based clusterer to generate cluster "

462

+ "membership values; filtered instances are composed of these values "

463

+ "plus the class attribute (if set in the input data). If a (nominal) "

464

+ "class attribute is set, the clusterer is run separately for each "

465

+ "class. The class attribute (if set) and any user-specified "

466

+ "attributes are ignored during the clustering operation";

467

}

468

469

/**

470

* Returns a description of this option suitable for display

471

* as a tip text in the gui.

472

473

* @return description of this option

474

475

public String densityBasedClustererTipText() {

476

return "The clusterer that will generate membership values for the instances.";

477

}

478

479

/**

480

* Set the clusterer for use in filtering

481

482

* @param newClusterer the clusterer to use

483

484

public void setDensityBasedClusterer(DensityBasedClusterer newClusterer) {

485

m_clusterer = newClusterer;

486

}

487

488

/**

489

* Get the clusterer used by this filter

490

491

* @return the clusterer used

492

493

public DensityBasedClusterer getDensityBasedClusterer() {

494

return m_clusterer;

495

}

496

497

/**

498

* Returns the tip text for this property

499

500

* @return tip text for this property suitable for

501

* displaying in the explorer/experimenter gui

502

503

public String ignoredAttributeIndicesTipText() {

504

505

return "The range of attributes to be ignored by the clusterer. eg: first-3,5,9-last";

506

}

507

508

/**

509

* Gets ranges of attributes to be ignored.

510

511

* @return a string containing a comma-separated list of ranges

512

513

public String getIgnoredAttributeIndices() {

514

515

if (m_ignoreAttributesRange == null) {

516

return "";

517

} else {

518

return m_ignoreAttributesRange.getRanges();

519

}

520

}

521

522

/**

523

* Sets the ranges of attributes to be ignored. If provided string

524

* is null, no attributes will be ignored.

525

526

* @param rangeList a string representing the list of attributes.

527

* eg: first-3,5,6-last

528

* @throws IllegalArgumentException if an invalid range list is supplied

529

530

public void setIgnoredAttributeIndices(String rangeList) {

531

532

if ((rangeList == null) || (rangeList.length() == 0)) {

533

m_ignoreAttributesRange = null;

534

} else {

535

m_ignoreAttributesRange = new Range();

536

m_ignoreAttributesRange.setRanges(rangeList);

537

}

538

}

539

540

/**

541

* Main method for testing this class.

542

543

* @param argv should contain arguments to the filter: use -h for help

544

545

public static void main(String [] argv) {

546

runFilter(new ClusterMembership(), argv);

547

}

548

}

Older »