~ubuntu-branches/ubuntu/precise/weka/precise

* Arie Ben-David (1992). Automatic Generation of Symbolic Multiattribute Ordinal Knowledge-Based DSSs: methodology and Applications. Decision Sciences. 23:1357-1372.

*

* Lievens, Stijn (2003-2004). Studie en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd rangschikken..

*

* BibTeX:

* <pre>

* @article{Ben-David1992,

* author = {Arie Ben-David},

* journal = {Decision Sciences},

* pages = {1357-1372},

* title = {Automatic Generation of Symbolic Multiattribute Ordinal Knowledge-Based DSSs: methodology and Applications},

* volume = {23},

* year = {1992}

* }

* @mastersthesis{Lievens2003-2004,

* author = {Lievens, Stijn},

* school = {Ghent University},

* title = {Studie en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd rangschikken.},

* year = {2003-2004}

* }

* </pre>

*

* Valid options are:

* <pre> -S <num>

* Random number seed.

* (default 1)</pre>

* <pre> -D

* If set, classifier is run in debug mode and

* may output additional info to the console</pre>

100

101

* <pre> -C <CL|REG>

102

* Sets the classification type to be used.

103

* (Default: REG)</pre>

104

105

* <pre> -A <MEAN|MED|MAX>

106

* Sets the averaging type used in phase 1 of the classifier.

107

* (Default: MEAN)</pre>

108

109

* <pre> -N <NONE|EUCL|HAM>

110

* If different from NONE, a nearest neighbour rule is fired when the

111

* rule base doesn't contain an example smaller than the instance

112

* to be classified

113

* (Default: NONE).</pre>

114

115

* <pre> -E <MIN|MAX|BOTH>

116

* Sets the extension type, i.e. the rule base to use.

117

* (Default: MIN)</pre>

118

119

* <pre> -sort

120

* If set, the instances are also sorted within the same class

121

* before building the rule bases</pre>

122

123

124

125

* @author Stijn Lievens (stijn.lievens@ugent.be)

126

* @version $Revision: 1.1 $

127

128

public class OLM

129

extends RandomizableClassifier

130

implements TechnicalInformationHandler {

131

132

/** for serialization */

133

private static final long serialVersionUID = 3722951802290935192L;

134

135

/**

136

* Round the real value that is returned by the original algorithm

137

* to the nearest label.

138

139

public static final int CT_ROUNDED = 0;

140

141

/**

142

* No rounding is performed during classification, this is the

143

* classification is done in a regression like way.

144

145

public static final int CT_REAL = 1;

146

147

/** the classification types */

148

public static final Tag[] TAGS_CLASSIFICATIONTYPES = {

149

new Tag(CT_ROUNDED, "CL", "Round to nearest label"),

150

new Tag(CT_REAL, "REG", "Regression-like classification")

151

};

152

153

/**

154

* Use the mean for averaging in phase 1. This is in fact a

155

* non ordinal procedure. The scores used for averaging are the internal

156

* values of WEKA.

157

158

public static final int AT_MEAN = 0;

159

160

/**

161

* Use the median for averaging in phase 1. The possible values

162

* are in the extended set of labels, this is labels in between the

163

* original labels are possible.

164

165

public static final int AT_MEDIAN = 1;

166

167

/**

168

* Use the mode for averaging in phase 1. The label

169

* that has maximum frequency is used. If there is more

170

* than one label that has maximum frequency, the lowest

171

* one is prefered.

172

173

public static final int AT_MAXPROB = 2;

174

175

/** the averaging types */

176

public static final Tag[] TAGS_AVERAGINGTYPES = {

177

new Tag(AT_MEAN, "MEAN", "Mean"),

178

new Tag(AT_MEDIAN, "MED","Median"),

179

new Tag(AT_MAXPROB, "MAX", "Max probability")

180

};

181

182

/**

183

* No nearest neighbour rule will be fired when

184

* classifying an instance for which there is no smaller rule

185

* in the rule base?

186

187

public static final int DT_NONE = -1;

188

189

/**

190

* Use the Euclidian distance whenever a nearest neighbour

191

* rule is fired.

192

193

public static final int DT_EUCLID = 0;

194

195

/**

196

* Use the Hamming distance, this is the number of

197

* positions in which the instances differ, whenever a

198

* nearest neighbour rule is fired

199

200

public static final int DT_HAMMING = 1;

201

202

/** the distance types */

203

public static final Tag[] TAGS_DISTANCETYPES = {

204

new Tag(DT_NONE, "NONE", "No nearest neighbor"),

205

new Tag(DT_EUCLID, "EUCL", "Euclidean"),

206

new Tag(DT_HAMMING, "HAM", "Hamming")

207

};

208

209

/**

210

* Use only the minimal extension, as in the original algorithm

211

* of Ben-David.

212

213

public static final int ET_MIN = 0;

214

215

/**

216

* Use only the maximal extension. In this case an algorithm

217

* dual to the original one is performed.

218

219

public static final int ET_MAX = 1;

220

221

/**

222

* Combine both the minimal and maximal extension, and use the

223

* midpoint of the resulting interval as prediction.

224

225

public static final int ET_BOTH = 2;

226

227

/** the mode types */

228

public static final Tag[] TAGS_EXTENSIONTYPES = {

229

new Tag(ET_MIN, "MIN", "Minimal extension"),

230

new Tag(ET_MAX, "MAX", "Maximal extension"),

231

new Tag(ET_BOTH, "BOTH", "Minimal and maximal extension")

232

};

233

234

/**

235

* The training examples, used temporarily.

236

* m_train is cleared after the rule base is built.

237

238

private Instances m_train;

239

240

/** Number of classes in the original m_train */

241

private int m_numClasses;

242

243

/**

244

* The rule base, should be consistent and contain no

245

* redundant rules. This is the rule base as in the original

246

* algorithm of Ben-David.

247

248

private Instances m_baseMin;

249

250

/**

251

* This is a complentary rule base, using the maximal rather

252

* than the minimal extension.

253

254

private Instances m_baseMax;

255

256

/**

257

* Map used in the method buildClassifier in order to quickly

258

* gather all info needed for phase 1. This is a map containing

259

* (Coordinates, DiscreteEstimator)-pairs.

260

261

private Map m_estimatedDistributions;

262

263

/** classification type */

264

private int m_ctype = CT_REAL;

265

266

/** averaging type */

267

private int m_atype = AT_MEAN;

268

269

/** distance type */

270

private int m_dtype = DT_EUCLID;

271

272

/** mode type */

273

private int m_etype = ET_MIN;

274

275

/**

276

* Should the instances be sorted such that minimal (resp. maximal)

277

* elements (per class) are treated first when building m_baseMin

278

* (resp. m_baseMax).

279

280

private boolean m_sort = false;

281

282

/**

283

* Returns a string describing the classifier.

284

* @return a description suitable for displaying in the

285

* explorer/experimenter gui

286

287

public String globalInfo() {

288

return "This class is an implementation of the Ordinal Learning "

289

+ "Method\n"

290

+ "Further information regarding the algorithm and variants "

291

+ "can be found in:\n\n"

292

+ getTechnicalInformation().toString();

293

}

294

295

/**

296

* Returns default capabilities of the classifier.

297

298

* @return the capabilities of this classifier

299

300

public Capabilities getCapabilities() {

301

Capabilities result = super.getCapabilities();

302

303

// attributes

304

result.enable(Capability.NOMINAL_ATTRIBUTES);

305

306

// class

307

result.enable(Capability.NOMINAL_CLASS);

308

result.enable(Capability.MISSING_CLASS_VALUES);

309

310

// instances

311

result.setMinimumNumberInstances(0);

312

313

return result;

314

}

315

316

/**

317

* Returns an instance of a TechnicalInformation object, containing

318

* detailed information about the technical background of this class,

319

* e.g., paper reference or book this class is based on.

320

321

* @return the technical information about this class

322

323

public TechnicalInformation getTechnicalInformation() {

324

TechnicalInformation result;

325

TechnicalInformation additional;

326

327

result = new TechnicalInformation(Type.ARTICLE);

328

result.setValue(Field.AUTHOR, "Arie Ben-David");

329

result.setValue(Field.YEAR, "1992");

330

result.setValue(Field.TITLE, "Automatic Generation of Symbolic Multiattribute Ordinal Knowledge-Based DSSs: methodology and Applications");

331

result.setValue(Field.JOURNAL, "Decision Sciences");

332

result.setValue(Field.PAGES, "1357-1372");

333

result.setValue(Field.VOLUME, "23");

334

335

additional = result.add(Type.MASTERSTHESIS);

336

additional.setValue(Field.AUTHOR, "Lievens, Stijn");

337

additional.setValue(Field.YEAR, "2003-2004");

338

additional.setValue(Field.TITLE, "Studie en implementatie van instantie-gebaseerde algoritmen voor gesuperviseerd rangschikken.");

339

additional.setValue(Field.SCHOOL, "Ghent University");

340

341

return result;

342

}

343

344

/**

345

* Returns the tip text for this property.

346

347

* @return tip text for this property suitable for

348

* displaying in the explorer/experimenter gui

349

350

public String classificationTypeTipText() {

351

return "Sets the classification type.";

352

}

353

354

/**

355

* Sets the classification type.

356

357

* @param value the classification type to be set.

358

359

public void setClassificationType(SelectedTag value) {

360

if (value.getTags() == TAGS_CLASSIFICATIONTYPES)

361

m_ctype = value.getSelectedTag().getID();

362

}

363

364

/**

365

* Gets the classification type.

366

367

* @return the classification type

368

369

public SelectedTag getClassificationType() {

370

return new SelectedTag(m_ctype, TAGS_CLASSIFICATIONTYPES);

371

}

372

373

/**

374

* Returns the tip text for this property.

375

376

* @return tip text for this property suitable for

377

* displaying in the explorer/experimenter gui

378

379

public String averagingTypeTipText() {

380

return "Choses the way in which the distributions are averaged in "

381

+ "the first phase of the algorithm.";

382

}

383

384

/**

385

* Sets the averaging type to use in phase 1 of the algorithm.

386

387

* @param value the averaging type to use

388

389

public void setAveragingType(SelectedTag value) {

390

if (value.getTags() == TAGS_AVERAGINGTYPES)

391

m_atype = value.getSelectedTag().getID();

392

}

393

394

/**

395

* Gets the averaging type.

396

397

* @return the averaging type

398

399

public SelectedTag getAveragingType() {

400

return new SelectedTag(m_atype, TAGS_AVERAGINGTYPES);

401

}

402

403

/**

404

* Returns the tip text for this property.

405

406

* @return tip text for this property suitable for

407

* displaying in the explorer/experimenter gui

408

409

public String distanceTypeTipText() {

410

return "Sets the distance that is to be used by the nearest neighbour "

411

+ "rule";

412

}

413

414

/**

415

* Sets the distance type to be used by a nearest neighbour rule (if any).

416

417

* @param value the distance type to use

418

419

public void setDistanceType(SelectedTag value) {

420

if (value.getTags() == TAGS_DISTANCETYPES)

421

m_dtype = value.getSelectedTag().getID();

422

}

423

424

/**

425

* Gets the distance type used by a nearest neighbour rule (if any).

426

427

* @return the distance type

428

429

public SelectedTag getDistanceType() {

430

return new SelectedTag(m_dtype, TAGS_DISTANCETYPES);

431

}

432

433

/**

434

* Returns the tip text for this property.

435

436

* @return tip text for this property suitable for

437

* displaying in the explorer/experimenter gui

438

439

public String extensionTypeTipText() {

440

return "Sets the extension type to use.";

441

}

442

443

/**

444

* Sets the extension type to use.

445

* The minimal extension is the one used by

446

* Ben-David in the original algorithm. The maximal extension is

447

* a completely dual variant of the minimal extension. When using

448

* both, then the midpoint of the interval determined by both

449

* extensions is returned.

450

451

* @param value the extension type to use

452

453

public void setExtensionType(SelectedTag value) {

454

if (value.getTags() == TAGS_EXTENSIONTYPES)

455

m_etype = value.getSelectedTag().getID();

456

}

457

458

/**

459

* Gets the extension type.

460

461

* @return the extension type

462

463

public SelectedTag getExtensionType() {

464

return new SelectedTag(m_etype, TAGS_EXTENSIONTYPES);

465

}

466

467

/**

468

* Returns the tip text for this property.

469

470

* @return tip text for this property suitable for

471

* displaying in the explorer/experimenter gui

472

473

public String sortTipText() {

474

return "If true, the instances are also sorted within the classes "

475

+ "prior to building the rule bases.";

476

}

477

478

/**

479

* Sets if the instances are to be sorted prior to building the rule bases.

480

481

* @param sort if <code> true </code> the instances will be sorted

482

483

public void setSort(boolean sort) {

484

m_sort = sort;

485

}

486

487

/**

488

* Returns if the instances are sorted prior to building the rule bases.

489

490

* @return <code> true </code> if instances are sorted prior to building

491

* the rule bases, <code> false </code> otherwise.

492

493

public boolean getSort() {

494

return m_sort;

495

}

496

497

/**

498

* Returns the tip text for this property.

499

500

* @return tip text for this property suitable for

501

* displaying in the explorer/experimenter gui

502

503

public String seedTipText() {

504

return "Sets the seed that is used to randomize the instances prior "

505

+ "to building the rule bases";

506

}

507

508

/**

509

* Return the number of examples in the minimal rule base.

510

* The minimal rule base is the one that corresponds to the

511

* rule base of Ben-David.

512

513

* @return the number of examples in the minimal rule base

514

515

public int getSizeRuleBaseMin() {

516

return m_baseMin.numInstances();

517

}

518

519

/**

520

* Return the number of examples in the maximal rule base.

521

* The maximal rule base is built using an algorithm

522

* dual to that for building the minimal rule base.

523

524

* @return the number of examples in the maximal rule base

525

526

public int getSizeRuleBaseMax() {

527

return m_baseMax.numInstances();

528

}

529

530

/**

531

* Classifies a given instance according to the current settings

532

* of the classifier.

533

534

* @param instance the instance to be classified

535

* @return a <code> double </code> that represents the classification,

536

* this could either be the internal value of a label, when rounding is

537

* on, or a real number.

538

539

public double classifyInstance(Instance instance) {

540

double classValueMin = -1;

541

double classValueMax = -1;

542

double classValue;

543

544

if (m_etype == ET_MIN || m_etype == ET_BOTH) {

545

classValueMin = classifyInstanceMin(instance);

546

}

547

548

if (m_etype == ET_MAX || m_etype == ET_BOTH) {

549

classValueMax = classifyInstanceMax(instance);

550

}

551

552

switch (m_etype) {

553

case ET_MIN:

554

classValue = classValueMin;

555

break;

556

case ET_MAX:

557

classValue = classValueMax;

558

break;

559

case ET_BOTH:

560

classValue = (classValueMin + classValueMax) / 2;

561

break;

562

default:

563

throw new IllegalStateException("Illegal mode type!");

564

}

565

566

// round if necessary and return

567

return (m_ctype == CT_ROUNDED ? Utils.round(classValue) : classValue);

568

}

569

570

/**

571

* Classify <code> instance </code> using the minimal rule base.

572

* Rounding is never performed, this is the responsability

573

* of <code> classifyInstance </code>.

574

575

* @param instance the instance to be classified

576

* @return the classification according to the minimal rule base

577

578

private double classifyInstanceMin(Instance instance) {

579

double classValue = -1;

580

if (m_baseMin == null) {

581

throw new IllegalStateException

582

("Classifier has not yet been built");

583

}

584

585

Iterator it = new EnumerationIterator(m_baseMin.enumerateInstances());

586

while (it.hasNext()) {

587

Instance r = (Instance) it.next();

588

589

// we assume that rules are ordered in decreasing class value order

590

// so that the first one that we encounter is immediately the

591

// one with the biggest class value

592

if (InstancesUtil.smallerOrEqual(r, instance)) {

593

classValue = r.classValue();

594

break;

595

}

596

}

597

598

// there is no smaller rule in the database

599

if (classValue == -1) {

600

if (m_dtype != DT_NONE) {

601

Instance[] nn = nearestRules(instance, m_baseMin);

602

classValue = 0;

603

// XXX for the moment we only use the mean to extract a

604

// classValue; other possibilities might be included later

605

for (int i = 0; i < nn.length; i++) {

606

classValue += nn[i].classValue();

607

}

608

classValue /= nn.length;

609

}

610

else {

611

classValue = 0; // minimal class value

612

}

613

}

614

615

return classValue; // no rounding!

616

}

617

618

/**

619

* Classify <code> instance </code> using the maximal rule base.

620

* Rounding is never performed, this is the responsability

621

* of <code> classifyInstance </code>.

622

623

* @param instance the instance to be classified

624

* @return the classification according to the maximal rule base

625

626

private double classifyInstanceMax(Instance instance) {

627

double classValue = -1;

628

if (m_baseMax == null) {

629

throw new IllegalStateException

630

("Classifier has not yet been built");

631

}

632

633

Iterator it = new EnumerationIterator(m_baseMax.enumerateInstances());

634

while (it.hasNext()) {

635

Instance r = (Instance) it.next();

636

637

// we assume that rules are ordered in increasing class value order

638

// so that the first bigger one we encounter will be the one

639

// with the smallest label

640

if (InstancesUtil.smallerOrEqual(instance, r)) {

641

classValue = r.classValue();

642

break;

643

}

644

}

645

646

// there is no bigger rule in the database

647

if (classValue == -1) {

648

if (m_dtype != DT_NONE) {

649

// XXX see remark in classifyInstanceMin

650

Instance[] nn = nearestRules(instance, m_baseMax);

651

classValue = 0;

652

for (int i = 0; i < nn.length; i++) {

653

classValue += nn[i].classValue();

654

}

655

classValue /= nn.length;

656

}

657

else {

658

classValue = m_numClasses - 1; // maximal label

659

}

660

}

661

662

return classValue;

663

}

664

665

/**

666

* Find the instances in <code> base </code> that are,

667

* according to the current distance type, closest

668

* to <code> instance </code>.

669

670

* @param instance the instance around which one looks

671

* @param base the instances to choose from

672

* @return an array of <code> Instance </code> which contains

673

* the instances closest to <code> instance </code>

674

675

private Instance[] nearestRules(Instance instance, Instances base) {

676

double min = Double.POSITIVE_INFINITY;

677

double dist = 0;

678

double[] instanceDouble = InstancesUtil.toDataDouble(instance);

679

ArrayList nn = new ArrayList();

680

Iterator it = new EnumerationIterator(base.enumerateInstances());

681

while(it.hasNext()) {

682

Instance r = (Instance) it.next();

683

double[] rDouble = InstancesUtil.toDataDouble(r);

684

switch (m_dtype) {

685

case DT_EUCLID:

686

dist = euclidDistance(instanceDouble, rDouble);

687

break;

688

case DT_HAMMING:

689

dist = hammingDistance(instanceDouble, rDouble);

690

break;

691

default:

692

throw new IllegalArgumentException("distance type is not valid");

693

}

694

if (dist < min) {

695

min = dist;

696

nn.clear();

697

nn.add(r);

698

} else if (dist == min) {

699

nn.add(r);

700

}

701

}

702

703

nn.trimToSize();

704

return (Instance[]) nn.toArray(new Instance[0]);

705

}

706

707

/**

708

* Build the OLM classifier, meaning that the rule bases

709

* are built.

710

711

* @param instances the instances to use for building the rule base

712

* @throws Exception if <code> instances </code> cannot be handled by

713

* the classifier.

714

715

public void buildClassifier(Instances instances) throws Exception {

716

getCapabilities().testWithFail(instances);

717

718

// copy the dataset

719

m_train = new Instances(instances);

720

m_numClasses = m_train.numClasses();

721

722

// new dataset in which examples with missing class value are removed

723

m_train.deleteWithMissingClass();

724

725

// build the Map for the estimatedDistributions

726

m_estimatedDistributions = new HashMap(m_train.numInstances() / 2);

727

728

// cycle through all instances

729

Iterator it = new EnumerationIterator(m_train.enumerateInstances());

730

while (it.hasNext() == true) {

731

Instance instance = (Instance) it.next();

732

Coordinates c = new Coordinates(instance);

733

734

// get DiscreteEstimator from the map

735

DiscreteEstimator df =

736

(DiscreteEstimator) m_estimatedDistributions.get(c);

737

738

// if no DiscreteEstimator is present in the map, create one

739

if (df == null) {

740

df = new DiscreteEstimator(instances.numClasses(), 0);

741

}

742

df.addValue(instance.classValue(), instance.weight()); // update

743

m_estimatedDistributions.put(c, df); // put back in map

744

}

745

746

// Create the attributes for m_baseMin and m_baseMax.

747

// These are identical to those of m_train, except that the

748

// class is set to 'numeric'

749

// The class attribute is moved to the back

750

FastVector newAtts = new FastVector(m_train.numAttributes());

751

Attribute classAttribute = null;

752

for (int i = 0; i < m_train.numAttributes(); i++) {

753

Attribute att = m_train.attribute(i);

754

if (i != m_train.classIndex()) {

755

newAtts.addElement(att.copy());

756

} else {

757

classAttribute = new Attribute(att.name()); //numeric attribute

758

}

759

}

760

newAtts.addElement(classAttribute);

761

762

// original training instances are replaced by an empty set

763

// of instances

764

m_train = new Instances(m_train.relationName(), newAtts,

765

m_estimatedDistributions.size());

766

m_train.setClassIndex(m_train.numAttributes() - 1);

767

768

769

// We cycle through the map of estimatedDistributions and

770

// create one Instance for each entry in the map, with

771

// a class value that is calculated from the distribution of

772

// the class values

773

it = m_estimatedDistributions.keySet().iterator();

774

while(it.hasNext()) {

775

// XXX attValues must be here, otherwise things go wrong

776

double[] attValues = new double[m_train.numAttributes()];

777

Coordinates cc = (Coordinates) it.next();

778

DiscreteEstimator df =

779

(DiscreteEstimator) m_estimatedDistributions.get(cc);

780

cc.getValues(attValues);

781

switch(m_atype) {

782

case AT_MEAN:

783

attValues[attValues.length - 1] = (new DiscreteDistribution(df)).mean();

784

break;

785

case AT_MEDIAN:

786

attValues[attValues.length - 1] = (new DiscreteDistribution(df)).median();

787

break;

788

case AT_MAXPROB:

789

attValues[attValues.length - 1] = (new DiscreteDistribution(df)).modes()[0];

790

break;

791

default:

792

throw new IllegalStateException("Not a valid averaging type");

793

}

794

795

// add the instance, we give it weight one

796

m_train.add(new Instance(1, attValues));

797

}

798

799

if (m_Debug == true) {

800

System.out.println("The dataset after phase 1 :");

801

System.out.println(m_train.toString());

802

}

803

804

/* Shuffle to training instances, to prevent the order dictated

805

* by the map to play an important role in how the rule bases

806

* are built.

807

808

m_train.randomize(new Random(getSeed()));

809

810

if (m_sort == false) {

811

// sort the instances only in increasing class order

812

m_train.sort(m_train.classIndex());

813

} else {

814

// sort instances completely

815

Comparator[] cc = new Comparator[m_train.numAttributes()];

816

// sort the class, increasing

817

cc[0] = new InstancesComparator(m_train.classIndex());

818

// sort the attributes, decreasing

819

for (int i = 1; i < cc.length; i++) {

820

cc[i] = new InstancesComparator(i - 1, true);

821

}

822

823

// copy instances into an array

824

Instance[] tmp = new Instance[m_train.numInstances()];

825

for (int i = 0; i < tmp.length; i++) {

826

tmp[i] = m_train.instance(i);

827

}

828

MultiDimensionalSort.multiDimensionalSort(tmp, cc);

829

830

// copy sorted array back into Instances

831

m_train.delete();

832

for (int i = 0; i < tmp.length; i++) {

833

m_train.add(tmp[i]);

834

}

835

}

836

837

// phase 2: building the rule bases themselves

838

m_baseMin =

839

new Instances(m_train, m_estimatedDistributions.size() / 4);

840

phaseTwoMin();

841

m_baseMax =

842

new Instances(m_train, m_estimatedDistributions.size() / 4);

843

phaseTwoMax();

844

}

845

846

/**

847

* This implements the second phase of the OLM algorithm.

848

* We build the rule base m_baseMin, according to the conflict

849

* resolution mechanism described in the thesis.

850

851

private void phaseTwoMin() {

852

853

// loop through instances backwards, this is biggest class labels first

854

for (int i = m_train.numInstances() - 1; i >=0; i--) {

855

Instance e = m_train.instance(i);

856

857

// if the example is redundant with m_base, we discard it

858

if (isRedundant(e) == false) {

859

// how many examples are redundant if we would add e

860

int[] redundancies = makesRedundant(e);

861

if (redundancies[0] == 1

862

&& causesReversedPreference(e) == false) {

863

// there is one example made redundant be e, and

864

// adding e doesn't cause reversed preferences

865

// so we replace the indicated rule by e

866

m_baseMin.delete(redundancies[1]);

867

m_baseMin.add(e);

868

continue;

869

}

870

871

if (redundancies[0] == 0) {

872

// adding e causes no redundancies, what about

873

// reversed preferences ?

874

int[] revPref = reversedPreferences(e);

875

if (revPref[0] == 1) {

876

// there would be one reversed preference, we

877

// prefer the example e because it has a lower label

878

m_baseMin.delete(revPref[1]);

879

m_baseMin.add(e);

880

continue;

881

}

882

883

if (revPref[0] == 0) {

884

// this means: e causes no redundancies and no

885

// reversed preferences. We can simply add it.

886

m_baseMin.add(e);

887

}

888

}

889

}

890

}

891

}

892

893

/**

894

* This implements the second phase of the OLM algorithm.

895

* We build the rule base m_baseMax .

896

897

private void phaseTwoMax() {

898

899

// loop through instances, smallest class labels first

900

for (int i = 0; i < m_train.numInstances(); i++) {

901

Instance e = m_train.instance(i);

902

903

// if the example is redundant with m_base, we discard it

904

if (isRedundantMax(e) == false) {

905

// how many examples are redundant if we would add e

906

int[] redundancies = makesRedundantMax(e);

907

if (redundancies[0] == 1

908

&& causesReversedPreferenceMax(e) == false) {

909

// there is one example made redundant be e, and

910

// adding e doesn't cause reversed preferences

911

// so we replace the indicated rule by e

912

m_baseMax.delete(redundancies[1]);

913

m_baseMax.add(e);

914

continue;

915

}

916

917

if (redundancies[0] == 0) {

918

// adding e causes no redundancies, what about

919

// reversed preferences ?

920

int[] revPref = reversedPreferencesMax(e);

921

if (revPref[0] == 1) {

922

// there would be one reversed preference, we

923

// prefer the example e because it has a lower label

924

m_baseMax.delete(revPref[1]);

925

m_baseMax.add(e);

926

continue;

927

}

928

929

if (revPref[0] == 0) {

930

// this means: e causes no redundancies and no

931

// reversed preferences. We can simply add it.

932

m_baseMax.add(e);

933

}

934

}

935

}

936

}

937

}

938

939

/**

940

* Returns a string description of the classifier. In debug

941

* mode, the rule bases are added to the string representation

942

* as well. This means that the description can become rather

943

* lengthy.

944

945

* @return a <code> String </code> describing the classifier.

946

947

public String toString() {

948

StringBuffer sb = new StringBuffer();

949

sb.append("OLM\n===\n\n");

950

if (m_etype == ET_MIN || m_etype == ET_BOTH) {

951

if (m_baseMin != null) {

952

sb.append("Number of examples in the minimal rule base = "

953

+ m_baseMin.numInstances() + "\n");

954

} else {

955

sb.append("minimal rule base not yet created");

956

}

957

}

958

959

if (m_etype == ET_MAX || m_etype == ET_BOTH) {

960

if (m_baseMax != null) {

961

sb.append("Number of examples in the maximal rule base = "

962

+ m_baseMax.numInstances() + "\n");

963

} else {

964

sb.append("maximal rule base not yet created");

965

}

966

}

967

968

if (m_Debug == true) {

969

970

if (m_etype == ET_MIN || m_etype == ET_BOTH) {

971

sb.append("The minimal rule base is \n");

972

if (m_baseMin != null) {

973

sb.append(m_baseMin.toString());

974

} else {

975

sb.append(".... not yet created");

976

}

977

}

978

979

if (m_etype == ET_MAX || m_etype == ET_BOTH) {

980

sb.append("The second rule base is \n");

981

if (m_baseMax != null) {

982

sb.append(m_baseMax.toString());

983

} else {

984

sb.append(".... not yet created");

985

}

986

}

987

}

988

989

sb.append("\n");

990

return sb.toString();

991

}

992

993

/**

994

* Is <code> instance </code> redundant wrt the

995

* current <code> m_baseMin </code>.

996

* This mean we are looking if there is an element in

997

* <code> m_baseMin </code> smaller than <code> instance </code>

998

* and with the same class.

999

1000

* @param instance the instance of which the redundancy needs to be

1001

* checked

1002

* @return <code> true </code> if <code> instance </code>

1003

* is redundant, <code> false </code> otherwise

1004

1005

private boolean isRedundant(Instance instance) {

1006

Iterator it = new EnumerationIterator(m_baseMin.enumerateInstances());

1007

while(it.hasNext()) {

1008

Instance r = (Instance) it.next();

1009

if (instance.classValue() == r.classValue()

1010

&& InstancesUtil.smallerOrEqual(r, instance) ) {

1011

return true;

1012

}

1013

}

1014

return false;

1015

}

1016

1017

/**

1018

* Is <code> instance </code> redundant wrt the

1019

* current <code> m_baseMax </code>.

1020

* This is dual to the previous method, this means that

1021

* <code> instance </code> is redundant if there exists

1022

* and example greater of equal than <code> instance </code>

1023

* with the same class value.

1024

1025

* @param instance the instance of which the redundancy needs to be

1026

* checked

1027

* @return <code> true </code> if <code> instance </code>

1028

* is redundant, <code> false </code> otherwise

1029

1030

private boolean isRedundantMax(Instance instance) {

1031

Iterator it = new EnumerationIterator(m_baseMax.enumerateInstances());

1032

while(it.hasNext()) {

1033

Instance r = (Instance) it.next();

1034

if (instance.classValue() == r.classValue()

1035

&& InstancesUtil.smallerOrEqual(instance, r) ) {

1036

return true;

1037

}

1038

}

1039

return false;

1040

}

1041

1042

/**

1043

* If we were to add <code> instance </code>, how many

1044

* and which elements from <code> m_baseMin </code> would

1045

* be made redundant by <code> instance </code>.

1046

1047

* @param instance the instance of which the influence is to be determined

1048

* @return an array containing two components

1049

* [0] is 0 if instance makes no elements redundant;

1050

* is 1 if there is one rule that is made redundant by instance;

1051

* is 2 if there is more than one rule that is made redundant;

1052

* [1] if [0] == 1, this entry gives the element that is made redundant

1053

1054

private int[] makesRedundant(Instance instance) {

1055

int[] ret = new int[2];

1056

for (int i = 0; i < m_baseMin.numInstances(); i++) {

1057

Instance r = m_baseMin.instance(i);

1058

if (r.classValue() == instance.classValue() &&

1059

InstancesUtil.smallerOrEqual(instance, r)) {

1060

if (ret[0] == 0) {

1061

ret[0] = 1;

1062

ret[1] = i;

1063

} else { // this means ret[0] == 1

1064

ret[0] = 2;

1065

return ret;

1066

}

1067

}

1068

}

1069

return ret;

1070

}

1071

1072

/**

1073

* If we were to add <code> instance </code>, how many

1074

* and which elements from <code> m_baseMax </code> would

1075

* be made redundant by <code> instance </code>.

1076

* This method is simply dual to <code> makesRedundant </code>.

1077

1078

* @param instance the instance to add

1079

* @return an array containing two components

1080

* [0] is 0 if instance makes no elements redundant;

1081

* is 1 if there is one rule that is made redundant by instance;

1082

* is 2 if there is more than one rule that is made redundant;

1083

* [1] if [0] == 1, this entry gives the element that is made redundant

1084

1085

private int[] makesRedundantMax(Instance instance) {

1086

int[] ret = new int[2];

1087

for (int i = 0; i < m_baseMax.numInstances(); i++) {

1088

Instance r = m_baseMax.instance(i);

1089

if (r.classValue() == instance.classValue() &&

1090

InstancesUtil.smallerOrEqual(r, instance)) {

1091

if (ret[0] == 0) {

1092

ret[0] = 1;

1093

ret[1] = i;

1094

} else { // this means ret[0] == 1

1095

ret[0] = 2;

1096

return ret;

1097

}

1098

}

1099

}

1100

return ret;

1101

}

1102

1103

/**

1104

* Checks if adding <code> instance </code> to <code> m_baseMin </code>

1105

* causes reversed preference amongst the elements of <code>

1106

* m_baseMin </code>.

1107

1108

* @param instance the instance of which the influence needs to be

1109

* determined

1110

* @return <code> true </code> if adding <code> instance </code>

1111

* to the rule base would cause reversed preference among the rules,

1112

* <code> false </code> otherwise.

1113

1114

private boolean causesReversedPreference(Instance instance) {

1115

Iterator it = new EnumerationIterator(m_baseMin.enumerateInstances());

1116

while (it.hasNext()) {

1117

Instance r = (Instance) it.next();

1118

if (instance.classValue() > r.classValue() &&

1119

InstancesUtil.smallerOrEqual(instance, r)) {

1120

// in the original version of OLM this should not happen

1121

System.err.println

1122

("Should not happen in the original OLM algorithm");

1123

return true;

1124

} else if (r.classValue() > instance.classValue() &&

1125

InstancesUtil.smallerOrEqual(r, instance)) {

1126

return true;

1127

}

1128

}

1129

return false;

1130

}

1131

1132

/**

1133

* Checks if adding <code> instance </code> to <code> m_baseMax </code>

1134

* causes reversed preference amongst the elements of

1135

* <code> m_baseMax </code>.

1136

1137

* @param instance the instance to add

1138

* @return true if adding <code> instance </code> to the rule

1139

* base would cause reversed preference among the rules,

1140

* false otherwise.

1141

1142

private boolean causesReversedPreferenceMax(Instance instance) {

1143

Iterator it = new EnumerationIterator(m_baseMax.enumerateInstances());

1144

while (it.hasNext()) {

1145

Instance r = (Instance) it.next();

1146

if (instance.classValue() > r.classValue() &&

1147

InstancesUtil.smallerOrEqual(instance, r)) {

1148

return true;

1149

} else if (r.classValue() > instance.classValue() &&

1150

InstancesUtil.smallerOrEqual(r, instance)) {

1151

return true;

1152

}

1153

}

1154

return false;

1155

}

1156

1157

/**

1158

* Find indices of the elements from <code> m_baseMin </code>

1159

* that are inconsistent with instance.

1160

1161

* @param instance the instance of which the influence needs to be

1162

* determined

1163

* @return an array containing two components

1164

* [0] is 0 if instance is consistent with all present elements

1165

* is 1 if instance is inconsistent (reversed preference) with

1166

* exactly one element

1167

* is 2 if instance is inconsistent with more than one element

1168

* [1] if [0] == 1, this entry gives the index of the

1169

* element that has reversed preference wrt to

1170

* <code> instance </code>

1171

1172

private int[] reversedPreferences(Instance instance) {

1173

int[] revPreferences = new int[2];

1174

for (int i = 0; i < m_baseMin.numInstances(); i++) {

1175

Instance r = m_baseMin.instance(i);

1176

if (instance.classValue() < r.classValue() &&

1177

InstancesUtil.smallerOrEqual(r, instance) ) {

1178

if (revPreferences[0] == 0) {

1179

revPreferences[0] = 1;

1180

revPreferences[1] = i;

1181

} else {

1182

revPreferences[0] = 2;

1183

return revPreferences;

1184

}

1185

}

1186

}

1187

return revPreferences;

1188

}

1189

1190

/**

1191

* Find indices of the elements from <code> m_baseMin </code>

1192

* that are inconsistent with instance.

1193

1194

* @param instance the instance of which the influence needs to be

1195

* determined

1196

* @return an array containing two components

1197

* [0] is 0 if instance is consistent with all present elements

1198

* is 1 if instance is inconsistent (reversed preference) with

1199

* exactly one element

1200

* is 2 if instance is inconsistent with more than one element

1201

* [1] if [0] == 1, this entry gives the index of the

1202

* element that has reversed preference wrt to

1203

* <code> instance </code>

1204

1205

private int[] reversedPreferencesMax(Instance instance) {

1206

int[] revPreferences = new int[2];

1207

for (int i = 0; i < m_baseMax.numInstances(); i++) {

1208

Instance r = m_baseMax.instance(i);

1209

if (instance.classValue() > r.classValue() &&

1210

InstancesUtil.smallerOrEqual(instance, r) ) {

1211

if (revPreferences[0] == 0) {

1212

revPreferences[0] = 1;

1213

revPreferences[1] = i;

1214

} else {

1215

revPreferences[0] = 2;

1216

return revPreferences;

1217

}

1218

}

1219

}

1220

return revPreferences;

1221

}

1222

1223

/**

1224

* Calculates the square of the Euclidian distance between

1225

* two arrays of doubles. The arrays should have the same length.

1226

1227

* @param a1 the first array

1228

* @param a2 the second array

1229

* @return the square of the Euclidian distance between the two

1230

* arrays <code> a1 </code> and <code> a2 </code>

1231

1232

private double euclidDistance(double[] a1, double[] a2) {

1233

double dist = 0;

1234

for (int i = 0; i < a1.length; i++) {

1235

dist += (a1[i] - a2[i]) * (a1[i] - a2[i]);

1236

}

1237

return dist;

1238

}

1239

1240

/**

1241

* Calculates the Hamming distances between two arrays, this

1242

* means one counts the number of positions in which these

1243

* two array differ. The arrays should be of the same length.

1244

1245

* @param a1 the first array

1246

* @param a2 the second array

1247

* @return the requested Hamming distance

1248

* The Hamming distance between a1 and a2, this is

1249

* the number of positions in which a1 and a2 differ

1250

1251

private int hammingDistance(double[] a1, double[] a2) {

1252

int dist = 0;

1253

for (int i = 0; i < a1.length; i++) {

1254

dist += (a1[i] == a2[i]) ? 0 : 1;

1255

}

1256

return dist;

1257

}

1258

1259

/**

1260

* Get an enumeration of all available options for this classifier.

1261

1262

* @return an enumeration of available options

1263

1264

public Enumeration listOptions() {

1265

Vector options = new Vector();

1266

1267

Enumeration enm = super.listOptions();

1268

while (enm.hasMoreElements())

1269

options.addElement(enm.nextElement());

1270

1271

String description =

1272

"\tSets the classification type to be used.\n" +

1273

"\t(Default: " + new SelectedTag(CT_REAL, TAGS_CLASSIFICATIONTYPES) + ")";

1274

String synopsis = "-C " + Tag.toOptionList(TAGS_CLASSIFICATIONTYPES);

1275

String name = "C";

1276

options.addElement(new Option(description, name, 1, synopsis));

1277

1278

description =

1279

"\tSets the averaging type used in phase 1 of the classifier.\n" +

1280

"\t(Default: " + new SelectedTag(AT_MEAN, TAGS_AVERAGINGTYPES) + ")";

1281

synopsis = "-A " + Tag.toOptionList(TAGS_AVERAGINGTYPES);

1282

name = "A";

1283

options.addElement(new Option(description, name, 1, synopsis));

1284

1285

description =

1286

"\tIf different from " + new SelectedTag(DT_NONE, TAGS_DISTANCETYPES) + ", a nearest neighbour rule is fired when the\n" +

1287

"\trule base doesn't contain an example smaller than the instance\n" +

1288

"\tto be classified\n" +

1289

"\t(Default: " + new SelectedTag(DT_NONE, TAGS_DISTANCETYPES) + ").";

1290

synopsis = "-N " + Tag.toOptionList(TAGS_DISTANCETYPES);

1291

name = "N";

1292

options.addElement(new Option(description, name, 1, synopsis));

1293

1294

description = "\tSets the extension type, i.e. the rule base to use."

1295

+ "\n\t(Default: " + new SelectedTag(ET_MIN, TAGS_EXTENSIONTYPES) + ")";

1296

synopsis = "-E " + Tag.toOptionList(TAGS_EXTENSIONTYPES);

1297

name = "E";

1298

options.addElement(new Option(description, name, 1, synopsis));

1299

1300

description =

1301

"\tIf set, the instances are also sorted within the same class\n"

1302

+ "\tbefore building the rule bases";

1303

synopsis = "-sort";

1304

name = "sort";

1305

options.addElement(new Option(description, name, 0, synopsis));

1306

1307

return options.elements();

1308

}

1309

1310

/**

1311

* Parses the options for this object.

1312

1313

1314

* Valid options are:

1315

1316

* <pre> -S <num>

1317

* Random number seed.

1318

* (default 1)</pre>

1319

1320

* <pre> -D

1321

* If set, classifier is run in debug mode and

1322

* may output additional info to the console</pre>

1323

1324

* <pre> -C <CL|REG>

1325

* Sets the classification type to be used.

1326

* (Default: REG)</pre>

1327

1328

* <pre> -A <MEAN|MED|MAX>

1329

* Sets the averaging type used in phase 1 of the classifier.

1330

* (Default: MEAN)</pre>

1331

1332

* <pre> -N <NONE|EUCL|HAM>

1333

* If different from NONE, a nearest neighbour rule is fired when the

1334

* rule base doesn't contain an example smaller than the instance

1335

* to be classified

1336

* (Default: NONE).</pre>

1337

1338

* <pre> -E <MIN|MAX|BOTH>

1339

* Sets the extension type, i.e. the rule base to use.

1340

* (Default: MIN)</pre>

1341

1342

* <pre> -sort

1343

* If set, the instances are also sorted within the same class

1344

* before building the rule bases</pre>

1345

1346

1347

1348

* @param options an array of strings containing the options

1349

* @throws Exception if there are options that have invalid arguments.

1350

1351

public void setOptions(String[] options) throws Exception {

1352

String args;

1353

1354

// classification type

1355

args = Utils.getOption('C', options);

1356

if (args.length() != 0)

1357

setClassificationType(new SelectedTag(args, TAGS_CLASSIFICATIONTYPES));

1358

else

1359

setClassificationType(new SelectedTag(CT_REAL, TAGS_CLASSIFICATIONTYPES));

1360

1361

// averaging type

1362

args = Utils.getOption('A', options);

1363

if (args.length() != 0)

1364

setAveragingType(new SelectedTag(args, TAGS_AVERAGINGTYPES));

1365

else

1366

setAveragingType(new SelectedTag(AT_MEAN, TAGS_AVERAGINGTYPES));

1367

1368

// distance type

1369

args = Utils.getOption('N', options);

1370

if (args.length() != 0)

1371

setDistanceType(new SelectedTag(args, TAGS_DISTANCETYPES));

1372

else

1373

setDistanceType(new SelectedTag(DT_NONE, TAGS_DISTANCETYPES));

1374

1375

// extension type

1376

args = Utils.getOption('E', options);

1377

if (args.length() != 0)

1378

setExtensionType(new SelectedTag(args, TAGS_EXTENSIONTYPES));

1379

else

1380

setExtensionType(new SelectedTag(ET_MIN, TAGS_EXTENSIONTYPES));

1381

1382

// sort ?

1383

setSort(Utils.getFlag("sort", options));

1384

1385

super.setOptions(options);

1386

}

1387

1388

/**

1389

* Gets an array of string with the current options of the classifier.

1390

1391

* @return an array suitable as argument for <code> setOptions </code>

1392

1393

public String[] getOptions() {

1394

int i;

1395

Vector result;

1396

String[] options;

1397

1398

result = new Vector();

1399

1400

options = super.getOptions();

1401

for (i = 0; i < options.length; i++)

1402

result.add(options[i]);

1403

1404

// classification type

1405

result.add("-C");

1406

result.add("" + getClassificationType());

1407

1408

result.add("-A");

1409

result.add("" + getAveragingType());

1410

1411

result.add("-N");

1412

result.add("" + getDistanceType());

1413

1414

result.add("-E");

1415

result.add("" + getExtensionType());

1416

1417

if (getSort())

1418

result.add("-sort");

1419

1420

return (String[]) result.toArray(new String[result.size()]);

1421

}

1422

1423

/**

1424

* Main method for testing this class.

1425

1426

* @param args the command line arguments

1427

1428

public static void main(String[] args) {

1429

runClassifier(new OLM(), args);

1430

}

1431

}

Older »