~ubuntu-branches/ubuntu/precise/weka/precise

* Performs a Random search in the space of attribute subsets. If no start set is supplied, Random search starts from a random point and reports the best subset found. If a start set is supplied, Random searches randomly for subsets that are as good or better than the start point with the same or or fewer attributes. Using RandomSearch in conjunction with a start set containing all attributes equates to the LVF algorithm of Liu and Setiono (ICML-96).

*

* For more information see:

*

* H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter solution. In: 13th International Conference on Machine Learning, 319-327, 1996.

*

* BibTeX:

* <pre>

* @inproceedings{Liu1996,

* author = {H. Liu and R. Setiono},

* booktitle = {13th International Conference on Machine Learning},

* pages = {319-327},

* title = {A probabilistic approach to feature selection - A filter solution},

* year = {1996}

* }

* </pre>

*

* Valid options are:

* <pre> -P <start set>

* Specify a starting set of attributes.

* Eg. 1,3,5-7.

* If a start point is supplied,

* random search evaluates the start

* point and then randomly looks for

* subsets that are as good as or better

* than the start point with the same

* or lower cardinality.</pre>

* <pre> -F <percent>

* Percent of search space to consider.

* (default = 25%).</pre>

* <pre> -V

* Output subsets as the search progresses.

* (default = false).</pre>

* @author Mark Hall (mhall@cs.waikato.ac.nz)

* @version $Revision: 1.17 $

public class RandomSearch

extends ASSearch

implements StartSetHandler, OptionHandler, TechnicalInformationHandler {

/** for serialization */

static final long serialVersionUID = 7479392617377425484L;

/**

100

* holds a starting set as an array of attributes.

101

102

private int[] m_starting;

103

104

/** holds the start set as a range */

105

private Range m_startRange;

106

107

/** the best feature set found during the search */

108

private BitSet m_bestGroup;

109

110

/** the merit of the best subset found */

111

private double m_bestMerit;

112

113

/**

114

* only accept a feature set as being "better" than the best if its

115

* merit is better or equal to the best, and it contains fewer

116

* features than the best (this allows LVF to be implimented).

117

118

private boolean m_onlyConsiderBetterAndSmaller;

119

120

/** does the data have a class */

121

private boolean m_hasClass;

122

123

/** holds the class index */

124

private int m_classIndex;

125

126

/** number of attributes in the data */

127

private int m_numAttribs;

128

129

/** seed for random number generation */

130

private int m_seed;

131

132

/** percentage of the search space to consider */

133

private double m_searchSize;

134

135

/** the number of iterations performed */

136

private int m_iterations;

137

138

/** random number object */

139

private Random m_random;

140

141

/** output new best subsets as the search progresses */

142

private boolean m_verbose;

143

144

/**

145

* Returns a string describing this search method

146

* @return a description of the search suitable for

147

* displaying in the explorer/experimenter gui

148

149

public String globalInfo() {

150

return "RandomSearch : \n\nPerforms a Random search in "

151

+"the space of attribute subsets. If no start set is supplied, Random "

152

+"search starts from a random point and reports the best subset found. "

153

+"If a start set is supplied, Random searches randomly for subsets "

154

+"that are as good or better than the start point with the same or "

155

+"or fewer attributes. Using RandomSearch in conjunction with a start "

156

+"set containing all attributes equates to the LVF algorithm of Liu "

157

+"and Setiono (ICML-96).\n\n"

158

+ "For more information see:\n\n"

159

+ getTechnicalInformation().toString();

160

}

161

162

/**

163

* Returns an instance of a TechnicalInformation object, containing

164

* detailed information about the technical background of this class,

165

* e.g., paper reference or book this class is based on.

166

167

* @return the technical information about this class

168

169

public TechnicalInformation getTechnicalInformation() {

170

TechnicalInformation result;

171

172

result = new TechnicalInformation(Type.INPROCEEDINGS);

173

result.setValue(Field.AUTHOR, "H. Liu and R. Setiono");

174

result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution");

175

result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning");

176

result.setValue(Field.YEAR, "1996");

177

result.setValue(Field.PAGES, "319-327");

178

179

return result;

180

}

181

182

/**

183

* Constructor

184

185

public RandomSearch () {

186

resetOptions();

187

}

188

189

/**

190

* Returns an enumeration describing the available options.

191

* @return an enumeration of all the available options.

192

**/

193

public Enumeration listOptions () {

194

Vector newVector = new Vector(3);

195

196

newVector.addElement(new Option("\tSpecify a starting set of attributes."

197

+ "\n\tEg. 1,3,5-7."

198

+"\n\tIf a start point is supplied,"

199

+"\n\trandom search evaluates the start"

200

+"\n\tpoint and then randomly looks for"

201

+"\n\tsubsets that are as good as or better"

202

+"\n\tthan the start point with the same"

203

+"\n\tor lower cardinality."

204

,"P",1

205

, "-P <start set>"));

206

207

newVector.addElement(new Option("\tPercent of search space to consider."

208

+"\n\t(default = 25%)."

209

, "F", 1

210

, "-F <percent> "));

211

newVector.addElement(new Option("\tOutput subsets as the search progresses."

212

+"\n\t(default = false)."

213

, "V", 0

214

, "-V"));

215

return newVector.elements();

216

}

217

218

/**

219

* Parses a given list of options.

220

221

222

* Valid options are:

223

224

* <pre> -P <start set>

225

* Specify a starting set of attributes.

226

* Eg. 1,3,5-7.

227

* If a start point is supplied,

228

* random search evaluates the start

229

* point and then randomly looks for

230

* subsets that are as good as or better

231

* than the start point with the same

232

* or lower cardinality.</pre>

233

234

* <pre> -F <percent>

235

* Percent of search space to consider.

236

* (default = 25%).</pre>

237

238

* <pre> -V

239

* Output subsets as the search progresses.

240

* (default = false).</pre>

241

242

243

244

* @param options the list of options as an array of strings

245

* @throws Exception if an option is not supported

246

247

**/

248

public void setOptions (String[] options)

249

throws Exception {

250

String optionString;

251

resetOptions();

252

253

optionString = Utils.getOption('P', options);

254

if (optionString.length() != 0) {

255

setStartSet(optionString);

256

}

257

258

optionString = Utils.getOption('F',options);

259

if (optionString.length() != 0) {

260

setSearchPercent((new Double(optionString)).doubleValue());

261

}

262

263

setVerbose(Utils.getFlag('V',options));

264

}

265

266

/**

267

* Gets the current settings of RandomSearch.

268

* @return an array of strings suitable for passing to setOptions()

269

270

public String[] getOptions () {

271

String[] options = new String[5];

272

int current = 0;

273

274

if (m_verbose) {

275

options[current++] = "-V";

276

}

277

278

if (!(getStartSet().equals(""))) {

279

options[current++] = "-P";

280

options[current++] = "" + startSetToString();

281

}

282

283

options[current++] = "-F";

284

options[current++] = "" + getSearchPercent();

285

286

while (current < options.length) {

287

options[current++] = "";

288

}

289

290

return options;

291

}

292

293

/**

294

* Returns the tip text for this property

295

* @return tip text for this property suitable for

296

* displaying in the explorer/experimenter gui

297

298

public String startSetTipText() {

299

return "Set the start point for the search. This is specified as a comma "

300

+"seperated list off attribute indexes starting at 1. It can include "

301

+"ranges. Eg. 1,2,5-9,17. If specified, Random searches for subsets "

302

+"of attributes that are as good as or better than the start set with "

303

+"the same or lower cardinality.";

304

}

305

306

/**

307

* Sets a starting set of attributes for the search. It is the

308

* search method's responsibility to report this start set (if any)

309

* in its toString() method.

310

* @param startSet a string containing a list of attributes (and or ranges),

311

* eg. 1,2,6,10-15. "" indicates no start point.

312

* If a start point is supplied, random search evaluates the

313

* start point and then looks for subsets that are as good as or better

314

* than the start point with the same or lower cardinality.

315

* @throws Exception if start set can't be set.

316

317

public void setStartSet (String startSet) throws Exception {

318

m_startRange.setRanges(startSet);

319

}

320

321

/**

322

* Returns a list of attributes (and or attribute ranges) as a String

323

* @return a list of attributes (and or attribute ranges)

324

325

public String getStartSet () {

326

return m_startRange.getRanges();

327

}

328

329

/**

330

* Returns the tip text for this property

331

* @return tip text for this property suitable for

332

* displaying in the explorer/experimenter gui

333

334

public String verboseTipText() {

335

return "Print progress information. Sends progress info to the terminal "

336

+"as the search progresses.";

337

}

338

339

/**

340

* set whether or not to output new best subsets as the search proceeds

341

* @param v true if output is to be verbose

342

343

public void setVerbose(boolean v) {

344

m_verbose = v;

345

}

346

347

/**

348

* get whether or not output is verbose

349

* @return true if output is set to verbose

350

351

public boolean getVerbose() {

352

return m_verbose;

353

}

354

355

/**

356

* Returns the tip text for this property

357

* @return tip text for this property suitable for

358

* displaying in the explorer/experimenter gui

359

360

public String searchPercentTipText() {

361

return "Percentage of the search space to explore.";

362

}

363

364

/**

365

* set the percentage of the search space to consider

366

* @param p percent of the search space ( 0 < p <= 100)

367

368

public void setSearchPercent(double p) {

369

p = Math.abs(p);

370

if (p == 0) {

371

p = 25;

372

}

373

374

if (p > 100.0) {

375

p = 100;

376

}

377

378

m_searchSize = (p/100.0);

379

}

380

381

/**

382

* get the percentage of the search space to consider

383

* @return the percent of the search space explored

384

385

public double getSearchPercent() {

386

return m_searchSize * 100;

387

}

388

389

/**

390

* converts the array of starting attributes to a string. This is

391

* used by getOptions to return the actual attributes specified

392

* as the starting set. This is better than using m_startRanges.getRanges()

393

* as the same start set can be specified in different ways from the

394

* command line---eg 1,2,3 == 1-3. This is to ensure that stuff that

395

* is stored in a database is comparable.

396

* @return a comma seperated list of individual attribute numbers as a String

397

398

private String startSetToString() {

399

StringBuffer FString = new StringBuffer();

400

boolean didPrint;

401

402

if (m_starting == null) {

403

return getStartSet();

404

}

405

406

for (int i = 0; i < m_starting.length; i++) {

407

didPrint = false;

408

409

if ((m_hasClass == false) ||

410

(m_hasClass == true && i != m_classIndex)) {

411

FString.append((m_starting[i] + 1));

412

didPrint = true;

413

}

414

415

if (i == (m_starting.length - 1)) {

416

FString.append("");

417

}

418

else {

419

if (didPrint) {

420

FString.append(",");

421

}

422

}

423

}

424

425

return FString.toString();

426

}

427

428

/**

429

* prints a description of the search

430

* @return a description of the search as a string

431

432

public String toString() {

433

StringBuffer text = new StringBuffer();

434

435

text.append("\tRandom search.\n\tStart set: ");

436

if (m_starting == null) {

437

text.append("no attributes\n");

438

}

439

else {

440

text.append(startSetToString()+"\n");

441

}

442

text.append("\tNumber of iterations: "+m_iterations+" ("

443

+(m_searchSize * 100.0)+"% of the search space)\n");

444

text.append("\tMerit of best subset found: "

445

+Utils.doubleToString(Math.abs(m_bestMerit),8,3)+"\n");

446

447

return text.toString();

448

}

449

450

/**

451

* Searches the attribute subset space randomly.

452

453

* @param ASEval the attribute evaluator to guide the search

454

* @param data the training instances.

455

* @return an array (not necessarily ordered) of selected attribute indexes

456

* @throws Exception if the search can't be completed

457

458

public int[] search (ASEvaluation ASEval, Instances data)

459

throws Exception {

460

double best_merit;

461

int sizeOfBest = m_numAttribs;

462

BitSet temp;

463

m_bestGroup = new BitSet(m_numAttribs);

464

465

m_onlyConsiderBetterAndSmaller = false;

466

if (!(ASEval instanceof SubsetEvaluator)) {

467

throw new Exception(ASEval.getClass().getName()

468

+ " is not a "

469

+ "Subset evaluator!");

470

}

471

472

m_random = new Random(m_seed);

473

474

if (ASEval instanceof UnsupervisedSubsetEvaluator) {

475

m_hasClass = false;

476

}

477

else {

478

m_hasClass = true;

479

m_classIndex = data.classIndex();

480

}

481

482

SubsetEvaluator ASEvaluator = (SubsetEvaluator)ASEval;

483

m_numAttribs = data.numAttributes();

484

485

m_startRange.setUpper(m_numAttribs-1);

486

if (!(getStartSet().equals(""))) {

487

m_starting = m_startRange.getSelection();

488

}

489

490

// If a starting subset has been supplied, then initialise the bitset

491

if (m_starting != null) {

492

for (int i = 0; i < m_starting.length; i++) {

493

if ((m_starting[i]) != m_classIndex) {

494

m_bestGroup.set(m_starting[i]);

495

}

496

}

497

m_onlyConsiderBetterAndSmaller = true;

498

best_merit = ASEvaluator.evaluateSubset(m_bestGroup);

499

sizeOfBest = countFeatures(m_bestGroup);

500

} else {

501

// do initial random subset

502

m_bestGroup = generateRandomSubset();

503

best_merit = ASEvaluator.evaluateSubset(m_bestGroup);

504

}

505

506

if (m_verbose) {

507

System.out.println("Initial subset ("

508

+Utils.doubleToString(Math.

509

abs(best_merit),8,5)

510

+"): "+printSubset(m_bestGroup));

511

}

512

513

int i;

514

if (m_hasClass) {

515

i = m_numAttribs -1;

516

} else {

517

i = m_numAttribs;

518

}

519

m_iterations = (int)((m_searchSize * Math.pow(2, i)));

520

521

int tempSize;

522

double tempMerit;

523

// main loop

524

for (i=0;i<m_iterations;i++) {

525

temp = generateRandomSubset();

526

if (m_onlyConsiderBetterAndSmaller) {

527

tempSize = countFeatures(temp);

528

if (tempSize <= sizeOfBest) {

529

tempMerit = ASEvaluator.evaluateSubset(temp);

530

if (tempMerit >= best_merit) {

531

sizeOfBest = tempSize;

532

m_bestGroup = temp;

533

best_merit = tempMerit;

534

if (m_verbose) {

535

System.out.print("New best subset ("

536

+Utils.doubleToString(Math.

537

abs(best_merit),8,5)

538

+"): "+printSubset(m_bestGroup) + " :");

539

System.out.println(Utils.

540

doubleToString((((double)i)/

541

((double)m_iterations)*

542

100.0),5,1)

543

+"% done");

544

}

545

}

546

}

547

} else {

548

tempMerit = ASEvaluator.evaluateSubset(temp);

549

if (tempMerit > best_merit) {

550

m_bestGroup = temp;

551

best_merit = tempMerit;

552

if (m_verbose) {

553

System.out.print("New best subset ("

554

+Utils.doubleToString(Math.abs(best_merit),8,5)

555

+"): "+printSubset(m_bestGroup) + " :");

556

System.out.println(Utils.

557

doubleToString((((double)i)/

558

((double)m_iterations)

559

*100.0),5,1)

560

+"% done");

561

}

562

}

563

}

564

}

565

m_bestMerit = best_merit;

566

return attributeList(m_bestGroup);

567

}

568

569

/**

570

* prints a subset as a series of attribute numbers

571

* @param temp the subset to print

572

* @return a subset as a String of attribute numbers

573

574

private String printSubset(BitSet temp) {

575

StringBuffer text = new StringBuffer();

576

577

for (int j=0;j<m_numAttribs;j++) {

578

if (temp.get(j)) {

579

text.append((j+1)+" ");

580

}

581

}

582

return text.toString();

583

}

584

585

/**

586

* converts a BitSet into a list of attribute indexes

587

* @param group the BitSet to convert

588

* @return an array of attribute indexes

589

**/

590

private int[] attributeList (BitSet group) {

591

int count = 0;

592

593

// count how many were selected

594

for (int i = 0; i < m_numAttribs; i++) {

595

if (group.get(i)) {

596

count++;

597

}

598

}

599

600

int[] list = new int[count];

601

count = 0;

602

603

for (int i = 0; i < m_numAttribs; i++) {

604

if (group.get(i)) {

605

list[count++] = i;

606

}

607

}

608

609

return list;

610

}

611

612

/**

613

* generates a random subset

614

* @return a random subset as a BitSet

615

616

private BitSet generateRandomSubset() {

617

BitSet temp = new BitSet(m_numAttribs);

618

double r;

619

620

for (int i=0;i<m_numAttribs;i++) {

621

r = m_random.nextDouble();

622

if (r <= 0.5) {

623

if (m_hasClass && i == m_classIndex) {

624

} else {

625

temp.set(i);

626

}

627

}

628

}

629

return temp;

630

}

631

632

/**

633

* counts the number of features in a subset

634

* @param featureSet the feature set for which to count the features

635

* @return the number of features in the subset

636

637

private int countFeatures(BitSet featureSet) {

638

int count = 0;

639

for (int i=0;i<m_numAttribs;i++) {

640

if (featureSet.get(i)) {

641

count++;

642

}

643

}

644

return count;

645

}

646

647

/**

648

* resets to defaults

649

650

private void resetOptions() {

651

m_starting = null;

652

m_startRange = new Range();

653

m_searchSize = 0.25;

654

m_seed = 1;

655

m_onlyConsiderBetterAndSmaller = false;

656

m_verbose = false;

657

}

658

}

659

Older »