2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
23
package weka.filters.supervised.instance;
25
import weka.core.Capabilities;
26
import weka.core.Instance;
27
import weka.core.Instances;
28
import weka.core.Option;
29
import weka.core.OptionHandler;
30
import weka.core.Utils;
31
import weka.core.Capabilities.Capability;
32
import weka.filters.Filter;
33
import weka.filters.SupervisedFilter;
35
import java.util.Collections;
36
import java.util.Enumeration;
37
import java.util.Random;
38
import java.util.Vector;
41
<!-- globalinfo-start -->
42
* Produces a random subsample of a dataset using either sampling with replacement or without replacement.<br/>
43
* The original dataset must fit entirely in memory. The number of instances in the generated dataset may be specified. The dataset must have a nominal class attribute. If not, use the unsupervised version. The filter can be made to maintain the class distribution in the subsample, or to bias the class distribution toward a uniform distribution. When used in batch mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.
45
<!-- globalinfo-end -->
47
<!-- options-start -->
48
* Valid options are: <p/>
50
* <pre> -S <num>
51
* Specify the random number seed (default 1)</pre>
53
* <pre> -Z <num>
54
* The size of the output dataset, as a percentage of
55
* the input dataset (default 100)</pre>
57
* <pre> -B <num>
58
* Bias factor towards uniform class distribution.
59
* 0 = distribution in input data -- 1 = uniform distribution.
62
* <pre> -no-replacement
63
* Disables replacement of instances
64
* (default: with replacement)</pre>
67
* Inverts the selection - only available with '-no-replacement'.</pre>
71
* @author Len Trigg (len@reeltwo.com)
72
* @author FracPete (fracpete at waikato dot ac dot nz)
73
* @version $Revision: 1.11 $
77
implements SupervisedFilter, OptionHandler {
79
/** for serialization */
80
static final long serialVersionUID = 7079064953548300681L;
82
/** The subsample size, percent of original set, default 100% */
83
protected double m_SampleSizePercent = 100;
85
/** The random number generator seed */
86
protected int m_RandomSeed = 1;
88
/** The degree of bias towards uniform (nominal) class distribution */
89
protected double m_BiasToUniformClass = 0;
91
/** Whether to perform sampling with replacement or without */
92
protected boolean m_NoReplacement = false;
94
/** Whether to invert the selection (only if instances are drawn WITHOUT
96
* @see #m_NoReplacement */
97
protected boolean m_InvertSelection = false;
100
* Returns a string describing this filter
102
* @return a description of the filter suitable for
103
* displaying in the explorer/experimenter gui
105
public String globalInfo() {
107
"Produces a random subsample of a dataset using either sampling "
108
+ "with replacement or without replacement.\n"
109
+ "The original dataset must "
110
+ "fit entirely in memory. The number of instances in the generated "
111
+ "dataset may be specified. The dataset must have a nominal class "
112
+ "attribute. If not, use the unsupervised version. The filter can be "
113
+ "made to maintain the class distribution in the subsample, or to bias "
114
+ "the class distribution toward a uniform distribution. When used in batch "
115
+ "mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.";
119
* Returns an enumeration describing the available options.
121
* @return an enumeration of all the available options.
123
public Enumeration listOptions() {
124
Vector result = new Vector();
126
result.addElement(new Option(
127
"\tSpecify the random number seed (default 1)",
128
"S", 1, "-S <num>"));
130
result.addElement(new Option(
131
"\tThe size of the output dataset, as a percentage of\n"
132
+"\tthe input dataset (default 100)",
133
"Z", 1, "-Z <num>"));
135
result.addElement(new Option(
136
"\tBias factor towards uniform class distribution.\n"
137
+"\t0 = distribution in input data -- 1 = uniform distribution.\n"
139
"B", 1, "-B <num>"));
141
result.addElement(new Option(
142
"\tDisables replacement of instances\n"
143
+"\t(default: with replacement)",
144
"no-replacement", 0, "-no-replacement"));
146
result.addElement(new Option(
147
"\tInverts the selection - only available with '-no-replacement'.",
150
return result.elements();
155
* Parses a given list of options. <p/>
157
<!-- options-start -->
158
* Valid options are: <p/>
160
* <pre> -S <num>
161
* Specify the random number seed (default 1)</pre>
163
* <pre> -Z <num>
164
* The size of the output dataset, as a percentage of
165
* the input dataset (default 100)</pre>
167
* <pre> -B <num>
168
* Bias factor towards uniform class distribution.
169
* 0 = distribution in input data -- 1 = uniform distribution.
172
* <pre> -no-replacement
173
* Disables replacement of instances
174
* (default: with replacement)</pre>
177
* Inverts the selection - only available with '-no-replacement'.</pre>
181
* @param options the list of options as an array of strings
182
* @throws Exception if an option is not supported
184
public void setOptions(String[] options) throws Exception {
187
tmpStr = Utils.getOption('S', options);
188
if (tmpStr.length() != 0)
189
setRandomSeed(Integer.parseInt(tmpStr));
193
tmpStr = Utils.getOption('B', options);
194
if (tmpStr.length() != 0)
195
setBiasToUniformClass(Double.parseDouble(tmpStr));
197
setBiasToUniformClass(0);
199
tmpStr = Utils.getOption('Z', options);
200
if (tmpStr.length() != 0)
201
setSampleSizePercent(Double.parseDouble(tmpStr));
203
setSampleSizePercent(100);
205
setNoReplacement(Utils.getFlag("no-replacement", options));
207
if (getNoReplacement())
208
setInvertSelection(Utils.getFlag('V', options));
210
if (getInputFormat() != null) {
211
setInputFormat(getInputFormat());
216
* Gets the current settings of the filter.
218
* @return an array of strings suitable for passing to setOptions
220
public String [] getOptions() {
221
Vector<String> result;
223
result = new Vector<String>();
226
result.add("" + getBiasToUniformClass());
229
result.add("" + getRandomSeed());
232
result.add("" + getSampleSizePercent());
234
if (getNoReplacement()) {
235
result.add("-no-replacement");
236
if (getInvertSelection())
240
return result.toArray(new String[result.size()]);
244
* Returns the tip text for this property
246
* @return tip text for this property suitable for
247
* displaying in the explorer/experimenter gui
249
public String biasToUniformClassTipText() {
250
return "Whether to use bias towards a uniform class. A value of 0 leaves the class "
251
+ "distribution as-is, a value of 1 ensures the class distribution is "
252
+ "uniform in the output data.";
256
* Gets the bias towards a uniform class. A value of 0 leaves the class
257
* distribution as-is, a value of 1 ensures the class distributions are
258
* uniform in the output data.
260
* @return the current bias
262
public double getBiasToUniformClass() {
263
return m_BiasToUniformClass;
267
* Sets the bias towards a uniform class. A value of 0 leaves the class
268
* distribution as-is, a value of 1 ensures the class distributions are
269
* uniform in the output data.
271
* @param newBiasToUniformClass the new bias value, between 0 and 1.
273
public void setBiasToUniformClass(double newBiasToUniformClass) {
274
m_BiasToUniformClass = newBiasToUniformClass;
278
* Returns the tip text for this property
280
* @return tip text for this property suitable for
281
* displaying in the explorer/experimenter gui
283
public String randomSeedTipText() {
284
return "Sets the random number seed for subsampling.";
288
* Gets the random number seed.
290
* @return the random number seed.
292
public int getRandomSeed() {
297
* Sets the random number seed.
299
* @param newSeed the new random number seed.
301
public void setRandomSeed(int newSeed) {
302
m_RandomSeed = newSeed;
306
* Returns the tip text for this property
308
* @return tip text for this property suitable for
309
* displaying in the explorer/experimenter gui
311
public String sampleSizePercentTipText() {
312
return "The subsample size as a percentage of the original set.";
316
* Gets the subsample size as a percentage of the original set.
318
* @return the subsample size
320
public double getSampleSizePercent() {
321
return m_SampleSizePercent;
325
* Sets the size of the subsample, as a percentage of the original set.
327
* @param newSampleSizePercent the subsample set size, between 0 and 100.
329
public void setSampleSizePercent(double newSampleSizePercent) {
330
m_SampleSizePercent = newSampleSizePercent;
334
* Returns the tip text for this property
336
* @return tip text for this property suitable for
337
* displaying in the explorer/experimenter gui
339
public String noReplacementTipText() {
340
return "Disables the replacement of instances.";
344
* Gets whether instances are drawn with or without replacement.
346
* @return true if the replacement is disabled
348
public boolean getNoReplacement() {
349
return m_NoReplacement;
353
* Sets whether instances are drawn with or with out replacement.
355
* @param value if true then the replacement of instances is disabled
357
public void setNoReplacement(boolean value) {
358
m_NoReplacement = value;
362
* Returns the tip text for this property
364
* @return tip text for this property suitable for
365
* displaying in the explorer/experimenter gui
367
public String invertSelectionTipText() {
368
return "Inverts the selection (only if instances are drawn WITHOUT replacement).";
372
* Gets whether selection is inverted (only if instances are drawn WIHTOUT
375
* @return true if the replacement is disabled
376
* @see #m_NoReplacement
378
public boolean getInvertSelection() {
379
return m_InvertSelection;
383
* Sets whether the selection is inverted (only if instances are drawn WIHTOUT
386
* @param value if true then selection is inverted
388
public void setInvertSelection(boolean value) {
389
m_InvertSelection = value;
393
* Returns the Capabilities of this filter.
395
* @return the capabilities of this object
398
public Capabilities getCapabilities() {
399
Capabilities result = super.getCapabilities();
402
result.enableAllAttributes();
403
result.enable(Capability.MISSING_VALUES);
406
result.enable(Capability.NOMINAL_CLASS);
412
* Sets the format of the input instances.
414
* @param instanceInfo an Instances object containing the input
415
* instance structure (any instances contained in the object are
416
* ignored - only the structure is required).
417
* @return true if the outputFormat may be collected immediately
418
* @throws Exception if the input format can't be set
421
public boolean setInputFormat(Instances instanceInfo)
424
super.setInputFormat(instanceInfo);
425
setOutputFormat(instanceInfo);
430
* Input an instance for filtering. Filter requires all
431
* training instances be read before producing output.
433
* @param instance the input instance
434
* @return true if the filtered instance may now be
435
* collected with output().
436
* @throws IllegalStateException if no input structure has been defined
438
public boolean input(Instance instance) {
440
if (getInputFormat() == null) {
441
throw new IllegalStateException("No input instance format defined");
447
if (isFirstBatchDone()) {
451
bufferInput(instance);
457
* Signify that this batch of input to the filter is finished.
458
* If the filter requires all instances prior to filtering,
459
* output() may now be called to retrieve the filtered instances.
461
* @return true if there are instances pending output
462
* @throws IllegalStateException if no input structure has been defined
464
public boolean batchFinished() {
466
if (getInputFormat() == null) {
467
throw new IllegalStateException("No input instance format defined");
470
if (!isFirstBatchDone()) {
471
// Do the subsample, and clear the input instances.
477
m_FirstBatchDone = true;
478
return (numPendingOutput() != 0);
482
* creates the subsample with replacement
484
* @param random the random number generator to use
485
* @param origSize the original size of the dataset
486
* @param sampleSize the size to generate
487
* @param actualClasses the number of classes found in the data
488
* @param classIndices the indices where classes start
490
public void createSubsampleWithReplacement(Random random, int origSize,
491
int sampleSize, int actualClasses, int[] classIndices) {
493
for (int i = 0; i < sampleSize; i++) {
495
if (random.nextDouble() < m_BiasToUniformClass) {
496
// Pick a random class (of those classes that actually appear)
497
int cIndex = random.nextInt(actualClasses);
498
for (int j = 0, k = 0; j < classIndices.length - 1; j++) {
499
if ((classIndices[j] != classIndices[j + 1])
500
&& (k++ >= cIndex)) {
501
// Pick a random instance of the designated class
502
index = classIndices[j]
503
+ random.nextInt(classIndices[j + 1] - classIndices[j]);
509
index = random.nextInt(origSize);
511
push((Instance) getInputFormat().instance(index).copy());
516
* creates the subsample without replacement
518
* @param random the random number generator to use
519
* @param origSize the original size of the dataset
520
* @param sampleSize the size to generate
521
* @param actualClasses the number of classes found in the data
522
* @param classIndices the indices where classes start
524
public void createSubsampleWithoutReplacement(Random random, int origSize,
525
int sampleSize, int actualClasses, int[] classIndices) {
527
if (sampleSize > origSize) {
528
sampleSize = origSize;
530
"Resampling with replacement can only use percentage <=100% - "
531
+ "Using full dataset!");
534
Vector<Integer>[] indices = new Vector[actualClasses];
535
Vector<Integer>[] indicesNew = new Vector[actualClasses];
537
// generate list of all indices to draw from
538
for (int i = 0; i < actualClasses; i++) {
539
indices[i] = new Vector<Integer>(classIndices[i + 1] - classIndices[i]);
540
indicesNew[i] = new Vector<Integer>(indices[i].capacity());
541
for (int n = classIndices[i]; n < classIndices[i + 1]; n++)
546
int currentSize = origSize;
547
for (int i = 0; i < sampleSize; i++) {
549
if (random.nextDouble() < m_BiasToUniformClass) {
550
// Pick a random class (of those classes that actually appear)
551
int cIndex = random.nextInt(actualClasses);
552
for (int j = 0, k = 0; j < classIndices.length - 1; j++) {
553
if ((classIndices[j] != classIndices[j + 1])
554
&& (k++ >= cIndex)) {
555
// Pick a random instance of the designated class
556
index = random.nextInt(indices[j].size());
557
indicesNew[j].add(indices[j].get(index));
558
indices[j].remove(index);
564
index = random.nextInt(currentSize);
565
for (int n = 0; n < actualClasses; n++) {
566
if (index < indices[n].size()) {
567
indicesNew[n].add(indices[n].get(index));
568
indices[n].remove(index);
572
index -= indices[n].size();
580
if (getInvertSelection()) {
581
indicesNew = indices;
584
for (int i = 0; i < indicesNew.length; i++)
585
Collections.sort(indicesNew[i]);
589
for (int i = 0; i < indicesNew.length; i++) {
590
for (int n = 0; n < indicesNew[i].size(); n++)
591
push((Instance) getInputFormat().instance(indicesNew[i].get(n)).copy());
595
for (int i = 0; i < indices.length; i++) {
597
indicesNew[i].clear();
604
* Creates a subsample of the current set of input instances. The output
605
* instances are pushed onto the output queue for collection.
607
protected void createSubsample() {
608
int origSize = getInputFormat().numInstances();
609
int sampleSize = (int) (origSize * m_SampleSizePercent / 100);
611
// Subsample that takes class distribution into consideration
613
// Sort according to class attribute.
614
getInputFormat().sort(getInputFormat().classIndex());
616
// Create an index of where each class value starts
617
int [] classIndices = new int [getInputFormat().numClasses() + 1];
618
int currentClass = 0;
619
classIndices[currentClass] = 0;
620
for (int i = 0; i < getInputFormat().numInstances(); i++) {
621
Instance current = getInputFormat().instance(i);
622
if (current.classIsMissing()) {
623
for (int j = currentClass + 1; j < classIndices.length; j++) {
627
} else if (current.classValue() != currentClass) {
628
for (int j = currentClass + 1; j <= current.classValue(); j++) {
631
currentClass = (int) current.classValue();
634
if (currentClass <= getInputFormat().numClasses()) {
635
for (int j = currentClass + 1; j < classIndices.length; j++) {
636
classIndices[j] = getInputFormat().numInstances();
640
int actualClasses = 0;
641
for (int i = 0; i < classIndices.length - 1; i++) {
642
if (classIndices[i] != classIndices[i + 1]) {
647
// Create the new sample
648
Random random = new Random(m_RandomSeed);
650
// Convert pending input instances
651
if (getNoReplacement())
652
createSubsampleWithoutReplacement(
653
random, origSize, sampleSize, actualClasses, classIndices);
655
createSubsampleWithReplacement(
656
random, origSize, sampleSize, actualClasses, classIndices);
660
* Main method for testing this class.
662
* @param argv should contain arguments to the filter:
665
public static void main(String [] argv) {
666
runFilter(new Resample(), argv);