2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
25
import weka.core.converters.ArffLoader.ArffReader;
26
import weka.core.converters.ConverterUtils.DataSource;
28
import java.io.FileReader;
29
import java.io.IOException;
30
import java.io.Reader;
31
import java.io.Serializable;
32
import java.util.Enumeration;
33
import java.util.Random;
36
* Class for handling an ordered set of weighted instances. <p>
40
* import weka.core.converters.ConverterUtils.DataSource;
43
* // Read all the instances in the file (ARFF, CSV, XRFF, ...)
44
* DataSource source = new DataSource(filename);
45
* Instances instances = source.getDataSet();
47
* // Make the last attribute be the class
48
* instances.setClassIndex(instances.numAttributes() - 1);
50
* // Print header and instances.
51
* System.out.println("\nDataset:\n");
52
* System.out.println(instances);
57
* All methods that change a set of instances are safe, ie. a change
58
* of a set of instances does not affect any other sets of
59
* instances. All methods that change a datasets's attribute
60
* information clone the dataset before it is changed.
62
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
63
* @author Len Trigg (trigg@cs.waikato.ac.nz)
64
* @author FracPete (fracpete at waikato dot ac dot nz)
65
* @version $Revision: 1.73 $
67
public class Instances
68
implements Serializable {
70
/** for serialization */
71
static final long serialVersionUID = -19412345060742748L;
73
/** The filename extension that should be used for arff files */
74
public final static String FILE_EXTENSION = ".arff";
76
/** The filename extension that should be used for bin. serialized instances files */
77
public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi";
79
/** The keyword used to denote the start of an arff header */
80
public final static String ARFF_RELATION = "@relation";
82
/** The keyword used to denote the start of the arff data section */
83
public final static String ARFF_DATA = "@data";
85
/** The dataset's name. */
86
protected /*@spec_public non_null@*/ String m_RelationName;
88
/** The attribute information. */
89
protected /*@spec_public non_null@*/ FastVector m_Attributes;
90
/* public invariant (\forall int i; 0 <= i && i < m_Attributes.size();
91
m_Attributes.elementAt(i) != null);
95
protected /*@spec_public non_null@*/ FastVector m_Instances;
97
/** The class attribute's index */
98
protected int m_ClassIndex;
99
//@ protected invariant classIndex() == m_ClassIndex;
101
/** The lines read so far in case of incremental loading. Since the
102
* StreamTokenizer will be re-initialized with every instance that is read,
103
* we have to keep track of the number of lines read so far.
104
* @see #readInstance(Reader) */
105
protected int m_Lines = 0;
108
* Reads an ARFF file from a reader, and assigns a weight of
109
* one to each instance. Lets the index of the class
110
* attribute be undefined (negative).
112
* @param reader the reader
113
* @throws IOException if the ARFF file is not read
116
public Instances(/*@non_null@*/Reader reader) throws IOException {
117
ArffReader arff = new ArffReader(reader);
118
Instances dataset = arff.getData();
119
initialize(dataset, dataset.numInstances());
120
dataset.copyInstances(0, this, dataset.numInstances());
125
* Reads the header of an ARFF file from a reader and
126
* reserves space for the given number of instances. Lets
127
* the class index be undefined (negative).
129
* @param reader the reader
130
* @param capacity the capacity
131
* @throws IllegalArgumentException if the header is not read successfully
132
* or the capacity is negative.
133
* @throws IOException if there is a problem with the reader.
134
* @deprecated instead of using this method in conjunction with the
135
* <code>readInstance(Reader)</code> method, one should use the
136
* <code>ArffLoader</code> or <code>DataSource</code> class instead.
137
* @see weka.core.converters.ArffLoader
138
* @see weka.core.converters.ConverterUtils.DataSource
140
//@ requires capacity >= 0;
141
//@ ensures classIndex() == -1;
142
@Deprecated public Instances(/*@non_null@*/Reader reader, int capacity)
145
ArffReader arff = new ArffReader(reader, 0);
146
Instances header = arff.getStructure();
147
initialize(header, capacity);
148
m_Lines = arff.getLineNo();
152
* Constructor copying all instances and references to
153
* the header information from the given set of instances.
155
* @param dataset the set to be copied
157
public Instances(/*@non_null@*/Instances dataset) {
159
this(dataset, dataset.numInstances());
161
dataset.copyInstances(0, this, dataset.numInstances());
165
* Constructor creating an empty set of instances. Copies references
166
* to the header information from the given set of instances. Sets
167
* the capacity of the set of instances to 0 if its negative.
169
* @param dataset the instances from which the header
170
* information is to be taken
171
* @param capacity the capacity of the new dataset
173
public Instances(/*@non_null@*/Instances dataset, int capacity) {
174
initialize(dataset, capacity);
178
* initializes with the header information of the given dataset and sets
179
* the capacity of the set of instances.
181
* @param dataset the dataset to use as template
182
* @param capacity the number of rows to reserve
184
protected void initialize(Instances dataset, int capacity) {
188
// Strings only have to be "shallow" copied because
189
// they can't be modified.
190
m_ClassIndex = dataset.m_ClassIndex;
191
m_RelationName = dataset.m_RelationName;
192
m_Attributes = dataset.m_Attributes;
193
m_Instances = new FastVector(capacity);
197
* Creates a new set of instances by copying a
198
* subset of another set.
200
* @param source the set of instances from which a subset
202
* @param first the index of the first instance to be copied
203
* @param toCopy the number of instances to be copied
204
* @throws IllegalArgumentException if first and toCopy are out of range
206
//@ requires 0 <= first;
207
//@ requires 0 <= toCopy;
208
//@ requires first + toCopy <= source.numInstances();
209
public Instances(/*@non_null@*/Instances source, int first, int toCopy) {
211
this(source, toCopy);
213
if ((first < 0) || ((first + toCopy) > source.numInstances())) {
214
throw new IllegalArgumentException("Parameters first and/or toCopy out "+
217
source.copyInstances(first, this, toCopy);
221
* Creates an empty set of instances. Uses the given
222
* attribute information. Sets the capacity of the set of
223
* instances to 0 if its negative. Given attribute information
224
* must not be changed after this constructor has been used.
226
* @param name the name of the relation
227
* @param attInfo the attribute information
228
* @param capacity the capacity of the set
230
public Instances(/*@non_null@*/String name,
231
/*@non_null@*/FastVector attInfo, int capacity) {
233
m_RelationName = name;
235
m_Attributes = attInfo;
236
for (int i = 0; i < numAttributes(); i++) {
237
attribute(i).setIndex(i);
239
m_Instances = new FastVector(capacity);
243
* Create a copy of the structure, but "cleanse" string types (i.e.
244
* doesn't contain references to the strings seen in the past).
245
* Also cleanses all relational attributes.
247
* @return a copy of the instance structure.
249
public Instances stringFreeStructure() {
251
FastVector atts = (FastVector)m_Attributes.copy();
252
for (int i = 0 ; i < atts.size(); i++) {
253
Attribute att = (Attribute)atts.elementAt(i);
254
if (att.type() == Attribute.STRING) {
255
atts.setElementAt(new Attribute(att.name(), (FastVector)null), i);
256
} else if (att.type() == Attribute.RELATIONAL) {
257
atts.setElementAt(new Attribute(att.name(), new Instances(att.relation(), 0)), i);
260
Instances result = new Instances(relationName(), atts, 0);
261
result.m_ClassIndex = m_ClassIndex;
266
* Adds one instance to the end of the set.
267
* Shallow copies instance before it is added. Increases the
268
* size of the dataset if it is not large enough. Does not
269
* check if the instance is compatible with the dataset.
270
* Note: String or relational values are not transferred.
272
* @param instance the instance to be added
274
public void add(/*@non_null@*/ Instance instance) {
276
Instance newInstance = (Instance)instance.copy();
278
newInstance.setDataset(this);
279
m_Instances.addElement(newInstance);
283
* Returns an attribute.
285
* @param index the attribute's index (index starts with 0)
286
* @return the attribute at the given position
288
//@ requires 0 <= index;
289
//@ requires index < m_Attributes.size();
290
//@ ensures \result != null;
291
public /*@pure@*/ Attribute attribute(int index) {
293
return (Attribute) m_Attributes.elementAt(index);
297
* Returns an attribute given its name. If there is more than
298
* one attribute with the same name, it returns the first one.
299
* Returns null if the attribute can't be found.
301
* @param name the attribute's name
302
* @return the attribute with the given name, null if the
303
* attribute can't be found
305
public /*@pure@*/ Attribute attribute(String name) {
307
for (int i = 0; i < numAttributes(); i++) {
308
if (attribute(i).name().equals(name)) {
316
* Checks for attributes of the given type in the dataset
318
* @param attType the attribute type to look for
319
* @return true if attributes of the given type are present
321
public boolean checkForAttributeType(int attType) {
325
while (i < m_Attributes.size()) {
326
if (attribute(i++).type() == attType) {
334
* Checks for string attributes in the dataset
336
* @return true if string attributes are present, false otherwise
338
public /*@pure@*/ boolean checkForStringAttributes() {
339
return checkForAttributeType(Attribute.STRING);
343
* Checks if the given instance is compatible
344
* with this dataset. Only looks at the size of
345
* the instance and the ranges of the values for
346
* nominal and string attributes.
348
* @param instance the instance to check
349
* @return true if the instance is compatible with the dataset
351
public /*@pure@*/ boolean checkInstance(Instance instance) {
353
if (instance.numAttributes() != numAttributes()) {
356
for (int i = 0; i < numAttributes(); i++) {
357
if (instance.isMissing(i)) {
359
} else if (attribute(i).isNominal() ||
360
attribute(i).isString()) {
361
if (!(Utils.eq(instance.value(i),
362
(double)(int)instance.value(i)))) {
364
} else if (Utils.sm(instance.value(i), 0) ||
365
Utils.gr(instance.value(i),
366
attribute(i).numValues())) {
375
* Returns the class attribute.
377
* @return the class attribute
378
* @throws UnassignedClassException if the class is not set
380
//@ requires classIndex() >= 0;
381
public /*@pure@*/ Attribute classAttribute() {
383
if (m_ClassIndex < 0) {
384
throw new UnassignedClassException("Class index is negative (not set)!");
386
return attribute(m_ClassIndex);
390
* Returns the class attribute's index. Returns negative number
393
* @return the class index as an integer
395
// ensures \result == m_ClassIndex;
396
public /*@pure@*/ int classIndex() {
402
* Compactifies the set of instances. Decreases the capacity of
403
* the set so that it matches the number of instances in the set.
405
public void compactify() {
407
m_Instances.trimToSize();
411
* Removes all instances from the set.
413
public void delete() {
415
m_Instances = new FastVector();
419
* Removes an instance at the given position from the set.
421
* @param index the instance's position (index starts with 0)
423
//@ requires 0 <= index && index < numInstances();
424
public void delete(int index) {
426
m_Instances.removeElementAt(index);
430
* Deletes an attribute at the given position
431
* (0 to numAttributes() - 1). A deep copy of the attribute
432
* information is performed before the attribute is deleted.
434
* @param position the attribute's position (position starts with 0)
435
* @throws IllegalArgumentException if the given index is out of range
436
* or the class attribute is being deleted
438
//@ requires 0 <= position && position < numAttributes();
439
//@ requires position != classIndex();
440
public void deleteAttributeAt(int position) {
442
if ((position < 0) || (position >= m_Attributes.size())) {
443
throw new IllegalArgumentException("Index out of range");
445
if (position == m_ClassIndex) {
446
throw new IllegalArgumentException("Can't delete class attribute");
448
freshAttributeInfo();
449
if (m_ClassIndex > position) {
452
m_Attributes.removeElementAt(position);
453
for (int i = position; i < m_Attributes.size(); i++) {
454
Attribute current = (Attribute)m_Attributes.elementAt(i);
455
current.setIndex(current.index() - 1);
457
for (int i = 0; i < numInstances(); i++) {
458
instance(i).forceDeleteAttributeAt(position);
463
* Deletes all attributes of the given type in the dataset. A deep copy of
464
* the attribute information is performed before an attribute is deleted.
466
* @param attType the attribute type to delete
467
* @throws IllegalArgumentException if attribute couldn't be
468
* successfully deleted (probably because it is the class attribute).
470
public void deleteAttributeType(int attType) {
472
while (i < m_Attributes.size()) {
473
if (attribute(i).type() == attType) {
474
deleteAttributeAt(i);
482
* Deletes all string attributes in the dataset. A deep copy of the attribute
483
* information is performed before an attribute is deleted.
485
* @throws IllegalArgumentException if string attribute couldn't be
486
* successfully deleted (probably because it is the class attribute).
487
* @see #deleteAttributeType(int)
489
public void deleteStringAttributes() {
490
deleteAttributeType(Attribute.STRING);
494
* Removes all instances with missing values for a particular
495
* attribute from the dataset.
497
* @param attIndex the attribute's index (index starts with 0)
499
//@ requires 0 <= attIndex && attIndex < numAttributes();
500
public void deleteWithMissing(int attIndex) {
502
FastVector newInstances = new FastVector(numInstances());
504
for (int i = 0; i < numInstances(); i++) {
505
if (!instance(i).isMissing(attIndex)) {
506
newInstances.addElement(instance(i));
509
m_Instances = newInstances;
513
* Removes all instances with missing values for a particular
514
* attribute from the dataset.
516
* @param att the attribute
518
public void deleteWithMissing(/*@non_null@*/ Attribute att) {
520
deleteWithMissing(att.index());
524
* Removes all instances with a missing class value
527
* @throws UnassignedClassException if class is not set
529
public void deleteWithMissingClass() {
531
if (m_ClassIndex < 0) {
532
throw new UnassignedClassException("Class index is negative (not set)!");
534
deleteWithMissing(m_ClassIndex);
538
* Returns an enumeration of all the attributes.
540
* @return enumeration of all the attributes.
542
public /*@non_null pure@*/ Enumeration enumerateAttributes() {
544
return m_Attributes.elements(m_ClassIndex);
548
* Returns an enumeration of all instances in the dataset.
550
* @return enumeration of all instances in the dataset
552
public /*@non_null pure@*/ Enumeration enumerateInstances() {
554
return m_Instances.elements();
558
* Checks if two headers are equivalent.
560
* @param dataset another dataset
561
* @return true if the header of the given dataset is equivalent
564
public /*@pure@*/ boolean equalHeaders(Instances dataset){
566
// Check class and all attributes
567
if (m_ClassIndex != dataset.m_ClassIndex) {
570
if (m_Attributes.size() != dataset.m_Attributes.size()) {
573
for (int i = 0; i < m_Attributes.size(); i++) {
574
if (!(attribute(i).equals(dataset.attribute(i)))) {
582
* Returns the first instance in the set.
584
* @return the first instance in the set
586
//@ requires numInstances() > 0;
587
public /*@non_null pure@*/ Instance firstInstance() {
589
return (Instance)m_Instances.firstElement();
593
* Returns a random number generator. The initial seed of the random
594
* number generator depends on the given seed and the hash code of
595
* a string representation of a instances chosen based on the given
598
* @param seed the given seed
599
* @return the random number generator
601
public Random getRandomNumberGenerator(long seed) {
603
Random r = new Random(seed);
604
r.setSeed(instance(r.nextInt(numInstances())).toString().hashCode() + seed);
609
* Inserts an attribute at the given position (0 to
610
* numAttributes()) and sets all values to be missing.
611
* Shallow copies the attribute before it is inserted, and performs
612
* a deep copy of the existing attribute information.
614
* @param att the attribute to be inserted
615
* @param position the attribute's position (position starts with 0)
616
* @throws IllegalArgumentException if the given index is out of range
618
//@ requires 0 <= position;
619
//@ requires position <= numAttributes();
620
public void insertAttributeAt(/*@non_null@*/ Attribute att, int position) {
622
if ((position < 0) ||
623
(position > m_Attributes.size())) {
624
throw new IllegalArgumentException("Index out of range");
626
att = (Attribute)att.copy();
627
freshAttributeInfo();
628
att.setIndex(position);
629
m_Attributes.insertElementAt(att, position);
630
for (int i = position + 1; i < m_Attributes.size(); i++) {
631
Attribute current = (Attribute)m_Attributes.elementAt(i);
632
current.setIndex(current.index() + 1);
634
for (int i = 0; i < numInstances(); i++) {
635
instance(i).forceInsertAttributeAt(position);
637
if (m_ClassIndex >= position) {
643
* Returns the instance at the given position.
645
* @param index the instance's index (index starts with 0)
646
* @return the instance at the given position
648
//@ requires 0 <= index;
649
//@ requires index < numInstances();
650
public /*@non_null pure@*/ Instance instance(int index) {
652
return (Instance)m_Instances.elementAt(index);
656
* Returns the kth-smallest attribute value of a numeric attribute.
657
* Note that calling this method will change the order of the data!
659
* @param att the Attribute object
660
* @param k the value of k
661
* @return the kth-smallest value
663
public double kthSmallestValue(Attribute att, int k) {
665
return kthSmallestValue(att.index(), k);
669
* Returns the kth-smallest attribute value of a numeric attribute.
670
* Note that calling this method will change the order of the data!
671
* The number of non-missing values in the data must be as least
672
* as last as k for this to work.
674
* @param attIndex the attribute's index
675
* @param k the value of k
676
* @return the kth-smallest value
678
public double kthSmallestValue(int attIndex, int k) {
680
if (!attribute(attIndex).isNumeric()) {
681
throw new IllegalArgumentException("Instances: attribute must be numeric to compute kth-smallest value.");
686
// move all instances with missing values to end
687
j = numInstances() - 1;
690
if (instance(j).isMissing(attIndex)) {
693
if (instance(i).isMissing(attIndex)) {
701
if ((k < 1) || (k > j+1)) {
702
throw new IllegalArgumentException("Instances: value for k for computing kth-smallest value too large.");
705
return instance(select(attIndex, 0, j, k)).value(attIndex);
709
* Returns the last instance in the set.
711
* @return the last instance in the set
713
//@ requires numInstances() > 0;
714
public /*@non_null pure@*/ Instance lastInstance() {
716
return (Instance)m_Instances.lastElement();
720
* Returns the mean (mode) for a numeric (nominal) attribute as
721
* a floating-point value. Returns 0 if the attribute is neither nominal nor
722
* numeric. If all values are missing it returns zero.
724
* @param attIndex the attribute's index (index starts with 0)
725
* @return the mean or the mode
727
public /*@pure@*/ double meanOrMode(int attIndex) {
729
double result, found;
732
if (attribute(attIndex).isNumeric()) {
734
for (int j = 0; j < numInstances(); j++) {
735
if (!instance(j).isMissing(attIndex)) {
736
found += instance(j).weight();
737
result += instance(j).weight()*instance(j).value(attIndex);
743
return result / found;
745
} else if (attribute(attIndex).isNominal()) {
746
counts = new int[attribute(attIndex).numValues()];
747
for (int j = 0; j < numInstances(); j++) {
748
if (!instance(j).isMissing(attIndex)) {
749
counts[(int) instance(j).value(attIndex)] += instance(j).weight();
752
return (double)Utils.maxIndex(counts);
759
* Returns the mean (mode) for a numeric (nominal) attribute as a
760
* floating-point value. Returns 0 if the attribute is neither
761
* nominal nor numeric. If all values are missing it returns zero.
763
* @param att the attribute
764
* @return the mean or the mode
766
public /*@pure@*/ double meanOrMode(Attribute att) {
768
return meanOrMode(att.index());
772
* Returns the number of attributes.
774
* @return the number of attributes as an integer
776
//@ ensures \result == m_Attributes.size();
777
public /*@pure@*/ int numAttributes() {
779
return m_Attributes.size();
783
* Returns the number of class labels.
785
* @return the number of class labels as an integer if the class
786
* attribute is nominal, 1 otherwise.
787
* @throws UnassignedClassException if the class is not set
789
//@ requires classIndex() >= 0;
790
public /*@pure@*/ int numClasses() {
792
if (m_ClassIndex < 0) {
793
throw new UnassignedClassException("Class index is negative (not set)!");
795
if (!classAttribute().isNominal()) {
798
return classAttribute().numValues();
803
* Returns the number of distinct values of a given attribute.
804
* Returns the number of instances if the attribute is a
805
* string attribute. The value 'missing' is not counted.
807
* @param attIndex the attribute (index starts with 0)
808
* @return the number of distinct values of a given attribute
810
//@ requires 0 <= attIndex;
811
//@ requires attIndex < numAttributes();
812
public /*@pure@*/ int numDistinctValues(int attIndex) {
814
if (attribute(attIndex).isNumeric()) {
815
double [] attVals = attributeToDoubleArray(attIndex);
816
int [] sorted = Utils.sort(attVals);
819
for (int i = 0; i < sorted.length; i++) {
820
Instance current = instance(sorted[i]);
821
if (current.isMissing(attIndex)) {
825
(current.value(attIndex) > prev)) {
826
prev = current.value(attIndex);
832
return attribute(attIndex).numValues();
837
* Returns the number of distinct values of a given attribute.
838
* Returns the number of instances if the attribute is a
839
* string attribute. The value 'missing' is not counted.
841
* @param att the attribute
842
* @return the number of distinct values of a given attribute
844
public /*@pure@*/ int numDistinctValues(/*@non_null@*/Attribute att) {
846
return numDistinctValues(att.index());
850
* Returns the number of instances in the dataset.
852
* @return the number of instances in the dataset as an integer
854
//@ ensures \result == m_Instances.size();
855
public /*@pure@*/ int numInstances() {
857
return m_Instances.size();
861
* Shuffles the instances in the set so that they are ordered
864
* @param random a random number generator
866
public void randomize(Random random) {
868
for (int j = numInstances() - 1; j > 0; j--)
869
swap(j, random.nextInt(j+1));
873
* Reads a single instance from the reader and appends it
874
* to the dataset. Automatically expands the dataset if it
875
* is not large enough to hold the instance. This method does
876
* not check for carriage return at the end of the line.
878
* @param reader the reader
879
* @return false if end of file has been reached
880
* @throws IOException if the information is not read
882
* @deprecated instead of using this method in conjunction with the
883
* <code>readInstance(Reader)</code> method, one should use the
884
* <code>ArffLoader</code> or <code>DataSource</code> class instead.
885
* @see weka.core.converters.ArffLoader
886
* @see weka.core.converters.ConverterUtils.DataSource
888
@Deprecated public boolean readInstance(Reader reader) throws IOException {
890
ArffReader arff = new ArffReader(reader, this, m_Lines, 1);
891
Instance inst = arff.readInstance(arff.getData(), false);
892
m_Lines = arff.getLineNo();
903
* Returns the relation's name.
905
* @return the relation's name as a string
907
//@ ensures \result == m_RelationName;
908
public /*@pure@*/ String relationName() {
910
return m_RelationName;
914
* Renames an attribute. This change only affects this
917
* @param att the attribute's index (index starts with 0)
918
* @param name the new name
920
public void renameAttribute(int att, String name) {
922
Attribute newAtt = attribute(att).copy(name);
923
FastVector newVec = new FastVector(numAttributes());
925
for (int i = 0; i < numAttributes(); i++) {
927
newVec.addElement(newAtt);
929
newVec.addElement(attribute(i));
932
m_Attributes = newVec;
936
* Renames an attribute. This change only affects this
939
* @param att the attribute
940
* @param name the new name
942
public void renameAttribute(Attribute att, String name) {
944
renameAttribute(att.index(), name);
948
* Renames the value of a nominal (or string) attribute value. This
949
* change only affects this dataset.
951
* @param att the attribute's index (index starts with 0)
952
* @param val the value's index (index starts with 0)
953
* @param name the new name
955
public void renameAttributeValue(int att, int val, String name) {
957
Attribute newAtt = (Attribute)attribute(att).copy();
958
FastVector newVec = new FastVector(numAttributes());
960
newAtt.setValue(val, name);
961
for (int i = 0; i < numAttributes(); i++) {
963
newVec.addElement(newAtt);
965
newVec.addElement(attribute(i));
968
m_Attributes = newVec;
972
* Renames the value of a nominal (or string) attribute value. This
973
* change only affects this dataset.
975
* @param att the attribute
976
* @param val the value
977
* @param name the new name
979
public void renameAttributeValue(Attribute att, String val,
982
int v = att.indexOfValue(val);
983
if (v == -1) throw new IllegalArgumentException(val + " not found");
984
renameAttributeValue(att.index(), v, name);
988
* Creates a new dataset of the same size using random sampling
991
* @param random a random number generator
992
* @return the new dataset
994
public Instances resample(Random random) {
996
Instances newData = new Instances(this, numInstances());
997
while (newData.numInstances() < numInstances()) {
998
newData.add(instance(random.nextInt(numInstances())));
1004
* Creates a new dataset of the same size using random sampling
1005
* with replacement according to the current instance weights. The
1006
* weights of the instances in the new dataset are set to one.
1008
* @param random a random number generator
1009
* @return the new dataset
1011
public Instances resampleWithWeights(Random random) {
1013
double [] weights = new double[numInstances()];
1014
for (int i = 0; i < weights.length; i++) {
1015
weights[i] = instance(i).weight();
1017
return resampleWithWeights(random, weights);
1022
* Creates a new dataset of the same size using random sampling
1023
* with replacement according to the given weight vector. The
1024
* weights of the instances in the new dataset are set to one.
1025
* The length of the weight vector has to be the same as the
1026
* number of instances in the dataset, and all weights have to
1029
* @param random a random number generator
1030
* @param weights the weight vector
1031
* @return the new dataset
1032
* @throws IllegalArgumentException if the weights array is of the wrong
1033
* length or contains negative weights.
1035
public Instances resampleWithWeights(Random random,
1038
if (weights.length != numInstances()) {
1039
throw new IllegalArgumentException("weights.length != numInstances.");
1041
Instances newData = new Instances(this, numInstances());
1042
if (numInstances() == 0) {
1045
double[] probabilities = new double[numInstances()];
1046
double sumProbs = 0, sumOfWeights = Utils.sum(weights);
1047
for (int i = 0; i < numInstances(); i++) {
1048
sumProbs += random.nextDouble();
1049
probabilities[i] = sumProbs;
1051
Utils.normalize(probabilities, sumProbs / sumOfWeights);
1053
// Make sure that rounding errors don't mess things up
1054
probabilities[numInstances() - 1] = sumOfWeights;
1055
int k = 0; int l = 0;
1057
while ((k < numInstances() && (l < numInstances()))) {
1058
if (weights[l] < 0) {
1059
throw new IllegalArgumentException("Weights have to be positive.");
1061
sumProbs += weights[l];
1062
while ((k < numInstances()) &&
1063
(probabilities[k] <= sumProbs)) {
1064
newData.add(instance(l));
1065
newData.instance(k).setWeight(1);
1074
* Sets the class attribute.
1076
* @param att attribute to be the class
1078
public void setClass(Attribute att) {
1080
m_ClassIndex = att.index();
1084
* Sets the class index of the set.
1085
* If the class index is negative there is assumed to be no class.
1086
* (ie. it is undefined)
1088
* @param classIndex the new class index (index starts with 0)
1089
* @throws IllegalArgumentException if the class index is too big or < 0
1091
public void setClassIndex(int classIndex) {
1093
if (classIndex >= numAttributes()) {
1094
throw new IllegalArgumentException("Invalid class index: " + classIndex);
1096
m_ClassIndex = classIndex;
1100
* Sets the relation's name.
1102
* @param newName the new relation name.
1104
public void setRelationName(/*@non_null@*/String newName) {
1106
m_RelationName = newName;
1110
* Sorts the instances based on an attribute. For numeric attributes,
1111
* instances are sorted in ascending order. For nominal attributes,
1112
* instances are sorted based on the attribute label ordering
1113
* specified in the header. Instances with missing values for the
1114
* attribute are placed at the end of the dataset.
1116
* @param attIndex the attribute's index (index starts with 0)
1118
public void sort(int attIndex) {
1122
// move all instances with missing values to end
1123
j = numInstances() - 1;
1126
if (instance(j).isMissing(attIndex)) {
1129
if (instance(i).isMissing(attIndex)) {
1136
quickSort(attIndex, 0, j);
1140
* Sorts the instances based on an attribute. For numeric attributes,
1141
* instances are sorted into ascending order. For nominal attributes,
1142
* instances are sorted based on the attribute label ordering
1143
* specified in the header. Instances with missing values for the
1144
* attribute are placed at the end of the dataset.
1146
* @param att the attribute
1148
public void sort(Attribute att) {
1154
* Stratifies a set of instances according to its class values
1155
* if the class attribute is nominal (so that afterwards a
1156
* stratified cross-validation can be performed).
1158
* @param numFolds the number of folds in the cross-validation
1159
* @throws UnassignedClassException if the class is not set
1161
public void stratify(int numFolds) {
1163
if (numFolds <= 0) {
1164
throw new IllegalArgumentException("Number of folds must be greater than 1");
1166
if (m_ClassIndex < 0) {
1167
throw new UnassignedClassException("Class index is negative (not set)!");
1169
if (classAttribute().isNominal()) {
1173
while (index < numInstances()) {
1174
Instance instance1 = instance(index - 1);
1175
for (int j = index; j < numInstances(); j++) {
1176
Instance instance2 = instance(j);
1177
if ((instance1.classValue() == instance2.classValue()) ||
1178
(instance1.classIsMissing() &&
1179
instance2.classIsMissing())) {
1186
stratStep(numFolds);
1191
* Computes the sum of all the instances' weights.
1193
* @return the sum of all the instances' weights as a double
1195
public /*@pure@*/ double sumOfWeights() {
1199
for (int i = 0; i < numInstances(); i++) {
1200
sum += instance(i).weight();
1206
* Creates the test set for one fold of a cross-validation on
1209
* @param numFolds the number of folds in the cross-validation. Must
1210
* be greater than 1.
1211
* @param numFold 0 for the first fold, 1 for the second, ...
1212
* @return the test set as a set of weighted instances
1213
* @throws IllegalArgumentException if the number of folds is less than 2
1214
* or greater than the number of instances.
1216
//@ requires 2 <= numFolds && numFolds < numInstances();
1217
//@ requires 0 <= numFold && numFold < numFolds;
1218
public Instances testCV(int numFolds, int numFold) {
1220
int numInstForFold, first, offset;
1224
throw new IllegalArgumentException("Number of folds must be at least 2!");
1226
if (numFolds > numInstances()) {
1227
throw new IllegalArgumentException("Can't have more folds than instances!");
1229
numInstForFold = numInstances() / numFolds;
1230
if (numFold < numInstances() % numFolds){
1234
offset = numInstances() % numFolds;
1235
test = new Instances(this, numInstForFold);
1236
first = numFold * (numInstances() / numFolds) + offset;
1237
copyInstances(first, test, numInstForFold);
1242
* Returns the dataset as a string in ARFF format. Strings
1243
* are quoted if they contain whitespace characters, or if they
1244
* are a question mark.
1246
* @return the dataset in ARFF format as a string
1248
public String toString() {
1250
StringBuffer text = new StringBuffer();
1252
text.append(ARFF_RELATION).append(" ").
1253
append(Utils.quote(m_RelationName)).append("\n\n");
1254
for (int i = 0; i < numAttributes(); i++) {
1255
text.append(attribute(i)).append("\n");
1257
text.append("\n").append(ARFF_DATA).append("\n");
1259
text.append(stringWithoutHeader());
1260
return text.toString();
1264
* Returns the instances in the dataset as a string in ARFF format. Strings
1265
* are quoted if they contain whitespace characters, or if they
1266
* are a question mark.
1268
* @return the dataset in ARFF format as a string
1270
protected String stringWithoutHeader() {
1272
StringBuffer text = new StringBuffer();
1274
for (int i = 0; i < numInstances(); i++) {
1275
text.append(instance(i));
1276
if (i < numInstances() - 1) {
1280
return text.toString();
1284
* Creates the training set for one fold of a cross-validation
1287
* @param numFolds the number of folds in the cross-validation. Must
1288
* be greater than 1.
1289
* @param numFold 0 for the first fold, 1 for the second, ...
1290
* @return the training set
1291
* @throws IllegalArgumentException if the number of folds is less than 2
1292
* or greater than the number of instances.
1294
//@ requires 2 <= numFolds && numFolds < numInstances();
1295
//@ requires 0 <= numFold && numFold < numFolds;
1296
public Instances trainCV(int numFolds, int numFold) {
1298
int numInstForFold, first, offset;
1302
throw new IllegalArgumentException("Number of folds must be at least 2!");
1304
if (numFolds > numInstances()) {
1305
throw new IllegalArgumentException("Can't have more folds than instances!");
1307
numInstForFold = numInstances() / numFolds;
1308
if (numFold < numInstances() % numFolds) {
1312
offset = numInstances() % numFolds;
1313
train = new Instances(this, numInstances() - numInstForFold);
1314
first = numFold * (numInstances() / numFolds) + offset;
1315
copyInstances(0, train, first);
1316
copyInstances(first + numInstForFold, train,
1317
numInstances() - first - numInstForFold);
1323
* Creates the training set for one fold of a cross-validation
1324
* on the dataset. The data is subsequently randomized based
1325
* on the given random number generator.
1327
* @param numFolds the number of folds in the cross-validation. Must
1328
* be greater than 1.
1329
* @param numFold 0 for the first fold, 1 for the second, ...
1330
* @param random the random number generator
1331
* @return the training set
1332
* @throws IllegalArgumentException if the number of folds is less than 2
1333
* or greater than the number of instances.
1335
//@ requires 2 <= numFolds && numFolds < numInstances();
1336
//@ requires 0 <= numFold && numFold < numFolds;
1337
public Instances trainCV(int numFolds, int numFold, Random random) {
1339
Instances train = trainCV(numFolds, numFold);
1340
train.randomize(random);
1345
* Computes the variance for a numeric attribute.
1347
* @param attIndex the numeric attribute (index starts with 0)
1348
* @return the variance if the attribute is numeric
1349
* @throws IllegalArgumentException if the attribute is not numeric
1351
public /*@pure@*/ double variance(int attIndex) {
1353
double sum = 0, sumSquared = 0, sumOfWeights = 0;
1355
if (!attribute(attIndex).isNumeric()) {
1356
throw new IllegalArgumentException("Can't compute variance because attribute is " +
1359
for (int i = 0; i < numInstances(); i++) {
1360
if (!instance(i).isMissing(attIndex)) {
1361
sum += instance(i).weight() *
1362
instance(i).value(attIndex);
1363
sumSquared += instance(i).weight() *
1364
instance(i).value(attIndex) *
1365
instance(i).value(attIndex);
1366
sumOfWeights += instance(i).weight();
1369
if (sumOfWeights <= 1) {
1372
double result = (sumSquared - (sum * sum / sumOfWeights)) /
1375
// We don't like negative variance
1384
* Computes the variance for a numeric attribute.
1386
* @param att the numeric attribute
1387
* @return the variance if the attribute is numeric
1388
* @throws IllegalArgumentException if the attribute is not numeric
1390
public /*@pure@*/ double variance(Attribute att) {
1392
return variance(att.index());
1396
* Calculates summary statistics on the values that appear in this
1397
* set of instances for a specified attribute.
1399
* @param index the index of the attribute to summarize (index starts with 0)
1400
* @return an AttributeStats object with it's fields calculated.
1402
//@ requires 0 <= index && index < numAttributes();
1403
public AttributeStats attributeStats(int index) {
1405
AttributeStats result = new AttributeStats();
1406
if (attribute(index).isNominal()) {
1407
result.nominalCounts = new int [attribute(index).numValues()];
1409
if (attribute(index).isNumeric()) {
1410
result.numericStats = new weka.experiment.Stats();
1412
result.totalCount = numInstances();
1414
double [] attVals = attributeToDoubleArray(index);
1415
int [] sorted = Utils.sort(attVals);
1416
int currentCount = 0;
1417
double prev = Instance.missingValue();
1418
for (int j = 0; j < numInstances(); j++) {
1419
Instance current = instance(sorted[j]);
1420
if (current.isMissing(index)) {
1421
result.missingCount = numInstances() - j;
1424
if (current.value(index) == prev) {
1427
result.addDistinct(prev, currentCount);
1429
prev = current.value(index);
1432
result.addDistinct(prev, currentCount);
1433
result.distinctCount--; // So we don't count "missing" as a value
1438
* Gets the value of all instances in this dataset for a particular
1439
* attribute. Useful in conjunction with Utils.sort to allow iterating
1440
* through the dataset in sorted order for some attribute.
1442
* @param index the index of the attribute.
1443
* @return an array containing the value of the desired attribute for
1444
* each instance in the dataset.
1446
//@ requires 0 <= index && index < numAttributes();
1447
public /*@pure@*/ double [] attributeToDoubleArray(int index) {
1449
double [] result = new double[numInstances()];
1450
for (int i = 0; i < result.length; i++) {
1451
result[i] = instance(i).value(index);
1457
* Generates a string summarizing the set of instances. Gives a breakdown
1458
* for each attribute indicating the number of missing/discrete/unique
1459
* values and other information.
1461
* @return a string summarizing the dataset
1463
public String toSummaryString() {
1465
StringBuffer result = new StringBuffer();
1466
result.append("Relation Name: ").append(relationName()).append('\n');
1467
result.append("Num Instances: ").append(numInstances()).append('\n');
1468
result.append("Num Attributes: ").append(numAttributes()).append('\n');
1469
result.append('\n');
1471
result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
1472
result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
1473
result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
1474
result.append(Utils.padLeft("Missing", 12));
1475
result.append(Utils.padLeft("Unique", 12));
1476
result.append(Utils.padLeft("Dist", 6)).append('\n');
1477
for (int i = 0; i < numAttributes(); i++) {
1478
Attribute a = attribute(i);
1479
AttributeStats as = attributeStats(i);
1480
result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
1481
result.append(Utils.padRight(a.name(), 25)).append(' ');
1484
case Attribute.NOMINAL:
1485
result.append(Utils.padLeft("Nom", 4)).append(' ');
1486
percent = Math.round(100.0 * as.intCount / as.totalCount);
1487
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1488
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1489
percent = Math.round(100.0 * as.realCount / as.totalCount);
1490
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1492
case Attribute.NUMERIC:
1493
result.append(Utils.padLeft("Num", 4)).append(' ');
1494
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1495
percent = Math.round(100.0 * as.intCount / as.totalCount);
1496
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1497
percent = Math.round(100.0 * as.realCount / as.totalCount);
1498
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1500
case Attribute.DATE:
1501
result.append(Utils.padLeft("Dat", 4)).append(' ');
1502
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1503
percent = Math.round(100.0 * as.intCount / as.totalCount);
1504
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1505
percent = Math.round(100.0 * as.realCount / as.totalCount);
1506
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1508
case Attribute.STRING:
1509
result.append(Utils.padLeft("Str", 4)).append(' ');
1510
percent = Math.round(100.0 * as.intCount / as.totalCount);
1511
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1512
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1513
percent = Math.round(100.0 * as.realCount / as.totalCount);
1514
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1516
case Attribute.RELATIONAL:
1517
result.append(Utils.padLeft("Rel", 4)).append(' ');
1518
percent = Math.round(100.0 * as.intCount / as.totalCount);
1519
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1520
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1521
percent = Math.round(100.0 * as.realCount / as.totalCount);
1522
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1525
result.append(Utils.padLeft("???", 4)).append(' ');
1526
result.append(Utils.padLeft("" + 0, 3)).append("% ");
1527
percent = Math.round(100.0 * as.intCount / as.totalCount);
1528
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1529
percent = Math.round(100.0 * as.realCount / as.totalCount);
1530
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1533
result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
1534
percent = Math.round(100.0 * as.missingCount / as.totalCount);
1535
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1536
result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
1537
percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
1538
result.append(Utils.padLeft("" + percent, 3)).append("% ");
1539
result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
1540
result.append('\n');
1542
return result.toString();
1546
* Copies instances from one set to the end of another
1549
* @param from the position of the first instance to be copied
1550
* @param dest the destination for the instances
1551
* @param num the number of instances to be copied
1553
//@ requires 0 <= from && from <= numInstances() - num;
1554
//@ requires 0 <= num;
1555
protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) {
1557
for (int i = 0; i < num; i++) {
1558
dest.add(instance(from + i));
1563
* Replaces the attribute information by a clone of
1566
protected void freshAttributeInfo() {
1568
m_Attributes = (FastVector) m_Attributes.copyElements();
1572
* Returns string including all instances, their weights and
1573
* their indices in the original dataset.
1575
* @return description of instance and its weight as a string
1577
protected /*@pure@*/ String instancesAndWeights(){
1579
StringBuffer text = new StringBuffer();
1581
for (int i = 0; i < numInstances(); i++) {
1582
text.append(instance(i) + " " + instance(i).weight());
1583
if (i < numInstances() - 1) {
1587
return text.toString();
1591
* Partitions the instances around a pivot. Used by quicksort and
1594
* @param attIndex the attribute's index (index starts with 0)
1595
* @param l the first index of the subset (index starts with 0)
1596
* @param r the last index of the subset (index starts with 0)
1598
* @return the index of the middle element
1600
//@ requires 0 <= attIndex && attIndex < numAttributes();
1601
//@ requires 0 <= left && left <= right && right < numInstances();
1602
protected int partition(int attIndex, int l, int r) {
1604
double pivot = instance((l + r) / 2).value(attIndex);
1607
while ((instance(l).value(attIndex) < pivot) && (l < r)) {
1610
while ((instance(r).value(attIndex) > pivot) && (l < r)) {
1619
if ((l == r) && (instance(r).value(attIndex) > pivot)) {
1627
* Implements quicksort according to Manber's "Introduction to
1630
* @param attIndex the attribute's index (index starts with 0)
1631
* @param left the first index of the subset to be sorted (index starts with 0)
1632
* @param right the last index of the subset to be sorted (index starts with 0)
1634
//@ requires 0 <= attIndex && attIndex < numAttributes();
1635
//@ requires 0 <= first && first <= right && right < numInstances();
1636
protected void quickSort(int attIndex, int left, int right) {
1639
int middle = partition(attIndex, left, right);
1640
quickSort(attIndex, left, middle);
1641
quickSort(attIndex, middle + 1, right);
1646
* Implements computation of the kth-smallest element according
1647
* to Manber's "Introduction to Algorithms".
1649
* @param attIndex the attribute's index (index starts with 0)
1650
* @param left the first index of the subset (index starts with 0)
1651
* @param right the last index of the subset (index starts with 0)
1652
* @param k the value of k
1654
* @return the index of the kth-smallest element
1656
//@ requires 0 <= attIndex && attIndex < numAttributes();
1657
//@ requires 0 <= first && first <= right && right < numInstances();
1658
protected int select(int attIndex, int left, int right, int k) {
1660
if (left == right) {
1663
int middle = partition(attIndex, left, right);
1664
if ((middle - left + 1) >= k) {
1665
return select(attIndex, left, middle, k);
1667
return select(attIndex, middle + 1, right, k - (middle - left + 1));
1673
* Help function needed for stratification of set.
1675
* @param numFolds the number of folds for the stratification
1677
protected void stratStep (int numFolds){
1679
FastVector newVec = new FastVector(m_Instances.capacity());
1682
// create stratified batch
1683
while (newVec.size() < numInstances()) {
1685
while (j < numInstances()) {
1686
newVec.addElement(instance(j));
1691
m_Instances = newVec;
1695
* Swaps two instances in the set.
1697
* @param i the first instance's index (index starts with 0)
1698
* @param j the second instance's index (index starts with 0)
1700
//@ requires 0 <= i && i < numInstances();
1701
//@ requires 0 <= j && j < numInstances();
1702
public void swap(int i, int j){
1704
m_Instances.swap(i, j);
1708
* Merges two sets of Instances together. The resulting set will have
1709
* all the attributes of the first set plus all the attributes of the
1710
* second set. The number of instances in both sets must be the same.
1712
* @param first the first set of Instances
1713
* @param second the second set of Instances
1714
* @return the merged set of Instances
1715
* @throws IllegalArgumentException if the datasets are not the same size
1717
public static Instances mergeInstances(Instances first, Instances second) {
1719
if (first.numInstances() != second.numInstances()) {
1720
throw new IllegalArgumentException("Instance sets must be of the same size");
1723
// Create the vector of merged attributes
1724
FastVector newAttributes = new FastVector();
1725
for (int i = 0; i < first.numAttributes(); i++) {
1726
newAttributes.addElement(first.attribute(i));
1728
for (int i = 0; i < second.numAttributes(); i++) {
1729
newAttributes.addElement(second.attribute(i));
1732
// Create the set of Instances
1733
Instances merged = new Instances(first.relationName() + '_'
1734
+ second.relationName(),
1736
first.numInstances());
1737
// Merge each instance
1738
for (int i = 0; i < first.numInstances(); i++) {
1739
merged.add(first.instance(i).mergeInstance(second.instance(i)));
1745
* Method for testing this class.
1747
* @param argv should contain one element: the name of an ARFF file
1749
//@ requires argv != null;
1750
//@ requires argv.length == 1;
1751
//@ requires argv[0] != null;
1752
public static void test(String [] argv) {
1754
Instances instances, secondInstances, train, test, empty;
1755
Random random = new Random(2);
1758
FastVector testAtts, testVals;
1762
if (argv.length > 1) {
1763
throw (new Exception("Usage: Instances [<filename>]"));
1766
// Creating set of instances from scratch
1767
testVals = new FastVector(2);
1768
testVals.addElement("first_value");
1769
testVals.addElement("second_value");
1770
testAtts = new FastVector(2);
1771
testAtts.addElement(new Attribute("nominal_attribute", testVals));
1772
testAtts.addElement(new Attribute("numeric_attribute"));
1773
instances = new Instances("test_set", testAtts, 10);
1774
instances.add(new Instance(instances.numAttributes()));
1775
instances.add(new Instance(instances.numAttributes()));
1776
instances.add(new Instance(instances.numAttributes()));
1777
instances.setClassIndex(0);
1778
System.out.println("\nSet of instances created from scratch:\n");
1779
System.out.println(instances);
1781
if (argv.length == 1) {
1782
String filename = argv[0];
1783
reader = new FileReader(filename);
1785
// Read first five instances and print them
1786
System.out.println("\nFirst five instances from file:\n");
1787
instances = new Instances(reader, 1);
1788
instances.setClassIndex(instances.numAttributes() - 1);
1790
while ((i < 5) && (instances.readInstance(reader))) {
1793
System.out.println(instances);
1795
// Read all the instances in the file
1796
reader = new FileReader(filename);
1797
instances = new Instances(reader);
1799
// Make the last attribute be the class
1800
instances.setClassIndex(instances.numAttributes() - 1);
1802
// Print header and instances.
1803
System.out.println("\nDataset:\n");
1804
System.out.println(instances);
1805
System.out.println("\nClass index: "+instances.classIndex());
1808
// Test basic methods based on class index.
1809
System.out.println("\nClass name: "+instances.classAttribute().name());
1810
System.out.println("\nClass index: "+instances.classIndex());
1811
System.out.println("\nClass is nominal: " +
1812
instances.classAttribute().isNominal());
1813
System.out.println("\nClass is numeric: " +
1814
instances.classAttribute().isNumeric());
1815
System.out.println("\nClasses:\n");
1816
for (i = 0; i < instances.numClasses(); i++) {
1817
System.out.println(instances.classAttribute().value(i));
1819
System.out.println("\nClass values and labels of instances:\n");
1820
for (i = 0; i < instances.numInstances(); i++) {
1821
Instance inst = instances.instance(i);
1822
System.out.print(inst.classValue() + "\t");
1823
System.out.print(inst.toString(inst.classIndex()));
1824
if (instances.instance(i).classIsMissing()) {
1825
System.out.println("\tis missing");
1827
System.out.println();
1831
// Create random weights.
1832
System.out.println("\nCreating random weights for instances.");
1833
for (i = 0; i < instances.numInstances(); i++) {
1834
instances.instance(i).setWeight(random.nextDouble());
1837
// Print all instances and their weights (and the sum of weights).
1838
System.out.println("\nInstances and their weights:\n");
1839
System.out.println(instances.instancesAndWeights());
1840
System.out.print("\nSum of weights: ");
1841
System.out.println(instances.sumOfWeights());
1843
// Insert an attribute
1844
secondInstances = new Instances(instances);
1845
Attribute testAtt = new Attribute("Inserted");
1846
secondInstances.insertAttributeAt(testAtt, 0);
1847
System.out.println("\nSet with inserted attribute:\n");
1848
System.out.println(secondInstances);
1849
System.out.println("\nClass name: "
1850
+ secondInstances.classAttribute().name());
1852
// Delete the attribute
1853
secondInstances.deleteAttributeAt(0);
1854
System.out.println("\nSet with attribute deleted:\n");
1855
System.out.println(secondInstances);
1856
System.out.println("\nClass name: "
1857
+ secondInstances.classAttribute().name());
1859
// Test if headers are equal
1860
System.out.println("\nHeaders equal: "+
1861
instances.equalHeaders(secondInstances) + "\n");
1863
// Print data in internal format.
1864
System.out.println("\nData (internal values):\n");
1865
for (i = 0; i < instances.numInstances(); i++) {
1866
for (j = 0; j < instances.numAttributes(); j++) {
1867
if (instances.instance(i).isMissing(j)) {
1868
System.out.print("? ");
1870
System.out.print(instances.instance(i).value(j) + " ");
1873
System.out.println();
1876
// Just print header
1877
System.out.println("\nEmpty dataset:\n");
1878
empty = new Instances(instances, 0);
1879
System.out.println(empty);
1880
System.out.println("\nClass name: "+empty.classAttribute().name());
1882
// Create copy and rename an attribute and a value (if possible)
1883
if (empty.classAttribute().isNominal()) {
1884
Instances copy = new Instances(empty, 0);
1885
copy.renameAttribute(copy.classAttribute(), "new_name");
1886
copy.renameAttributeValue(copy.classAttribute(),
1887
copy.classAttribute().value(0),
1889
System.out.println("\nDataset with names changed:\n" + copy);
1890
System.out.println("\nOriginal dataset:\n" + empty);
1893
// Create and prints subset of instances.
1894
start = instances.numInstances() / 4;
1895
num = instances.numInstances() / 2;
1896
System.out.print("\nSubset of dataset: ");
1897
System.out.println(num + " instances from " + (start + 1)
1899
secondInstances = new Instances(instances, start, num);
1900
System.out.println("\nClass name: "
1901
+ secondInstances.classAttribute().name());
1903
// Print all instances and their weights (and the sum of weights).
1904
System.out.println("\nInstances and their weights:\n");
1905
System.out.println(secondInstances.instancesAndWeights());
1906
System.out.print("\nSum of weights: ");
1907
System.out.println(secondInstances.sumOfWeights());
1909
// Create and print training and test sets for 3-fold
1910
// cross-validation.
1911
System.out.println("\nTrain and test folds for 3-fold CV:");
1912
if (instances.classAttribute().isNominal()) {
1913
instances.stratify(3);
1915
for (j = 0; j < 3; j++) {
1916
train = instances.trainCV(3,j, new Random(1));
1917
test = instances.testCV(3,j);
1919
// Print all instances and their weights (and the sum of weights).
1920
System.out.println("\nTrain: ");
1921
System.out.println("\nInstances and their weights:\n");
1922
System.out.println(train.instancesAndWeights());
1923
System.out.print("\nSum of weights: ");
1924
System.out.println(train.sumOfWeights());
1925
System.out.println("\nClass name: "+train.classAttribute().name());
1926
System.out.println("\nTest: ");
1927
System.out.println("\nInstances and their weights:\n");
1928
System.out.println(test.instancesAndWeights());
1929
System.out.print("\nSum of weights: ");
1930
System.out.println(test.sumOfWeights());
1931
System.out.println("\nClass name: "+test.classAttribute().name());
1934
// Randomize instances and print them.
1935
System.out.println("\nRandomized dataset:");
1936
instances.randomize(random);
1938
// Print all instances and their weights (and the sum of weights).
1939
System.out.println("\nInstances and their weights:\n");
1940
System.out.println(instances.instancesAndWeights());
1941
System.out.print("\nSum of weights: ");
1942
System.out.println(instances.sumOfWeights());
1944
// Sort instances according to first attribute and
1946
System.out.print("\nInstances sorted according to first attribute:\n ");
1949
// Print all instances and their weights (and the sum of weights).
1950
System.out.println("\nInstances and their weights:\n");
1951
System.out.println(instances.instancesAndWeights());
1952
System.out.print("\nSum of weights: ");
1953
System.out.println(instances.sumOfWeights());
1954
} catch (Exception e) {
1955
e.printStackTrace();
1960
* Main method for this class. The following calls are possible:
1963
* <code>weka.core.Instances</code> help<br/>
1964
* prints a short list of possible commands.
1967
* <code>weka.core.Instances</code> <filename><br/>
1968
* prints a summary of a set of instances.
1971
* <code>weka.core.Instances</code> merge <filename1> <filename2><br/>
1972
* merges the two datasets (must have same number of instances) and
1973
* outputs the results on stdout.
1976
* <code>weka.core.Instances</code> append <filename1> <filename2><br/>
1977
* appends the second dataset to the first one (must have same headers) and
1978
* outputs the results on stdout.
1981
* <code>weka.core.Instances</code> randomize <seed> <filename><br/>
1982
* randomizes the dataset with the given seed and outputs the result on stdout.
1986
* @param args the commandline parameters
1988
public static void main(String[] args) {
1992
// read from stdin and print statistics
1993
if (args.length == 0) {
1994
DataSource source = new DataSource(System.in);
1995
i = source.getDataSet();
1996
System.out.println(i.toSummaryString());
1998
// read file and print statistics
1999
else if ((args.length == 1) && (!args[0].equals("-h")) && (!args[0].equals("help"))) {
2000
DataSource source = new DataSource(args[0]);
2001
i = source.getDataSet();
2002
System.out.println(i.toSummaryString());
2004
// read two files, merge them and print result to stdout
2005
else if ((args.length == 3) && (args[0].toLowerCase().equals("merge"))) {
2006
DataSource source1 = new DataSource(args[1]);
2007
DataSource source2 = new DataSource(args[2]);
2008
i = Instances.mergeInstances(source1.getDataSet(), source2.getDataSet());
2009
System.out.println(i);
2011
// read two files, append them and print result to stdout
2012
else if ((args.length == 3) && (args[0].toLowerCase().equals("append"))) {
2013
DataSource source1 = new DataSource(args[1]);
2014
DataSource source2 = new DataSource(args[2]);
2015
if (!source1.getStructure().equalHeaders(source2.getStructure()))
2016
throw new Exception("The two datasets have different headers!");
2017
Instances structure = source1.getStructure();
2018
System.out.println(source1.getStructure());
2019
while (source1.hasMoreElements(structure))
2020
System.out.println(source1.nextElement(structure));
2021
structure = source2.getStructure();
2022
while (source2.hasMoreElements(structure))
2023
System.out.println(source2.nextElement(structure));
2025
// read file and seed value, randomize data and print result to stdout
2026
else if ((args.length == 3) && (args[0].toLowerCase().equals("randomize"))) {
2027
DataSource source = new DataSource(args[2]);
2028
i = source.getDataSet();
2029
i.randomize(new Random(Integer.parseInt(args[1])));
2030
System.out.println(i);
2036
+ "\tweka.core.Instances help\n"
2037
+ "\tweka.core.Instances <filename>\n"
2038
+ "\tweka.core.Instances merge <filename1> <filename2>\n"
2039
+ "\tweka.core.Instances append <filename1> <filename2>\n"
2040
+ "\tweka.core.Instances randomize <seed> <filename>\n"
2045
catch (Exception ex) {
2046
ex.printStackTrace();
2047
System.err.println(ex.getMessage());