2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
23
package weka.core.converters;
25
import weka.core.Attribute;
26
import weka.core.FastVector;
27
import weka.core.Instance;
28
import weka.core.Instances;
29
import weka.core.SparseInstance;
31
import java.io.BufferedReader;
33
import java.io.FileInputStream;
34
import java.io.FileNotFoundException;
35
import java.io.IOException;
36
import java.io.InputStream;
37
import java.io.InputStreamReader;
38
import java.io.Reader;
39
import java.io.StreamTokenizer;
40
import java.io.StringReader;
42
import java.text.ParseException;
43
import java.util.zip.GZIPInputStream;
46
<!-- globalinfo-start -->
47
* Reads a source that is in arff (attribute relation file format) format.
49
<!-- globalinfo-end -->
51
* @author Mark Hall (mhall@cs.waikato.ac.nz)
52
* @author FracPete (fracpete at waikato dot ac dot nz)
53
* @version $Revision: 1.19 $
56
public class ArffLoader
57
extends AbstractFileLoader
58
implements BatchConverter, IncrementalConverter, URLSourcedLoader {
60
/** for serialization */
61
static final long serialVersionUID = 2726929550544048587L;
63
/** the file extension */
64
public static String FILE_EXTENSION = Instances.FILE_EXTENSION;
66
/** the extension for compressed files */
67
public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz";
70
protected String m_URL = "http://";
72
/** The reader for the source file. */
73
protected transient Reader m_sourceReader = null;
75
/** The parser for the ARFF file */
76
protected transient ArffReader m_ArffReader = null;
79
* Reads data from an ARFF file, either in incremental or batch mode. <p/>
81
* Typical code for batch usage:
83
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
84
* ArffReader arff = new ArffReader(reader);
85
* Instances data = arff.getData();
86
* data.setClassIndex(data.numAttributes() - 1);
89
* Typical code for incremental usage:
91
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
92
* ArffReader arff = new ArffReader(reader, 1000);
93
* Instances data = arff.getStructure();
94
* data.setClassIndex(data.numAttributes() - 1);
96
* while ((inst = arff.readInstance(data)) != null) {
101
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
102
* @author Len Trigg (trigg@cs.waikato.ac.nz)
103
* @author fracpete (fracpete at waikato dot ac dot nz)
104
* @version $Revision: 1.19 $
106
public static class ArffReader {
108
/** the tokenizer for reading the stream */
109
protected StreamTokenizer m_Tokenizer;
111
/** Buffer of values for sparse instance */
112
protected double[] m_ValueBuffer;
114
/** Buffer of indices for sparse instance */
115
protected int[] m_IndicesBuffer;
117
/** the actual data */
118
protected Instances m_Data;
120
/** the number of lines read so far */
121
protected int m_Lines;
124
* Reads the data completely from the reader. The data can be accessed
125
* via the <code>getData()</code> method.
127
* @param reader the reader to use
128
* @throws IOException if something goes wrong
131
public ArffReader(Reader reader) throws IOException {
132
m_Tokenizer = new StreamTokenizer(reader);
139
while ((inst = readInstance(m_Data)) != null) {
147
* Reads only the header and reserves the specified space for instances.
148
* Further instances can be read via <code>readInstance()</code>.
150
* @param reader the reader to use
151
* @param capacity the capacity of the new dataset
152
* @throws IOException if something goes wrong
153
* @throws IllegalArgumentException if capacity is negative
154
* @see #getStructure()
155
* @see #readInstance(Instances)
157
public ArffReader(Reader reader, int capacity) throws IOException {
159
throw new IllegalArgumentException("Capacity has to be positive!");
161
m_Tokenizer = new StreamTokenizer(reader);
164
readHeader(capacity);
169
* Reads the data without header according to the specified template.
170
* The data can be accessed via the <code>getData()</code> method.
172
* @param reader the reader to use
173
* @param template the template header
174
* @param lines the lines read so far
175
* @throws IOException if something goes wrong
178
public ArffReader(Reader reader, Instances template, int lines) throws IOException {
179
this(reader, template, lines, 100);
182
while ((inst = readInstance(m_Data)) != null) {
190
* Initializes the reader without reading the header according to the
191
* specified template. The data must be read via the
192
* <code>readInstance()</code> method.
194
* @param reader the reader to use
195
* @param template the template header
196
* @param lines the lines read so far
197
* @param capacity the capacity of the new dataset
198
* @throws IOException if something goes wrong
201
public ArffReader(Reader reader, Instances template, int lines, int capacity) throws IOException {
203
m_Tokenizer = new StreamTokenizer(reader);
206
m_Data = new Instances(template, capacity);
211
* initializes the buffers for sparse instances to be read
213
* @see #m_ValueBuffer
214
* @see #m_IndicesBuffer
216
protected void initBuffers() {
217
m_ValueBuffer = new double[m_Data.numAttributes()];
218
m_IndicesBuffer = new int[m_Data.numAttributes()];
222
* compactifies the data
224
protected void compactify() {
230
* Throws error message with line number and last token read.
232
* @param msg the error message to be thrown
233
* @throws IOException containing the error message
235
protected void errorMessage(String msg) throws IOException {
236
String str = msg + ", read " + m_Tokenizer.toString();
238
int line = Integer.parseInt(str.replaceAll(".* line ", ""));
239
str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1));
241
throw new IOException(str);
245
* returns the current line number
247
* @return the current line number
249
public int getLineNo() {
250
return m_Lines + m_Tokenizer.lineno();
254
* Gets next token, skipping empty lines.
256
* @throws IOException if reading the next token fails
258
protected void getFirstToken() throws IOException {
259
while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {};
261
if ((m_Tokenizer.ttype == '\'') ||
262
(m_Tokenizer.ttype == '"')) {
263
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
264
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) &&
265
(m_Tokenizer.sval.equals("?"))){
266
m_Tokenizer.ttype = '?';
271
* Gets index, checking for a premature and of line.
273
* @throws IOException if it finds a premature end of line
275
protected void getIndex() throws IOException {
276
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
277
errorMessage("premature end of line");
279
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
280
errorMessage("premature end of file");
285
* Gets token and checks if its end of line.
287
* @param endOfFileOk whether EOF is OK
288
* @throws IOException if it doesn't find an end of line
290
protected void getLastToken(boolean endOfFileOk) throws IOException {
291
if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
292
((m_Tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
293
errorMessage("end of line expected");
298
* Gets next token, checking for a premature and of line.
300
* @throws IOException if it finds a premature end of line
302
protected void getNextToken() throws IOException {
303
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
304
errorMessage("premature end of line");
306
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
307
errorMessage("premature end of file");
308
} else if ((m_Tokenizer.ttype == '\'') ||
309
(m_Tokenizer.ttype == '"')) {
310
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
311
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) &&
312
(m_Tokenizer.sval.equals("?"))){
313
m_Tokenizer.ttype = '?';
318
* Initializes the StreamTokenizer used for reading the ARFF file.
320
protected void initTokenizer(){
321
m_Tokenizer.resetSyntax();
322
m_Tokenizer.whitespaceChars(0, ' ');
323
m_Tokenizer.wordChars(' '+1,'\u00FF');
324
m_Tokenizer.whitespaceChars(',',',');
325
m_Tokenizer.commentChar('%');
326
m_Tokenizer.quoteChar('"');
327
m_Tokenizer.quoteChar('\'');
328
m_Tokenizer.ordinaryChar('{');
329
m_Tokenizer.ordinaryChar('}');
330
m_Tokenizer.eolIsSignificant(true);
334
* Reads a single instance using the tokenizer and returns it.
336
* @param structure the dataset header information, will get updated
337
* in case of string or relational attributes
338
* @return null if end of file has been reached
339
* @throws IOException if the information is not read
342
public Instance readInstance(Instances structure) throws IOException {
343
return readInstance(structure, true);
347
* Reads a single instance using the tokenizer and returns it.
349
* @param structure the dataset header information, will get updated
350
* in case of string or relational attributes
351
* @param flag if method should test for carriage return after
353
* @return null if end of file has been reached
354
* @throws IOException if the information is not read
357
public Instance readInstance(Instances structure, boolean flag) throws IOException {
358
return getInstance(structure, flag);
362
* Reads a single instance using the tokenizer and returns it.
364
* @param structure the dataset header information, will get updated
365
* in case of string or relational attributes
366
* @param flag if method should test for carriage return after
368
* @return null if end of file has been reached
369
* @throws IOException if the information is not read
372
protected Instance getInstance(Instances structure, boolean flag) throws IOException {
375
// Check if any attributes have been declared.
376
if (m_Data.numAttributes() == 0) {
377
errorMessage("no header information available");
380
// Check if end of file reached.
382
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
387
if (m_Tokenizer.ttype == '{') {
388
return getInstanceSparse(flag);
390
return getInstanceFull(flag);
395
* Reads a single instance using the tokenizer and returns it.
397
* @param flag if method should test for carriage return after
399
* @return null if end of file has been reached
400
* @throws IOException if the information is not read
403
protected Instance getInstanceSparse(boolean flag) throws IOException {
404
int valIndex, numValues = 0, maxIndex = -1;
410
if (m_Tokenizer.ttype == '}') {
416
m_IndicesBuffer[numValues] = Integer.valueOf(m_Tokenizer.sval).intValue();
417
} catch (NumberFormatException e) {
418
errorMessage("index number expected");
420
if (m_IndicesBuffer[numValues] <= maxIndex) {
421
errorMessage("indices have to be ordered");
423
if ((m_IndicesBuffer[numValues] < 0) ||
424
(m_IndicesBuffer[numValues] >= m_Data.numAttributes())) {
425
errorMessage("index out of bounds");
427
maxIndex = m_IndicesBuffer[numValues];
432
// Check if value is missing.
433
if (m_Tokenizer.ttype == '?') {
434
m_ValueBuffer[numValues] = Instance.missingValue();
437
// Check if token is valid.
438
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
439
errorMessage("not a valid value");
441
switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) {
442
case Attribute.NOMINAL:
443
// Check if value appears in header.
445
m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(m_Tokenizer.sval);
446
if (valIndex == -1) {
447
errorMessage("nominal value not declared in header");
449
m_ValueBuffer[numValues] = (double)valIndex;
451
case Attribute.NUMERIC:
452
// Check if value is really a number.
454
m_ValueBuffer[numValues] = Double.valueOf(m_Tokenizer.sval).
456
} catch (NumberFormatException e) {
457
errorMessage("number expected");
460
case Attribute.STRING:
461
m_ValueBuffer[numValues] =
462
m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(m_Tokenizer.sval);
466
m_ValueBuffer[numValues] =
467
m_Data.attribute(m_IndicesBuffer[numValues]).parseDate(m_Tokenizer.sval);
468
} catch (ParseException e) {
469
errorMessage("unparseable date: " + m_Tokenizer.sval);
472
case Attribute.RELATIONAL:
474
ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(m_IndicesBuffer[numValues]).relation(), 0);
475
Instances data = arff.getData();
476
m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addRelation(data);
478
catch (Exception e) {
479
throw new IOException(e.toString() + " of line " + getLineNo());
483
errorMessage("unknown attribute type in column " + m_IndicesBuffer[numValues]);
493
// Add instance to dataset
494
double[] tempValues = new double[numValues];
495
int[] tempIndices = new int[numValues];
496
System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
497
System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
498
Instance inst = new SparseInstance(1, tempValues, tempIndices, m_Data.numAttributes());
499
inst.setDataset(m_Data);
505
* Reads a single instance using the tokenizer and returns it.
507
* @param flag if method should test for carriage return after
509
* @return null if end of file has been reached
510
* @throws IOException if the information is not read
513
protected Instance getInstanceFull(boolean flag) throws IOException {
514
double[] instance = new double[m_Data.numAttributes()];
517
// Get values for all attributes.
518
for (int i = 0; i < m_Data.numAttributes(); i++){
524
// Check if value is missing.
525
if (m_Tokenizer.ttype == '?') {
526
instance[i] = Instance.missingValue();
529
// Check if token is valid.
530
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
531
errorMessage("not a valid value");
533
switch (m_Data.attribute(i).type()) {
534
case Attribute.NOMINAL:
535
// Check if value appears in header.
536
index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval);
538
errorMessage("nominal value not declared in header");
540
instance[i] = (double)index;
542
case Attribute.NUMERIC:
543
// Check if value is really a number.
545
instance[i] = Double.valueOf(m_Tokenizer.sval).
547
} catch (NumberFormatException e) {
548
errorMessage("number expected");
551
case Attribute.STRING:
552
instance[i] = m_Data.attribute(i).addStringValue(m_Tokenizer.sval);
556
instance[i] = m_Data.attribute(i).parseDate(m_Tokenizer.sval);
557
} catch (ParseException e) {
558
errorMessage("unparseable date: " + m_Tokenizer.sval);
561
case Attribute.RELATIONAL:
563
ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(i).relation(), 0);
564
Instances data = arff.getData();
565
instance[i] = m_Data.attribute(i).addRelation(data);
567
catch (Exception e) {
568
throw new IOException(e.toString() + " of line " + getLineNo());
572
errorMessage("unknown attribute type in column " + i);
581
// Add instance to dataset
582
Instance inst = new Instance(1, instance);
583
inst.setDataset(m_Data);
589
* Reads and stores header of an ARFF file.
591
* @param capacity the number of instances to reserve in the data
593
* @throws IOException if the information is not read
596
protected void readHeader(int capacity) throws IOException {
598
String relationName = "";
600
// Get name of relation.
602
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
603
errorMessage("premature end of file");
605
if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
607
relationName = m_Tokenizer.sval;
610
errorMessage("keyword " + Instances.ARFF_RELATION + " expected");
613
// Create vectors to hold information temporarily.
614
FastVector attributes = new FastVector();
616
// Get attribute declarations.
618
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
619
errorMessage("premature end of file");
622
while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
623
attributes = parseAttribute(attributes);
626
// Check if data part follows. We can't easily check for EOL.
627
if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) {
628
errorMessage("keyword " + Instances.ARFF_DATA + " expected");
631
// Check if any attributes have been declared.
632
if (attributes.size() == 0) {
633
errorMessage("no attributes declared");
636
m_Data = new Instances(relationName, attributes, capacity);
640
* Parses the attribute declaration.
642
* @param attributes the current attributes vector
643
* @return the new attributes vector
644
* @throws IOException if the information is not read
647
protected FastVector parseAttribute(FastVector attributes) throws IOException {
648
String attributeName;
649
FastVector attributeValues;
651
// Get attribute name.
653
attributeName = m_Tokenizer.sval;
656
// Check if attribute is nominal.
657
if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) {
659
// Attribute is real, integer, or string.
660
if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||
661
m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||
662
m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
663
attributes.addElement(new Attribute(attributeName, attributes.size()));
665
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
667
addElement(new Attribute(attributeName, (FastVector)null,
670
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {
671
String format = null;
672
if (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
673
if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD) &&
674
(m_Tokenizer.ttype != '\'') &&
675
(m_Tokenizer.ttype != '\"')) {
676
errorMessage("not a valid date format");
678
format = m_Tokenizer.sval;
681
m_Tokenizer.pushBack();
683
attributes.addElement(new Attribute(attributeName, format,
686
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) {
689
// Read attributes for subrelation
690
// First, save current set of attributes
691
FastVector atts = attributes;
692
attributes = new FastVector();
694
// Now, read attributes until we hit end of declaration of relational value
696
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
697
errorMessage("premature end of file");
700
if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
701
attributes = parseAttribute(attributes);
702
} else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
704
if (!attributeName.equalsIgnoreCase(m_Tokenizer.sval)) {
705
errorMessage("declaration of subrelation " + attributeName +
706
" must be terminated by " + "@end " + attributeName);
710
errorMessage("declaration of subrelation " + attributeName +
711
" must be terminated by " + "@end " + attributeName);
715
// Make relation and restore original set of attributes
716
Instances relation = new Instances(attributeName, attributes, 0);
718
attributes.addElement(new Attribute(attributeName, relation,
721
errorMessage("no valid attribute type or invalid "+
726
// Attribute is nominal.
727
attributeValues = new FastVector();
728
m_Tokenizer.pushBack();
730
// Get values for nominal attribute.
731
if (m_Tokenizer.nextToken() != '{') {
732
errorMessage("{ expected at beginning of enumeration");
734
while (m_Tokenizer.nextToken() != '}') {
735
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL) {
736
errorMessage("} expected at end of enumeration");
738
attributeValues.addElement(m_Tokenizer.sval);
742
addElement(new Attribute(attributeName, attributeValues,
747
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF)
748
errorMessage("premature end of file");
754
* Reads and skips all tokens before next end of line token.
756
* @throws IOException in case something goes wrong
758
protected void readTillEOL() throws IOException {
759
while (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
761
m_Tokenizer.pushBack();
765
* Returns the header format
767
* @return the header format
769
public Instances getStructure() {
770
return new Instances(m_Data, 0);
774
* Returns the data that was read
778
public Instances getData() {
784
* Returns a string describing this Loader
785
* @return a description of the Loader suitable for
786
* displaying in the explorer/experimenter gui
788
public String globalInfo() {
789
return "Reads a source that is in arff (attribute relation file format) "
794
* Get the file extension used for arff files
796
* @return the file extension
798
public String getFileExtension() {
799
return FILE_EXTENSION;
803
* Gets all the file extensions used for this type of file
805
* @return the file extensions
807
public String[] getFileExtensions() {
808
return new String[]{FILE_EXTENSION, FILE_EXTENSION_COMPRESSED};
812
* Returns a description of the file type.
814
* @return a short file description
816
public String getFileDescription() {
817
return "Arff data files";
821
* Resets the Loader ready to read a new data set
823
* @throws IOException if something goes wrong
825
public void reset() throws IOException {
829
if (m_File != null && (new File(m_File)).isFile()) {
830
setFile(new File(m_File));
831
} else if (m_URL != null & !m_URL.equals("http://")) {
837
* Resets the Loader object and sets the source of the data set to be
840
* @param url the source url.
841
* @throws IOException if an error occurs
843
public void setSource(URL url) throws IOException {
847
setSource(url.openStream());
849
m_URL = url.toString();
854
* get the File specified as the source
856
* @return the source file
858
public File retrieveFile() {
859
return new File(m_File);
863
* sets the source File
865
* @param file the source file
866
* @throws IOException if an error occurs
868
public void setFile(File file) throws IOException {
869
m_File = file.getAbsolutePath();
874
* Resets the Loader object and sets the source of the data set to be
875
* the supplied File object.
877
* @param file the source file.
878
* @throws IOException if an error occurs
880
public void setSource(File file) throws IOException {
886
throw new IOException("Source file object is null!");
889
if (file.getName().endsWith(FILE_EXTENSION_COMPRESSED))
890
setSource(new GZIPInputStream(new FileInputStream(file)));
892
setSource(new FileInputStream(file));
894
catch (FileNotFoundException ex) {
895
throw new IOException("File not found");
899
m_File = file.getAbsolutePath();
903
* Set the url to load from
905
* @param url the url to load from
906
* @throws IOException if the url can't be set.
908
public void setURL(String url) throws IOException {
910
setSource(new URL(url));
914
* Return the current url
916
* @return the current url
918
public String retrieveURL() {
923
* Resets the Loader object and sets the source of the data set to be
924
* the supplied InputStream.
926
* @param in the source InputStream.
927
* @throws IOException always thrown.
929
public void setSource(InputStream in) throws IOException {
930
m_File = (new File(System.getProperty("user.dir"))).getAbsolutePath();
933
m_sourceReader = new BufferedReader(new InputStreamReader(in));
937
* Determines and returns (if possible) the structure (internally the
938
* header) of the data set as an empty set of instances.
940
* @return the structure of the data set as an empty set of Instances
941
* @throws IOException if an error occurs
943
public Instances getStructure() throws IOException {
945
if (m_sourceReader == null) {
946
throw new IOException("No source has been specified");
949
if (m_structure == null) {
951
m_ArffReader = new ArffReader(m_sourceReader, 1);
952
m_structure = m_ArffReader.getStructure();
953
} catch (Exception ex) {
954
throw new IOException("Unable to determine structure as arff (Reason: " + ex.toString() + ").");
958
return new Instances(m_structure, 0);
962
* Return the full data set. If the structure hasn't yet been determined
963
* by a call to getStructure then method should do so before processing
964
* the rest of the data set.
966
* @return the structure of the data set as an empty set of Instances
967
* @throws IOException if there is no source or parsing fails
969
public Instances getDataSet() throws IOException {
971
if (m_sourceReader == null) {
972
throw new IOException("No source has been specified");
974
if (getRetrieval() == INCREMENTAL) {
975
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
978
if (m_structure == null) {
982
// Read all instances
984
while ((inst = m_ArffReader.readInstance(m_structure)) != null)
985
m_structure.add(inst);
987
Instances readIn = new Instances(m_structure);
993
* Read the data set incrementally---get the next instance in the data
994
* set or returns null if there are no
995
* more instances to get. If the structure hasn't yet been
996
* determined by a call to getStructure then method should do so before
997
* returning the next instance in the data set.
999
* @param structure the dataset header information, will get updated in
1000
* case of string or relational attributes
1001
* @return the next instance in the data set as an Instance object or null
1002
* if there are no more instances to be read
1003
* @throws IOException if there is an error during parsing
1005
public Instance getNextInstance(Instances structure) throws IOException {
1007
m_structure = structure;
1009
if (getRetrieval() == BATCH) {
1010
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
1012
setRetrieval(INCREMENTAL);
1014
Instance current = m_ArffReader.readInstance(m_structure);
1015
if (current == null) {
1018
} catch (Exception ex) {
1019
ex.printStackTrace();
1028
* @param args should contain the name of an input file.
1030
public static void main(String [] args) {
1031
runFileLoader(new ArffLoader(), args);