2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
23
package weka.core.converters;
25
import weka.core.Attribute;
26
import weka.core.FastVector;
27
import weka.core.Instance;
28
import weka.core.Instances;
30
import java.io.BufferedReader;
32
import java.io.FileNotFoundException;
33
import java.io.FileReader;
34
import java.io.IOException;
35
import java.io.InputStream;
36
import java.io.InputStreamReader;
37
import java.io.StreamTokenizer;
38
import java.io.StringReader;
39
import java.util.Enumeration;
40
import java.util.Hashtable;
43
<!-- globalinfo-start -->
44
* Reads a source that is in comma separated or tab separated format. Assumes that the first row in the file determines the number of and names of the attributes.
46
<!-- globalinfo-end -->
48
* @author Mark Hall (mhall@cs.waikato.ac.nz)
49
* @version $Revision: 1.16 $
52
public class CSVLoader
53
extends AbstractFileLoader
54
implements BatchConverter {
56
/** for serialization */
57
static final long serialVersionUID = 5607529739745491340L;
59
/** the file extension */
60
public static String FILE_EXTENSION = ".csv";
63
* A list of hash tables for accumulating nominal values during parsing.
65
private FastVector m_cumulativeStructure;
68
* Holds instances accumulated so far
70
private FastVector m_cumulativeInstances;
72
/** the data collected from an InputStream */
73
private StringBuffer m_StreamBuffer;
79
// No instances retrieved yet
84
* Get the file extension used for arff files
86
* @return the file extension
88
public String getFileExtension() {
89
return FILE_EXTENSION;
93
* Returns a description of the file type.
95
* @return a short file description
97
public String getFileDescription() {
98
return "CSV data files";
102
* Gets all the file extensions used for this type of file
104
* @return the file extensions
106
public String[] getFileExtensions() {
107
return new String[]{getFileExtension()};
111
* Returns a string describing this attribute evaluator
112
* @return a description of the evaluator suitable for
113
* displaying in the explorer/experimenter gui
115
public String globalInfo() {
116
return "Reads a source that is in comma separated or tab separated format. "
117
+"Assumes that the first row in the file determines the number of "
118
+"and names of the attributes.";
122
* Resets the Loader object and sets the source of the data set to be
123
* the supplied Stream object.
125
* @param input the input stream
126
* @exception IOException if an error occurs
128
public void setSource(InputStream input) throws IOException {
129
BufferedReader reader;
136
m_StreamBuffer = new StringBuffer();
137
reader = new BufferedReader(new InputStreamReader(input));
138
while ((line = reader.readLine()) != null)
139
m_StreamBuffer.append(line + "\n");
143
* Resets the Loader object and sets the source of the data set to be
144
* the supplied File object.
146
* @param file the source file.
147
* @exception IOException if an error occurs
149
public void setSource(File file) throws IOException {
150
super.setSource(file);
152
m_StreamBuffer = null;
156
* Determines and returns (if possible) the structure (internally the
157
* header) of the data set as an empty set of instances.
159
* @return the structure of the data set as an empty set of Instances
160
* @exception IOException if an error occurs
162
public Instances getStructure() throws IOException {
163
if ((m_sourceFile == null) && (m_StreamBuffer == null)) {
164
throw new IOException("No source has been specified");
167
if (m_structure == null) {
170
if (m_StreamBuffer != null)
171
br = new BufferedReader(new StringReader(m_StreamBuffer.toString()));
173
br = new BufferedReader(new FileReader(m_sourceFile));
174
StreamTokenizer st = new StreamTokenizer(br);
177
} catch (FileNotFoundException ex) {
185
* reads the structure
187
* @param st the stream tokenizer to read from
188
* @throws IOException if reading fails
190
private void readStructure(StreamTokenizer st) throws IOException {
195
* Return the full data set. If the structure hasn't yet been determined
196
* by a call to getStructure then method should do so before processing
197
* the rest of the data set.
199
* @return the structure of the data set as an empty set of Instances
200
* @exception IOException if there is no source or parsing fails
202
public Instances getDataSet() throws IOException {
203
if ((m_sourceFile == null) && (m_StreamBuffer == null)) {
204
throw new IOException("No source has been specified");
207
if (m_sourceFile != null) {
208
setSource(m_sourceFile);
209
br = new BufferedReader(new FileReader(m_sourceFile));
212
br = new BufferedReader(new StringReader(m_StreamBuffer.toString()));
214
StreamTokenizer st = new StreamTokenizer(br);
218
st.ordinaryChar(',');
219
st.ordinaryChar('\t');
221
m_cumulativeStructure = new FastVector(m_structure.numAttributes());
222
for (int i = 0; i < m_structure.numAttributes(); i++) {
223
m_cumulativeStructure.addElement(new Hashtable());
227
// Instances result = new Instances(m_structure);
228
m_cumulativeInstances = new FastVector();
230
while ((current = getInstance(st)) != null) {
231
m_cumulativeInstances.addElement(current);
234
// now determine the true structure of the data set
235
FastVector atts = new FastVector(m_structure.numAttributes());
236
for (int i = 0; i < m_structure.numAttributes(); i++) {
237
String attname = m_structure.attribute(i).name();
238
Hashtable tempHash = ((Hashtable)m_cumulativeStructure.elementAt(i));
239
if (tempHash.size() == 0) {
240
atts.addElement(new Attribute(attname));
242
FastVector values = new FastVector(tempHash.size());
243
// add dummy objects in order to make the FastVector's size == capacity
244
for (int z = 0; z < tempHash.size(); z++) {
245
values.addElement("dummy");
247
Enumeration e = tempHash.keys();
248
while (e.hasMoreElements()) {
249
Object ob = e.nextElement();
250
// if (ob instanceof Double) {
251
int index = ((Integer)tempHash.get(ob)).intValue();
252
values.setElementAt(new String(ob.toString()), index);
255
atts.addElement(new Attribute(attname, values));
259
// make the instances
261
if (m_sourceFile != null)
262
relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");
264
relationName = "stream";
265
Instances dataSet = new Instances(relationName,
267
m_cumulativeInstances.size());
269
for (int i = 0; i < m_cumulativeInstances.size(); i++) {
270
current = ((FastVector)m_cumulativeInstances.elementAt(i));
271
double [] vals = new double[dataSet.numAttributes()];
272
for (int j = 0; j < current.size(); j++) {
273
Object cval = current.elementAt(j);
274
if (cval instanceof String) {
275
if (((String)cval).compareTo("'?'") == 0) {
276
vals[j] = Instance.missingValue();
278
if (!dataSet.attribute(j).isNominal()) {
279
System.err.println("Wrong attribute type!!!");
282
// find correct index
283
Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
284
int index = ((Integer)lookup.get(cval)).intValue();
285
vals[j] = (double)index;
287
} else if (dataSet.attribute(j).isNominal()) {
288
// find correct index
289
Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
290
int index = ((Integer)lookup.get(cval)).intValue();
291
vals[j] = (double)index;
293
vals[j] = ((Double)cval).doubleValue();
296
dataSet.add(new Instance(1.0, vals));
298
m_structure = new Instances(dataSet, 0);
300
m_cumulativeStructure = null; // conserve memory
305
* CSVLoader is unable to process a data set incrementally.
307
* @param structure ignored
308
* @return never returns without throwing an exception
309
* @exception IOException always. CSVLoader is unable to process a data
312
public Instance getNextInstance(Instances structure) throws IOException {
313
throw new IOException("CSVLoader can't read data sets incrementally.");
317
* Attempts to parse a line of the data set.
319
* @param tokenizer the tokenizer
320
* @return a FastVector containg String and Double objects representing
321
* the values of the instance.
322
* @exception IOException if an error occurs
325
* private_normal_behavior
326
* requires: tokenizer != null;
327
* ensures: \result != null;
329
* private_exceptional_behavior
330
* requires: tokenizer == null
331
* || (* unsucessful parse *);
332
* signals: (IOException);
335
private FastVector getInstance(StreamTokenizer tokenizer)
338
FastVector current = new FastVector();
340
// Check if end of file reached.
341
ConverterUtils.getFirstToken(tokenizer);
342
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
345
boolean first = true;
348
while (tokenizer.ttype != StreamTokenizer.TT_EOL &&
349
tokenizer.ttype != StreamTokenizer.TT_EOF) {
353
ConverterUtils.getToken(tokenizer);
356
if (tokenizer.ttype == ',' || tokenizer.ttype == '\t' ||
357
tokenizer.ttype == StreamTokenizer.TT_EOL) {
358
current.addElement("?");
360
} else if (tokenizer.ttype == '?') {
362
current.addElement(new String("'?'"));
365
// try to parse as a number
367
double val = Double.valueOf(tokenizer.sval).doubleValue();
368
current.addElement(new Double(val));
369
} catch (NumberFormatException e) {
370
// otherwise assume its an enumerated value
371
current.addElement(new String(tokenizer.sval));
376
ConverterUtils.getToken(tokenizer);
381
// check number of values read
382
if (current.size() != m_structure.numAttributes()) {
383
ConverterUtils.errms(tokenizer,
384
"wrong number of values. Read "+current.size()
385
+", expected "+m_structure.numAttributes());
388
// check for structure update
390
checkStructure(current);
391
} catch (Exception ex) {
392
ex.printStackTrace();
399
* Checks the current instance against what is known about the structure
400
* of the data set so far. If there is a nominal value for an attribute
401
* that was beleived to be numeric then all previously seen values for this
402
* attribute are stored in a Hashtable.
404
* @param current a <code>FastVector</code> value
405
* @exception Exception if an error occurs
408
* private_normal_behavior
409
* requires: current != null;
411
* private_exceptional_behavior
412
* requires: current == null
413
* || (* unrecognized object type in current *);
414
* signals: (Exception);
417
private void checkStructure(FastVector current) throws Exception {
418
if (current == null) {
419
throw new Exception("current shouldn't be null in checkStructure");
421
for (int i = 0; i < current.size(); i++) {
422
Object ob = current.elementAt(i);
423
if (ob instanceof String) {
424
if (((String)ob).compareTo("'?'") == 0) {
426
Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
427
if (!tempHash.containsKey(ob)) {
428
// may have found a nominal value in what was previously thought to
429
// be a numeric variable.
430
if (tempHash.size() == 0) {
431
for (int j = 0; j < m_cumulativeInstances.size(); j++) {
432
FastVector tempUpdate =
433
((FastVector)m_cumulativeInstances.elementAt(j));
434
Object tempO = tempUpdate.elementAt(i);
435
if (tempO instanceof String) {
436
// must have been a missing value
438
if (!tempHash.containsKey(tempO)) {
439
tempHash.put(new Double(((Double)tempO).doubleValue()),
440
new Integer(tempHash.size()));
445
int newIndex = tempHash.size();
446
tempHash.put(ob, new Integer(newIndex));
449
} else if (ob instanceof Double) {
450
Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
451
if (tempHash.size() != 0) {
452
if (!tempHash.containsKey(ob)) {
453
int newIndex = tempHash.size();
454
tempHash.put(new Double(((Double)ob).doubleValue()),
455
new Integer(newIndex));
459
throw new Exception("Wrong object type in checkStructure!");
465
* Assumes the first line of the file contains the attribute names.
466
* Assumes all attributes are real (Reading the full data set with
467
* getDataSet will establish the true structure).
469
* @param tokenizer a <code>StreamTokenizer</code> value
470
* @exception IOException if an error occurs
473
* private_normal_behavior
474
* requires: tokenizer != null;
475
* modifiable: m_structure;
476
* ensures: m_structure != null;
478
* private_exceptional_behavior
479
* requires: tokenizer == null
480
* || (* unsucessful parse *);
481
* signals: (IOException);
484
private void readHeader(StreamTokenizer tokenizer) throws IOException {
486
FastVector attribNames = new FastVector();
487
ConverterUtils.getFirstToken(tokenizer);
488
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
489
ConverterUtils.errms(tokenizer,"premature end of file");
492
while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
493
attribNames.addElement(new Attribute(tokenizer.sval));
494
ConverterUtils.getToken(tokenizer);
497
if (m_sourceFile != null)
498
relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");
500
relationName = "stream";
501
m_structure = new Instances(relationName, attribNames, 0);
505
* Initializes the stream tokenizer
507
* @param tokenizer the tokenizer to initialize
509
private void initTokenizer(StreamTokenizer tokenizer) {
510
tokenizer.resetSyntax();
511
tokenizer.whitespaceChars(0, (' '-1));
512
tokenizer.wordChars(' ','\u00FF');
513
tokenizer.whitespaceChars(',',',');
514
tokenizer.whitespaceChars('\t','\t');
515
tokenizer.commentChar('%');
516
tokenizer.quoteChar('"');
517
tokenizer.quoteChar('\'');
518
tokenizer.eolIsSignificant(true);
524
* @param args should contain the name of an input file.
526
public static void main(String [] args) {
527
runFileLoader(new CSVLoader(), args);