~ubuntu-branches/ubuntu/trusty/weka/trusty-proposed

* Reads a source that is in comma separated or tab separated format. Assumes that the first row in the file determines the number of and names of the attributes.

* <p/>

* @author Mark Hall (mhall@cs.waikato.ac.nz)

* @version $Revision: 1.16 $

* @see Loader

public class CSVLoader

extends AbstractFileLoader

implements BatchConverter {

/** for serialization */

static final long serialVersionUID = 5607529739745491340L;

/** the file extension */

public static String FILE_EXTENSION = ".csv";

/**

* A list of hash tables for accumulating nominal values during parsing.

private FastVector m_cumulativeStructure;

/**

* Holds instances accumulated so far

private FastVector m_cumulativeInstances;

/** the data collected from an InputStream */

private StringBuffer m_StreamBuffer;

/**

* default constructor

public CSVLoader() {

// No instances retrieved yet

setRetrieval(NONE);

}

/**

* Get the file extension used for arff files

* @return the file extension

public String getFileExtension() {

return FILE_EXTENSION;

}

/**

* Returns a description of the file type.

* @return a short file description

public String getFileDescription() {

return "CSV data files";

}

100

101

/**

102

* Gets all the file extensions used for this type of file

103

104

* @return the file extensions

105

106

public String[] getFileExtensions() {

107

return new String[]{getFileExtension()};

108

}

109

110

/**

111

* Returns a string describing this attribute evaluator

112

* @return a description of the evaluator suitable for

113

* displaying in the explorer/experimenter gui

114

115

public String globalInfo() {

116

return "Reads a source that is in comma separated or tab separated format. "

117

+"Assumes that the first row in the file determines the number of "

118

+"and names of the attributes.";

119

}

120

121

/**

122

* Resets the Loader object and sets the source of the data set to be

123

* the supplied Stream object.

124

125

* @param input the input stream

126

* @exception IOException if an error occurs

127

128

public void setSource(InputStream input) throws IOException {

129

BufferedReader reader;

130

String line;

131

132

m_structure = null;

133

m_sourceFile = null;

134

m_File = null;

135

136

m_StreamBuffer = new StringBuffer();

137

reader = new BufferedReader(new InputStreamReader(input));

138

while ((line = reader.readLine()) != null)

139

m_StreamBuffer.append(line + "\n");

140

}

141

142

/**

143

* Resets the Loader object and sets the source of the data set to be

144

* the supplied File object.

145

146

* @param file the source file.

147

* @exception IOException if an error occurs

148

149

public void setSource(File file) throws IOException {

150

super.setSource(file);

151

152

m_StreamBuffer = null;

153

}

154

155

/**

156

* Determines and returns (if possible) the structure (internally the

157

* header) of the data set as an empty set of instances.

158

159

* @return the structure of the data set as an empty set of Instances

160

* @exception IOException if an error occurs

161

162

public Instances getStructure() throws IOException {

163

if ((m_sourceFile == null) && (m_StreamBuffer == null)) {

164

throw new IOException("No source has been specified");

165

}

166

167

if (m_structure == null) {

168

try {

169

BufferedReader br;

170

if (m_StreamBuffer != null)

171

br = new BufferedReader(new StringReader(m_StreamBuffer.toString()));

172

else

173

br = new BufferedReader(new FileReader(m_sourceFile));

174

StreamTokenizer st = new StreamTokenizer(br);

175

initTokenizer(st);

176

readStructure(st);

177

} catch (FileNotFoundException ex) {

178

}

179

}

180

181

return m_structure;

182

}

183

184

/**

185

* reads the structure

186

187

* @param st the stream tokenizer to read from

188

* @throws IOException if reading fails

189

190

private void readStructure(StreamTokenizer st) throws IOException {

191

readHeader(st);

192

}

193

194

/**

195

* Return the full data set. If the structure hasn't yet been determined

196

* by a call to getStructure then method should do so before processing

197

* the rest of the data set.

198

199

* @return the structure of the data set as an empty set of Instances

200

* @exception IOException if there is no source or parsing fails

201

202

public Instances getDataSet() throws IOException {

203

if ((m_sourceFile == null) && (m_StreamBuffer == null)) {

204

throw new IOException("No source has been specified");

205

}

206

BufferedReader br;

207

if (m_sourceFile != null) {

208

setSource(m_sourceFile);

209

br = new BufferedReader(new FileReader(m_sourceFile));

210

}

211

else {

212

br = new BufferedReader(new StringReader(m_StreamBuffer.toString()));

213

}

214

StreamTokenizer st = new StreamTokenizer(br);

215

initTokenizer(st);

216

readStructure(st);

217

218

st.ordinaryChar(',');

219

st.ordinaryChar('\t');

220

221

m_cumulativeStructure = new FastVector(m_structure.numAttributes());

222

for (int i = 0; i < m_structure.numAttributes(); i++) {

223

m_cumulativeStructure.addElement(new Hashtable());

224

}

225

226

227

// Instances result = new Instances(m_structure);

228

m_cumulativeInstances = new FastVector();

229

FastVector current;

230

while ((current = getInstance(st)) != null) {

231

m_cumulativeInstances.addElement(current);

232

}

233

br.close();

234

// now determine the true structure of the data set

235

FastVector atts = new FastVector(m_structure.numAttributes());

236

for (int i = 0; i < m_structure.numAttributes(); i++) {

237

String attname = m_structure.attribute(i).name();

238

Hashtable tempHash = ((Hashtable)m_cumulativeStructure.elementAt(i));

239

if (tempHash.size() == 0) {

240

atts.addElement(new Attribute(attname));

241

} else {

242

FastVector values = new FastVector(tempHash.size());

243

// add dummy objects in order to make the FastVector's size == capacity

244

for (int z = 0; z < tempHash.size(); z++) {

245

values.addElement("dummy");

246

}

247

Enumeration e = tempHash.keys();

248

while (e.hasMoreElements()) {

249

Object ob = e.nextElement();

250

// if (ob instanceof Double) {

251

int index = ((Integer)tempHash.get(ob)).intValue();

252

values.setElementAt(new String(ob.toString()), index);

253

// }

254

}

255

atts.addElement(new Attribute(attname, values));

256

}

257

}

258

259

// make the instances

260

String relationName;

261

if (m_sourceFile != null)

262

relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");

263

else

264

relationName = "stream";

265

Instances dataSet = new Instances(relationName,

266

atts,

267

m_cumulativeInstances.size());

268

269

for (int i = 0; i < m_cumulativeInstances.size(); i++) {

270

current = ((FastVector)m_cumulativeInstances.elementAt(i));

271

double [] vals = new double[dataSet.numAttributes()];

272

for (int j = 0; j < current.size(); j++) {

273

Object cval = current.elementAt(j);

274

if (cval instanceof String) {

275

if (((String)cval).compareTo("'?'") == 0) {

276

vals[j] = Instance.missingValue();

277

} else {

278

if (!dataSet.attribute(j).isNominal()) {

279

System.err.println("Wrong attribute type!!!");

280

System.exit(1);

281

}

282

// find correct index

283

Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);

284

int index = ((Integer)lookup.get(cval)).intValue();

285

vals[j] = (double)index;

286

}

287

} else if (dataSet.attribute(j).isNominal()) {

288

// find correct index

289

Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);

290

int index = ((Integer)lookup.get(cval)).intValue();

291

vals[j] = (double)index;

292

} else {

293

vals[j] = ((Double)cval).doubleValue();

294

}

295

}

296

dataSet.add(new Instance(1.0, vals));

297

}

298

m_structure = new Instances(dataSet, 0);

299

setRetrieval(BATCH);

300

m_cumulativeStructure = null; // conserve memory

301

return dataSet;

302

}

303

304

/**

305

* CSVLoader is unable to process a data set incrementally.

306

307

* @param structure ignored

308

* @return never returns without throwing an exception

309

* @exception IOException always. CSVLoader is unable to process a data

310

* set incrementally.

311

312

public Instance getNextInstance(Instances structure) throws IOException {

313

throw new IOException("CSVLoader can't read data sets incrementally.");

314

}

315

316

/**

317

* Attempts to parse a line of the data set.

318

319

* @param tokenizer the tokenizer

320

* @return a FastVector containg String and Double objects representing

321

* the values of the instance.

322

* @exception IOException if an error occurs

323

324

* <pre><jml>

325

* private_normal_behavior

326

* requires: tokenizer != null;

327

* ensures: \result != null;

328

* also

329

* private_exceptional_behavior

330

* requires: tokenizer == null

331

* || (* unsucessful parse *);

332

* signals: (IOException);

333

* </jml></pre>

334

335

private FastVector getInstance(StreamTokenizer tokenizer)

336

throws IOException {

337

338

FastVector current = new FastVector();

339

340

// Check if end of file reached.

341

ConverterUtils.getFirstToken(tokenizer);

342

if (tokenizer.ttype == StreamTokenizer.TT_EOF) {

343

return null;

344

}

345

boolean first = true;

346

boolean wasSep;

347

348

while (tokenizer.ttype != StreamTokenizer.TT_EOL &&

349

tokenizer.ttype != StreamTokenizer.TT_EOF) {

350

351

// Get next token

352

if (!first) {

353

ConverterUtils.getToken(tokenizer);

354

}

355

356

if (tokenizer.ttype == ',' || tokenizer.ttype == '\t' ||

357

tokenizer.ttype == StreamTokenizer.TT_EOL) {

358

current.addElement("?");

359

wasSep = true;

360

} else if (tokenizer.ttype == '?') {

361

wasSep = false;

362

current.addElement(new String("'?'"));

363

} else {

364

wasSep = false;

365

// try to parse as a number

366

try {

367

double val = Double.valueOf(tokenizer.sval).doubleValue();

368

current.addElement(new Double(val));

369

} catch (NumberFormatException e) {

370

// otherwise assume its an enumerated value

371

current.addElement(new String(tokenizer.sval));

372

}

373

}

374

375

if (!wasSep) {

376

ConverterUtils.getToken(tokenizer);

377

}

378

first = false;

379

}

380

381

// check number of values read

382

if (current.size() != m_structure.numAttributes()) {

383

ConverterUtils.errms(tokenizer,

384

"wrong number of values. Read "+current.size()

385

+", expected "+m_structure.numAttributes());

386

}

387

388

// check for structure update

389

try {

390

checkStructure(current);

391

} catch (Exception ex) {

392

ex.printStackTrace();

393

}

394

395

return current;

396

}

397

398

/**

399

* Checks the current instance against what is known about the structure

400

* of the data set so far. If there is a nominal value for an attribute

401

* that was beleived to be numeric then all previously seen values for this

402

* attribute are stored in a Hashtable.

403

404

* @param current a <code>FastVector</code> value

405

* @exception Exception if an error occurs

406

407

* <pre><jml>

408

* private_normal_behavior

409

* requires: current != null;

410

* also

411

* private_exceptional_behavior

412

* requires: current == null

413

* || (* unrecognized object type in current *);

414

* signals: (Exception);

415

* </jml></pre>

416

417

private void checkStructure(FastVector current) throws Exception {

418

if (current == null) {

419

throw new Exception("current shouldn't be null in checkStructure");

420

}

421

for (int i = 0; i < current.size(); i++) {

422

Object ob = current.elementAt(i);

423

if (ob instanceof String) {

424

if (((String)ob).compareTo("'?'") == 0) {

425

} else {

426

Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);

427

if (!tempHash.containsKey(ob)) {

428

// may have found a nominal value in what was previously thought to

429

// be a numeric variable.

430

if (tempHash.size() == 0) {

431

for (int j = 0; j < m_cumulativeInstances.size(); j++) {

432

FastVector tempUpdate =

433

((FastVector)m_cumulativeInstances.elementAt(j));

434

Object tempO = tempUpdate.elementAt(i);

435

if (tempO instanceof String) {

436

// must have been a missing value

437

} else {

438

if (!tempHash.containsKey(tempO)) {

439

tempHash.put(new Double(((Double)tempO).doubleValue()),

440

new Integer(tempHash.size()));

441

}

442

}

443

}

444

}

445

int newIndex = tempHash.size();

446

tempHash.put(ob, new Integer(newIndex));

447

}

448

}

449

} else if (ob instanceof Double) {

450

Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);

451

if (tempHash.size() != 0) {

452

if (!tempHash.containsKey(ob)) {

453

int newIndex = tempHash.size();

454

tempHash.put(new Double(((Double)ob).doubleValue()),

455

new Integer(newIndex));

456

}

457

}

458

} else {

459

throw new Exception("Wrong object type in checkStructure!");

460

}

461

}

462

}

463

464

/**

465

* Assumes the first line of the file contains the attribute names.

466

* Assumes all attributes are real (Reading the full data set with

467

* getDataSet will establish the true structure).

468

469

* @param tokenizer a <code>StreamTokenizer</code> value

470

* @exception IOException if an error occurs

471

472

* <pre><jml>

473

* private_normal_behavior

474

* requires: tokenizer != null;

475

* modifiable: m_structure;

476

* ensures: m_structure != null;

477

* also

478

* private_exceptional_behavior

479

* requires: tokenizer == null

480

* || (* unsucessful parse *);

481

* signals: (IOException);

482

* </jml></pre>

483

484

private void readHeader(StreamTokenizer tokenizer) throws IOException {

485

486

FastVector attribNames = new FastVector();

487

ConverterUtils.getFirstToken(tokenizer);

488

if (tokenizer.ttype == StreamTokenizer.TT_EOF) {

489

ConverterUtils.errms(tokenizer,"premature end of file");

490

}

491

492

while (tokenizer.ttype != StreamTokenizer.TT_EOL) {

493

attribNames.addElement(new Attribute(tokenizer.sval));

494

ConverterUtils.getToken(tokenizer);

495

}

496

String relationName;

497

if (m_sourceFile != null)

498

relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$","");

499

else

500

relationName = "stream";

501

m_structure = new Instances(relationName, attribNames, 0);

502

}

503

504

/**

505

* Initializes the stream tokenizer

506

507

* @param tokenizer the tokenizer to initialize

508

509

private void initTokenizer(StreamTokenizer tokenizer) {

510

tokenizer.resetSyntax();

511

tokenizer.whitespaceChars(0, (' '-1));

512

tokenizer.wordChars(' ','\u00FF');

513

tokenizer.whitespaceChars(',',',');

514

tokenizer.whitespaceChars('\t','\t');

515

tokenizer.commentChar('%');

516

tokenizer.quoteChar('"');

517

tokenizer.quoteChar('\'');

518

tokenizer.eolIsSignificant(true);

519

}

520

521

/**

522

* Main method.

523

524

* @param args should contain the name of an input file.

525

526

public static void main(String [] args) {

527

runFileLoader(new CSVLoader(), args);

528

}

529

}

Older »