~ubuntu-branches/ubuntu/quantal/commons-csv/quantal

« back to all changes in this revision

Viewing changes to src/java/org/apache/commons/csv/CSVParser.java

Committer: Bazaar Package Importer
Author(s): Jan-Pascal van Best
Date: 2007-07-27 09:45:30 UTC
Revision ID: james.westby@ubuntu.com-20070727094530-iy6ls22i7yj3p0sg

Tags: upstream-0.1-SNAPSHOT+svn558885

Import upstream version 0.1-SNAPSHOT+svn558885

files added:

LICENSE.txt

NOTICE.txt

build.xml

checkstyle.xml

maven.xml

pom.xml

project.properties

project.xml

src/changes

src/changes/changes.xml

src/java

src/java/org

src/java/org/apache

src/java/org/apache/commons

src/java/org/apache/commons/csv

src/java/org/apache/commons/csv/CSVParser.java

src/java/org/apache/commons/csv/CSVPrinter.java

src/java/org/apache/commons/csv/CSVStrategy.java

src/java/org/apache/commons/csv/CSVUtils.java

src/java/org/apache/commons/csv/CharBuffer.java

src/java/org/apache/commons/csv/ExtendedBufferedReader.java

src/java/org/apache/commons/csv/package.html

src/java/org/apache/commons/csv/writer

src/java/org/apache/commons/csv/writer/CSVConfig.java

src/java/org/apache/commons/csv/writer/CSVConfigGuesser.java

src/java/org/apache/commons/csv/writer/CSVField.java

src/java/org/apache/commons/csv/writer/CSVWriter.java

src/site

src/site/site.xml

src/site/xdoc

src/site/xdoc/cvs-usage.xml

src/site/xdoc/downloads.xml

src/site/xdoc/examples.xml

src/site/xdoc/index.xml

src/site/xdoc/issue-tracking.xml

src/test

src/test/AllTests.java

src/test/org

src/test/org/apache

src/test/org/apache/commons

src/test/org/apache/commons/csv

src/test/org/apache/commons/csv/AllTests.java

src/test/org/apache/commons/csv/CSVParserTest.java

src/test/org/apache/commons/csv/CSVPrinterTest.java

src/test/org/apache/commons/csv/CSVStrategyTest.java

src/test/org/apache/commons/csv/CSVUtilsTest.java

src/test/org/apache/commons/csv/CharBufferTest.java

src/test/org/apache/commons/csv/ExtendedBufferedReaderTest.java

src/test/org/apache/commons/csv/writer

src/test/org/apache/commons/csv/writer/AllTests.java

src/test/org/apache/commons/csv/writer/CSVConfigGuesserTest.java

src/test/org/apache/commons/csv/writer/CSVConfigTest.java

src/test/org/apache/commons/csv/writer/CSVFieldTest.java

src/test/org/apache/commons/csv/writer/CSVWriterTest.java

Show diffs side-by-side

added added

removed removed

src/java/org/apache/commons/csv/CSVParser.java

* Licensed to the Apache Software Foundation (ASF) under one or more

* contributor license agreements. See the NOTICE file distributed with

* this work for additional information regarding copyright ownership.

* The ASF licenses this file to You under the Apache License, Version 2.0

* (the "License"); you may not use this file except in compliance with

* the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

package org.apache.commons.csv;

import java.io.*;

import java.util.ArrayList;

/**

* Parses CSV files according to the specified configuration.

* Because CSV appears in many different dialects, the parser supports many

* configuration settings by allowing the specification of a {@link CSVStrategy}.

* Parsing of a csv-string having tabs as separators,

* '"' as an optional value encapsulator, and comments starting with '#':

* <pre>

* String[][] data =

* (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();

* </pre>

* Parsing of a csv-string in Excel CSV format

* <pre>

* String[][] data =

* (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();

* </pre>

*

* Internal parser state is completely covered by the strategy

* and the reader-state.

* see <a href="package-summary.html">package documentation</a>

* for more details

public class CSVParser {

/** length of the initial token (content-)buffer */

private static final int INITIAL_TOKEN_LENGTH = 50;

// the token types

/** Token has no valid content, i.e. is in its initilized state. */

protected static final int TT_INVALID = -1;

/** Token with content, at beginning or in the middle of a line. */

protected static final int TT_TOKEN = 0;

/** Token (which can have content) when end of file is reached. */

protected static final int TT_EOF = 1;

/** Token with content when end of a line is reached. */

protected static final int TT_EORECORD = 2;

/** Immutable empty String array. */

private static final String[] EMPTY_STRING_ARRAY = new String[0];

// the input stream

private final ExtendedBufferedReader in;

// TODO: this can be made final if setStrategy is removed

private CSVStrategy strategy;

// the following objects are shared to reduce garbage

/** A record buffer for getLine(). Grows as necessary and is reused. */

private final ArrayList record = new ArrayList();

private final Token reusableToken = new Token();

private final CharBuffer wsBuf = new CharBuffer();

private final CharBuffer code = new CharBuffer(4);

/**

* Token is an internal token representation.

* It is used as contract between the lexer and the parser.

static class Token {

/** Token type, see TT_xxx constants. */

int type = TT_INVALID;

/** The content buffer. */

CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);

/** Token ready flag: indicates a valid token with content (ready for the parser). */

boolean isReady;

Token reset() {

content.clear();

type = TT_INVALID;

isReady = false;

return this;

}

100

}

101

102

// ======================================================

103

// the constructor

104

// ======================================================

105

106

/**

107

* Default strategy for the parser follows the default {@link CSVStrategy}.

108

109

* @param input an InputStream containing "csv-formatted" stream

110

* @deprecated use {@link #CSVParser(Reader)}.

111

112

public CSVParser(InputStream input) {

113

this(new InputStreamReader(input));

114

}

115

116

/**

117

* CSV parser using the default {@link CSVStrategy}.

118

119

* @param input a Reader containing "csv-formatted" input

120

121

public CSVParser(Reader input) {

122

// note: must match default-CSV-strategy !!

123

this(input, ',');

124

}

125

126

/**

127

* Customized value delimiter parser.

128

129

* The parser follows the default {@link CSVStrategy}

130

* except for the delimiter setting.

131

132

* @param input a Reader based on "csv-formatted" input

133

* @param delimiter a Char used for value separation

134

* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.

135

136

public CSVParser(Reader input, char delimiter) {

137

this(input, delimiter, '"', (char) 0);

138

}

139

140

/**

141

* Customized csv parser.

142

143

* The parser parses according to the given CSV dialect settings.

144

* Leading whitespaces are truncated, unicode escapes are

145

* not interpreted and empty lines are ignored.

146

147

* @param input a Reader based on "csv-formatted" input

148

* @param delimiter a Char used for value separation

149

* @param encapsulator a Char used as value encapsulation marker

150

* @param commentStart a Char used for comment identification

151

* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.

152

153

public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {

154

this(input, new CSVStrategy(delimiter, encapsulator, commentStart));

155

}

156

157

/**

158

* Customized CSV parser using the given {@link CSVStrategy}

159

160

* @param input a Reader containing "csv-formatted" input

161

* @param strategy the CSVStrategy used for CSV parsing

162

163

public CSVParser(Reader input, CSVStrategy strategy) {

164

this.in = new ExtendedBufferedReader(input);

165

this.strategy = strategy;

166

}

167

168

// ======================================================

169

// the parser

170

// ======================================================

171

172

/**

173

* Parses the CSV according to the given strategy

174

* and returns the content as an array of records

175

* (whereas records are arrays of single values).

176

*

177

* The returned content starts at the current parse-position in

178

* the stream.

179

180

* @return matrix of records x values ('null' when end of file)

181

* @throws IOException on parse error or input read-failure

182

183

public String[][] getAllValues() throws IOException {

184

ArrayList records = new ArrayList();

185

String[] values;

186

String[][] ret = null;

187

while ((values = getLine()) != null) {

188

records.add(values);

189

}

190

if (records.size() > 0) {

191

ret = new String[records.size()][];

192

records.toArray(ret);

193

}

194

return ret;

195

}

196

197

/**

198

* Parses the CSV according to the given strategy

199

* and returns the next csv-value as string.

200

201

* @return next value in the input stream ('null' when end of file)

202

* @throws IOException on parse error or input read-failure

203

204

public String nextValue() throws IOException {

205

Token tkn = nextToken();

206

String ret = null;

207

switch (tkn.type) {

208

case TT_TOKEN:

209

case TT_EORECORD:

210

ret = tkn.content.toString();

211

break;

212

case TT_EOF:

213

ret = null;

214

break;

215

case TT_INVALID:

216

default:

217

// error no token available (or error)

218

throw new IOException(

219

"(line " + getLineNumber()

220

+ ") invalid parse sequence");

221

// unreachable: break;

222

}

223

return ret;

224

}

225

226

/**

227

* Parses from the current point in the stream til

228

* the end of the current line.

229

230

* @return array of values til end of line

231

* ('null' when end of file has been reached)

232

* @throws IOException on parse error or input read-failure

233

234

public String[] getLine() throws IOException {

235

String[] ret = EMPTY_STRING_ARRAY;

236

record.clear();

237

while (true) {

238

reusableToken.reset();

239

nextToken(reusableToken);

240

switch (reusableToken.type) {

241

case TT_TOKEN:

242

record.add(reusableToken.content.toString());

243

break;

244

case TT_EORECORD:

245

record.add(reusableToken.content.toString());

246

break;

247

case TT_EOF:

248

if (reusableToken.isReady) {

249

record.add(reusableToken.content.toString());

250

} else {

251

ret = null;

252

}

253

break;

254

case TT_INVALID:

255

default:

256

// error: throw IOException

257

throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");

258

// unreachable: break;

259

}

260

if (reusableToken.type != TT_TOKEN) break;

261

}

262

if (!record.isEmpty()) {

263

ret = (String[]) record.toArray(new String[record.size()]);

264

}

265

return ret;

266

}

267

268

/**

269

* Returns the current line number in the input stream.

270

271

* ATTENTION: in case your csv has multiline-values the returned

272

* number does not correspond to the record-number

273

274

* @return current line number

275

276

public int getLineNumber() {

277

return in.getLineNumber();

278

}

279

280

// ======================================================

281

// the lexer(s)

282

// ======================================================

283

284

/**

285

* Convenience method for <code>nextToken(null)</code>.

286

287

protected Token nextToken() throws IOException {

288

return nextToken(new Token());

289

}

290

291

/**

292

* Returns the next token.

293

294

* A token corresponds to a term, a record change or an

295

* end-of-file indicator.

296

297

* @param tkn an existing Token object to reuse. The caller is responsible to initialize the

298

* Token.

299

* @return the next token found

300

* @throws IOException on stream access error

301

302

protected Token nextToken(Token tkn) throws IOException {

303

wsBuf.clear(); // resuse

304

305

// get the last read char (required for empty line detection)

306

int lastChar = in.readAgain();

307

308

// read the next char and set eol

309

/* note: unfourtunately isEndOfLine may consumes a character silently.

310

* this has no effect outside of the method. so a simple workaround

311

* is to call 'readAgain' on the stream...

312

* uh: might using objects instead of base-types (jdk1.5 autoboxing!)

313

314

int c = in.read();

315

boolean eol = isEndOfLine(c);

316

c = in.readAgain();

317

318

// empty line detection: eol AND (last char was EOL or beginning)

319

while (strategy.getIgnoreEmptyLines() && eol

320

&& (lastChar == '\n'

321

|| lastChar == ExtendedBufferedReader.UNDEFINED)

322

&& !isEndOfFile(lastChar)) {

323

// go on char ahead ...

324

lastChar = c;

325

c = in.read();

326

eol = isEndOfLine(c);

327

c = in.readAgain();

328

// reached end of file without any content (empty line at the end)

329

if (isEndOfFile(c)) {

330

tkn.type = TT_EOF;

331

return tkn;

332

}

333

}

334

335

// did we reached eof during the last iteration already ? TT_EOF

336

if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {

337

tkn.type = TT_EOF;

338

return tkn;

339

}

340

341

// important: make sure a new char gets consumed in each iteration

342

while (!tkn.isReady) {

343

// ignore whitespaces at beginning of a token

344

while (isWhitespace(c) && !eol) {

345

wsBuf.append((char) c);

346

c = in.read();

347

eol = isEndOfLine(c);

348

}

349

// ok, start of token reached: comment, encapsulated, or token

350

if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {

351

// ignore everything till end of line and continue (incr linecount)

352

in.readLine();

353

tkn = nextToken(tkn.reset());

354

} else if (c == strategy.getDelimiter()) {

355

// empty token return TT_TOKEN("")

356

tkn.type = TT_TOKEN;

357

tkn.isReady = true;

358

} else if (eol) {

359

// empty token return TT_EORECORD("")

360

//noop: tkn.content.append("");

361

tkn.type = TT_EORECORD;

362

tkn.isReady = true;

363

} else if (c == strategy.getEncapsulator()) {

364

// consume encapsulated token

365

encapsulatedTokenLexer(tkn, c);

366

} else if (isEndOfFile(c)) {

367

// end of file return TT_EOF()

368

//noop: tkn.content.append("");

369

tkn.type = TT_EOF;

370

tkn.isReady = true;

371

} else {

372

// next token must be a simple token

373

// add removed blanks when not ignoring whitespace chars...

374

if (!strategy.getIgnoreLeadingWhitespaces()) {

375

tkn.content.append(wsBuf);

376

}

377

simpleTokenLexer(tkn, c);

378

}

379

}

380

return tkn;

381

}

382

383

/**

384

* A simple token lexer

385

386

* Simple token are tokens which are not surrounded by encapsulators.

387

* A simple token might contain escaped delimiters (as \, or \;). The

388

* token is finished when one of the following conditions become true:

389

* <ul>

390

* <li>end of line has been reached (TT_EORECORD)</li>

391

* <li>end of stream has been reached (TT_EOF)</li>

392

* <li>an unescaped delimiter has been reached (TT_TOKEN)</li>

393

* </ul>

394

395

* @param tkn the current token

396

* @param c the current character

397

* @return the filled token

398

399

* @throws IOException on stream access error

400

401

private Token simpleTokenLexer(Token tkn, int c) throws IOException {

402

wsBuf.clear();

403

while (!tkn.isReady) {

404

if (isEndOfLine(c)) {

405

// end of record

406

tkn.type = TT_EORECORD;

407

tkn.isReady = true;

408

} else if (isEndOfFile(c)) {

409

// end of file

410

tkn.type = TT_EOF;

411

tkn.isReady = true;

412

} else if (c == strategy.getDelimiter()) {

413

// end of token

414

tkn.type = TT_TOKEN;

415

tkn.isReady = true;

416

} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {

417

// interpret unicode escaped chars (like \u0070 -> p)

418

tkn.content.append((char) unicodeEscapeLexer(c));

419

} else if (isWhitespace(c)) {

420

// gather whitespaces

421

// (as long as they are not at the beginning of a token)

422

if (tkn.content.length() > 0) {

423

wsBuf.append((char) c);

424

}

425

} else {

426

// prepend whitespaces (if we have)

427

if (wsBuf.length() > 0) {

428

tkn.content.append(wsBuf);

429

wsBuf.clear();

430

}

431

tkn.content.append((char) c);

432

}

433

// get the next char

434

if (!tkn.isReady) {

435

c = in.read();

436

}

437

}

438

return tkn;

439

}

440

441

442

/**

443

* An encapsulated token lexer

444

445

* Encapsulated tokens are surrounded by the given encapsulating-string.

446

* The encapsulator itself might be included in the token using a

447

* doubling syntax (as "", '') or using escaping (as in \", \').

448

* Whitespaces before and after an encapsulated token are ignored.

449

450

* @param tkn the current token

451

* @param c the current character

452

* @return a valid token object

453

* @throws IOException on invalid state

454

455

private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {

456

// save current line

457

int startLineNumber = getLineNumber();

458

// ignore the given delimiter

459

// assert c == delimiter;

460

c = in.read();

461

while (!tkn.isReady) {

462

boolean skipRead = false;

463

if (c == strategy.getEncapsulator() || c == '\\') {

464

// check lookahead

465

if (in.lookAhead() == strategy.getEncapsulator()) {

466

// double or escaped encapsulator -> add single encapsulator to token

467

c = in.read();

468

tkn.content.append((char) c);

469

} else if (c == '\\' && in.lookAhead() == '\\') {

470

// doubled escape char, it does not escape itself, only encapsulator

471

// -> add both escape chars to stream

472

tkn.content.append((char) c);

473

c = in.read();

474

tkn.content.append((char) c);

475

} else if (

476

strategy.getUnicodeEscapeInterpretation()

477

&& c == '\\'

478

&& in.lookAhead() == 'u') {

479

// interpret unicode escaped chars (like \u0070 -> p)

480

tkn.content.append((char) unicodeEscapeLexer(c));

481

} else if (c == '\\') {

482

// use a single escape character -> add it to stream

483

tkn.content.append((char) c);

484

} else {

485

// token finish mark (encapsulator) reached: ignore whitespace till delimiter

486

while (!tkn.isReady) {

487

c = in.read();

488

if (c == strategy.getDelimiter()) {

489

tkn.type = TT_TOKEN;

490

tkn.isReady = true;

491

} else if (isEndOfFile(c)) {

492

tkn.type = TT_EOF;

493

tkn.isReady = true;

494

} else if (isEndOfLine(c)) {

495

// ok eo token reached

496

tkn.type = TT_EORECORD;

497

tkn.isReady = true;

498

} else if (!isWhitespace(c)) {

499

// error invalid char between token and next delimiter

500

throw new IOException(

501

"(line " + getLineNumber()

502

+ ") invalid char between encapsulated token end delimiter"

503

);

504

}

505

}

506

skipRead = true;

507

}

508

} else if (isEndOfFile(c)) {

509

// error condition (end of file before end of token)

510

throw new IOException(

511

"(startline " + startLineNumber + ")"

512

+ "eof reached before encapsulated token finished"

513

);

514

} else {

515

// consume character

516

tkn.content.append((char) c);

517

}

518

// get the next char

519

if (!tkn.isReady && !skipRead) {

520

c = in.read();

521

}

522

}

523

return tkn;

524

}

525

526

527

/**

528

* Decodes Unicode escapes.

529

530

* Interpretation of "\\uXXXX" escape sequences

531

* where XXXX is a hex-number.

532

* @param c current char which is discarded because it's the "\\" of "\\uXXXX"

533

* @return the decoded character

534

* @throws IOException on wrong unicode escape sequence or read error

535

536

protected int unicodeEscapeLexer(int c) throws IOException {

537

int ret = 0;

538

// ignore 'u' (assume c==\ now) and read 4 hex digits

539

c = in.read();

540

code.clear();

541

try {

542

for (int i = 0; i < 4; i++) {

543

c = in.read();

544

if (isEndOfFile(c) || isEndOfLine(c)) {

545

throw new NumberFormatException("number too short");

546

}

547

code.append((char) c);

548

}

549

ret = Integer.parseInt(code.toString(), 16);

550

} catch (NumberFormatException e) {

551

throw new IOException(

552

"(line " + getLineNumber() + ") Wrong unicode escape sequence found '"

553

+ code.toString() + "'" + e.toString());

554

}

555

return ret;

556

}

557

558

// ======================================================

559

// strategies

560

// ======================================================

561

562

/**

563

* Sets the specified CSV Strategy

564

565

* @return current instance of CSVParser to allow chained method calls

566

* @deprecated the strategy should be set in the constructor {@link #CSVParser(Reader,CSVStrategy)}.

567

568

public CSVParser setStrategy(CSVStrategy strategy) {

569

this.strategy = strategy;

570

return this;

571

}

572

573

/**

574

* Obtain the specified CSV Strategy

575

576

* @return strategy currently being used

577

578

public CSVStrategy getStrategy() {

579

return this.strategy;

580

}

581

582

// ======================================================

583

// Character class checker

584

// ======================================================

585

586

/**

587

* @return true if the given char is a whitespace character

588

589

private boolean isWhitespace(int c) {

590

return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());

591

}

592

593

/**

594

* Greedy - accepts \n and \r\n

595

* This checker consumes silently the second control-character...

596

597

* @return true if the given character is a line-terminator

598

599

private boolean isEndOfLine(int c) throws IOException {

600

// check if we have \r\n...

601

if (c == '\r') {

602

if (in.lookAhead() == '\n') {

603

// note: does not change c outside of this method !!

604

c = in.read();

605

}

606

}

607

return (c == '\n');

608

}

609

610

/**

611

* @return true if the given character indicates end of file

612

613

private boolean isEndOfFile(int c) {

614

return c == ExtendedBufferedReader.END_OF_STREAM;

615

}

616

}

Older »