2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
17
package org.apache.commons.csv;
20
import java.util.ArrayList;
24
* Parses CSV files according to the specified configuration.
26
* Because CSV appears in many different dialects, the parser supports many
27
* configuration settings by allowing the specification of a {@link CSVStrategy}.
29
* <p>Parsing of a csv-string having tabs as separators,
30
* '"' as an optional value encapsulator, and comments starting with '#':</p>
33
* (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
36
* <p>Parsing of a csv-string in Excel CSV format</p>
39
* (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
43
* Internal parser state is completely covered by the strategy
44
* and the reader-state.</p>
46
* <p>see <a href="package-summary.html">package documentation</a>
47
* for more details</p>
49
public class CSVParser {
51
/** length of the initial token (content-)buffer */
52
private static final int INITIAL_TOKEN_LENGTH = 50;
55
/** Token has no valid content, i.e. is in its initilized state. */
56
protected static final int TT_INVALID = -1;
57
/** Token with content, at beginning or in the middle of a line. */
58
protected static final int TT_TOKEN = 0;
59
/** Token (which can have content) when end of file is reached. */
60
protected static final int TT_EOF = 1;
61
/** Token with content when end of a line is reached. */
62
protected static final int TT_EORECORD = 2;
64
/** Immutable empty String array. */
65
private static final String[] EMPTY_STRING_ARRAY = new String[0];
68
private final ExtendedBufferedReader in;
70
// TODO: this can be made final if setStrategy is removed
71
private CSVStrategy strategy;
73
// the following objects are shared to reduce garbage
74
/** A record buffer for getLine(). Grows as necessary and is reused. */
75
private final ArrayList record = new ArrayList();
76
private final Token reusableToken = new Token();
77
private final CharBuffer wsBuf = new CharBuffer();
78
private final CharBuffer code = new CharBuffer(4);
82
* Token is an internal token representation.
84
* It is used as contract between the lexer and the parser.
87
/** Token type, see TT_xxx constants. */
88
int type = TT_INVALID;
89
/** The content buffer. */
90
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
91
/** Token ready flag: indicates a valid token with content (ready for the parser). */
102
// ======================================================
104
// ======================================================
107
* Default strategy for the parser follows the default {@link CSVStrategy}.
109
* @param input an InputStream containing "csv-formatted" stream
110
* @deprecated use {@link #CSVParser(Reader)}.
112
public CSVParser(InputStream input) {
113
this(new InputStreamReader(input));
117
* CSV parser using the default {@link CSVStrategy}.
119
* @param input a Reader containing "csv-formatted" input
121
public CSVParser(Reader input) {
122
// note: must match default-CSV-strategy !!
127
* Customized value delimiter parser.
129
* The parser follows the default {@link CSVStrategy}
130
* except for the delimiter setting.
132
* @param input a Reader based on "csv-formatted" input
133
* @param delimiter a Char used for value separation
134
* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
136
public CSVParser(Reader input, char delimiter) {
137
this(input, delimiter, '"', (char) 0);
141
* Customized csv parser.
143
* The parser parses according to the given CSV dialect settings.
144
* Leading whitespaces are truncated, unicode escapes are
145
* not interpreted and empty lines are ignored.
147
* @param input a Reader based on "csv-formatted" input
148
* @param delimiter a Char used for value separation
149
* @param encapsulator a Char used as value encapsulation marker
150
* @param commentStart a Char used for comment identification
151
* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
153
public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
154
this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
158
* Customized CSV parser using the given {@link CSVStrategy}
160
* @param input a Reader containing "csv-formatted" input
161
* @param strategy the CSVStrategy used for CSV parsing
163
public CSVParser(Reader input, CSVStrategy strategy) {
164
this.in = new ExtendedBufferedReader(input);
165
this.strategy = strategy;
168
// ======================================================
170
// ======================================================
173
* Parses the CSV according to the given strategy
174
* and returns the content as an array of records
175
* (whereas records are arrays of single values).
177
* The returned content starts at the current parse-position in
180
* @return matrix of records x values ('null' when end of file)
181
* @throws IOException on parse error or input read-failure
183
public String[][] getAllValues() throws IOException {
184
ArrayList records = new ArrayList();
186
String[][] ret = null;
187
while ((values = getLine()) != null) {
190
if (records.size() > 0) {
191
ret = new String[records.size()][];
192
records.toArray(ret);
198
* Parses the CSV according to the given strategy
199
* and returns the next csv-value as string.
201
* @return next value in the input stream ('null' when end of file)
202
* @throws IOException on parse error or input read-failure
204
public String nextValue() throws IOException {
205
Token tkn = nextToken();
210
ret = tkn.content.toString();
217
// error no token available (or error)
218
throw new IOException(
219
"(line " + getLineNumber()
220
+ ") invalid parse sequence");
221
// unreachable: break;
227
* Parses from the current point in the stream til
228
* the end of the current line.
230
* @return array of values til end of line
231
* ('null' when end of file has been reached)
232
* @throws IOException on parse error or input read-failure
234
public String[] getLine() throws IOException {
235
String[] ret = EMPTY_STRING_ARRAY;
238
reusableToken.reset();
239
nextToken(reusableToken);
240
switch (reusableToken.type) {
242
record.add(reusableToken.content.toString());
245
record.add(reusableToken.content.toString());
248
if (reusableToken.isReady) {
249
record.add(reusableToken.content.toString());
256
// error: throw IOException
257
throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
258
// unreachable: break;
260
if (reusableToken.type != TT_TOKEN) break;
262
if (!record.isEmpty()) {
263
ret = (String[]) record.toArray(new String[record.size()]);
269
* Returns the current line number in the input stream.
271
* ATTENTION: in case your csv has multiline-values the returned
272
* number does not correspond to the record-number
274
* @return current line number
276
public int getLineNumber() {
277
return in.getLineNumber();
280
// ======================================================
282
// ======================================================
285
* Convenience method for <code>nextToken(null)</code>.
287
protected Token nextToken() throws IOException {
288
return nextToken(new Token());
292
* Returns the next token.
294
* A token corresponds to a term, a record change or an
295
* end-of-file indicator.
297
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the
299
* @return the next token found
300
* @throws IOException on stream access error
302
protected Token nextToken(Token tkn) throws IOException {
303
wsBuf.clear(); // resuse
305
// get the last read char (required for empty line detection)
306
int lastChar = in.readAgain();
308
// read the next char and set eol
309
/* note: unfourtunately isEndOfLine may consumes a character silently.
310
* this has no effect outside of the method. so a simple workaround
311
* is to call 'readAgain' on the stream...
312
* uh: might using objects instead of base-types (jdk1.5 autoboxing!)
315
boolean eol = isEndOfLine(c);
318
// empty line detection: eol AND (last char was EOL or beginning)
319
while (strategy.getIgnoreEmptyLines() && eol
321
|| lastChar == ExtendedBufferedReader.UNDEFINED)
322
&& !isEndOfFile(lastChar)) {
323
// go on char ahead ...
326
eol = isEndOfLine(c);
328
// reached end of file without any content (empty line at the end)
329
if (isEndOfFile(c)) {
335
// did we reached eof during the last iteration already ? TT_EOF
336
if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
341
// important: make sure a new char gets consumed in each iteration
342
while (!tkn.isReady) {
343
// ignore whitespaces at beginning of a token
344
while (isWhitespace(c) && !eol) {
345
wsBuf.append((char) c);
347
eol = isEndOfLine(c);
349
// ok, start of token reached: comment, encapsulated, or token
350
if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
351
// ignore everything till end of line and continue (incr linecount)
353
tkn = nextToken(tkn.reset());
354
} else if (c == strategy.getDelimiter()) {
355
// empty token return TT_TOKEN("")
359
// empty token return TT_EORECORD("")
360
//noop: tkn.content.append("");
361
tkn.type = TT_EORECORD;
363
} else if (c == strategy.getEncapsulator()) {
364
// consume encapsulated token
365
encapsulatedTokenLexer(tkn, c);
366
} else if (isEndOfFile(c)) {
367
// end of file return TT_EOF()
368
//noop: tkn.content.append("");
372
// next token must be a simple token
373
// add removed blanks when not ignoring whitespace chars...
374
if (!strategy.getIgnoreLeadingWhitespaces()) {
375
tkn.content.append(wsBuf);
377
simpleTokenLexer(tkn, c);
384
* A simple token lexer
386
* Simple token are tokens which are not surrounded by encapsulators.
387
* A simple token might contain escaped delimiters (as \, or \;). The
388
* token is finished when one of the following conditions become true:
390
* <li>end of line has been reached (TT_EORECORD)</li>
391
* <li>end of stream has been reached (TT_EOF)</li>
392
* <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
395
* @param tkn the current token
396
* @param c the current character
397
* @return the filled token
399
* @throws IOException on stream access error
401
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
403
while (!tkn.isReady) {
404
if (isEndOfLine(c)) {
406
tkn.type = TT_EORECORD;
408
} else if (isEndOfFile(c)) {
412
} else if (c == strategy.getDelimiter()) {
416
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
417
// interpret unicode escaped chars (like \u0070 -> p)
418
tkn.content.append((char) unicodeEscapeLexer(c));
419
} else if (isWhitespace(c)) {
420
// gather whitespaces
421
// (as long as they are not at the beginning of a token)
422
if (tkn.content.length() > 0) {
423
wsBuf.append((char) c);
426
// prepend whitespaces (if we have)
427
if (wsBuf.length() > 0) {
428
tkn.content.append(wsBuf);
431
tkn.content.append((char) c);
443
* An encapsulated token lexer
445
* Encapsulated tokens are surrounded by the given encapsulating-string.
446
* The encapsulator itself might be included in the token using a
447
* doubling syntax (as "", '') or using escaping (as in \", \').
448
* Whitespaces before and after an encapsulated token are ignored.
450
* @param tkn the current token
451
* @param c the current character
452
* @return a valid token object
453
* @throws IOException on invalid state
455
private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
457
int startLineNumber = getLineNumber();
458
// ignore the given delimiter
459
// assert c == delimiter;
461
while (!tkn.isReady) {
462
boolean skipRead = false;
463
if (c == strategy.getEncapsulator() || c == '\\') {
465
if (in.lookAhead() == strategy.getEncapsulator()) {
466
// double or escaped encapsulator -> add single encapsulator to token
468
tkn.content.append((char) c);
469
} else if (c == '\\' && in.lookAhead() == '\\') {
470
// doubled escape char, it does not escape itself, only encapsulator
471
// -> add both escape chars to stream
472
tkn.content.append((char) c);
474
tkn.content.append((char) c);
476
strategy.getUnicodeEscapeInterpretation()
478
&& in.lookAhead() == 'u') {
479
// interpret unicode escaped chars (like \u0070 -> p)
480
tkn.content.append((char) unicodeEscapeLexer(c));
481
} else if (c == '\\') {
482
// use a single escape character -> add it to stream
483
tkn.content.append((char) c);
485
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
486
while (!tkn.isReady) {
488
if (c == strategy.getDelimiter()) {
491
} else if (isEndOfFile(c)) {
494
} else if (isEndOfLine(c)) {
495
// ok eo token reached
496
tkn.type = TT_EORECORD;
498
} else if (!isWhitespace(c)) {
499
// error invalid char between token and next delimiter
500
throw new IOException(
501
"(line " + getLineNumber()
502
+ ") invalid char between encapsulated token end delimiter"
508
} else if (isEndOfFile(c)) {
509
// error condition (end of file before end of token)
510
throw new IOException(
511
"(startline " + startLineNumber + ")"
512
+ "eof reached before encapsulated token finished"
516
tkn.content.append((char) c);
519
if (!tkn.isReady && !skipRead) {
528
* Decodes Unicode escapes.
530
* Interpretation of "\\uXXXX" escape sequences
531
* where XXXX is a hex-number.
532
* @param c current char which is discarded because it's the "\\" of "\\uXXXX"
533
* @return the decoded character
534
* @throws IOException on wrong unicode escape sequence or read error
536
protected int unicodeEscapeLexer(int c) throws IOException {
538
// ignore 'u' (assume c==\ now) and read 4 hex digits
542
for (int i = 0; i < 4; i++) {
544
if (isEndOfFile(c) || isEndOfLine(c)) {
545
throw new NumberFormatException("number too short");
547
code.append((char) c);
549
ret = Integer.parseInt(code.toString(), 16);
550
} catch (NumberFormatException e) {
551
throw new IOException(
552
"(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
553
+ code.toString() + "'" + e.toString());
558
// ======================================================
560
// ======================================================
563
* Sets the specified CSV Strategy
565
* @return current instance of CSVParser to allow chained method calls
566
* @deprecated the strategy should be set in the constructor {@link #CSVParser(Reader,CSVStrategy)}.
568
public CSVParser setStrategy(CSVStrategy strategy) {
569
this.strategy = strategy;
574
* Obtain the specified CSV Strategy
576
* @return strategy currently being used
578
public CSVStrategy getStrategy() {
579
return this.strategy;
582
// ======================================================
583
// Character class checker
584
// ======================================================
587
* @return true if the given char is a whitespace character
589
private boolean isWhitespace(int c) {
590
return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
594
* Greedy - accepts \n and \r\n
595
* This checker consumes silently the second control-character...
597
* @return true if the given character is a line-terminator
599
private boolean isEndOfLine(int c) throws IOException {
600
// check if we have \r\n...
602
if (in.lookAhead() == '\n') {
603
// note: does not change c outside of this method !!
611
* @return true if the given character indicates end of file
613
private boolean isEndOfFile(int c) {
614
return c == ExtendedBufferedReader.END_OF_STREAM;