Skip to content

Commit eac54a2

Browse files
committed
Extracted the strategy concept into its own class
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/sandbox/csv/trunk@399987 13f79535-47bb-0310-9956-ffa450edef68
1 parent f047581 commit eac54a2

4 files changed

Lines changed: 249 additions & 298 deletions

File tree

src/java/org/apache/commons/csv/CSVParser.java

Lines changed: 29 additions & 219 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,10 @@ public class CSVParser {
6565
/** Token with content when end of a line is reached. */
6666
protected static final int TT_EORECORD = 2;
6767

68-
// the csv definition
69-
private char delimiter;
70-
private char encapsulator;
71-
private char commentStart;
72-
private boolean ignoreLeadingWhitespaces;
73-
private boolean interpretUnicodeEscapes;
74-
private boolean ignoreEmptyLines;
75-
7668
// the input stream
7769
private ExtendedBufferedReader in;
70+
71+
private CSVStrategy strategy;
7872

7973
/**
8074
* Token is an internal token representation.
@@ -106,7 +100,7 @@ class Token {
106100
* @param s CSV String to be parsed.
107101
* @return parsed String matrix (which is never null)
108102
* @throws IOException in case of error
109-
* @see #setCSVStrategy()
103+
* @see #setStrategy()
110104
*/
111105
public static String[][] parse(String s) throws IOException {
112106
if (s == null) {
@@ -130,7 +124,7 @@ public static String[][] parse(String s) throws IOException {
130124
* @param s CSV String to be parsed.
131125
* @return parsed String vector (which is never null)
132126
* @throws IOException in case of error
133-
* @see #setCSVStrategy()
127+
* @see #setStrategy()
134128
*/
135129
public static String[] parseLine(String s) throws IOException {
136130
if (s == null) {
@@ -151,7 +145,7 @@ public static String[] parseLine(String s) throws IOException {
151145
* Default strategy for the parser follows the default CSV Strategy.
152146
*
153147
* @param input an InputStream containing "csv-formatted" stream
154-
* @see #setCSVStrategy()
148+
* @see #setStrategy()
155149
*/
156150
public CSVParser(InputStream input) {
157151
this(new InputStreamReader(input));
@@ -161,7 +155,7 @@ public CSVParser(InputStream input) {
161155
* Default strategy for the parser follows the default CSV Strategy.
162156
*
163157
* @param input a Reader based on "csv-formatted" input
164-
* @see #setCSVStrategy()
158+
* @see #setStrategy()
165159
*/
166160
public CSVParser(Reader input) {
167161
// note: must match default-CSV-strategy !!
@@ -172,7 +166,7 @@ public CSVParser(Reader input) {
172166
* Customized value delimiter parser.
173167
*
174168
* The parser follows the default CSV strategy as defined in
175-
* {@link #setCSVStrategy()} except for the delimiter setting.
169+
* {@link #setStrategy()} except for the delimiter setting.
176170
*
177171
* @param input a Reader based on "csv-formatted" input
178172
* @param delimiter a Char used for value separation
@@ -193,18 +187,9 @@ public CSVParser(Reader input, char delimiter) {
193187
* @param encapsulator a Char used as value encapsulation marker
194188
* @param commentStart a Char used for comment identification
195189
*/
196-
public CSVParser(
197-
Reader input,
198-
char delimiter,
199-
char encapsulator,
200-
char commentStart) {
190+
public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
201191
this.in = new ExtendedBufferedReader(input);
202-
this.setDelimiter(delimiter);
203-
this.setEncapsulator(encapsulator);
204-
this.setCommentStart(commentStart);
205-
this.setIgnoreLeadingWhitespaces(true);
206-
this.setUnicodeEscapeInterpretation(false);
207-
this.setIgnoreEmptyLines(true);
192+
this.strategy = new CSVStrategy(delimiter, encapsulator, commentStart);
208193
}
209194

210195
// ======================================================
@@ -350,7 +335,7 @@ protected Token nextToken() throws IOException {
350335
c = in.readAgain();
351336

352337
// empty line detection: eol AND (last char was EOL or beginning)
353-
while (ignoreEmptyLines && eol
338+
while (strategy.getIgnoreEmptyLines() && eol
354339
&& (lastChar == '\n'
355340
|| lastChar == ExtendedBufferedReader.UNDEFINED)
356341
&& !isEndOfFile(lastChar)) {
@@ -367,7 +352,7 @@ protected Token nextToken() throws IOException {
367352
}
368353

369354
// did we reached eof during the last iteration already ? TT_EOF
370-
if (isEndOfFile(lastChar) || (lastChar != delimiter && isEndOfFile(c))) {
355+
if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
371356
tkn.type = TT_EOF;
372357
return tkn;
373358
}
@@ -381,11 +366,11 @@ protected Token nextToken() throws IOException {
381366
eol = isEndOfLine(c);
382367
}
383368
// ok, start of token reached: comment, encapsulated, or token
384-
if (c == commentStart) {
369+
if (c == strategy.getCommentStart()) {
385370
// ignore everything till end of line and continue (incr linecount)
386371
in.readLine();
387372
tkn = nextToken();
388-
} else if (c == delimiter) {
373+
} else if (c == strategy.getDelimiter()) {
389374
// empty token return TT_TOKEN("")
390375
tkn.type = TT_TOKEN;
391376
tkn.isReady = true;
@@ -394,7 +379,7 @@ protected Token nextToken() throws IOException {
394379
tkn.content.append("");
395380
tkn.type = TT_EORECORD;
396381
tkn.isReady = true;
397-
} else if (c == encapsulator) {
382+
} else if (c == strategy.getEncapsulator()) {
398383
// consume encapsulated token
399384
encapsulatedTokenLexer(tkn, c);
400385
} else if (isEndOfFile(c)) {
@@ -405,7 +390,7 @@ protected Token nextToken() throws IOException {
405390
} else {
406391
// next token must be a simple token
407392
// add removed blanks when not ignoring whitespace chars...
408-
if (!this.ignoreLeadingWhitespaces) {
393+
if (!strategy.getIgnoreLeadingWhitespaces()) {
409394
tkn.content.append(wsBuf.toString());
410395
}
411396
simpleTokenLexer(tkn, c);
@@ -443,11 +428,11 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
443428
// end of file
444429
tkn.type = TT_EOF;
445430
tkn.isReady = true;
446-
} else if (c == delimiter) {
431+
} else if (c == strategy.getDelimiter()) {
447432
// end of token
448433
tkn.type = TT_TOKEN;
449434
tkn.isReady = true;
450-
} else if (c == '\\' && interpretUnicodeEscapes && in.lookAhead() == 'u') {
435+
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
451436
// interpret unicode escaped chars (like \u0070 -> p)
452437
tkn.content.append((char) unicodeEscapeLexer(c));
453438
} else if (isWhitespace(c)) {
@@ -493,9 +478,9 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
493478
// assert c == delimiter;
494479
c = in.read();
495480
while (!tkn.isReady) {
496-
if (c == encapsulator || c == '\\') {
481+
if (c == strategy.getEncapsulator() || c == '\\') {
497482
// check lookahead
498-
if (in.lookAhead() == encapsulator) {
483+
if (in.lookAhead() == strategy.getEncapsulator()) {
499484
// double or escaped encapsulator -> add single encapsulator to token
500485
c = in.read();
501486
tkn.content.append((char) c);
@@ -506,7 +491,7 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
506491
c = in.read();
507492
tkn.content.append((char) c);
508493
} else if (
509-
interpretUnicodeEscapes
494+
strategy.getUnicodeEscapeInterpretation()
510495
&& c == '\\'
511496
&& in.lookAhead() == 'u') {
512497
// interpret unicode escaped chars (like \u0070 -> p)
@@ -518,7 +503,7 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
518503
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
519504
while (!tkn.isReady) {
520505
int n = in.lookAhead();
521-
if (n == delimiter) {
506+
if (n == strategy.getDelimiter()) {
522507
tkn.type = TT_TOKEN;
523508
tkn.isReady = true;
524509
} else if (isEndOfFile(n)) {
@@ -589,201 +574,26 @@ protected int unicodeEscapeLexer(int c) throws IOException {
589574
}
590575

591576
// ======================================================
592-
// strategy utilities
577+
// strategies
593578
// ======================================================
594579

595580
/**
596-
* Sets the "Default CSV" settings.
597-
*
598-
* The default csv settings are relatively restrictive but implement
599-
* something like the "least-common-basis" of CSV:
600-
* <ul>
601-
* <li> Delimiter of values is comma ',' (as the C in "CSV") </li>
602-
* <li> Complex values encapsulated by '"' </li>
603-
* <li> Comments are not supported </li>
604-
* <li> Leading whitespaces are ignored </li>
605-
* <li> Unicode escapes are not interpreted </li>
606-
* <li> empty lines are skiped </li>
607-
* </ul>
608-
* @return current instance of CSVParser to allow chained method calls
609-
*/
610-
public CSVParser setCSVStrategy() {
611-
setStrategy(',', '"', (char) 0, true, false, true);
612-
return this;
613-
}
614-
615-
/**
616-
* Sets the "Excel CSV" settings. There are companies out there which
617-
* interpret "C" as an abbreviation for "Semicolon". For these companies the
618-
* following settings might be appropriate:
619-
* <ul>
620-
* <li> Delimiter of values is semicolon ';' </li>
621-
* <li> Complex values encapsulated by '"' </li>
622-
* <li> Comments are not supported </li>
623-
* <li> Leading whitespaces are not ignored </li>
624-
* <li> Unicode escapes are not interpreted </li>
625-
* <li> empty lines are not skiped </li>
626-
* </ul>
627-
*
628-
* @return current instance of CSVParser to allow chained method calls
629-
*/
630-
public CSVParser setExcelStrategy() {
631-
setStrategy(';', '"', (char) 0, false, false, false);
632-
return this;
633-
}
634-
635-
/**
636-
* Customized CSV strategy setter.
637-
*
638-
* @param delimiter a Char used for value separation
639-
* @param encapsulator a Char used as value encapsulation marker
640-
* @param commentStart a Char used for comment identification
641-
* @param ignoreLeadingWhitespace TRUE when leading whitespaces should be
642-
* ignored
643-
* @param interpretUnicodeEscapes TRUE when unicode escapes should be
644-
* interpreted
645-
* @param ignoreEmptyLines TRUE when the parser should skip emtpy lines
646-
* @return current instance of CSVParser to allow chained method calls
647-
*/
648-
public CSVParser setStrategy(
649-
char delimiter,
650-
char encapsulator,
651-
char commentStart,
652-
boolean ignoreLeadingWhitespace,
653-
boolean interpretUnicodeEscapes,
654-
boolean ignoreEmptyLines) {
655-
this.setDelimiter(delimiter);
656-
this.setEncapsulator(encapsulator);
657-
this.setCommentStart(commentStart);
658-
this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
659-
this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
660-
this.setIgnoreEmptyLines(ignoreEmptyLines);
661-
return this;
662-
}
663-
664-
/**
665-
* Set the desired delimiter.
581+
* Sets the specified CSV Strategy
666582
*
667-
* @param c a Char used for value separation
668-
* @return current instance of CSVParser to allow chained method calls
669-
*/
670-
public CSVParser setDelimiter(char c) {
671-
this.delimiter = c;
672-
return this;
673-
}
674-
675-
/**
676-
* Gets the delimiter.
677-
*
678-
* @return the delimiter character
679-
*/
680-
public char getDelimiter() {
681-
return this.delimiter;
682-
}
683-
684-
/**
685-
* Set the desired encapsulator.
686-
*
687-
* @param c a Char used as value encapsulation marker
688-
* @return current instance of CSVParser to allow chained method calls
689-
*/
690-
public CSVParser setEncapsulator(char c) {
691-
this.encapsulator = c;
692-
return this;
693-
}
694-
695-
/**
696-
* Gets the encapsulator character.
697-
*
698-
* @return the encapsulator marker
699-
*/
700-
public char getEncapsulator() {
701-
return this.encapsulator;
702-
}
703-
704-
/**
705-
* Set the desired comment start character.
706-
*
707-
* @param c a Char used for comment identification
708-
* @return current instance of CSVParser to allow chained method calls
709-
*/
710-
public CSVParser setCommentStart(char c) {
711-
this.commentStart = c;
712-
return this;
713-
}
714-
715-
/**
716-
* Gets the comment identifier.
717-
*
718-
* @return the comment identifier character
719-
*/
720-
public char getCommentStart() {
721-
return this.commentStart;
722-
}
723-
724-
/**
725-
* Enables unicode escape interpretation.
726-
*
727-
* @param b TRUE when interpretation should be enabled
728-
* @return current instance of CSVParser to allow chained method calls
729-
*/
730-
public CSVParser setUnicodeEscapeInterpretation(boolean b) {
731-
this.interpretUnicodeEscapes = b;
732-
return this;
733-
}
734-
735-
/**
736-
* Shows wether unicode interpretation is enabled.
737-
*
738-
* @return TRUE when unicode interpretation is enabled
739-
*/
740-
public boolean getUnicodeEscapeInterpretation() {
741-
return this.interpretUnicodeEscapes;
742-
}
743-
744-
/**
745-
* Sets the ignore-leading-whitespaces behaviour.
746-
*
747-
* Should the lexer ignore leading whitespaces when parsing non
748-
* encapsulated tokens.
749-
*
750-
* @param b TRUE when leading whitespaces should be ignored
751-
* @return current instance of CSVParser to allow chained method calls
752-
*/
753-
public CSVParser setIgnoreLeadingWhitespaces(boolean b) {
754-
this.ignoreLeadingWhitespaces = b;
755-
return this;
756-
}
757-
758-
/**
759-
* Shows whether unicode interpretation is enabled.
760-
*
761-
* @return TRUE when unicode interpretation is enabled
762-
*/
763-
public boolean getIgnoreLeadingWhitespaces() {
764-
return this.ignoreLeadingWhitespaces;
765-
}
766-
767-
/**
768-
* Sets the ignore-empty-line behaviour.
769-
*
770-
* When set to 'true' empty lines in the input will be ignored.
771-
*
772-
* @param b TRUE when empty lines in the input should be ignored
773583
* @return current instance of CSVParser to allow chained method calls
774584
*/
775-
public CSVParser setIgnoreEmptyLines(boolean b) {
776-
this.ignoreEmptyLines = b;
585+
public CSVParser setStrategy(CSVStrategy strategy) {
586+
this.strategy = strategy;
777587
return this;
778588
}
779589

780590
/**
781-
* Shows whether empty lines in the input are ignored.
591+
* Obtain the specified CSV Strategy
782592
*
783-
* @return TRUE when empty lines in the input are ignored
593+
* @return strategy currently being used
784594
*/
785-
public boolean getIgnoreEmptyLines() {
786-
return this.ignoreEmptyLines;
595+
public CSVStrategy getStrategy() {
596+
return this.strategy;
787597
}
788598

789599
// ======================================================

0 commit comments

Comments
 (0)