@@ -65,16 +65,10 @@ public class CSVParser {
6565 /** Token with content when end of a line is reached. */
6666 protected static final int TT_EORECORD = 2 ;
6767
68- // the csv definition
69- private char delimiter ;
70- private char encapsulator ;
71- private char commentStart ;
72- private boolean ignoreLeadingWhitespaces ;
73- private boolean interpretUnicodeEscapes ;
74- private boolean ignoreEmptyLines ;
75-
7668 // the input stream
7769 private ExtendedBufferedReader in ;
70+
71+ private CSVStrategy strategy ;
7872
7973 /**
8074 * Token is an internal token representation.
@@ -106,7 +100,7 @@ class Token {
106100 * @param s CSV String to be parsed.
107101 * @return parsed String matrix (which is never null)
108102 * @throws IOException in case of error
109- * @see #setCSVStrategy ()
103+ * @see #setStrategy ()
110104 */
111105 public static String [][] parse (String s ) throws IOException {
112106 if (s == null ) {
@@ -130,7 +124,7 @@ public static String[][] parse(String s) throws IOException {
130124 * @param s CSV String to be parsed.
131125 * @return parsed String vector (which is never null)
132126 * @throws IOException in case of error
133- * @see #setCSVStrategy ()
127+ * @see #setStrategy ()
134128 */
135129 public static String [] parseLine (String s ) throws IOException {
136130 if (s == null ) {
@@ -151,7 +145,7 @@ public static String[] parseLine(String s) throws IOException {
151145 * Default strategy for the parser follows the default CSV Strategy.
152146 *
153147 * @param input an InputStream containing "csv-formatted" stream
154- * @see #setCSVStrategy ()
148+ * @see #setStrategy ()
155149 */
156150 public CSVParser (InputStream input ) {
157151 this (new InputStreamReader (input ));
@@ -161,7 +155,7 @@ public CSVParser(InputStream input) {
161155 * Default strategy for the parser follows the default CSV Strategy.
162156 *
163157 * @param input a Reader based on "csv-formatted" input
164- * @see #setCSVStrategy ()
158+ * @see #setStrategy ()
165159 */
166160 public CSVParser (Reader input ) {
167161 // note: must match default-CSV-strategy !!
@@ -172,7 +166,7 @@ public CSVParser(Reader input) {
172166 * Customized value delimiter parser.
173167 *
174168 * The parser follows the default CSV strategy as defined in
175- * {@link #setCSVStrategy ()} except for the delimiter setting.
169+ * {@link #setStrategy ()} except for the delimiter setting.
176170 *
177171 * @param input a Reader based on "csv-formatted" input
178172 * @param delimiter a Char used for value separation
@@ -193,18 +187,9 @@ public CSVParser(Reader input, char delimiter) {
193187 * @param encapsulator a Char used as value encapsulation marker
194188 * @param commentStart a Char used for comment identification
195189 */
196- public CSVParser (
197- Reader input ,
198- char delimiter ,
199- char encapsulator ,
200- char commentStart ) {
190+ public CSVParser (Reader input , char delimiter , char encapsulator , char commentStart ) {
201191 this .in = new ExtendedBufferedReader (input );
202- this .setDelimiter (delimiter );
203- this .setEncapsulator (encapsulator );
204- this .setCommentStart (commentStart );
205- this .setIgnoreLeadingWhitespaces (true );
206- this .setUnicodeEscapeInterpretation (false );
207- this .setIgnoreEmptyLines (true );
192+ this .strategy = new CSVStrategy (delimiter , encapsulator , commentStart );
208193 }
209194
210195 // ======================================================
@@ -350,7 +335,7 @@ protected Token nextToken() throws IOException {
350335 c = in .readAgain ();
351336
352337 // empty line detection: eol AND (last char was EOL or beginning)
353- while (ignoreEmptyLines && eol
338+ while (strategy . getIgnoreEmptyLines () && eol
354339 && (lastChar == '\n'
355340 || lastChar == ExtendedBufferedReader .UNDEFINED )
356341 && !isEndOfFile (lastChar )) {
@@ -367,7 +352,7 @@ protected Token nextToken() throws IOException {
367352 }
368353
369354 // did we reached eof during the last iteration already ? TT_EOF
370- if (isEndOfFile (lastChar ) || (lastChar != delimiter && isEndOfFile (c ))) {
355+ if (isEndOfFile (lastChar ) || (lastChar != strategy . getDelimiter () && isEndOfFile (c ))) {
371356 tkn .type = TT_EOF ;
372357 return tkn ;
373358 }
@@ -381,11 +366,11 @@ protected Token nextToken() throws IOException {
381366 eol = isEndOfLine (c );
382367 }
383368 // ok, start of token reached: comment, encapsulated, or token
384- if (c == commentStart ) {
369+ if (c == strategy . getCommentStart () ) {
385370 // ignore everything till end of line and continue (incr linecount)
386371 in .readLine ();
387372 tkn = nextToken ();
388- } else if (c == delimiter ) {
373+ } else if (c == strategy . getDelimiter () ) {
389374 // empty token return TT_TOKEN("")
390375 tkn .type = TT_TOKEN ;
391376 tkn .isReady = true ;
@@ -394,7 +379,7 @@ protected Token nextToken() throws IOException {
394379 tkn .content .append ("" );
395380 tkn .type = TT_EORECORD ;
396381 tkn .isReady = true ;
397- } else if (c == encapsulator ) {
382+ } else if (c == strategy . getEncapsulator () ) {
398383 // consume encapsulated token
399384 encapsulatedTokenLexer (tkn , c );
400385 } else if (isEndOfFile (c )) {
@@ -405,7 +390,7 @@ protected Token nextToken() throws IOException {
405390 } else {
406391 // next token must be a simple token
407392 // add removed blanks when not ignoring whitespace chars...
408- if (!this . ignoreLeadingWhitespaces ) {
393+ if (!strategy . getIgnoreLeadingWhitespaces () ) {
409394 tkn .content .append (wsBuf .toString ());
410395 }
411396 simpleTokenLexer (tkn , c );
@@ -443,11 +428,11 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
443428 // end of file
444429 tkn .type = TT_EOF ;
445430 tkn .isReady = true ;
446- } else if (c == delimiter ) {
431+ } else if (c == strategy . getDelimiter () ) {
447432 // end of token
448433 tkn .type = TT_TOKEN ;
449434 tkn .isReady = true ;
450- } else if (c == '\\' && interpretUnicodeEscapes && in .lookAhead () == 'u' ) {
435+ } else if (c == '\\' && strategy . getUnicodeEscapeInterpretation () && in .lookAhead () == 'u' ) {
451436 // interpret unicode escaped chars (like \u0070 -> p)
452437 tkn .content .append ((char ) unicodeEscapeLexer (c ));
453438 } else if (isWhitespace (c )) {
@@ -493,9 +478,9 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
493478 // assert c == delimiter;
494479 c = in .read ();
495480 while (!tkn .isReady ) {
496- if (c == encapsulator || c == '\\' ) {
481+ if (c == strategy . getEncapsulator () || c == '\\' ) {
497482 // check lookahead
498- if (in .lookAhead () == encapsulator ) {
483+ if (in .lookAhead () == strategy . getEncapsulator () ) {
499484 // double or escaped encapsulator -> add single encapsulator to token
500485 c = in .read ();
501486 tkn .content .append ((char ) c );
@@ -506,7 +491,7 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
506491 c = in .read ();
507492 tkn .content .append ((char ) c );
508493 } else if (
509- interpretUnicodeEscapes
494+ strategy . getUnicodeEscapeInterpretation ()
510495 && c == '\\'
511496 && in .lookAhead () == 'u' ) {
512497 // interpret unicode escaped chars (like \u0070 -> p)
@@ -518,7 +503,7 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
518503 // token finish mark (encapsulator) reached: ignore whitespace till delimiter
519504 while (!tkn .isReady ) {
520505 int n = in .lookAhead ();
521- if (n == delimiter ) {
506+ if (n == strategy . getDelimiter () ) {
522507 tkn .type = TT_TOKEN ;
523508 tkn .isReady = true ;
524509 } else if (isEndOfFile (n )) {
@@ -589,201 +574,26 @@ protected int unicodeEscapeLexer(int c) throws IOException {
589574 }
590575
591576 // ======================================================
592- // strategy utilities
577+ // strategies
593578 // ======================================================
594579
595580 /**
596- * Sets the "Default CSV" settings.
597- *
598- * The default csv settings are relatively restrictive but implement
599- * something like the "least-common-basis" of CSV:
600- * <ul>
601- * <li> Delimiter of values is comma ',' (as the C in "CSV") </li>
602- * <li> Complex values encapsulated by '"' </li>
603- * <li> Comments are not supported </li>
604- * <li> Leading whitespaces are ignored </li>
605- * <li> Unicode escapes are not interpreted </li>
606- * <li> empty lines are skiped </li>
607- * </ul>
608- * @return current instance of CSVParser to allow chained method calls
609- */
610- public CSVParser setCSVStrategy () {
611- setStrategy (',' , '"' , (char ) 0 , true , false , true );
612- return this ;
613- }
614-
615- /**
616- * Sets the "Excel CSV" settings. There are companies out there which
617- * interpret "C" as an abbreviation for "Semicolon". For these companies the
618- * following settings might be appropriate:
619- * <ul>
620- * <li> Delimiter of values is semicolon ';' </li>
621- * <li> Complex values encapsulated by '"' </li>
622- * <li> Comments are not supported </li>
623- * <li> Leading whitespaces are not ignored </li>
624- * <li> Unicode escapes are not interpreted </li>
625- * <li> empty lines are not skiped </li>
626- * </ul>
627- *
628- * @return current instance of CSVParser to allow chained method calls
629- */
630- public CSVParser setExcelStrategy () {
631- setStrategy (';' , '"' , (char ) 0 , false , false , false );
632- return this ;
633- }
634-
635- /**
636- * Customized CSV strategy setter.
637- *
638- * @param delimiter a Char used for value separation
639- * @param encapsulator a Char used as value encapsulation marker
640- * @param commentStart a Char used for comment identification
641- * @param ignoreLeadingWhitespace TRUE when leading whitespaces should be
642- * ignored
643- * @param interpretUnicodeEscapes TRUE when unicode escapes should be
644- * interpreted
645- * @param ignoreEmptyLines TRUE when the parser should skip emtpy lines
646- * @return current instance of CSVParser to allow chained method calls
647- */
648- public CSVParser setStrategy (
649- char delimiter ,
650- char encapsulator ,
651- char commentStart ,
652- boolean ignoreLeadingWhitespace ,
653- boolean interpretUnicodeEscapes ,
654- boolean ignoreEmptyLines ) {
655- this .setDelimiter (delimiter );
656- this .setEncapsulator (encapsulator );
657- this .setCommentStart (commentStart );
658- this .setIgnoreLeadingWhitespaces (ignoreLeadingWhitespace );
659- this .setUnicodeEscapeInterpretation (interpretUnicodeEscapes );
660- this .setIgnoreEmptyLines (ignoreEmptyLines );
661- return this ;
662- }
663-
664- /**
665- * Set the desired delimiter.
581+ * Sets the specified CSV Strategy
666582 *
667- * @param c a Char used for value separation
668- * @return current instance of CSVParser to allow chained method calls
669- */
670- public CSVParser setDelimiter (char c ) {
671- this .delimiter = c ;
672- return this ;
673- }
674-
675- /**
676- * Gets the delimiter.
677- *
678- * @return the delimiter character
679- */
680- public char getDelimiter () {
681- return this .delimiter ;
682- }
683-
684- /**
685- * Set the desired encapsulator.
686- *
687- * @param c a Char used as value encapsulation marker
688- * @return current instance of CSVParser to allow chained method calls
689- */
690- public CSVParser setEncapsulator (char c ) {
691- this .encapsulator = c ;
692- return this ;
693- }
694-
695- /**
696- * Gets the encapsulator character.
697- *
698- * @return the encapsulator marker
699- */
700- public char getEncapsulator () {
701- return this .encapsulator ;
702- }
703-
704- /**
705- * Set the desired comment start character.
706- *
707- * @param c a Char used for comment identification
708- * @return current instance of CSVParser to allow chained method calls
709- */
710- public CSVParser setCommentStart (char c ) {
711- this .commentStart = c ;
712- return this ;
713- }
714-
715- /**
716- * Gets the comment identifier.
717- *
718- * @return the comment identifier character
719- */
720- public char getCommentStart () {
721- return this .commentStart ;
722- }
723-
724- /**
725- * Enables unicode escape interpretation.
726- *
727- * @param b TRUE when interpretation should be enabled
728- * @return current instance of CSVParser to allow chained method calls
729- */
730- public CSVParser setUnicodeEscapeInterpretation (boolean b ) {
731- this .interpretUnicodeEscapes = b ;
732- return this ;
733- }
734-
735- /**
736- * Shows wether unicode interpretation is enabled.
737- *
738- * @return TRUE when unicode interpretation is enabled
739- */
740- public boolean getUnicodeEscapeInterpretation () {
741- return this .interpretUnicodeEscapes ;
742- }
743-
744- /**
745- * Sets the ignore-leading-whitespaces behaviour.
746- *
747- * Should the lexer ignore leading whitespaces when parsing non
748- * encapsulated tokens.
749- *
750- * @param b TRUE when leading whitespaces should be ignored
751- * @return current instance of CSVParser to allow chained method calls
752- */
753- public CSVParser setIgnoreLeadingWhitespaces (boolean b ) {
754- this .ignoreLeadingWhitespaces = b ;
755- return this ;
756- }
757-
758- /**
759- * Shows whether unicode interpretation is enabled.
760- *
761- * @return TRUE when unicode interpretation is enabled
762- */
763- public boolean getIgnoreLeadingWhitespaces () {
764- return this .ignoreLeadingWhitespaces ;
765- }
766-
767- /**
768- * Sets the ignore-empty-line behaviour.
769- *
770- * When set to 'true' empty lines in the input will be ignored.
771- *
772- * @param b TRUE when empty lines in the input should be ignored
773583 * @return current instance of CSVParser to allow chained method calls
774584 */
775- public CSVParser setIgnoreEmptyLines ( boolean b ) {
776- this .ignoreEmptyLines = b ;
585+ public CSVParser setStrategy ( CSVStrategy strategy ) {
586+ this .strategy = strategy ;
777587 return this ;
778588 }
779589
780590 /**
781- * Shows whether empty lines in the input are ignored.
591+ * Obtain the specified CSV Strategy
782592 *
783- * @return TRUE when empty lines in the input are ignored
593+ * @return strategy currently being used
784594 */
785- public boolean getIgnoreEmptyLines () {
786- return this .ignoreEmptyLines ;
595+ public CSVStrategy getStrategy () {
596+ return this .strategy ;
787597 }
788598
789599 // ======================================================
0 commit comments