@@ -76,8 +76,6 @@ public class CSVParser implements Iterable<String[]> {
7676 private final List <String > record = new ArrayList <String >();
7777 private final Token reusableToken = new Token ();
7878 private final CharBuffer wsBuf = new CharBuffer ();
79- private final CharBuffer code = new CharBuffer (4 );
80-
8179
8280 /**
8381 * Token is an internal token representation.
@@ -137,6 +135,10 @@ public CSVParser(Reader input) {
137135 * @param format the CSVFormat used for CSV parsing
138136 */
139137 public CSVParser (Reader input , CSVFormat format ) {
138+ if (format .isUnicodeEscapesInterpreted ()) {
139+ input = new UnicodeUnescapeReader (input );
140+ }
141+
140142 this .in = new ExtendedBufferedReader (input );
141143 this .format = format ;
142144 }
@@ -404,9 +406,6 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
404406 tkn .type = TOKEN ;
405407 tkn .isReady = true ;
406408 break ;
407- } else if (c == '\\' && format .isUnicodeEscapesInterpreted () && in .lookAhead () == 'u' ) {
408- // interpret unicode escaped chars (like \u0070 -> p)
409- tkn .content .append ((char ) unicodeEscapeLexer (c ));
410409 } else if (c == format .getEscape ()) {
411410 tkn .content .append ((char ) readEscape (c ));
412411 } else {
@@ -444,10 +443,8 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
444443 // assert c == delimiter;
445444 for (; ;) {
446445 c = in .read ();
447-
448- if (c == '\\' && format .isUnicodeEscapesInterpreted () && in .lookAhead () == 'u' ) {
449- tkn .content .append ((char ) unicodeEscapeLexer (c ));
450- } else if (c == format .getEscape ()) {
446+
447+ if (c == format .getEscape ()) {
451448 tkn .content .append ((char ) readEscape (c ));
452449 } else if (c == format .getEncapsulator ()) {
453450 if (in .lookAhead () == format .getEncapsulator ()) {
@@ -487,62 +484,23 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
487484 }
488485 }
489486
490-
491- /**
492- * Decodes Unicode escapes.
493- * <p/>
494- * Interpretation of "\\uXXXX" escape sequences where XXXX is a hex-number.
495- *
496- * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
497- * @return the decoded character
498- * @throws IOException on wrong unicode escape sequence or read error
499- */
500- private int unicodeEscapeLexer (int c ) throws IOException {
501- int ret = 0 ;
502- // ignore 'u' (assume c==\ now) and read 4 hex digits
503- c = in .read ();
504- code .clear ();
505- try {
506- for (int i = 0 ; i < 4 ; i ++) {
507- c = in .read ();
508- if (isEndOfFile (c ) || isEndOfLine (c )) {
509- throw new NumberFormatException ("number too short" );
510- }
511- code .append ((char ) c );
512- }
513- ret = Integer .parseInt (code .toString (), 16 );
514- } catch (NumberFormatException e ) {
515- throw new IOException (
516- "(line " + getLineNumber () + ") Wrong unicode escape sequence found '"
517- + code .toString () + "'" + e .toString ());
518- }
519- return ret ;
520- }
521-
522487 private int readEscape (int c ) throws IOException {
523488 // assume c is the escape char (normally a backslash)
524489 c = in .read ();
525- int out ;
526490 switch (c ) {
527491 case 'r' :
528- out = '\r' ;
529- break ;
492+ return '\r' ;
530493 case 'n' :
531- out = '\n' ;
532- break ;
494+ return '\n' ;
533495 case 't' :
534- out = '\t' ;
535- break ;
496+ return '\t' ;
536497 case 'b' :
537- out = '\b' ;
538- break ;
498+ return '\b' ;
539499 case 'f' :
540- out = '\f' ;
541- break ;
500+ return '\f' ;
542501 default :
543- out = c ;
502+ return c ;
544503 }
545- return out ;
546504 }
547505
548506 /**
0 commit comments