SANDBOX-206: add escape to strategy, turn off backslash-style escaping by default

yonik · yonik · commit b55fb21d78e3 · 2008-01-05T15:37:26.000Z
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609155 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/java/org/apache/commons/csv/CSVParser.java b/src/java/org/apache/commons/csv/CSVParser.java
@@ -134,7 +134,7 @@ public CSVParser(Reader input) {
    * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
    */
   public CSVParser(Reader input, char delimiter) {
-    this(input, delimiter, '"', (char) 0);
+    this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
   }
   
   /**
@@ -347,7 +347,7 @@ protected Token nextToken(Token tkn) throws IOException {
         eol = isEndOfLine(c);
       }
       // ok, start of token reached: comment, encapsulated, or token
-      if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
+      if (c == strategy.getCommentStart()) {
         // ignore everything till end of line and continue (incr linecount)
         in.readLine();
         tkn = nextToken(tkn.reset());
@@ -400,19 +400,22 @@ protected Token nextToken(Token tkn) throws IOException {
    */
   private Token simpleTokenLexer(Token tkn, int c) throws IOException {
     wsBuf.clear();
-    while (!tkn.isReady) {
+    for (;;) {
       if (isEndOfLine(c)) {
         // end of record
         tkn.type = TT_EORECORD;
         tkn.isReady = true;
+        return tkn;
       } else if (isEndOfFile(c)) {
         // end of file
         tkn.type = TT_EOF;
         tkn.isReady = true;
+        return tkn;
       } else if (c == strategy.getDelimiter()) {
         // end of token
         tkn.type = TT_TOKEN;
         tkn.isReady = true;
+        return tkn;
       } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
         // interpret unicode escaped chars (like \u0070 -> p)
         tkn.content.append((char) unicodeEscapeLexer(c));
@@ -422,6 +425,8 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
         if (tkn.content.length() > 0) {
           wsBuf.append((char) c);
         }
+      } else if (c == strategy.getEscape()) {
+        tkn.content.append((char)readEscape(c));
       } else {
         // prepend whitespaces (if we have)
         if (wsBuf.length() > 0) {
@@ -435,7 +440,6 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
         c = in.read();
       }
     }
-    return tkn;
   }
   
   
@@ -457,70 +461,55 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
     int startLineNumber = getLineNumber();
     // ignore the given delimiter
     // assert c == delimiter;
-    c = in.read();
-    while (!tkn.isReady) {
-      boolean skipRead = false;
-      if (c == strategy.getEncapsulator() || c == '\\') {
-        // check lookahead
+    for (;;) {
+      c = in.read();
+
+      if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
+        tkn.content.append((char) unicodeEscapeLexer(c));
+      } else if (c == strategy.getEscape()) {
+        tkn.content.append((char)readEscape(c));
+      } else if (c == strategy.getEncapsulator()) {
         if (in.lookAhead() == strategy.getEncapsulator()) {
           // double or escaped encapsulator -> add single encapsulator to token
           c = in.read();
           tkn.content.append((char) c);
-        } else if (c == '\\' && in.lookAhead() == '\\') {
-          // doubled escape char, it does not escape itself, only encapsulator 
-          // -> add both escape chars to stream
-          tkn.content.append((char) c);
-          c = in.read();
-          tkn.content.append((char) c);
-        } else if (
-          strategy.getUnicodeEscapeInterpretation()
-          && c == '\\' 
-          && in.lookAhead() == 'u') {
-          // interpret unicode escaped chars (like \u0070 -> p)
-          tkn.content.append((char) unicodeEscapeLexer(c));
-        } else if (c == '\\') {
-          // use a single escape character -> add it to stream
-          tkn.content.append((char) c);
         } else {
           // token finish mark (encapsulator) reached: ignore whitespace till delimiter
-          while (!tkn.isReady) {
+          for (;;) {
             c = in.read();
             if (c == strategy.getDelimiter()) {
               tkn.type = TT_TOKEN;
               tkn.isReady = true;
+              return tkn;
             } else if (isEndOfFile(c)) {
               tkn.type = TT_EOF;
               tkn.isReady = true;
+              return tkn;
             } else if (isEndOfLine(c)) {
               // ok eo token reached
               tkn.type = TT_EORECORD;
               tkn.isReady = true;
+              return tkn;
             } else if (!isWhitespace(c)) {
-                // error invalid char between token and next delimiter
-                throw new IOException(
-                  "(line " + getLineNumber() 
-                  + ") invalid char between encapsulated token end delimiter"
-                );
-              }
+              // error invalid char between token and next delimiter
+              throw new IOException(
+                      "(line " + getLineNumber()
+                              + ") invalid char between encapsulated token end delimiter"
+              );
+            }
           }
-          skipRead = true;
         }
       } else if (isEndOfFile(c)) {
         // error condition (end of file before end of token)
         throw new IOException(
-          "(startline " + startLineNumber + ")"
-          + "eof reached before encapsulated token finished"
-          );
+                "(startline " + startLineNumber + ")"
+                        + "eof reached before encapsulated token finished"
+        );
       } else {
         // consume character
         tkn.content.append((char) c);
       }
-      // get the next char
-      if (!tkn.isReady && !skipRead) {
-        c = in.read();
-      }
     }
-    return tkn;
   }
   
   
@@ -554,6 +543,21 @@ protected int unicodeEscapeLexer(int c) throws IOException {
     }
     return ret;
   }
+
+  private int readEscape(int c) throws IOException {
+    // assume c is the escape char (normally a backslash)
+    c = in.read();
+    int out;
+    switch (c) {
+      case 'r': out='\r'; break;
+      case 'n': out='\n'; break;
+      case 't': out='\t'; break;
+      case 'b': out='\b'; break;
+      case 'f': out='\f'; break;
+      default : out=c;
+    }
+    return out;
+  }
   
   // ======================================================
   //  strategies
diff --git a/src/java/org/apache/commons/csv/CSVStrategy.java b/src/java/org/apache/commons/csv/CSVStrategy.java
@@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable {
     private char delimiter;
     private char encapsulator;
     private char commentStart;
+    private char escape;
     private boolean ignoreLeadingWhitespaces;
     private boolean interpretUnicodeEscapes;
     private boolean ignoreEmptyLines;
 
-    public static char COMMENTS_DISABLED       = (char) 0;
+    // -2 is used to signal disabled, because it won't be confused with
+    // an EOF signal (-1), and because \ufffe in UTF-16 would be
+    // encoded as two chars (using surrogates) and thus there should never
+    // be a collision with a real text char.
+    public static char COMMENTS_DISABLED       = (char)-2;
+    public static char ESCAPE_DISABLED         = (char)-2;
 
-    public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true,  false, true);
-    public static CSVStrategy EXCEL_STRATEGY   = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false);
-    public static CSVStrategy TDF_STRATEGY     = new CSVStrategy('	', '"', COMMENTS_DISABLED, true,  false, true);
+    public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true,  false, true);
+    public static CSVStrategy EXCEL_STRATEGY   = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
+    public static CSVStrategy TDF_STRATEGY     = new CSVStrategy('	', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true,  false, true);
 
 
     public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@@ -58,19 +64,34 @@ public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
     public CSVStrategy(
         char delimiter, 
         char encapsulator, 
-        char commentStart, 
+        char commentStart,
+        char escape,
         boolean ignoreLeadingWhitespace, 
         boolean interpretUnicodeEscapes,
         boolean ignoreEmptyLines) 
     {
         setDelimiter(delimiter);
         setEncapsulator(encapsulator);
         setCommentStart(commentStart);
+        setEscape(escape);
         setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
         setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
         setIgnoreEmptyLines(ignoreEmptyLines);
     }
 
+    /** @deprecated */
+    public CSVStrategy(
+        char delimiter,
+        char encapsulator,
+        char commentStart,
+        boolean ignoreLeadingWhitespace,
+        boolean interpretUnicodeEscapes,
+        boolean ignoreEmptyLines)
+    {
+        this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
+    }
+
+
     public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
     public char getDelimiter() { return this.delimiter; }
 
@@ -81,6 +102,9 @@ public CSVStrategy(
     public char getCommentStart() { return this.commentStart; }
     public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; }
 
+    public void setEscape(char escape) { this.escape = escape; }
+    public char getEscape() { return this.escape; }
+
     public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
     public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
 
diff --git a/src/test/org/apache/commons/csv/CSVParserTest.java b/src/test/org/apache/commons/csv/CSVParserTest.java
@@ -182,9 +182,7 @@ public void testNextToken4() throws IOException {
   // encapsulator tokenizer (multi line, delimiter in string)
   public void testNextToken5() throws IOException {   
     String code = 
-      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\",\"\\\"\""
-      + ",\"\\,\"" 
-      + ",\"\"\"\"";
+      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
     TestCSVParser parser = new TestCSVParser(new StringReader(code));
     parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
     System.out.println("---------\n" + code + "\n-------------");
@@ -193,11 +191,8 @@ public void testNextToken5() throws IOException {
     assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
     assertEquals(CSVParser.TT_EORECORD + ";foo\n  baar ,,,;",
         parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
-    // escape char in quoted input only escapes delimiter
-    assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
-    assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
+    assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
+
   }
   
   // change delimiters, comment, encapsulater
@@ -207,7 +202,7 @@ public void testNextToken6() throws IOException {
      *       !comment;;;;
      *       ;;
      */
-    String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
+    String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
     TestCSVParser parser = new TestCSVParser(new StringReader(code));
     parser.setStrategy( new CSVStrategy(';', '\'', '!') );
     System.out.println("---------\n" + code + "\n-------------");
@@ -226,8 +221,9 @@ public void testNextToken6() throws IOException {
     "a,b,c,d\n"
     + " a , b , 1 2 \n"
     + "\"foo baar\", b,\n"
-    + "   \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
-  String[][] res = { 
+   // + "   \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
+      + "   \"foo\n,,\n\"\",,\n\"\"\",d,e\n";   // changed to use standard CSV escaping
+  String[][] res = {
     {"a", "b", "c", "d"},
     {"a", "b", "1 2"}, 
     {"foo baar", "b", ""}, 
@@ -439,7 +435,7 @@ public void testEmptyLineBehaviourCSV() throws Exception {
     }
   }
   
-  public void testBackslashEscaping() throws IOException {
+  public void OLDtestBackslashEscaping() throws IOException {
     String code =
       "one,two,three\n"
       + "on\\\"e,two\n"
@@ -474,6 +470,49 @@ public void testBackslashEscaping() throws IOException {
     }
   }
   
+  public void testBackslashEscaping() throws IOException {
+
+    // To avoid confusion over the need for escaping chars in java code,
+    // We will test with a forward slash as the escape char, and a single
+    // quote as the encapsulator.
+
+    String code =
+      "one,two,three\n" // 0
+      + "'',''\n"       // 1) empty encapsulators
+      + "/',/'\n"       // 2) single encapsulators
+      + "'/'','/''\n"   // 3) single encapsulators encapsulated via escape
+      + "'''',''''\n"   // 4) single encapsulators encapsulated via doubling
+      + "/,,/,\n"       // 5) separator escaped
+      + "//,//\n"       // 6) escape escaped
+      + "'//','//'\n"   // 7) escape escaped in encapsulation
+      + "";
+    String[][] res = {
+        { "one", "two", "three" }, // 0
+        { "", "" },                // 1
+        { "'", "'" },              // 2
+        { "'", "'" },              // 3
+        { "'", "'" },              // 4
+        { ",", "," },              // 5
+        { "/", "/" },              // 6
+        { "/", "/" },              // 7
+      };
+
+
+    CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
+
+    CSVParser parser = new CSVParser(new StringReader(code), strategy);
+    System.out.println("---------\n" + code + "\n-------------");
+    String[][] tmp = parser.getAllValues();
+    assertTrue(tmp.length > 0);
+    for (int i = 0; i < res.length; i++) {
+      for (int j = 0; j < tmp[i].length; j++) {
+        System.out.println("'" + tmp[i][j] + "'  should be '" + res[i][j] + "'");
+      }
+      assertTrue(Arrays.equals(res[i], tmp[i]));
+    }
+  }
+
+
     public void testUnicodeEscape() throws IOException {
       String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
       CSVParser parser = new CSVParser(new StringReader(code));
diff --git a/src/test/org/apache/commons/csv/CSVStrategyTest.java b/src/test/org/apache/commons/csv/CSVStrategyTest.java
@@ -91,15 +91,15 @@ public void testSetCSVStrategy() {
     // default settings
     assertEquals(strategy.getDelimiter(), ',');
     assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
     assertEquals(true,  strategy.getIgnoreLeadingWhitespaces());
     assertEquals(false, strategy.getUnicodeEscapeInterpretation());
     assertEquals(true,  strategy.getIgnoreEmptyLines());
     // explicit csv settings
     parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
     assertEquals(strategy.getDelimiter(), ',');
     assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
     assertEquals(true,  strategy.getIgnoreLeadingWhitespaces());
     assertEquals(false, strategy.getUnicodeEscapeInterpretation());
     assertEquals(true,  strategy.getIgnoreEmptyLines());
@@ -109,7 +109,7 @@ public void testSetExcelStrategy() {
     CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
     assertEquals(strategy.getDelimiter(), ',');
     assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
     assertEquals(false,  strategy.getIgnoreLeadingWhitespaces());
     assertEquals(false, strategy.getUnicodeEscapeInterpretation());
     assertEquals(false, strategy.getIgnoreEmptyLines());