Extracted the lexer from CSVParser in a distinct class (suggested by Bob Smith)

ebourg · ebourg · commit ca7bbae40ef8 · 2012-03-07T18:21:52.000Z
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@1298033 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -25,7 +25,9 @@
 import java.util.List;
 import java.util.NoSuchElementException;
 
-import static org.apache.commons.csv.CSVParser.Token.Type.*;
+import org.apache.commons.csv.CSVLexer.Token;
+
+import static org.apache.commons.csv.CSVLexer.Token.Type.*;
 
 /**
  * Parses CSV files according to the specified configuration.
@@ -59,65 +61,16 @@
  */
 public class CSVParser implements Iterable<String[]> {
 
-    /** length of the initial token (content-)buffer */
-    private static final int INITIAL_TOKEN_LENGTH = 50;
-
     /** Immutable empty String array. */
     private static final String[] EMPTY_STRING_ARRAY = new String[0];
 
-    /** The input stream */
-    private final ExtendedBufferedReader in;
-
-    private final CSVFormat format;
-
+    private CSVLexer lexer;
+    
     // the following objects are shared to reduce garbage
     
     /** A record buffer for getLine(). Grows as necessary and is reused. */
     private final List<String> record = new ArrayList<String>();
     private final Token reusableToken = new Token();
-    private final CharBuffer wsBuf = new CharBuffer();
-
-    /**
-     * Token is an internal token representation.
-     * <p/>
-     * It is used as contract between the lexer and the parser.
-     */
-    static class Token {
-
-        enum Type {
-            /** Token has no valid content, i.e. is in its initialized state. */
-            INVALID,
-            
-            /** Token with content, at beginning or in the middle of a line. */
-            TOKEN,
-            
-            /** Token (which can have content) when end of file is reached. */
-            EOF,
-            
-            /** Token with content when end of a line is reached. */
-            EORECORD
-        }
-        
-        /** Token type */
-        Type type = INVALID;
-        
-        /** The content buffer. */
-        CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
-        
-        /** Token ready flag: indicates a valid token with content (ready for the parser). */
-        boolean isReady;
-
-        Token reset() {
-            content.clear();
-            type = INVALID;
-            isReady = false;
-            return this;
-        }
-    }
-
-    // ======================================================
-    //  the constructor
-    // ======================================================
 
     /**
      * CSV parser using the default {@link CSVFormat}.
@@ -139,8 +92,7 @@ public CSVParser(Reader input, CSVFormat format) {
             input = new UnicodeUnescapeReader(input);
         }
         
-        this.in = new ExtendedBufferedReader(input);
-        this.format = format;
+        this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
     }
 
     /**
@@ -153,9 +105,6 @@ public CSVParser(String input, CSVFormat format) {
         this(new StringReader(input), format);
     }
 
-    // ======================================================
-    //  the parser
-    // ======================================================
 
     /**
      * Parses the CSV according to the given format and returns the content
@@ -191,7 +140,7 @@ String[] getLine() throws IOException {
         record.clear();
         while (true) {
             reusableToken.reset();
-            nextToken(reusableToken);
+            lexer.nextToken(reusableToken);
             switch (reusableToken.type) {
                 case TOKEN:
                     record.add(reusableToken.content.toString());
@@ -274,12 +223,69 @@ public void remove() { }
      * @return current line number
      */
     public int getLineNumber() {
-        return in.getLineNumber();
+        return lexer.getLineNumber();
     }
+}
 
-    // ======================================================
-    //  the lexer(s)
-    // ======================================================
+
+class CSVLexer {
+
+    /** length of the initial token (content-)buffer */
+    private static final int INITIAL_TOKEN_LENGTH = 50;
+    
+    private final CharBuffer wsBuf = new CharBuffer();
+    
+    private CSVFormat format;
+    
+    /** The input stream */
+    private ExtendedBufferedReader in;
+
+    /**
+     * Token is an internal token representation.
+     * <p/>
+     * It is used as contract between the lexer and the parser.
+     */
+    static class Token {
+
+        enum Type {
+            /** Token has no valid content, i.e. is in its initialized state. */
+            INVALID,
+            
+            /** Token with content, at beginning or in the middle of a line. */
+            TOKEN,
+            
+            /** Token (which can have content) when end of file is reached. */
+            EOF,
+            
+            /** Token with content when end of a line is reached. */
+            EORECORD
+        }
+        
+        /** Token type */
+        Type type = INVALID;
+        
+        /** The content buffer. */
+        CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
+        
+        /** Token ready flag: indicates a valid token with content (ready for the parser). */
+        boolean isReady;
+
+        Token reset() {
+            content.clear();
+            type = INVALID;
+            isReady = false;
+            return this;
+        }
+    }
+
+    CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
+        this.format = format;
+        this.in = in;
+    }
+
+    public int getLineNumber() {
+        return in.getLineNumber();
+    }
 
     /**
      * Returns the next token.
@@ -503,19 +509,6 @@ private int readEscape(int c) throws IOException {
         }
     }
 
-    /**
-     * Obtain the specified CSV format.
-     *
-     * @return format currently being used
-     */
-    public CSVFormat getFormat() {
-        return this.format;
-    }
-
-    // ======================================================
-    //  Character class checker
-    // ======================================================
-
     /**
      * @return true if the given char is a whitespace character
      */
diff --git a/src/test/java/org/apache/commons/csv/CSVLexerTest.java b/src/test/java/org/apache/commons/csv/CSVLexerTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.csv;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+import org.apache.commons.csv.CSVLexer.Token;
+
+import static org.apache.commons.csv.CSVLexer.Token.Type.*;
+
+public class CSVLexerTest extends TestCase {
+    
+    private CSVLexer getLexer(String input, CSVFormat format) {
+        return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
+    }
+
+    private void assertTokenEquals(Token.Type expectedType, String expectedContent, Token token) {
+        assertEquals("Token type", expectedType, token.type);
+        assertEquals("Token content", expectedContent, token.content.toString());
+    }
+    
+    // Single line (without comment)
+    public void testNextToken1() throws IOException {
+        String code = "abc,def, hijk,  lmnop,   qrst,uv ,wxy   ,z , ,";
+        CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
+        assertTokenEquals(TOKEN, "abc", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "def", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "hijk", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "lmnop", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "qrst", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "uv", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "wxy", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "z", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "", parser.nextToken(new Token()));
+    }
+
+    // multiline including comments (and empty lines)
+    public void testNextToken2() throws IOException {
+        /*   file:   1,2,3,
+        *           a,b x,c
+        *
+        *           # this is a comment
+        *           d,e,
+        *
+        */
+        String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n";
+        CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
+        
+        CSVLexer parser = getLexer(code, format);
+
+
+        assertTokenEquals(TOKEN, "1", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "2", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "3", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "b x", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "c", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "d", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "e", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "", parser.nextToken(new Token()));
+
+    }
+
+    // simple token with escaping
+    public void testNextToken3() throws IOException {
+        /* file: a,\,,b
+        *       \,,
+        */
+        String code = "a,\\,,b\n\\,,";
+        CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
+        CSVLexer parser = getLexer(code, format);
+
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        // an unquoted single backslash is not an escape char
+        assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        // an unquoted single backslash is not an escape char
+        assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "", parser.nextToken(new Token()));
+    }
+
+    // encapsulator tokenizer (sinle line)
+    public void testNextToken4() throws IOException {
+        /* file:  a,"foo",b
+        *        a,   " foo",b
+        *        a,"foo "   ,b     // whitespace after closing encapsulator
+        *        a,  " foo " ,b
+        */
+        String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
+        CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "foo", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, " foo", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "foo ", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, " foo ", parser.nextToken(new Token()));
+//      assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "b", parser.nextToken(new Token()));
+    }
+
+    // encapsulator tokenizer (multi line, delimiter in string)
+    public void testNextToken5() throws IOException {
+        String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
+        CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "foo\n", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "foo\n  baar ,,,", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "\n\t \n", parser.nextToken(new Token()));
+
+    }
+
+    // change delimiters, comment, encapsulater
+    public void testNextToken6() throws IOException {
+        /* file: a;'b and \' more
+        *       '
+        *       !comment;;;;
+        *       ;;
+        */
+        String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
+        CSVFormat format = new CSVFormat(';', '\'', '!');
+        CSVLexer parser = getLexer(code, format);
+        assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
+        assertTokenEquals(EORECORD, "b and ' more\n", parser.nextToken(new Token()));
+    }
+
+    // From SANDBOX-153
+    public void testDelimiterIsWhitespace() throws IOException {
+        String code = "one\ttwo\t\tfour \t five\t six";
+        CSVLexer parser = getLexer(code, CSVFormat.TDF);
+        assertTokenEquals(TOKEN, "one", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "two", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "four", parser.nextToken(new Token()));
+        assertTokenEquals(TOKEN, "five", parser.nextToken(new Token()));
+        assertTokenEquals(EOF, "six", parser.nextToken(new Token()));
+    }
+}
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java