Replaced the unicode escaping code from the parser with a class implementing java.io.Reader

ebourg · ebourg · commit 00d0def6953d · 2012-03-07T15:58:12.000Z
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@1298001 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -76,8 +76,6 @@ public class CSVParser implements Iterable<String[]> {
     private final List<String> record = new ArrayList<String>();
     private final Token reusableToken = new Token();
     private final CharBuffer wsBuf = new CharBuffer();
-    private final CharBuffer code = new CharBuffer(4);
-
 
     /**
      * Token is an internal token representation.
@@ -137,6 +135,10 @@ public CSVParser(Reader input) {
      * @param format the CSVFormat used for CSV parsing
      */
     public CSVParser(Reader input, CSVFormat format) {
+        if (format.isUnicodeEscapesInterpreted()) {
+            input = new UnicodeUnescapeReader(input);
+        }
+        
         this.in = new ExtendedBufferedReader(input);
         this.format = format;
     }
@@ -404,9 +406,6 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
                 tkn.type = TOKEN;
                 tkn.isReady = true;
                 break;
-            } else if (c == '\\' && format.isUnicodeEscapesInterpreted() && in.lookAhead() == 'u') {
-                // interpret unicode escaped chars (like \u0070 -> p)
-                tkn.content.append((char) unicodeEscapeLexer(c));
             } else if (c == format.getEscape()) {
                 tkn.content.append((char) readEscape(c));
             } else {
@@ -444,10 +443,8 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
         // assert c == delimiter;
         for (; ;) {
             c = in.read();
-
-            if (c == '\\' && format.isUnicodeEscapesInterpreted() && in.lookAhead() == 'u') {
-                tkn.content.append((char) unicodeEscapeLexer(c));
-            } else if (c == format.getEscape()) {
+            
+            if (c == format.getEscape()) {
                 tkn.content.append((char) readEscape(c));
             } else if (c == format.getEncapsulator()) {
                 if (in.lookAhead() == format.getEncapsulator()) {
@@ -487,62 +484,23 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
         }
     }
 
-
-    /**
-     * Decodes Unicode escapes.
-     * <p/>
-     * Interpretation of "\\uXXXX" escape sequences where XXXX is a hex-number.
-     *
-     * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
-     * @return the decoded character
-     * @throws IOException on wrong unicode escape sequence or read error
-     */
-    private int unicodeEscapeLexer(int c) throws IOException {
-        int ret = 0;
-        // ignore 'u' (assume c==\ now) and read 4 hex digits
-        c = in.read();
-        code.clear();
-        try {
-            for (int i = 0; i < 4; i++) {
-                c = in.read();
-                if (isEndOfFile(c) || isEndOfLine(c)) {
-                    throw new NumberFormatException("number too short");
-                }
-                code.append((char) c);
-            }
-            ret = Integer.parseInt(code.toString(), 16);
-        } catch (NumberFormatException e) {
-            throw new IOException(
-                    "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
-                            + code.toString() + "'" + e.toString());
-        }
-        return ret;
-    }
-
     private int readEscape(int c) throws IOException {
         // assume c is the escape char (normally a backslash)
         c = in.read();
-        int out;
         switch (c) {
             case 'r':
-                out = '\r';
-                break;
+                return '\r';
             case 'n':
-                out = '\n';
-                break;
+                return '\n';
             case 't':
-                out = '\t';
-                break;
+                return '\t';
             case 'b':
-                out = '\b';
-                break;
+                return '\b';
             case 'f':
-                out = '\f';
-                break;
+                return '\f';
             default:
-                out = c;
+                return c;
         }
-        return out;
     }
 
     /**
diff --git a/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java b/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.csv;
+
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+
+/**
+ * Reader transforming unicode escape sequences (i.e \u0065) in the provided
+ * stream into the corresponding unicode character.
+ * 
+ * @author Emmanuel Bourg
+ * @version $Revision$, $Date$
+ */
+class UnicodeUnescapeReader extends Reader {
+    private PushbackReader reader;
+
+    /** The buffer used to read unicode escape sequences. */
+    private final char[] sequence = new char[5];
+
+    UnicodeUnescapeReader(Reader reader) {
+        this.reader = new PushbackReader(reader, sequence.length);
+    }
+
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        int count = 0;
+        for (int i = 0; i < len; i++) {
+            int c = reader.read();
+            
+            if (c == -1) {
+                return count == 0 ? -1 : count;
+            }
+            
+            if (c == '\\') {
+                int l = reader.read(sequence);
+                if (l == sequence.length 
+                        && 'u' == sequence[0]
+                        && isHexadecimal(sequence[1])
+                        && isHexadecimal(sequence[2])
+                        && isHexadecimal(sequence[3])
+                        && isHexadecimal(sequence[4])) {
+                    // unicode escape found
+                    c = Integer.parseInt(new String(sequence, 1, 4), 16);
+                    
+                } else if (l > 0) {
+                    // put the characters back in the stream
+                    reader.unread(sequence, 0, l);
+                }
+            }
+
+            cbuf[off + i] = (char) c;
+            count++;
+        }
+        
+        return count;
+    }
+    
+    private boolean isHexadecimal(char c) {
+        return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
+    }
+
+    public void close() throws IOException {
+        if (reader != null) {
+            reader.close();
+        }
+    }
+}