Merge pull request #6 from DarrenJAN/apply-fix-on-release

DarrenJAN · web-flow · commit 526ecc2d1fbc · 2024-11-05T11:13:07.000-05:00
Add support in Commons CSV for tracking byte positions during parsing
diff --git a/pom.xml b/pom.xml
@@ -28,6 +28,7 @@
   <url>https://commons.apache.org/proper/commons-csv/</url>
   <inceptionYear>2005</inceptionYear>
   <description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
+  <packaging>jar</packaging>
 
   <dependencies>
     <dependency>
@@ -231,6 +232,8 @@
               <exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2074,6 +2074,30 @@ public CSVParser parse(final Reader reader) throws IOException {
         return new CSVParser(reader, this);
     }
 
+    /**
+     * Parses the specified content.
+     *
+     * <p>
+     * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
+     * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
+     * </p>
+     *
+     * <p>
+     * For additional parsing options, see the various static parse methods available on {@link CSVParser}.
+     * </p>
+     *
+     * @param reader the input stream
+     * @param characterOffset the character offset to start parsing from
+     * @param recordNumber the initial record number to start counting from
+     * @param encoding the character encoding of the input stream
+     * @return a parser over a stream of {@link CSVRecord}s.
+     * @throws IOException If an I/O error occurs
+     * @throws CSVException Thrown on invalid input.
+     */
+    public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
+        return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
+    }
+
     /**
      * Prints to the specified output.
      *
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -438,10 +438,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
     @SuppressWarnings("resource")
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
         throws IOException {
+            this(reader, format, characterOffset, recordNumber, null);
+        }
+
+        /**
+     * Constructs a new instance using the given {@link CSVFormat}
+     *
+     * <p>
+     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+     * unless you close the {@code reader}.
+     * </p>
+     *
+     * @param reader
+     *            a Reader containing CSV-formatted input. Must not be null.
+     * @param format
+     *            the CSVFormat used for CSV parsing. Must not be null.
+     * @param characterOffset
+     *            Lexer offset when the parser does not start parsing at the beginning of the source.
+     * @param recordNumber
+     *            The next record number to assign
+     * @param encoding
+     *            The encoding to use for the reader
+     * @throws IllegalArgumentException
+     *             If the parameters of the format are inconsistent or if either the reader or format is null.
+     * @throws IOException
+     *             If there is a problem reading the header or skipping the first record
+     * @throws CSVException Thrown on invalid input.
+     */
+    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
+        String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
         this.format = format.copy();
-        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
         this.csvRecordIterator = new CSVRecordIterator();
         this.headers = createHeaders();
         this.characterOffset = characterOffset;
@@ -768,6 +797,7 @@ CSVRecord nextRecord() throws IOException {
         recordList.clear();
         StringBuilder sb = null;
         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
+        final long startCharByte = lexer.getBytesRead() + this.characterOffset;
         do {
             reusableToken.reset();
             lexer.nextToken(reusableToken);
@@ -805,7 +835,7 @@ CSVRecord nextRecord() throws IOException {
             recordNumber++;
             final String comment = Objects.toString(sb, null);
             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
-                recordNumber, startCharPosition);
+                recordNumber, startCharPosition, startCharByte);
         }
         return result;
     }
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
      */
     private final long characterPosition;
 
+    /**
+     * The start byte of this record as a character byte in the source stream.
+     */
+    private final long characterByte;
+
     /** The accumulated comments (if any) */
     private final String comment;
 
@@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
         this.parser = parser;
         this.comment = comment;
         this.characterPosition = characterPosition;
+        this.characterByte = 0L;
     }
 
+    CSVRecord(final CSVParser parser, final String[] values,  final String comment, final long recordNumber,
+            final long characterPosition, final long characterByte) {
+        this.recordNumber = recordNumber;
+        this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
+        this.parser = parser;
+        this.comment = comment;
+        this.characterPosition = characterPosition;
+        this.characterByte = characterByte;
+    }
     /**
      * Returns a value by {@link Enum}.
      *
@@ -144,6 +159,15 @@ public long getCharacterPosition() {
         return characterPosition;
     }
 
+    /**
+     * Returns the start byte of this record as a character byte in the source stream.
+     *
+     * @return the start byte of this record as a character byte in the source stream.
+     */
+    public long getCharacterByte() {
+        return characterByte;
+    }
+
     /**
      * Returns the comment for this record, if any.
      * Note that comments are attached to the following record.
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -24,6 +24,10 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,13 +53,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long position;
     private long positionMark;
 
+    /** The number of bytes read so far */
+    private long bytesRead;
+    private long bytesReadMark;
+
+    /** Encoder used to calculate the bytes of characters */
+    CharsetEncoder encoder;
+
     /**
      * Constructs a new instance using the default buffer size.
      */
     ExtendedBufferedReader(final Reader reader) {
         super(reader);
     }
 
+    ExtendedBufferedReader(final Reader reader, String encoding) {
+        super(reader);
+        if (encoding != null) {
+            encoder = Charset.forName(encoding).newEncoder();
+        }
+    }
+
     /**
      * Closes the stream.
      *
@@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
         lineNumberMark = lineNumber;
         lastCharMark = lastChar;
         positionMark = position;
+        bytesReadMark = bytesRead;
         super.mark(readAheadLimit);
     }
 
@@ -118,11 +137,43 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
+        if (encoder != null) {
+            this.bytesRead += getCharBytes(current);
+        }
         lastChar = current;
         position++;
         return lastChar;
     }
 
+    /**
+     *  In Java, a char data type are based on the original Unicode
+     *  specification, which defined characters as fixed-width 16-bit entities.
+     *   U+0000 to U+FFFF:
+     *     - BMP, represented using 1 16-bit char
+     *     - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
+     *   U+10000 to U+10FFFF:
+     *     - Supplementary characters, represented as a pair of characters,
+     *     the first char from the high-surrogates range (\uD800-\uDBFF),
+     *     and the second char from the low-surrogates range (uDC00-\uDFFF).
+     *     - Consists of UTF-8 some 3-byte chars and 4-byte chars
+     */
+    private long getCharBytes(int current) throws CharacterCodingException {
+        char cChar = (char) current;
+        char lChar = (char) lastChar;
+        if (!Character.isSurrogate(cChar)) {
+            return encoder.encode(
+                CharBuffer.wrap(new char[] {cChar})).limit();
+        } else {
+            if (Character.isHighSurrogate(cChar)) {
+                // Move on to the next char (low surrogate)
+                return 0;
+            } else if (Character.isSurrogatePair(lChar, cChar)) {
+                return encoder.encode(
+                    CharBuffer.wrap(new char[] {lChar, cChar})).limit();
+            } else throw new CharacterCodingException();
+        }
+    }
+
     @Override
     public int read(final char[] buf, final int offset, final int length) throws IOException {
         if (length == 0) {
@@ -187,7 +238,17 @@ public void reset() throws IOException {
         lineNumber = lineNumberMark;
         lastChar = lastCharMark;
         position = positionMark;
+        bytesRead = bytesReadMark;
         super.reset();
     }
 
+    /**
+     * Gets the number of bytes read by the reader.
+     *
+     * @return the number of bytes read by the read
+     */
+    long getBytesRead() {
+        return this.bytesRead;
+    }
+
 }
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -103,6 +103,15 @@ long getCharacterPosition() {
         return reader.getPosition();
     }
 
+    /**
+     * Returns the number of bytes read
+     *
+     * @return the number of bytes read
+     */
+    long getBytesRead() {
+        return reader.getBytesRead();
+    }
+
     /**
      * Returns the current line number
      *
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -693,6 +693,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
         }
     }
 
+    @Test
+    public void testGetRecordThreeBytesRead() throws Exception {
+        String code = "id,date,val5,val4\n" +
+            "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n" +
+            "22222222222222,'4017-01-01',おはよう私の友人～,v4\n" +
+            "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
+        // String code = "'1',4";
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+                               .setDelimiter(',')
+                               .setQuote('\'')
+                               .build();
+        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+        CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
+        assertEquals(0, parser.getRecordNumber());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(1, record.getRecordNumber());
+        assertEquals(code.indexOf('i'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(2, record.getRecordNumber());
+        assertEquals(code.indexOf('1'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(3, record.getRecordNumber());
+        assertEquals(code.indexOf('2'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 95);
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(4, record.getRecordNumber());
+        assertEquals(code.indexOf('3'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 154);
+
+        parser.close();
+
+    }
+
+    @Test
+    public void testGetRecordFourBytesRead() throws Exception {
+        String code = "id,a,b,c\n" +
+            "1,😊,🤔,😂\n" +
+            "2,😊,🤔,😂\n" +
+            "3,😊,🤔,😂\n";
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+            .setDelimiter(',')
+            .setQuote('\'')
+            .build();
+
+        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+        CSVRecord record;
+        assertEquals(0, parser.getRecordNumber());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(1, record.getRecordNumber());
+        assertEquals(code.indexOf('i'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(2, record.getRecordNumber());
+        assertEquals(code.indexOf('1'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(3, record.getRecordNumber());
+        assertEquals(code.indexOf('2'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 26);
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(4, record.getRecordNumber());
+        assertEquals(code.indexOf('3'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 43);
+        parser.close();
+    }
+
     @Test
     public void testGetHeaderMap() throws Exception {
         try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv