From 7dca28192c48b3b9cb5e27c07215d113811fb401 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:57:31 -0500 Subject: [PATCH 1/7] Add support in Commons CSV for tracking byte positions during parsing (#9) Add support in Commons CSV for tracking byte positions during parsing --- pom.xml | 3 + .../org/apache/commons/csv/CSVFormat.java | 24 ++++++ .../org/apache/commons/csv/CSVParser.java | 34 +++++++- .../org/apache/commons/csv/CSVRecord.java | 24 ++++++ .../commons/csv/ExtendedBufferedReader.java | 61 +++++++++++++++ .../java/org/apache/commons/csv/Lexer.java | 9 +++ .../org/apache/commons/csv/CSVParserTest.java | 78 +++++++++++++++++++ .../apache/commons/csv/JiraCsv196Test.java | 75 ++++++++++++++++++ .../org/apache/commons/csv/CSV-196/emoji.csv | 5 ++ .../apache/commons/csv/CSV-196/japanese.csv | 4 + 10 files changed, 315 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/apache/commons/csv/JiraCsv196Test.java create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv diff --git a/pom.xml b/pom.xml index da5bc1b4ed..bfdf9e74a7 100644 --- a/pom.xml +++ b/pom.xml @@ -28,6 +28,7 @@ https://commons.apache.org/proper/commons-csv/ 2005 The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types. + jar @@ -231,6 +232,8 @@ src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv src/test/resources/org/apache/commons/csv/csv-167/sample1.csv src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv + src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv + src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 3d4b43c6ba..9833a26ed1 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2097,6 +2097,30 @@ public CSVParser parse(final Reader reader) throws IOException { return new CSVParser(reader, this); } + /** + * Parses the specified content. + * + *

+ * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, + * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. + *

+ * + *

+ * For additional parsing options, see the various static parse methods available on {@link CSVParser}. + *

+ * + * @param reader the input stream + * @param characterOffset the character offset to start parsing from + * @param recordNumber the initial record number to start counting from + * @param encoding the character encoding of the input stream + * @return a parser over a stream of {@link CSVRecord}s. + * @throws IOException If an I/O error occurs + * @throws CSVException Thrown on invalid input. + */ + public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { + return new CSVParser(reader, this, characterOffset, recordNumber, encoding); + } + /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index f0341cf719..75bf78d20a 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -511,10 +511,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { + this(reader, format, characterOffset, recordNumber, null); + } + + /** + * Constructs a new instance using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign + * @param encoding + * The encoding to use for the reader + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either the reader or format is null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @throws CSVException Thrown on invalid input. + */ + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, + String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; @@ -841,6 +870,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; + final long startCharByte = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -878,7 +908,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition); + recordNumber, startCharPosition, startCharByte); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 1fac65843d..f0a0a6b816 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable { */ private final long characterPosition; + /** + * The start byte of this record as a character byte in the source stream. + */ + private final long characterByte; + /** The accumulated comments (if any) */ private final String comment; @@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable { this.parser = parser; this.comment = comment; this.characterPosition = characterPosition; + this.characterByte = 0L; } + CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, + final long characterPosition, final long characterByte) { + this.recordNumber = recordNumber; + this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; + this.parser = parser; + this.comment = comment; + this.characterPosition = characterPosition; + this.characterByte = characterByte; + } /** * Returns a value by {@link Enum}. * @@ -144,6 +159,15 @@ public long getCharacterPosition() { return characterPosition; } + /** + * Returns the start byte of this record as a character byte in the source stream. + * + * @return the start byte of this record as a character byte in the source stream. + */ + public long getCharacterByte() { + return characterByte; + } + /** * Returns the comment for this record, if any. * Note that comments are attached to the following record. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 18c922a508..2a82d48a5a 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -24,6 +24,10 @@ import java.io.IOException; import java.io.Reader; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedBufferedReader; @@ -49,6 +53,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; + /** The number of bytes read so far */ + private long bytesRead; + private long bytesReadMark; + + /** Encoder used to calculate the bytes of characters */ + CharsetEncoder encoder; + /** * Constructs a new instance using the default buffer size. */ @@ -56,6 +67,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } + ExtendedBufferedReader(final Reader reader, String encoding) { + super(reader); + if (encoding != null) { + encoder = Charset.forName(encoding).newEncoder(); + } + } + /** * Closes the stream. * @@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException { lineNumberMark = lineNumber; lastCharMark = lastChar; positionMark = position; + bytesReadMark = bytesRead; super.mark(readAheadLimit); } @@ -118,11 +137,43 @@ public int read() throws IOException { current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } + if (encoder != null) { + this.bytesRead += getCharBytes(current); + } lastChar = current; position++; return lastChar; } + /** + * In Java, a char data type are based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + * U+0000 to U+FFFF: + * - BMP, represented using 1 16-bit char + * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars + * U+10000 to U+10FFFF: + * - Supplementary characters, represented as a pair of characters, + * the first char from the high-surrogates range (\uD800-\uDBFF), + * and the second char from the low-surrogates range (uDC00-\uDFFF). + * - Consists of UTF-8 some 3-byte chars and 4-byte chars + */ + private long getCharBytes(int current) throws CharacterCodingException { + char cChar = (char) current; + char lChar = (char) lastChar; + if (!Character.isSurrogate(cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {cChar})).limit(); + } else { + if (Character.isHighSurrogate(cChar)) { + // Move on to the next char (low surrogate) + return 0; + } else if (Character.isSurrogatePair(lChar, cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {lChar, cChar})).limit(); + } else throw new CharacterCodingException(); + } + } + @Override public int read(final char[] buf, final int offset, final int length) throws IOException { if (length == 0) { @@ -187,7 +238,17 @@ public void reset() throws IOException { lineNumber = lineNumberMark; lastChar = lastCharMark; position = positionMark; + bytesRead = bytesReadMark; super.reset(); } + /** + * Gets the number of bytes read by the reader. + * + * @return the number of bytes read by the read + */ + long getBytesRead() { + return this.bytesRead; + } + } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 6d9c8a4850..afbba4d21d 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -103,6 +103,15 @@ long getCharacterPosition() { return reader.getPosition(); } + /** + * Returns the number of bytes read + * + * @return the number of bytes read + */ + long getBytesRead() { + return reader.getBytesRead(); + } + /** * Returns the current line number * diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 8f5d577f66..fd1ecdb021 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -701,6 +701,84 @@ public void testGetHeaderComment_NoComment3() throws IOException { } } + @Test + public void testGetRecordThreeBytesRead() throws Exception { + String code = "id,date,val5,val4\n" + + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; + // String code = "'1',4"; + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); + + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 95); + + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 154); + + parser.close(); + + } + + @Test + public void testGetRecordFourBytesRead() throws Exception { + String code = "id,a,b,c\n" + + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + + // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); + + CSVRecord record; + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 43); + parser.close(); + } + @Test public void testGetHeaderMap() throws Exception { try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) { diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java new file mode 100644 index 0000000000..7dbc23cafa --- /dev/null +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv; +import static org.junit.jupiter.api.Assertions.assertEquals; + + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; + + +import org.junit.jupiter.api.Test; + + +public class JiraCsv196Test { + @Test + public void parseThreeBytes() throws IOException { + + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + // CSVParser parser = new CSVParser(getTestInput( + // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(getTestInput( + "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); + long[] charByteKey = {0, 89, 242, 395}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getCharacterByte()); + } + parser.close(); + } + + + @Test + public void parseFourBytes() throws IOException { + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + + CSVParser parser = format.parse(getTestInput( + "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); + + long[] charByteKey = {0, 84, 701, 1318, 1935}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getCharacterByte()); + } + parser.close(); + } + + + private Reader getTestInput(String path) { + return new InputStreamReader( + ClassLoader.getSystemClassLoader().getResourceAsStream(path)); + } +} diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv new file mode 100644 index 0000000000..0bff7a44f3 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv @@ -0,0 +1,5 @@ +id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 \ No newline at end of file diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv new file mode 100644 index 0000000000..b06e04bd6a --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv @@ -0,0 +1,4 @@ +id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 \ No newline at end of file From 3599f5bc44b5772b989212101e1b509ac86122ac Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:21:18 -0500 Subject: [PATCH 2/7] Add support in Commons CSV for tracking byte positions during parsing (#12) Add support in Commons CSV for tracking byte positions during parsing --- pom.xml | 1 - .../org/apache/commons/csv/CSVFormat.java | 24 ----- .../org/apache/commons/csv/CSVParser.java | 30 ++++-- .../commons/csv/ExtendedBufferedReader.java | 46 +++++---- .../org/apache/commons/csv/CSVParserTest.java | 99 +++++++++---------- .../apache/commons/csv/JiraCsv196Test.java | 32 +++--- 6 files changed, 110 insertions(+), 122 deletions(-) diff --git a/pom.xml b/pom.xml index bfdf9e74a7..a03787382e 100644 --- a/pom.xml +++ b/pom.xml @@ -28,7 +28,6 @@ https://commons.apache.org/proper/commons-csv/ 2005 The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types. - jar diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index cabcb5135e..8205f4c47e 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException { return CSVParser.builder().setReader(reader).setFormat(this).get(); } - /** - * Parses the specified content. - * - *

- * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, - * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. - *

- * - *

- * For additional parsing options, see the various static parse methods available on {@link CSVParser}. - *

- * - * @param reader the input stream - * @param characterOffset the character offset to start parsing from - * @param recordNumber the initial record number to start counting from - * @param encoding the character encoding of the input stream - * @return a parser over a stream of {@link CSVRecord}s. - * @throws IOException If an I/O error occurs - * @throws CSVException Thrown on invalid input. - */ - public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { - return new CSVParser(reader, this, characterOffset, recordNumber, encoding); - } - /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index c48e1da096..024dd562d4 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; + private Charset charset; /** * Constructs a new instance. @@ -164,7 +165,7 @@ protected Builder() { @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber); + return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, charset); } /** @@ -200,6 +201,16 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } + /** + * Sets the character encoding to be used for the reader. + * + * @param charset the character encoding. + * @return this instance. + */ + public Builder setCharset(final Charset charset) { + this.charset = charset; + return asThis(); + } } final class CSVRecordIterator implements Iterator { @@ -510,7 +521,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this(reader, format, characterOffset, recordNumber, null); } - /** + /** * Constructs a new instance using the given {@link CSVFormat} * *

@@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param characterOffset * Lexer offset when the parser does not start parsing at the beginning of the source. * @param recordNumber - * The next record number to assign - * @param encoding - * The encoding to use for the reader + * The next record number to assign. + * @param charset + * The character encoding to be used for the reader. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException - * If there is a problem reading the header or skipping the first record + * If there is a problem reading the header or skipping the first record. * @throws CSVException Thrown on invalid input. + * @since 1.13.0. */ - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, - String encoding) throws IOException { + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2a82d48a5a..158f90a755 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; - /** The number of bytes read so far */ + /** The number of bytes read so far. */ private long bytesRead; private long bytesReadMark; - /** Encoder used to calculate the bytes of characters */ - CharsetEncoder encoder; + /** Encoder for calculating the number of bytes for each character read. */ + private CharsetEncoder encoder; /** * Constructs a new instance using the default buffer size. @@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, String encoding) { + ExtendedBufferedReader(final Reader reader, Charset charset) { super(reader); - if (encoding != null) { - encoder = Charset.forName(encoding).newEncoder(); + if (charset != null) { + encoder = charset.newEncoder(); } } @@ -146,20 +146,30 @@ public int read() throws IOException { } /** - * In Java, a char data type are based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * U+0000 to U+FFFF: - * - BMP, represented using 1 16-bit char - * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars - * U+10000 to U+10FFFF: - * - Supplementary characters, represented as a pair of characters, - * the first char from the high-surrogates range (\uD800-\uDBFF), - * and the second char from the low-surrogates range (uDC00-\uDFFF). - * - Consists of UTF-8 some 3-byte chars and 4-byte chars + * In Java, the {@code char} data type is based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + *

+ * The Unicode characters are divided into two main ranges: + *

    + *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
      + *
    • Represented using a single 16-bit {@code char}.
    • + *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • + *
    + *
  • + *
  • U+10000 to U+10FFFF (Supplementary Characters): + *
      + *
    • Represented as a pair of {@code char}s:
    • + *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • + *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • + *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • + *
    + *
  • + *
*/ private long getCharBytes(int current) throws CharacterCodingException { - char cChar = (char) current; - char lChar = (char) lastChar; + final char cChar = (char) current; + final char lChar = (char) lastChar; if (!Character.isSurrogate(cChar)) { return encoder.encode( CharBuffer.wrap(new char[] {cChar})).limit(); diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index fd1ecdb021..2b68155624 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -707,38 +707,34 @@ public void testGetRecordThreeBytesRead() throws Exception { "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; - // String code = "'1',4"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - - CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 95); + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 154); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 95); - parser.close(); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 154); + }; } @@ -748,35 +744,32 @@ public void testGetRecordFourBytesRead() throws Exception { "1,😊,🤔,😂\n" + "2,😊,🤔,😂\n" + "3,😊,🤔,😂\n"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() .setDelimiter(',') .setQuote('\'') - .build(); - - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - - CSVRecord record; - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 26); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 43); - parser.close(); + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 43); + } } @Test diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 7dbc23cafa..853007f9e5 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -21,7 +21,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; - +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -29,16 +29,15 @@ public class JiraCsv196Test { @Test public void parseThreeBytes() throws IOException { - - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(getTestInput( - // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); + .setDelimiter(',') + .setQuote('\'') + .get(); + CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { @@ -50,15 +49,15 @@ public void parseThreeBytes() throws IOException { @Test public void parseFourBytes() throws IOException { - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() .setDelimiter(',') .setQuote('\'') - .build(); - - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); - + .get(); + CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; for (CSVRecord record : parser) { @@ -67,7 +66,6 @@ public void parseFourBytes() throws IOException { parser.close(); } - private Reader getTestInput(String path) { return new InputStreamReader( ClassLoader.getSystemClassLoader().getResourceAsStream(path)); From 344f282dbead967c49fd57820fca9d9249cc4ba3 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:41:45 -0500 Subject: [PATCH 3/7] CSV-196: Remove duplicated Charset (#13) --- src/main/java/org/apache/commons/csv/CSVParser.java | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 024dd562d4..0879cf3bc9 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,7 +153,6 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; - private Charset charset; /** * Constructs a new instance. @@ -165,7 +164,7 @@ protected Builder() { @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, charset); + return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset()); } /** @@ -201,16 +200,6 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } - /** - * Sets the character encoding to be used for the reader. - * - * @param charset the character encoding. - * @return this instance. - */ - public Builder setCharset(final Charset charset) { - this.charset = charset; - return asThis(); - } } final class CSVRecordIterator implements Iterator { From 27511be186b22755a8b9337f52faa47ce3051ff9 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:45:38 -0500 Subject: [PATCH 4/7] Adding a boolean to drive byte tracking opt-in behavior (#14) Adding a boolean to drive byte tracking opt-in behavior --- .../java/org/apache/commons/csv/CSVParser.java | 15 +++++++++++---- .../commons/csv/ExtendedBufferedReader.java | 4 ++-- .../org/apache/commons/csv/CSVParserTest.java | 4 ++-- .../org/apache/commons/csv/JiraCsv196Test.java | 2 ++ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 0879cf3bc9..d3d8c9f3da 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; + private boolean enableByteTracking = false; /** * Constructs a new instance. @@ -164,7 +165,7 @@ protected Builder() { @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset()); + return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking); } /** @@ -200,6 +201,11 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } + public Builder setEnableByteTracking(final boolean enableByteTracking) { + this.enableByteTracking = enableByteTracking; + return asThis(); + } + } final class CSVRecordIterator implements Iterator { @@ -507,7 +513,7 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { - this(reader, format, characterOffset, recordNumber, null); + this(reader, format, characterOffset, recordNumber, null, false); } /** @@ -535,12 +541,13 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @throws CSVException Thrown on invalid input. * @since 1.13.0. */ - private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, + final Charset charset, final boolean enableByteTracking) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 158f90a755..a64868b39b 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -67,9 +67,9 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, Charset charset) { + ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) { super(reader); - if (charset != null) { + if (charset != null && enableByteTracking) { encoder = charset.newEncoder(); } } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 2b68155624..219e5e5fa5 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -711,7 +711,7 @@ public void testGetRecordThreeBytesRead() throws Exception { .setDelimiter(',') .setQuote('\'') .get(); - try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) { + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) { CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); assertEquals(0, parser.getRecordNumber()); @@ -748,7 +748,7 @@ public void testGetRecordFourBytesRead() throws Exception { .setDelimiter(',') .setQuote('\'') .get(); - try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) { + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) { CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); assertEquals(0, parser.getRecordNumber()); diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 853007f9e5..a49d934cfc 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -37,6 +37,7 @@ public void parseThreeBytes() throws IOException { .setFormat(format) .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) .setCharset(StandardCharsets.UTF_8) + .setEnableByteTracking(true) .get(); long[] charByteKey = {0, 89, 242, 395}; int idx = 0; @@ -57,6 +58,7 @@ public void parseFourBytes() throws IOException { .setFormat(format) .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) .setCharset(StandardCharsets.UTF_8) + .setEnableByteTracking(true) .get(); long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; From 8387f796b89cedbfbd0b5a30266702c682e22371 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:05:48 -0500 Subject: [PATCH 5/7] Fix comments (#15) * Fix comments --- .../java/org/apache/commons/csv/CSVParser.java | 8 +++++++- .../java/org/apache/commons/csv/CSVRecord.java | 2 +- .../commons/csv/ExtendedBufferedReader.java | 15 ++++++++++++++- src/main/java/org/apache/commons/csv/Lexer.java | 2 +- .../org/apache/commons/csv/CSVParserTest.java | 4 ++-- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index d3d8c9f3da..9ff28a96ae 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,7 +153,7 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; - private boolean enableByteTracking = false; + private boolean enableByteTracking; /** * Constructs a new instance. @@ -201,6 +201,12 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } + /** + * Sets whether to enable byte tracking for the parser. + * + * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. + * @return this instance. + */ public Builder setEnableByteTracking(final boolean enableByteTracking) { this.enableByteTracking = enableByteTracking; return asThis(); diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index f0a0a6b816..54c88812f0 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -160,7 +160,7 @@ public long getCharacterPosition() { } /** - * Returns the start byte of this record as a character byte in the source stream. + * Gets the start byte of this record as a character byte in the source stream * * @return the start byte of this record as a character byte in the source stream. */ diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index a64868b39b..61f6ae2f3e 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -67,6 +67,15 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } + /** + * Constructs a new instance with the specified reader, character set, + * and byte tracking option. Initializes an encoder if byte tracking is enabled + * and a character set is provided. + * + * @param reader the reader supports a look-ahead option. + * @param charset the character set for encoding, or {@code null} if not applicable. + * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. + */ ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) { super(reader); if (charset != null && enableByteTracking) { @@ -146,7 +155,7 @@ public int read() throws IOException { } /** - * In Java, the {@code char} data type is based on the original Unicode + * Gets the byte length of the given character based on the the original Unicode * specification, which defined characters as fixed-width 16-bit entities. *

* The Unicode characters are divided into two main ranges: @@ -166,6 +175,10 @@ public int read() throws IOException { * * * + * + * @param current the current character to process. + * @return the byte length of the character. + * @throws CharacterCodingException if the character cannot be encoded. */ private long getCharBytes(int current) throws CharacterCodingException { final char cChar = (char) current; diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index afbba4d21d..3f14b2d883 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -104,7 +104,7 @@ long getCharacterPosition() { } /** - * Returns the number of bytes read + * Gets the number of bytes read * * @return the number of bytes read */ diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 219e5e5fa5..7e3cafa65c 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -703,7 +703,7 @@ public void testGetHeaderComment_NoComment3() throws IOException { @Test public void testGetRecordThreeBytesRead() throws Exception { - String code = "id,date,val5,val4\n" + + final String code = "id,date,val5,val4\n" + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; @@ -740,7 +740,7 @@ public void testGetRecordThreeBytesRead() throws Exception { @Test public void testGetRecordFourBytesRead() throws Exception { - String code = "id,a,b,c\n" + + final String code = "id,a,b,c\n" + "1,😊,🤔,😂\n" + "2,😊,🤔,😂\n" + "3,😊,🤔,😂\n"; From bdd152f917f22d6dc551f0b841bfd1ee809e95c7 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Thu, 26 Dec 2024 19:08:39 -0500 Subject: [PATCH 6/7] CSV-196-master: More changes (#16) --- .../org/apache/commons/csv/CSVParser.java | 5 +++-- .../org/apache/commons/csv/CSVRecord.java | 20 +++++-------------- .../commons/csv/ExtendedBufferedReader.java | 4 ++-- .../org/apache/commons/csv/CSVParserTest.java | 16 +++++++-------- .../org/apache/commons/csv/CSVRecordTest.java | 2 +- .../apache/commons/csv/JiraCsv196Test.java | 4 ++-- 6 files changed, 21 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 9ff28a96ae..50230388f8 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -539,13 +539,14 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param recordNumber * The next record number to assign. * @param charset - * The character encoding to be used for the reader. + * The character encoding to be used for the reader when enableByteTracking is true. + * @param enableByteTracking + * {@code true} to enable byte tracking for the parser; {@code false} to disable it. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException * If there is a problem reading the header or skipping the first record. * @throws CSVException Thrown on invalid input. - * @since 1.13.0. */ private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset, final boolean enableByteTracking) diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 54c88812f0..386a25c852 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -51,7 +51,7 @@ public final class CSVRecord implements Serializable, Iterable { /** * The start byte of this record as a character byte in the source stream. */ - private final long characterByte; + private final long bytePosition; /** The accumulated comments (if any) */ private final String comment; @@ -65,24 +65,14 @@ public final class CSVRecord implements Serializable, Iterable { /** The parser that originates this record. This is not serialized. */ private final transient CSVParser parser; - CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, - final long characterPosition) { - this.recordNumber = recordNumber; - this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; - this.parser = parser; - this.comment = comment; - this.characterPosition = characterPosition; - this.characterByte = 0L; - } - CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, - final long characterPosition, final long characterByte) { + final long characterPosition, final long bytePosition) { this.recordNumber = recordNumber; this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; this.parser = parser; this.comment = comment; this.characterPosition = characterPosition; - this.characterByte = characterByte; + this.bytePosition = bytePosition; } /** * Returns a value by {@link Enum}. @@ -164,8 +154,8 @@ public long getCharacterPosition() { * * @return the start byte of this record as a character byte in the source stream. */ - public long getCharacterByte() { - return characterByte; + public long getBytePosition() { + return bytePosition; } /** diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 61f6ae2f3e..24044966d1 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -147,7 +147,7 @@ public int read() throws IOException { lineNumber++; } if (encoder != null) { - this.bytesRead += getCharBytes(current); + this.bytesRead += getEncodedCharLength(current); } lastChar = current; position++; @@ -180,7 +180,7 @@ public int read() throws IOException { * @return the byte length of the character. * @throws CharacterCodingException if the character cannot be encoded. */ - private long getCharBytes(int current) throws CharacterCodingException { + private int getEncodedCharLength(int current) throws CharacterCodingException { final char cChar = (char) current; final char lChar = (char) lastChar; if (!Character.isSurrogate(cChar)) { diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 7e3cafa65c..ac3708a52a 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -718,22 +718,22 @@ public void testGetRecordThreeBytesRead() throws Exception { assertNotNull(record = parser.nextRecord()); assertEquals(1, record.getRecordNumber()); assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); assertNotNull(record = parser.nextRecord()); assertEquals(2, record.getRecordNumber()); assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); assertNotNull(record = parser.nextRecord()); assertEquals(3, record.getRecordNumber()); assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 95); + assertEquals(record.getBytePosition(), 95); assertNotNull(record = parser.nextRecord()); assertEquals(4, record.getRecordNumber()); assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 154); + assertEquals(record.getBytePosition(), 154); }; } @@ -755,20 +755,20 @@ public void testGetRecordFourBytesRead() throws Exception { assertNotNull(record = parser.nextRecord()); assertEquals(1, record.getRecordNumber()); assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); assertNotNull(record = parser.nextRecord()); assertEquals(2, record.getRecordNumber()); assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); assertNotNull(record = parser.nextRecord()); assertEquals(3, record.getRecordNumber()); assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 26); + assertEquals(record.getBytePosition(), 26); assertNotNull(record = parser.nextRecord()); assertEquals(4, record.getRecordNumber()); assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 43); + assertEquals(record.getBytePosition(), 43); } } diff --git a/src/test/java/org/apache/commons/csv/CSVRecordTest.java b/src/test/java/org/apache/commons/csv/CSVRecordTest.java index 5b0c5d812c..40c057e9b8 100644 --- a/src/test/java/org/apache/commons/csv/CSVRecordTest.java +++ b/src/test/java/org/apache/commons/csv/CSVRecordTest.java @@ -85,7 +85,7 @@ record = parser.iterator().next(); @Test public void testCSVRecordNULLValues() throws IOException { try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) { - final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L); + final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L); assertEquals(0, csvRecord.size()); assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B")); } diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index a49d934cfc..150a5f7f13 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -42,7 +42,7 @@ public void parseThreeBytes() throws IOException { long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { - assertEquals(charByteKey[idx++], record.getCharacterByte()); + assertEquals(charByteKey[idx++], record.getBytePosition()); } parser.close(); } @@ -63,7 +63,7 @@ public void parseFourBytes() throws IOException { long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; for (CSVRecord record : parser) { - assertEquals(charByteKey[idx++], record.getCharacterByte()); + assertEquals(charByteKey[idx++], record.getBytePosition()); } parser.close(); } From d403084ddaf83992123035b7dd2876d0dcb083e8 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com> Date: Tue, 31 Dec 2024 17:10:07 -0500 Subject: [PATCH 7/7] CSV-196: Comments changes on Dec30 (#17) --- .../org/apache/commons/csv/CSVParser.java | 5 +-- .../org/apache/commons/csv/CSVRecord.java | 7 ++-- .../commons/csv/ExtendedBufferedReader.java | 4 ++- .../org/apache/commons/csv/CSVParserTest.java | 3 +- .../apache/commons/csv/JiraCsv196Test.java | 34 ++++++++++--------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 95dd282aea..d9bb01fcff 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -208,6 +208,7 @@ public Builder setRecordNumber(final long recordNumber) { * * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. * @return this instance. + * @since 1.13.0 */ public Builder setEnableByteTracking(final boolean enableByteTracking) { this.enableByteTracking = enableByteTracking; @@ -885,7 +886,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; - final long startCharByte = lexer.getBytesRead() + this.characterOffset; + final long startBytePosition = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -923,7 +924,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition, startCharByte); + recordNumber, startCharPosition, startBytePosition); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 0da013458b..284220c38f 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -51,7 +51,7 @@ public final class CSVRecord implements Serializable, Iterable { private final long characterPosition; /** - * The start byte of this record as a character byte in the source stream. + * The starting position of this record in the source stream, measured in bytes. */ private final long bytePosition; @@ -152,9 +152,10 @@ public long getCharacterPosition() { } /** - * Gets the start byte of this record as a character byte in the source stream + * Returns the starting position of this record in the source stream, measured in bytes. * - * @return the start byte of this record as a character byte in the source stream. + * @return the byte position of this record in the source stream. + * @since 1.13.0 */ public long getBytePosition() { return bytePosition; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index f4a093f94c..6043ccaf08 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -195,7 +195,9 @@ private int getEncodedCharLength(int current) throws CharacterCodingException { } else if (Character.isSurrogatePair(lChar, cChar)) { return encoder.encode( CharBuffer.wrap(new char[] {lChar, cChar})).limit(); - } else throw new CharacterCodingException(); + } else { + throw new CharacterCodingException(); + } } } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 1e4a099a14..c42a3c25ab 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -736,8 +736,7 @@ public void testGetRecordThreeBytesRead() throws Exception { assertEquals(4, record.getRecordNumber()); assertEquals(code.indexOf('3'), record.getCharacterPosition()); assertEquals(record.getBytePosition(), 154); - }; - + } } @Test diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 150a5f7f13..ab7af819e7 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -1,18 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * https://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.commons.csv; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -33,13 +35,13 @@ public void parseThreeBytes() throws IOException { .setDelimiter(',') .setQuote('\'') .get(); - CSVParser parser = new CSVParser.Builder() + final CSVParser parser = new CSVParser.Builder() .setFormat(format) .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) .setCharset(StandardCharsets.UTF_8) .setEnableByteTracking(true) .get(); - long[] charByteKey = {0, 89, 242, 395}; + final long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { assertEquals(charByteKey[idx++], record.getBytePosition()); @@ -54,13 +56,13 @@ public void parseFourBytes() throws IOException { .setDelimiter(',') .setQuote('\'') .get(); - CSVParser parser = new CSVParser.Builder() + final CSVParser parser = new CSVParser.Builder() .setFormat(format) .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) .setCharset(StandardCharsets.UTF_8) .setEnableByteTracking(true) .get(); - long[] charByteKey = {0, 84, 701, 1318, 1935}; + final long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; for (CSVRecord record : parser) { assertEquals(charByteKey[idx++], record.getBytePosition());