+ * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, + * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. + *
+ * + *+ * For additional parsing options, see the various static parse methods available on {@link CSVParser}. + *
+ * + * @param reader the input stream + * @param characterOffset the character offset to start parsing from + * @param recordNumber the initial record number to start counting from + * @param encoding the character encoding of the input stream + * @return a parser over a stream of {@link CSVRecord}s. + * @throws IOException If an I/O error occurs + * @throws CSVException Thrown on invalid input. + */ + public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { + return new CSVParser(reader, this, characterOffset, recordNumber, encoding); + } + /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index f0341cf719..75bf78d20a 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -511,10 +511,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { + this(reader, format, characterOffset, recordNumber, null); + } + + /** + * Constructs a new instance using the given {@link CSVFormat} + * + *+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *
+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign + * @param encoding + * The encoding to use for the reader + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either the reader or format is null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @throws CSVException Thrown on invalid input. + */ + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, + String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; @@ -841,6 +870,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; + final long startCharByte = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -878,7 +908,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition); + recordNumber, startCharPosition, startCharByte); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 1fac65843d..f0a0a6b816 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable- * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, - * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. - *
- * - *- * For additional parsing options, see the various static parse methods available on {@link CSVParser}. - *
- * - * @param reader the input stream - * @param characterOffset the character offset to start parsing from - * @param recordNumber the initial record number to start counting from - * @param encoding the character encoding of the input stream - * @return a parser over a stream of {@link CSVRecord}s. - * @throws IOException If an I/O error occurs - * @throws CSVException Thrown on invalid input. - */ - public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { - return new CSVParser(reader, this, characterOffset, recordNumber, encoding); - } - /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index c48e1da096..024dd562d4 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder@@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param characterOffset * Lexer offset when the parser does not start parsing at the beginning of the source. * @param recordNumber - * The next record number to assign - * @param encoding - * The encoding to use for the reader + * The next record number to assign. + * @param charset + * The character encoding to be used for the reader. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException - * If there is a problem reading the header or skipping the first record + * If there is a problem reading the header or skipping the first record. * @throws CSVException Thrown on invalid input. + * @since 1.13.0. */ - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, - String encoding) throws IOException { + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2a82d48a5a..158f90a755 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; - /** The number of bytes read so far */ + /** The number of bytes read so far. */ private long bytesRead; private long bytesReadMark; - /** Encoder used to calculate the bytes of characters */ - CharsetEncoder encoder; + /** Encoder for calculating the number of bytes for each character read. */ + private CharsetEncoder encoder; /** * Constructs a new instance using the default buffer size. @@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, String encoding) { + ExtendedBufferedReader(final Reader reader, Charset charset) { super(reader); - if (encoding != null) { - encoder = Charset.forName(encoding).newEncoder(); + if (charset != null) { + encoder = charset.newEncoder(); } } @@ -146,20 +146,30 @@ public int read() throws IOException { } /** - * In Java, a char data type are based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * U+0000 to U+FFFF: - * - BMP, represented using 1 16-bit char - * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars - * U+10000 to U+10FFFF: - * - Supplementary characters, represented as a pair of characters, - * the first char from the high-surrogates range (\uD800-\uDBFF), - * and the second char from the low-surrogates range (uDC00-\uDFFF). - * - Consists of UTF-8 some 3-byte chars and 4-byte chars + * In Java, the {@code char} data type is based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + *
+ * The Unicode characters are divided into two main ranges: + *
* The Unicode characters are divided into two main ranges:
@@ -166,6 +175,10 @@ public int read() throws IOException {
*
*
*
+ *
+ * @param current the current character to process.
+ * @return the byte length of the character.
+ * @throws CharacterCodingException if the character cannot be encoded.
*/
private long getCharBytes(int current) throws CharacterCodingException {
final char cChar = (char) current;
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index afbba4d21d..3f14b2d883 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -104,7 +104,7 @@ long getCharacterPosition() {
}
/**
- * Returns the number of bytes read
+ * Gets the number of bytes read
*
* @return the number of bytes read
*/
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 219e5e5fa5..7e3cafa65c 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -703,7 +703,7 @@ public void testGetHeaderComment_NoComment3() throws IOException {
@Test
public void testGetRecordThreeBytesRead() throws Exception {
- String code = "id,date,val5,val4\n" +
+ final String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
@@ -740,7 +740,7 @@ public void testGetRecordThreeBytesRead() throws Exception {
@Test
public void testGetRecordFourBytesRead() throws Exception {
- String code = "id,a,b,c\n" +
+ final String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
From bdd152f917f22d6dc551f0b841bfd1ee809e95c7 Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <36880517+DarrenJAN@users.noreply.github.com>
Date: Thu, 26 Dec 2024 19:08:39 -0500
Subject: [PATCH 6/7] CSV-196-master: More changes (#16)
---
.../org/apache/commons/csv/CSVParser.java | 5 +++--
.../org/apache/commons/csv/CSVRecord.java | 20 +++++--------------
.../commons/csv/ExtendedBufferedReader.java | 4 ++--
.../org/apache/commons/csv/CSVParserTest.java | 16 +++++++--------
.../org/apache/commons/csv/CSVRecordTest.java | 2 +-
.../apache/commons/csv/JiraCsv196Test.java | 4 ++--
6 files changed, 21 insertions(+), 30 deletions(-)
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index 9ff28a96ae..50230388f8 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -539,13 +539,14 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
* @param recordNumber
* The next record number to assign.
* @param charset
- * The character encoding to be used for the reader.
+ * The character encoding to be used for the reader when enableByteTracking is true.
+ * @param enableByteTracking
+ * {@code true} to enable byte tracking for the parser; {@code false} to disable it.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record.
* @throws CSVException Thrown on invalid input.
- * @since 1.13.0.
*/
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
final Charset charset, final boolean enableByteTracking)
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index 54c88812f0..386a25c852 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -51,7 +51,7 @@ public final class CSVRecord implements Serializable, Iterable