diff --git a/pom.xml b/pom.xml index 8679177d30..63fad98880 100644 --- a/pom.xml +++ b/pom.xml @@ -245,6 +245,8 @@ src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv src/test/resources/org/apache/commons/csv/csv-167/sample1.csv src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv + src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv + src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index d0db9e6c3f..d9bb01fcff 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -155,6 +155,7 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; + private boolean enableByteTracking; /** * Constructs a new instance. @@ -166,7 +167,7 @@ protected Builder() { @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber); + return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking); } /** @@ -202,6 +203,18 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } + /** + * Sets whether to enable byte tracking for the parser. + * + * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. + * @return this instance. + * @since 1.13.0 + */ + public Builder setEnableByteTracking(final boolean enableByteTracking) { + this.enableByteTracking = enableByteTracking; + return asThis(); + } + } final class CSVRecordIterator implements Iterator { @@ -510,11 +523,43 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @Deprecated @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) + throws IOException { + this(reader, format, characterOffset, recordNumber, null, false); + } + + /** + * Constructs a new instance using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign. + * @param charset + * The character encoding to be used for the reader when enableByteTracking is true. + * @param enableByteTracking + * {@code true} to enable byte tracking for the parser; {@code false} to disable it. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either the reader or format is null. + * @throws IOException + * If there is a problem reading the header or skipping the first record. + * @throws CSVException Thrown on invalid input. + */ + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, + final Charset charset, final boolean enableByteTracking) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; @@ -841,6 +886,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; + final long startBytePosition = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -878,7 +924,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition); + recordNumber, startCharPosition, startBytePosition); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 948edbe77f..284220c38f 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -50,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable { */ private final long characterPosition; + /** + * The starting position of this record in the source stream, measured in bytes. + */ + private final long bytePosition; + /** The accumulated comments (if any) */ private final String comment; @@ -62,15 +67,15 @@ public final class CSVRecord implements Serializable, Iterable { /** The parser that originates this record. This is not serialized. */ private final transient CSVParser parser; - CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, - final long characterPosition) { + CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, + final long characterPosition, final long bytePosition) { this.recordNumber = recordNumber; this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; this.parser = parser; this.comment = comment; this.characterPosition = characterPosition; + this.bytePosition = bytePosition; } - /** * Returns a value by {@link Enum}. * @@ -146,6 +151,16 @@ public long getCharacterPosition() { return characterPosition; } + /** + * Returns the starting position of this record in the source stream, measured in bytes. + * + * @return the byte position of this record in the source stream. + * @since 1.13.0 + */ + public long getBytePosition() { + return bytePosition; + } + /** * Returns the comment for this record, if any. * Note that comments are attached to the following record. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index e476ad8f92..6043ccaf08 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -26,6 +26,10 @@ import java.io.IOException; import java.io.Reader; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedBufferedReader; @@ -51,6 +55,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; + /** The number of bytes read so far. */ + private long bytesRead; + private long bytesReadMark; + + /** Encoder for calculating the number of bytes for each character read. */ + private CharsetEncoder encoder; + /** * Constructs a new instance using the default buffer size. */ @@ -58,6 +69,22 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } + /** + * Constructs a new instance with the specified reader, character set, + * and byte tracking option. Initializes an encoder if byte tracking is enabled + * and a character set is provided. + * + * @param reader the reader supports a look-ahead option. + * @param charset the character set for encoding, or {@code null} if not applicable. + * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it. + */ + ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) { + super(reader); + if (charset != null && enableByteTracking) { + encoder = charset.newEncoder(); + } + } + /** * Closes the stream. * @@ -110,6 +137,7 @@ public void mark(final int readAheadLimit) throws IOException { lineNumberMark = lineNumber; lastCharMark = lastChar; positionMark = position; + bytesReadMark = bytesRead; super.mark(readAheadLimit); } @@ -120,11 +148,59 @@ public int read() throws IOException { current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } + if (encoder != null) { + this.bytesRead += getEncodedCharLength(current); + } lastChar = current; position++; return lastChar; } + /** + * Gets the byte length of the given character based on the the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + *

+ * The Unicode characters are divided into two main ranges: + *

    + *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
      + *
    • Represented using a single 16-bit {@code char}.
    • + *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • + *
    + *
  • + *
  • U+10000 to U+10FFFF (Supplementary Characters): + *
      + *
    • Represented as a pair of {@code char}s:
    • + *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • + *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • + *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • + *
    + *
  • + *
+ * + * @param current the current character to process. + * @return the byte length of the character. + * @throws CharacterCodingException if the character cannot be encoded. + */ + private int getEncodedCharLength(int current) throws CharacterCodingException { + final char cChar = (char) current; + final char lChar = (char) lastChar; + if (!Character.isSurrogate(cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {cChar})).limit(); + } else { + if (Character.isHighSurrogate(cChar)) { + // Move on to the next char (low surrogate) + return 0; + } else if (Character.isSurrogatePair(lChar, cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {lChar, cChar})).limit(); + } else { + throw new CharacterCodingException(); + } + } + } + @Override public int read(final char[] buf, final int offset, final int length) throws IOException { if (length == 0) { @@ -189,7 +265,17 @@ public void reset() throws IOException { lineNumber = lineNumberMark; lastChar = lastCharMark; position = positionMark; + bytesRead = bytesReadMark; super.reset(); } + /** + * Gets the number of bytes read by the reader. + * + * @return the number of bytes read by the read + */ + long getBytesRead() { + return this.bytesRead; + } + } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 20227df1d4..2e7d2d0412 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -105,6 +105,15 @@ long getCharacterPosition() { return reader.getPosition(); } + /** + * Gets the number of bytes read + * + * @return the number of bytes read + */ + long getBytesRead() { + return reader.getBytesRead(); + } + /** * Returns the current line number * diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 44a9afd5fa..c42a3c25ab 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -703,6 +703,76 @@ public void testGetHeaderComment_NoComment3() throws IOException { } } + @Test + public void testGetRecordThreeBytesRead() throws Exception { + final String code = "id,date,val5,val4\n" + + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 95); + + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 154); + } + } + + @Test + public void testGetRecordFourBytesRead() throws Exception { + final String code = "id,a,b,c\n" + + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getBytePosition(), 43); + } + } + @Test public void testGetHeaderMap() throws Exception { try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) { diff --git a/src/test/java/org/apache/commons/csv/CSVRecordTest.java b/src/test/java/org/apache/commons/csv/CSVRecordTest.java index b9f9ceae19..cd644b1512 100644 --- a/src/test/java/org/apache/commons/csv/CSVRecordTest.java +++ b/src/test/java/org/apache/commons/csv/CSVRecordTest.java @@ -87,7 +87,7 @@ record = parser.iterator().next(); @Test public void testCSVRecordNULLValues() throws IOException { try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) { - final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L); + final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L); assertEquals(0, csvRecord.size()); assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B")); } diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java new file mode 100644 index 0000000000..ab7af819e7 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.csv; +import static org.junit.jupiter.api.Assertions.assertEquals; + + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + + +public class JiraCsv196Test { + @Test + public void parseThreeBytes() throws IOException { + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + final CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) + .setCharset(StandardCharsets.UTF_8) + .setEnableByteTracking(true) + .get(); + final long[] charByteKey = {0, 89, 242, 395}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getBytePosition()); + } + parser.close(); + } + + + @Test + public void parseFourBytes() throws IOException { + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .get(); + final CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) + .setCharset(StandardCharsets.UTF_8) + .setEnableByteTracking(true) + .get(); + final long[] charByteKey = {0, 84, 701, 1318, 1935}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getBytePosition()); + } + parser.close(); + } + + private Reader getTestInput(String path) { + return new InputStreamReader( + ClassLoader.getSystemClassLoader().getResourceAsStream(path)); + } +} diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv new file mode 100644 index 0000000000..0bff7a44f3 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv @@ -0,0 +1,5 @@ +id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 \ No newline at end of file diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv new file mode 100644 index 0000000000..b06e04bd6a --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv @@ -0,0 +1,4 @@ +id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 \ No newline at end of file