Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
52 changes: 49 additions & 3 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
private CSVFormat format;
private long characterOffset;
private long recordNumber = 1;
private boolean enableByteTracking;

/**
* Constructs a new instance.
Expand All @@ -166,7 +167,7 @@ protected Builder() {
@SuppressWarnings("resource")
@Override
public CSVParser get() throws IOException {
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking);
}

/**
Expand Down Expand Up @@ -202,6 +203,18 @@ public Builder setRecordNumber(final long recordNumber) {
return asThis();
}

/**
* Sets whether to enable byte tracking for the parser.
*
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
* @return this instance.
* @since 1.13.0
*/
Comment thread
garydgregory marked this conversation as resolved.
public Builder setEnableByteTracking(final boolean enableByteTracking) {
Comment thread
garydgregory marked this conversation as resolved.
this.enableByteTracking = enableByteTracking;
return asThis();
}

}

final class CSVRecordIterator implements Iterator<CSVRecord> {
Expand Down Expand Up @@ -510,11 +523,43 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@Deprecated
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null, false);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign.
* @param charset
* The character encoding to be used for the reader when enableByteTracking is true.
* @param enableByteTracking
* {@code true} to enable byte tracking for the parser; {@code false} to disable it.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record.
* @throws CSVException Thrown on invalid input.
*/
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
final Charset charset, final boolean enableByteTracking)
Comment thread
garydgregory marked this conversation as resolved.
throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -841,6 +886,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -878,7 +924,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startBytePosition);
}
return result;
}
Expand Down
21 changes: 18 additions & 3 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The starting position of this record in the source stream, measured in bytes.
*/
private final long bytePosition;

/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -62,15 +67,15 @@ public final class CSVRecord implements Serializable, Iterable<String> {
/** The parser that originates this record. This is not serialized. */
private final transient CSVParser parser;

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition) {
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
Comment thread
garydgregory marked this conversation as resolved.
final long characterPosition, final long bytePosition) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.bytePosition = bytePosition;
}

/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -146,6 +151,16 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Returns the starting position of this record in the source stream, measured in bytes.
*
* @return the byte position of this record in the source stream.
* @since 1.13.0
*/
public long getBytePosition() {
return bytePosition;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
86 changes: 86 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -51,13 +55,36 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far. */
private long bytesRead;
private long bytesReadMark;

/** Encoder for calculating the number of bytes for each character read. */
private CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

/**
* Constructs a new instance with the specified reader, character set,
* and byte tracking option. Initializes an encoder if byte tracking is enabled
* and a character set is provided.
*
* @param reader the reader supports a look-ahead option.
* @param charset the character set for encoding, or {@code null} if not applicable.
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
*/
ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) {
Comment thread
garydgregory marked this conversation as resolved.
super(reader);
if (charset != null && enableByteTracking) {
encoder = charset.newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -110,6 +137,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -120,11 +148,59 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
Comment thread
garydgregory marked this conversation as resolved.
this.bytesRead += getEncodedCharLength(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* Gets the byte length of the given character based on the the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* <p>
* The Unicode characters are divided into two main ranges:
* <ul>
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
* <ul>
* <li>Represented using a single 16-bit {@code char}.</li>
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
* </ul>
* </li>
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
* <ul>
* <li>Represented as a pair of {@code char}s:</li>
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
* </ul>
* </li>
* </ul>
*
* @param current the current character to process.
* @return the byte length of the character.
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(int current) throws CharacterCodingException {
final char cChar = (char) current;
final char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else {
throw new CharacterCodingException();
}
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -189,7 +265,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
9 changes: 9 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Gets the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
70 changes: 70 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,76 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}

@Test
public void testGetRecordThreeBytesRead() throws Exception {
Comment thread
garydgregory marked this conversation as resolved.
final String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
Comment thread
garydgregory marked this conversation as resolved.

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 95);

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 154);
}
}

@Test
public void testGetRecordFourBytesRead() throws Exception {
final String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
Comment thread
garydgregory marked this conversation as resolved.

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 43);
}
}

@Test
public void testGetHeaderMap() throws Exception {
try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/apache/commons/csv/CSVRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ record = parser.iterator().next();
@Test
public void testCSVRecordNULLValues() throws IOException {
try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) {
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L);
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L);
assertEquals(0, csvRecord.size());
assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B"));
}
Expand Down
Loading