Skip to content

Commit 513aac2

Browse files
committed
[CSV-325] CSVParser applies characterOffset to bytePosition, which
breaks getBytePosition() for multi-byte prefixes Add CSVParser.Builder.setByteOffset(long)
1 parent 64ea660 commit 513aac2

2 files changed

Lines changed: 58 additions & 3 deletions

File tree

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
154154
public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
155155

156156
private CSVFormat format;
157+
private long byteOffset = -1;
157158
private long characterOffset;
158159
private long recordNumber = 1;
159160
private boolean trackBytes;
@@ -171,10 +172,27 @@ public CSVParser get() throws IOException {
171172
}
172173

173174
/**
174-
* Sets the lexer offset when the parser does not start parsing at the beginning of the source.
175+
* Sets the lexer byte offset when the parser does not start parsing at the beginning of the source.
176+
* <p>
177+
* By default, the value is {@code -1}, which reuses the character offset for the byte offset.
178+
* </p>
175179
*
176-
* @param characterOffset the lexer offset.
180+
* @param byteOffset the lexer byte offset.
177181
* @return {@code this} instance.
182+
* @see #setCharacterOffset(long)
183+
* @since 1.15.0
184+
*/
185+
public Builder setByteOffset(final long byteOffset) {
186+
this.byteOffset = byteOffset;
187+
return asThis();
188+
}
189+
190+
/**
191+
* Sets the lexer character offset when the parser does not start parsing at the beginning of the source.
192+
*
193+
* @param characterOffset the lexer character offset.
194+
* @return {@code this} instance.
195+
* @see #setByteOffset(long)
178196
*/
179197
public Builder setCharacterOffset(final long characterOffset) {
180198
this.characterOffset = characterOffset;
@@ -465,6 +483,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor
465483
*/
466484
private long recordNumber;
467485

486+
/**
487+
* Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
488+
* with {@link #recordNumber}.
489+
*/
490+
private final long byteOffset;
491+
468492
/**
469493
* Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
470494
* with {@link #recordNumber}.
@@ -485,6 +509,7 @@ private CSVParser(final Builder builder) throws IOException {
485509
this.lexer = new Lexer(format, new ExtendedBufferedReader(builder.getReader(), builder.getCharset(), builder.trackBytes));
486510
this.csvRecordIterator = new CSVRecordIterator();
487511
this.headers = createHeaders();
512+
this.byteOffset = builder.byteOffset != -1 ? builder.byteOffset : builder.characterOffset;
488513
this.characterOffset = builder.characterOffset;
489514
this.recordNumber = builder.recordNumber - 1;
490515
}
@@ -870,7 +895,7 @@ CSVRecord nextRecord() throws IOException {
870895
recordList.clear();
871896
StringBuilder sb = null;
872897
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
873-
final long startBytePosition = lexer.getBytesRead() + characterOffset;
898+
final long startBytePosition = lexer.getBytesRead() + byteOffset;
874899
do {
875900
reusableToken.reset();
876901
lexer.nextToken(reusableToken);

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,36 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
666666
}
667667
}
668668

669+
@Test
670+
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
671+
final String row0 = "é,x\n";
672+
final Charset charset = UTF_8;
673+
// row0 char count is 4
674+
assertEquals(4, row0.length());
675+
// row0 byte count is 5
676+
final int record1ByteOffset = row0.getBytes(charset).length;
677+
assertEquals(5, record1ByteOffset);
678+
final String row1 = "b,c\n";
679+
final String rows = row0 + row1;
680+
final long record1CharOffset = row0.length();
681+
final long expectedByteOffset = row0.getBytes(charset).length;
682+
try (CSVParser parser = CSVParser.builder()
683+
.setReader(new StringReader(row1))
684+
.setFormat(CSVFormat.DEFAULT)
685+
.setCharset(charset)
686+
.setTrackBytes(true)
687+
.setByteOffset(record1ByteOffset)
688+
.setCharacterOffset(record1CharOffset)
689+
.setRecordNumber(2) // not relevant but a better use case example.
690+
.get()) {
691+
final CSVRecord record = parser.nextRecord();
692+
assertNotNull(record);
693+
assertEquals(4, record.getCharacterPosition());
694+
assertEquals(record1CharOffset, record.getCharacterPosition());
695+
assertEquals(expectedByteOffset, record.getBytePosition());
696+
}
697+
}
698+
669699
@Test
670700
void testGetHeaderComment_HeaderComment1() throws IOException {
671701
try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) {

0 commit comments

Comments
 (0)