Skip to content

Commit 1d89cd5

Browse files
[CSV-329] Fix byte tracking for supplementary delimiters
ExtendedBufferedReader.read(char[], int, int) updated lastChar before computing the encoded byte length, so a surrogate pair in the delimiter lookahead buffer was paired against the post-update lastChar and threw CharacterCodingException. Count bytes before updating lastChar, and pair each char against the preceding char in the buffer seeded from lastChar so pairs split across reads still count. Add parser and ExtendedBufferedReader regression tests. Reviewed-by: OpenAI Codex Reviewed-by: Anthropic Claude Code
1 parent ed8dbf2 commit 1d89cd5

4 files changed

Lines changed: 53 additions & 7 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-325">CSVParser applies characterOffset to bytePosition (#604).</action>
5454
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-326">CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.</action>
5555
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-327">CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.</action>
56+
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-329">CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.</action>
5657
<action type="fix" dev="ggregory" due-to="OldTruckDriver, Gary Gregory" issue="CSV-326">Escape Reader values with quote and escape (#606).</action>
5758
<action type="fix" dev="ggregory" due-to="Dexter.k, Gary Gregory">Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).</action>
5859
<action type="fix" dev="ggregory" due-to="Dexter.k, Gary Gregory">Escape quote char in printWithEscapes when QuoteMode is NONE (#609).</action>

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,11 @@ long getBytesRead() {
108108
}
109109

110110
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
111-
int len = 0;
112-
for (int i = offset; i < length; i++) {
113-
len += getEncodedCharLength(buf[i]);
111+
long len = 0;
112+
int previous = lastChar;
113+
for (int i = offset; i < offset + length; i++) {
114+
len += getEncodedCharLength(previous, buf[i]);
115+
previous = buf[i];
114116
}
115117
return len;
116118
}
@@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
141143
* @throws CharacterCodingException if the character cannot be encoded.
142144
*/
143145
private int getEncodedCharLength(final int current) throws CharacterCodingException {
146+
return getEncodedCharLength(lastChar, current);
147+
}
148+
149+
private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
144150
final char cChar = (char) current;
145-
final char lChar = (char) lastChar;
151+
final char lChar = (char) previous;
146152
if (!Character.isSurrogate(cChar)) {
147153
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
148154
}
@@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
218224
return 0;
219225
}
220226
final int len = super.read(buf, offset, length);
227+
if (encoder != null && len > 0) {
228+
this.bytesRead += getEncodedCharLength(buf, offset, len);
229+
}
221230
if (len > 0) {
222231
for (int i = offset; i < offset + len; i++) {
223232
final char ch = buf[i];
@@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
233242
} else if (len == EOF) {
234243
lastChar = EOF;
235244
}
236-
if (encoder != null) {
237-
this.bytesRead += getEncodedCharLength(buf, offset, len);
238-
}
239245
position += len;
240246
return len;
241247
}

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
666666
}
667667
}
668668

669+
/**
670+
* Tests <a href="https://issues.apache.org/jira/browse/CSV-329">CSV-329</a>.
671+
*/
672+
@Test
673+
void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
674+
final String delimiter = "x😀";
675+
final String code = "ax😀b\ncx😀d\n";
676+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
677+
try (CSVParser parser = CSVParser.builder()
678+
.setReader(new StringReader(code))
679+
.setFormat(format)
680+
.setCharset(UTF_8)
681+
.setTrackBytes(true)
682+
.get()) {
683+
final CSVRecord first = parser.nextRecord();
684+
final CSVRecord second = parser.nextRecord();
685+
assertNotNull(first);
686+
assertNotNull(second);
687+
assertValuesEquals(new String[] { "a", "b" }, first);
688+
assertValuesEquals(new String[] { "c", "d" }, second);
689+
assertEquals(0, first.getBytePosition());
690+
assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
691+
}
692+
}
693+
669694
@Test
670695
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
671696
final String row0 = "é,x\n";

src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import static org.junit.jupiter.api.Assertions.assertNull;
2727

2828
import java.io.StringReader;
29+
import java.nio.charset.StandardCharsets;
2930

3031
import org.junit.jupiter.api.Test;
3132

@@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
104105
}
105106
}
106107

108+
@Test
109+
void testReadingSupplementaryCharacterTracksBytes() throws Exception {
110+
final String input = "😀";
111+
final char[] buffer = new char[input.length()];
112+
try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
113+
assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
114+
assertArrayEquals(input.toCharArray(), buffer);
115+
assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
116+
assertEquals(input.length(), reader.getPosition());
117+
assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
118+
}
119+
}
120+
107121
@Test
108122
void testReadLine() throws Exception {
109123
try (ExtendedBufferedReader br = createBufferedReader("")) {

0 commit comments

Comments
 (0)