Skip to content

Commit 871f745

Browse files
authored
Merge pull request apache#613 from OldTruckDriver/fix/CSV-329_trackbytes_supplementary_delimiter
[CSV-329] Fix byte tracking for supplementary delimiters
2 parents d8e1242 + f25f3d5 commit 871f745

4 files changed

Lines changed: 53 additions & 7 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-325">CSVParser applies characterOffset to bytePosition (#604).</action>
5454
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-326">CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.</action>
5555
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-327">CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.</action>
56+
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-329">CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.</action>
5657
<action type="fix" dev="ggregory" due-to="Ruiqi Dong, Gary Gregory" issue="CSV-328">CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null).</action>
5758
<action type="fix" dev="ggregory" due-to="OldTruckDriver, Gary Gregory" issue="CSV-326">Escape Reader values with quote and escape (#606).</action>
5859
<action type="fix" dev="ggregory" due-to="Dexter.k, Gary Gregory">Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).</action>

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,11 @@ long getBytesRead() {
108108
}
109109

110110
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
111-
int len = 0;
112-
for (int i = offset; i < length; i++) {
113-
len += getEncodedCharLength(buf[i]);
111+
long len = 0;
112+
int previous = lastChar;
113+
for (int i = offset; i < offset + length; i++) {
114+
len += getEncodedCharLength(previous, buf[i]);
115+
previous = buf[i];
114116
}
115117
return len;
116118
}
@@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
141143
* @throws CharacterCodingException if the character cannot be encoded.
142144
*/
143145
private int getEncodedCharLength(final int current) throws CharacterCodingException {
146+
return getEncodedCharLength(lastChar, current);
147+
}
148+
149+
private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
144150
final char cChar = (char) current;
145-
final char lChar = (char) lastChar;
151+
final char lChar = (char) previous;
146152
if (!Character.isSurrogate(cChar)) {
147153
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
148154
}
@@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
218224
return 0;
219225
}
220226
final int len = super.read(buf, offset, length);
227+
if (encoder != null && len > 0) {
228+
this.bytesRead += getEncodedCharLength(buf, offset, len);
229+
}
221230
if (len > 0) {
222231
for (int i = offset; i < offset + len; i++) {
223232
final char ch = buf[i];
@@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
233242
} else if (len == EOF) {
234243
lastChar = EOF;
235244
}
236-
if (encoder != null) {
237-
this.bytesRead += getEncodedCharLength(buf, offset, len);
238-
}
239245
position += len;
240246
return len;
241247
}

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
666666
}
667667
}
668668

669+
/**
670+
* Tests <a href="https://issues.apache.org/jira/browse/CSV-329">CSV-329</a>.
671+
*/
672+
@Test
673+
void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
674+
final String delimiter = "x😀";
675+
final String code = "ax😀b\ncx😀d\n";
676+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
677+
try (CSVParser parser = CSVParser.builder()
678+
.setReader(new StringReader(code))
679+
.setFormat(format)
680+
.setCharset(UTF_8)
681+
.setTrackBytes(true)
682+
.get()) {
683+
final CSVRecord first = parser.nextRecord();
684+
final CSVRecord second = parser.nextRecord();
685+
assertNotNull(first);
686+
assertNotNull(second);
687+
assertValuesEquals(new String[] { "a", "b" }, first);
688+
assertValuesEquals(new String[] { "c", "d" }, second);
689+
assertEquals(0, first.getBytePosition());
690+
assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
691+
}
692+
}
693+
669694
@Test
670695
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
671696
final String row0 = "é,x\n";

src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import static org.junit.jupiter.api.Assertions.assertNull;
2727

2828
import java.io.StringReader;
29+
import java.nio.charset.StandardCharsets;
2930

3031
import org.junit.jupiter.api.Test;
3132

@@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
104105
}
105106
}
106107

108+
@Test
109+
void testReadingSupplementaryCharacterTracksBytes() throws Exception {
110+
final String input = "😀";
111+
final char[] buffer = new char[input.length()];
112+
try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
113+
assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
114+
assertArrayEquals(input.toCharArray(), buffer);
115+
assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
116+
assertEquals(input.length(), reader.getPosition());
117+
assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
118+
}
119+
}
120+
107121
@Test
108122
void testReadLine() throws Exception {
109123
try (ExtendedBufferedReader br = createBufferedReader("")) {

0 commit comments

Comments
 (0)