Skip to content

Commit c0e8db6

Browse files
author
digi-scrypt
committed
fix surrogate pair byte counting in array read
1 parent 8192d9d commit c0e8db6

2 files changed

Lines changed: 31 additions & 8 deletions

File tree

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,10 @@ long getBytesRead() {
109109

110110
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
111111
int len = 0;
112-
for (int i = offset; i < length; i++) {
113-
len += getEncodedCharLength(buf[i]);
112+
int previous = lastChar;
113+
for (int i = offset; i < offset + length; i++) {
114+
len += getEncodedCharLength(buf[i], previous);
115+
previous = buf[i];
114116
}
115117
return len;
116118
}
@@ -140,9 +142,9 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
140142
* @return the byte length of the character.
141143
* @throws CharacterCodingException if the character cannot be encoded.
142144
*/
143-
private int getEncodedCharLength(final int current) throws CharacterCodingException {
145+
private int getEncodedCharLength(final int current, final int previous) throws CharacterCodingException {
144146
final char cChar = (char) current;
145-
final char lChar = (char) lastChar;
147+
final char lChar = (char) previous;
146148
if (!Character.isSurrogate(cChar)) {
147149
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
148150
}
@@ -205,7 +207,7 @@ public int read() throws IOException {
205207
lineNumber++;
206208
}
207209
if (encoder != null) {
208-
this.bytesRead += getEncodedCharLength(current);
210+
this.bytesRead += getEncodedCharLength(current, lastChar);
209211
}
210212
lastChar = current;
211213
position++;
@@ -229,13 +231,13 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
229231
lineNumber++;
230232
}
231233
}
234+
if (encoder != null) {
235+
this.bytesRead += getEncodedCharLength(buf, offset, len);
236+
}
232237
lastChar = buf[offset + len - 1];
233238
} else if (len == EOF) {
234239
lastChar = EOF;
235240
}
236-
if (encoder != null) {
237-
this.bytesRead += getEncodedCharLength(buf, offset, len);
238-
}
239241
position += len;
240242
return len;
241243
}

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,27 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
666666
}
667667
}
668668

669+
@Test
670+
void testGetBytePositionMultiCharacterDelimiterWithSupplementaryChar() throws IOException {
671+
// Delimiter holds a 4-byte (surrogate pair) character; the delimiter tail is consumed through
672+
// the char[] read path, where the surrogate halves must be paired with the correct neighbor.
673+
final String code = "aa[😀]bb\ncc[😀]dd\n";
674+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[😀]").get();
675+
try (CSVParser parser = CSVParser.builder()
676+
.setReader(new StringReader(code))
677+
.setFormat(format)
678+
.setCharset(StandardCharsets.UTF_8)
679+
.setTrackBytes(true)
680+
.get()) {
681+
final Iterator<CSVRecord> it = parser.iterator();
682+
final CSVRecord first = it.next();
683+
final CSVRecord second = it.next();
684+
assertEquals(0, first.getBytePosition());
685+
// "aa[😀]bb\n" -> 2 + 1 + 4 + 1 + 2 + 1 = 11 bytes in UTF-8
686+
assertEquals(11, second.getBytePosition());
687+
}
688+
}
689+
669690
@Test
670691
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
671692
final String row0 = "é,x\n";

0 commit comments

Comments
 (0)