Skip to content

Commit 85d26fc

Browse files
authored
Merge pull request #618 from rootvector2/delimiter-double-consume
Evaluate isDelimiter once in nextToken whitespace skip
2 parents e729d17 + c9362f7 commit 85d26fc

3 files changed

Lines changed: 48 additions & 2 deletions

File tree

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,15 +277,22 @@ Token nextToken(final Token token) throws IOException {
277277
}
278278
// Important: make sure a new char gets consumed in each iteration
279279
while (token.type == Token.Type.INVALID) {
280+
// isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must
281+
// only be evaluated once per character. Remember a match found while skipping whitespace below.
282+
boolean delimiter = false;
280283
// ignore whitespaces at beginning of a token
281284
if (ignoreSurroundingSpaces) {
282-
while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
285+
while (Character.isWhitespace((char) c) && !eol) {
286+
if (isDelimiter(c)) {
287+
delimiter = true;
288+
break;
289+
}
283290
c = reader.read();
284291
eol = readEndOfLine(c);
285292
}
286293
}
287294
// ok, start of token reached: encapsulated, or token
288-
if (isDelimiter(c)) {
295+
if (delimiter || isDelimiter(c)) {
289296
// empty token return TOKEN("")
290297
token.type = Token.Type.TOKEN;
291298
} else if (eol) {

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1758,6 +1758,26 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
17581758
}
17591759
}
17601760

1761+
/**
1762+
* With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
1763+
* the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping
1764+
* leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value.
1765+
*/
1766+
@Test
1767+
void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
1768+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
1769+
try (CSVParser parser = CSVParser.parse(" |a", format)) {
1770+
final List<CSVRecord> records = parser.getRecords();
1771+
assertEquals(1, records.size());
1772+
assertValuesEquals(new String[] { "", "a" }, records.get(0));
1773+
}
1774+
try (CSVParser parser = CSVParser.parse("a | |b", format)) {
1775+
final List<CSVRecord> records = parser.getRecords();
1776+
assertEquals(1, records.size());
1777+
assertValuesEquals(new String[] { "a", "", "b" }, records.get(0));
1778+
}
1779+
}
1780+
17611781
@Test
17621782
void testProvidedHeader() throws Exception {
17631783
final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z");

src/test/java/org/apache/commons/csv/LexerTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,25 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
447447
}
448448
}
449449

450+
/**
451+
* With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
452+
* the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the
453+
* delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped.
454+
*/
455+
@Test
456+
void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
457+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
458+
try (Lexer lexer = createLexer(" |a", format)) {
459+
assertNextToken(TOKEN, "", lexer);
460+
assertNextToken(EOF, "a", lexer);
461+
}
462+
try (Lexer lexer = createLexer("a | |b", format)) {
463+
assertNextToken(TOKEN, "a", lexer);
464+
assertNextToken(TOKEN, "", lexer);
465+
assertNextToken(EOF, "b", lexer);
466+
}
467+
}
468+
450469
@Test
451470
void testReadEscapeBackspace() throws IOException {
452471
try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {

0 commit comments

Comments
 (0)