Skip to content

Commit bbdfad7

Browse files
committed
quote value ending with multi-character delimiter prefix
1 parent 4434d93 commit bbdfad7

2 files changed

Lines changed: 52 additions & 0 deletions

File tree

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,6 +2097,33 @@ private boolean isDelimiter(final char ch0, final CharSequence charSeq, final in
20972097
return true;
20982098
}
20992099

2100+
/**
2101+
* Tests whether appending the delimiter after {@code charSeq} would let the parser match the delimiter starting inside the value. This happens with a
2102+
* multi-character delimiter when the value ends with a straddling prefix of it (for delimiter {@code ||}, a value ending in {@code |} followed by the
2103+
* delimiter yields {@code |||}, which the greedy lexer splits one character early). Such a value must be encapsulated so the field boundary is unambiguous.
2104+
*/
2105+
private boolean endsWithDelimiterPrefix(final CharSequence charSeq, final char[] delimiter, final int delimiterLength) {
2106+
if (delimiterLength < 2) {
2107+
return false;
2108+
}
2109+
final int len = charSeq.length();
2110+
for (int start = Math.max(0, len - delimiterLength + 1); start < len; start++) {
2111+
boolean match = true;
2112+
for (int j = 0; j < delimiterLength; j++) {
2113+
final int idx = start + j;
2114+
final char c = idx < len ? charSeq.charAt(idx) : delimiter[idx - len];
2115+
if (c != delimiter[j]) {
2116+
match = false;
2117+
break;
2118+
}
2119+
}
2120+
if (match) {
2121+
return true;
2122+
}
2123+
}
2124+
return false;
2125+
}
2126+
21002127
/**
21012128
* Tests whether escapes are being processed.
21022129
*
@@ -2510,6 +2537,9 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi
25102537
// encapsulate if we end in anything less than ' '
25112538
if (isTrimChar(c)) {
25122539
quote = true;
2540+
} else if (endsWithDelimiterPrefix(charSeq, delim, delimLength)) {
2541+
// A trailing partial multi-character delimiter would merge with the following delimiter on read.
2542+
quote = true;
25132543
}
25142544
}
25152545
}

src/test/java/org/apache/commons/csv/CSVPrinterTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,28 @@ void testQuoteCommentMarkerFirstChar() throws IOException {
19081908
}
19091909
}
19101910

1911+
@Test
1912+
void testQuoteValueEndingWithMultiCharacterDelimiterPrefix() throws IOException {
1913+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("||").get();
1914+
final StringWriter sw = new StringWriter();
1915+
try (CSVPrinter printer = new CSVPrinter(sw, format)) {
1916+
// "a|" ends with the delimiter's first char; unquoted output "a|||b" would split one char early on read.
1917+
printer.printRecord("a|", "b");
1918+
// "a|b" does not end with a delimiter prefix, so it stays unquoted.
1919+
printer.printRecord("a|b", "c");
1920+
}
1921+
final String string = sw.toString();
1922+
assertEquals("\"a|\"||b" + RECORD_SEPARATOR + "a|b||c" + RECORD_SEPARATOR, string);
1923+
try (CSVParser parser = CSVParser.parse(string, format)) {
1924+
final List<CSVRecord> records = parser.getRecords();
1925+
assertEquals(2, records.size());
1926+
assertEquals("a|", records.get(0).get(0));
1927+
assertEquals("b", records.get(0).get(1));
1928+
assertEquals("a|b", records.get(1).get(0));
1929+
assertEquals("c", records.get(1).get(1));
1930+
}
1931+
}
1932+
19111933
@Test
19121934
void testQuoteNonNumeric() throws IOException {
19131935
final StringWriter sw = new StringWriter();

0 commit comments

Comments
 (0)