diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 57f55cae0..cca38e512 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -46,10 +46,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
with:
persist-credentials: false
- - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+ - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index 114f3d8a2..7bc02bdd2 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -26,6 +26,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: 'Checkout Repository'
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- name: 'Dependency Review PR'
uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 3ee3dec2b..17ba7dd38 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -43,17 +43,17 @@ jobs:
experimental: true
steps:
- - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+ - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
with:
persist-credentials: false
- - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+ - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
+ uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0
with:
distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }}
java-version: ${{ matrix.java }}
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index bf246c140..e1868cb46 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -40,7 +40,7 @@ jobs:
steps:
- name: "Checkout code"
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0
with:
persist-credentials: false
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index eb15f2518..3423e18ad 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -48,13 +48,13 @@ Getting Started
---------------
+ Make sure you have a [JIRA account](https://issues.apache.org/jira/).
-+ Make sure you have a [GitHub account](https://github.com/signup/free). This is not essential, but makes providing patches much easier.
++ Make sure you have a [GitHub account](https://github.com/signup). This is not essential, but makes providing patches much easier.
+ If you're planning to implement a new feature it makes sense to discuss your changes on the [dev list](https://commons.apache.org/mail-lists.html) first. This way you can make sure you're not wasting your time on something that isn't considered to be in Apache Commons CSV's scope.
+ Submit a [Jira Ticket][jira] for your issue, assuming one does not already exist.
+ Clearly describe the issue including steps to reproduce when it is a bug.
+ Make sure you fill in the earliest version that you know has the issue.
+ Find the corresponding [repository on GitHub](https://github.com/apache/?query=commons-),
-[fork](https://help.github.com/articles/fork-a-repo/) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository.
+[fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository.
Making Changes
--------------
@@ -108,8 +108,8 @@ Additional Resources
+ [Contributing patches](https://commons.apache.org/patches.html)
+ [Apache Commons CSV JIRA project page][jira]
+ [Contributor License Agreement][cla]
-+ [General GitHub documentation](https://help.github.com/)
-+ [GitHub pull request documentation](https://help.github.com/articles/creating-a-pull-request/)
++ [General GitHub documentation](https://docs.github.com/)
++ [GitHub pull request documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request)
+ [Apache Commons Twitter Account](https://twitter.com/ApacheCommons)
[cla]:https://www.apache.org/licenses/#clas
diff --git a/pom.xml b/pom.xml
index 57bca27b2..8cb13ed7c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -20,7 +20,7 @@
org.apache.commons
commons-parent
- 101
+ 102
commons-csv
1.15.0-SNAPSHOT
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 633de96bd..93952e9f1 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -51,11 +51,23 @@
CSVFormat.Builder.setQuote() does not refresh quotedNullString (#2447).
Lexer.isDelimiter() accepts a partial multi-character delimiter at EOF (#603).
CSVParser applies characterOffset to bytePosition (#604).
+ CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.
+ CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.
+ CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.
+ CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null).
+ Escape Reader values with quote and escape (#606).
+ Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).
+ Escape quote char in printWithEscapes when QuoteMode is NONE (#609).
+ Quote value starting with comment marker in minimal quote mode (#610).
+ Escape leading comment marker in printWithEscapes (#614).
+ Skip byte counting at EOF in ExtendedBufferedReader.read (#615).
+ Keep quoted empty trailing field with trailingDelimiter (#616).
+ Evaluate isDelimiter once in nextToken whitespace skip (#618)..
Add an "Android Compatibility" section to the web site.
Add CSVParser.Builder.setByteOffset(long) (#604).
- Bump org.apache.commons:commons-parent from 85 to 101 #573, #595.
+ Bump org.apache.commons:commons-parent from 85 to 102 #573, #595.
[test] Bump com.opencsv:opencsv from 5.11.2 to 5.12.0 #558.
Bump org.apache.commons:commons-lang3 from 3.18.0 to 3.20.0.
Bump commons-codec:commons-codec from 1.19.0 to 1.22.0.
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index f6b2c5ae0..7145d23d3 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -780,8 +780,7 @@ public Builder setMaxRows(final long maxRows) {
*/
public Builder setNullString(final String nullString) {
this.nullString = nullString;
- this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
- return this;
+ return setQuotedNullString();
}
/**
@@ -806,6 +805,10 @@ public Builder setQuote(final Character quoteCharacter) {
throw new IllegalArgumentException("The quoteCharacter cannot be a line break");
}
this.quoteCharacter = quoteCharacter;
+ return setQuotedNullString();
+ }
+
+ private Builder setQuotedNullString() {
final Character quote = quoteCharacter != null ? quoteCharacter : Constants.DOUBLE_QUOTE_CHAR;
this.quotedNullString = quote + nullString + quote;
return this;
@@ -880,6 +883,16 @@ public Builder setTrailingData(final boolean trailingData) {
/**
* Sets whether to add a trailing delimiter.
*
+ *
+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field
+ * that such a trailing delimiter produces is dropped so the output round-trips back to the original record;
+ * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept.
+ *
+ *
+ * This is unrelated to {@link #setTrailingData(boolean) trailing data}, which controls whether characters
+ * after the closing quote of an encapsulated value are tolerated when reading.
+ *
+ *
* @param trailingDelimiter whether to add a trailing delimiter.
* @return This instance.
*/
@@ -2009,6 +2022,16 @@ public boolean getTrailingData() {
/**
* Gets whether to add a trailing delimiter.
*
+ *
+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field
+ * that such a trailing delimiter produces is dropped so the output round-trips back to the original record;
+ * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept.
+ *
+ *
+ * This is unrelated to {@link #getTrailingData() trailing data}, which controls whether characters after the
+ * closing quote of an encapsulated value are tolerated when reading.
+ *
+ *
* @return whether to add a trailing delimiter.
* @since 1.3
*/
@@ -2324,12 +2347,18 @@ private void printWithEscapes(final CharSequence charSeq, final Appendable appen
final char[] delimArray = getDelimiterCharArray();
final int delimLength = delimArray.length;
final char escape = getEscapeChar();
+ final boolean quoteSet = isQuoteCharacterSet();
+ final char quote = quoteSet ? getQuoteCharacter().charValue() : 0;
+ final boolean commentMarkerSet = isCommentMarkerSet();
+ final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional
while (pos < end) {
char c = charSeq.charAt(pos);
final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delimArray, delimLength);
final boolean isCr = c == Constants.CR;
final boolean isLf = c == Constants.LF;
- if (isCr || isLf || c == escape || isDelimiterStart) {
+ // A leading comment marker would be read back as a comment, so escape it.
+ final boolean isComment = commentMarkerSet && pos == 0 && c == commentChar;
+ if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) {
// write out segment up until this char
if (pos > start) {
appendable.append(charSeq, start, pos);
@@ -2368,8 +2397,13 @@ private void printWithEscapes(final Reader reader, final Appendable appendable)
final char[] delimArray = getDelimiterCharArray();
final int delimLength = delimArray.length;
final char escape = getEscapeChar();
+ final boolean quoteSet = isQuoteCharacterSet();
+ final char quote = quoteSet ? getQuoteCharacter().charValue() : 0;
+ final boolean commentMarkerSet = isCommentMarkerSet();
+ final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional
final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE);
int c;
+ boolean firstChar = true;
final char[] lookAheadBuffer = new char[delimLength - 1];
while (EOF != (c = bufferedReader.read())) {
builder.append((char) c);
@@ -2379,7 +2413,10 @@ private void printWithEscapes(final Reader reader, final Appendable appendable)
final boolean isDelimiterStart = isDelimiter((char) c, test, pos, delimArray, delimLength);
final boolean isCr = c == Constants.CR;
final boolean isLf = c == Constants.LF;
- if (isCr || isLf || c == escape || isDelimiterStart) {
+ // A leading comment marker would be read back as a comment, so escape it.
+ final boolean isComment = commentMarkerSet && firstChar && c == commentChar;
+ firstChar = false;
+ if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) {
// write out segment up until this char
if (pos > start) {
append(builder.substring(start, pos), appendable);
@@ -2450,10 +2487,11 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi
}
} else {
char c = charSeq.charAt(pos);
- if (c <= Constants.COMMENT) {
+ if (c <= Constants.COMMENT || isCommentMarkerSet() && c == commentMarker.charValue()) {
// Some other chars at the start of a value caused the parser to fail, so for now
// encapsulate if we start in anything less than '#'. We are being conservative
- // by including the default comment char too.
+ // by including the default comment char and any configured comment marker too,
+ // which the parser would otherwise read back as a comment line.
quote = true;
} else {
while (pos < len) {
@@ -2522,14 +2560,15 @@ private void printWithQuotes(final Reader reader, final Appendable appendable) t
return;
}
final char quote = getQuoteCharacter().charValue(); // Explicit unboxing is intentional
+ final char escape = isEscapeCharacterSet() ? getEscapeChar() : quote;
// (1) Append opening quote
append(quote, appendable);
- // (2) Append Reader contents, doubling quotes
+ // (2) Append Reader contents, doubling quotes and escape characters
int c;
while (EOF != (c = reader.read())) {
append((char) c, appendable);
- if (c == quote) {
- append(quote, appendable);
+ if (c == quote || c == escape) {
+ append((char) c, appendable);
}
}
// (3) Append closing quote
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index c9b2dc44f..141eba732 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -237,6 +237,7 @@ public Builder setTrackBytes(final boolean trackBytes) {
final class CSVRecordIterator implements Iterator {
private CSVRecord current;
+ private long recordCount;
/**
* Gets the next record or null at the end of stream or max rows read.
@@ -247,8 +248,11 @@ final class CSVRecordIterator implements Iterator {
*/
private CSVRecord getNextRecord() {
CSVRecord record = null;
- if (format.useRow(recordNumber + 1)) {
+ if (format.useRow(recordCount + 1)) {
record = Uncheck.get(CSVParser.this::nextRecord);
+ if (record != null) {
+ recordCount++;
+ }
}
return record;
}
@@ -576,7 +580,9 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
private void addRecordValue(final boolean lastRecord) {
final String input = format.trim(reusableToken.content.toString());
- if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
+ // Only drop the empty field produced by an actual trailing delimiter. A quoted empty
+ // field ("") is a real value, not a trailing delimiter, so it must be kept.
+ if (lastRecord && input.isEmpty() && format.getTrailingDelimiter() && !reusableToken.isQuoted) {
return;
}
recordList.add(handleNull(input));
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index 502bf318a..8dab14d90 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -281,7 +281,7 @@ public Iterator iterator() {
/**
* Puts all values of this record into the given Map.
*
- * @param the map type.
+ * @param The map type.
* @param map The Map to populate.
* @return the given map.
* @since 1.9.0
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 889b58edc..20c1ef544 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -108,9 +108,11 @@ long getBytesRead() {
}
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
- int len = 0;
- for (int i = offset; i < length; i++) {
- len += getEncodedCharLength(buf[i]);
+ long len = 0;
+ int previous = lastChar;
+ for (int i = offset; i < offset + length; i++) {
+ len += getEncodedCharLength(previous, buf[i]);
+ previous = buf[i];
}
return len;
}
@@ -141,8 +143,12 @@ private long getEncodedCharLength(final char[] buf, final int offset, final int
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
+ return getEncodedCharLength(lastChar, current);
+ }
+
+ private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
final char cChar = (char) current;
- final char lChar = (char) lastChar;
+ final char lChar = (char) previous;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
@@ -204,7 +210,7 @@ public int read() throws IOException {
if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
- if (encoder != null) {
+ if (encoder != null && current != EOF) {
this.bytesRead += getEncodedCharLength(current);
}
lastChar = current;
@@ -218,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
return 0;
}
final int len = super.read(buf, offset, length);
+ if (encoder != null && len > 0) {
+ this.bytesRead += getEncodedCharLength(buf, offset, len);
+ }
if (len > 0) {
for (int i = offset; i < offset + len; i++) {
final char ch = buf[i];
@@ -233,9 +242,6 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
} else if (len == EOF) {
lastChar = EOF;
}
- if (encoder != null) {
- this.bytesRead += getEncodedCharLength(buf, offset, len);
- }
position += len;
return len;
}
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index de97868e4..fe964480a 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -153,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException {
isLastTokenDelimiter = true;
return true;
}
+ Arrays.fill(delimiterBuf, '\0');
reader.peek(delimiterBuf);
for (int i = 0; i < delimiterBuf.length; i++) {
if (delimiterBuf[i] != delimiter[i + 1]) {
@@ -191,6 +192,7 @@ boolean isEscape(final int ch) {
* @throws IOException If an I/O error occurs.
*/
boolean isEscapeDelimiter() throws IOException {
+ Arrays.fill(escapeDelimiterBuf, '\0');
reader.peek(escapeDelimiterBuf);
if (escapeDelimiterBuf[0] != delimiter[0]) {
return false;
@@ -273,18 +275,24 @@ Token nextToken(final Token token) throws IOException {
token.type = Token.Type.COMMENT;
return token;
}
- Arrays.fill(delimiterBuf, '\0');
// Important: make sure a new char gets consumed in each iteration
while (token.type == Token.Type.INVALID) {
+ // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must
+ // only be evaluated once per character. Remember a match found while skipping whitespace below.
+ boolean delimiter = false;
// ignore whitespaces at beginning of a token
if (ignoreSurroundingSpaces) {
- while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
+ while (Character.isWhitespace((char) c) && !eol) {
+ if (isDelimiter(c)) {
+ delimiter = true;
+ break;
+ }
c = reader.read();
eol = readEndOfLine(c);
}
}
// ok, start of token reached: encapsulated, or token
- if (isDelimiter(c)) {
+ if (delimiter || isDelimiter(c)) {
// empty token return TOKEN("")
token.type = Token.Type.TOKEN;
} else if (eol) {
diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java
index ca18754f7..ed20898de 100644
--- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java
@@ -966,6 +966,23 @@ void testPrintWithQuotes() throws IOException {
assertEquals("\"\"\"a,b,c\r\nx,y,z\"", out.toString());
}
+ /**
+ * Tests CSV-326.
+ */
+ @Test
+ void testPrintWithQuotesEscapeBeforeQuote() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder()
+ .setEscape('\\')
+ .setQuote('"')
+ .get();
+ final String value = "\\\"";
+ final Appendable out = new StringBuilder();
+ format.print(new StringReader(value), out, true);
+ try (CSVParser parser = CSVParser.parse(out.toString(), format)) {
+ assertEquals(value, parser.getRecords().get(0).get(0));
+ }
+ }
+
@Test
void testQuoteCharSameAsCommentStartThrowsException() {
assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.builder().setQuote('!').setCommentMarker('!').get());
@@ -1023,6 +1040,11 @@ void testQuotedNullStringTracksQuoteCharacter() throws IOException {
builder.setQuote((Character) null);
builder.get().print(null, out, true);
assertEquals("\"NULL\"", out.toString());
+ // reset, reverse setter order
+ out.setLength(0);
+ builder.setNullString(null).setQuote((Character) null).setNullString("NULL");
+ builder.get().print(null, out, true);
+ assertEquals("\"NULL\"", out.toString());
}
@Test
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 8b1527c42..6d9bdd9e8 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() {
() -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false)));
}
+ /**
+ * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
+ * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping
+ * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value.
+ */
+ @Test
+ void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
+ try (CSVParser parser = CSVParser.parse(" |a", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "", "a" }, records.get(0));
+ }
+ try (CSVParser parser = CSVParser.parse("a | |b", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "a", "", "b" }, records.get(0));
+ }
+ try (CSVParser parser = CSVParser.parse("a | |b |", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0));
+ }
+ }
+
@Test
void testEmptyFile() throws Exception {
try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8,
@@ -666,6 +691,31 @@ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
}
}
+ /**
+ * Tests CSV-329.
+ */
+ @Test
+ void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
+ final String delimiter = "x😀";
+ final String code = "ax😀b\ncx😀d\n";
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(format)
+ .setCharset(UTF_8)
+ .setTrackBytes(true)
+ .get()) {
+ final CSVRecord first = parser.nextRecord();
+ final CSVRecord second = parser.nextRecord();
+ assertNotNull(first);
+ assertNotNull(second);
+ assertValuesEquals(new String[] { "a", "b" }, first);
+ assertValuesEquals(new String[] { "c", "d" }, second);
+ assertEquals(0, first.getBytePosition());
+ assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
+ }
+ }
+
@Test
void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
final String row0 = "é,x\n";
@@ -696,6 +746,27 @@ void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception
}
}
+ @Test
+ void testGetBytePositionWithSingleByteCharset() throws IOException {
+ // A single-byte charset cannot encode U+FFFF, the char value of the EOF sentinel.
+ // Byte counting must skip the EOF read so a valid file parses without throwing.
+ final String code = "a,b\nc,d\n";
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(CSVFormat.DEFAULT)
+ .setCharset(StandardCharsets.ISO_8859_1)
+ .setTrackBytes(true)
+ .get()) {
+ final CSVRecord first = parser.nextRecord();
+ final CSVRecord second = parser.nextRecord();
+ assertNotNull(first);
+ assertNotNull(second);
+ assertNull(parser.nextRecord());
+ assertEquals(0, first.getBytePosition());
+ assertEquals(4, second.getBytePosition());
+ }
+ }
+
@Test
void testGetHeaderComment_HeaderComment1() throws IOException {
try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) {
@@ -965,6 +1036,23 @@ void testGetRecordsMaxRows(final long maxRows) throws IOException {
}
}
+ /**
+ * Tests CSV-327.
+ */
+ @Test
+ void testGetRecordsMaxRowsWithRecordNumberOffset() throws IOException {
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader("a,b\nc,d\n"))
+ .setFormat(CSVFormat.DEFAULT.builder().setMaxRows(1).get())
+ .setRecordNumber(2)
+ .get()) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertEquals(2, records.get(0).getRecordNumber());
+ assertValuesEquals(new String[] { "a", "b" }, records.get(0));
+ }
+ }
+
@Test
void testGetRecordThreeBytesRead() throws Exception {
final String code = "id,date,val5,val4\n" +
@@ -1651,6 +1739,20 @@ void testParsingPrintedEmptyFirstColumn(final CSVFormat.Predefined format) throw
}
}
+ /**
+ * A truncated escaped multi-character delimiter at EOF must stay literal data and not be completed from a stale
+ * escape delimiter look-ahead.
+ */
+ @Test
+ void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get();
+ try (CSVParser parser = format.parse(new StringReader("x![!|!]y![!|"))) {
+ final CSVRecord record = parser.nextRecord();
+ assertEquals("x[|]y![!|", record.get(0));
+ assertEquals(1, record.size());
+ }
+ }
+
/**
* Tests CSV-324.
*/
@@ -1665,6 +1767,22 @@ void testPartialMultiCharacterDelimiterAtEOF() throws IOException {
}
}
+ /**
+ * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an
+ * earlier non-matching peek in the same token.
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]".
+ final String recordString = "x[a][|";
+ try (CSVParser parser = format.parse(new StringReader(recordString))) {
+ final CSVRecord record = parser.nextRecord();
+ assertEquals(recordString, record.get(0));
+ assertEquals(1, record.size());
+ }
+ }
+
@Test
void testProvidedHeader() throws Exception {
final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z");
@@ -1856,6 +1974,23 @@ void testTrailingDelimiter() throws Exception {
}
}
+ @Test
+ void testTrailingDelimiterKeepsQuotedEmptyLastField() throws Exception {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setTrailingDelimiter(true).get();
+ try (CSVParser parser = CSVParser.parse("a,b,\"\"", format)) {
+ final CSVRecord record = parser.iterator().next();
+ assertEquals(3, record.size());
+ assertEquals("a", record.get(0));
+ assertEquals("b", record.get(1));
+ assertEquals("", record.get(2));
+ }
+ // An unquoted trailing delimiter still drops the empty field.
+ try (CSVParser parser = CSVParser.parse("a,b,", format)) {
+ final CSVRecord record = parser.iterator().next();
+ assertEquals(2, record.size());
+ }
+ }
+
@Test
void testTrim() throws Exception {
final Reader in = new StringReader("a,a,a\n\" 1 \",\" 2 \",\" 3 \"\nx,y,z");
diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
index 1ff791010..9ae80c1e5 100644
--- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
@@ -569,6 +569,57 @@ void testEscapeBackslash5() throws IOException {
assertEquals("\\\\", sw.toString());
}
+ @Test
+ void testEscapeCommentMarkerFirstChar() throws IOException {
+ // No quoting available in escape mode, so a leading comment marker must be escaped or the
+ // record reads back as a comment and is dropped. Mirrors the quoting fix for QuoteMode.MINIMAL.
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote(null).setEscape('\\').setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";comment-like";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, "b");
+ printer.printRecord(new StringReader(col1), new StringReader("b"));
+ // The marker past the first character does not start a comment and is left alone.
+ printer.printRecord("a;b", ";c");
+ }
+ final String string = sw.toString();
+ assertEquals("\\;comment-like,b" + RECORD_SEPARATOR +
+ "\\;comment-like,b" + RECORD_SEPARATOR +
+ "a;b,\\;c" + RECORD_SEPARATOR, string);
+ // The emitted records must read back as the original values, none parsed as a comment.
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(3, records.size());
+ assertEquals(col1, records.get(0).get(0));
+ assertEquals("b", records.get(0).get(1));
+ assertEquals(col1, records.get(1).get(0));
+ assertEquals("b", records.get(1).get(1));
+ assertEquals("a;b", records.get(2).get(0));
+ assertEquals(";c", records.get(2).get(1));
+ }
+ }
+
+ @Test
+ void testEscapeCommentMarkerFirstCharWithQuoteModeNone() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setEscape('\\').setQuoteMode(QuoteMode.NONE).setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";bar";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, "b");
+ printer.printRecord(new StringReader(col1), new StringReader("b"));
+ }
+ final String string = sw.toString();
+ assertEquals("\\;bar,b" + RECORD_SEPARATOR + "\\;bar,b" + RECORD_SEPARATOR, string);
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ for (final CSVRecord record : records) {
+ assertEquals(col1, record.get(0));
+ assertEquals("b", record.get(1));
+ }
+ }
+ }
+
@Test
void testEscapeNull1() throws IOException {
final StringWriter sw = new StringWriter();
@@ -1798,6 +1849,28 @@ void testQuoteAll() throws IOException {
}
}
+ @Test
+ void testQuoteCharEscapedWithQuoteModeNone() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote('"').setEscape('?').setQuoteMode(QuoteMode.NONE).get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = "\"abc";
+ final String col2 = "x\"y";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, col2);
+ printer.printRecord(new StringReader(col1), new StringReader(col2));
+ }
+ assertEquals("?\"abc,x?\"y" + RECORD_SEPARATOR + "?\"abc,x?\"y" + RECORD_SEPARATOR, sw.toString());
+ // The emitted records must read back as the original values.
+ try (CSVParser parser = CSVParser.parse(sw.toString(), format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ for (final CSVRecord record : records) {
+ assertEquals(col1, record.get(0));
+ assertEquals(col2, record.get(1));
+ }
+ }
+ }
+
@Test
void testQuoteCommaFirstChar() throws IOException {
final StringWriter sw = new StringWriter();
@@ -1807,6 +1880,34 @@ void testQuoteCommaFirstChar() throws IOException {
}
}
+ @Test
+ void testQuoteCommentMarkerFirstChar() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";comment-like";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ // A real comment is written with the marker, unquoted.
+ printer.printComment("a real comment");
+ // A value starting with the marker is quoted, so it does not read back as a comment.
+ printer.printRecord(col1, "b");
+ // The marker past the first character does not start a comment, so only the leading-marker value is quoted.
+ printer.printRecord("a;b", ";c");
+ }
+ final String string = sw.toString();
+ assertEquals("; a real comment" + RECORD_SEPARATOR +
+ "\";comment-like\",b" + RECORD_SEPARATOR +
+ "a;b,\";c\"" + RECORD_SEPARATOR, string);
+ // The comment is dropped on read; both data records survive intact.
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ assertEquals(col1, records.get(0).get(0));
+ assertEquals("b", records.get(0).get(1));
+ assertEquals("a;b", records.get(1).get(0));
+ assertEquals(";c", records.get(1).get(1));
+ }
+ }
+
@Test
void testQuoteNonNumeric() throws IOException {
final StringWriter sw = new StringWriter();
diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
index 056b8a9c9..b8d9b9f19 100644
--- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
+++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
@@ -26,6 +26,7 @@
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
import org.junit.jupiter.api.Test;
@@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
}
}
+ @Test
+ void testReadingSupplementaryCharacterTracksBytes() throws Exception {
+ final String input = "😀";
+ final char[] buffer = new char[input.length()];
+ try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
+ assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
+ assertArrayEquals(input.toCharArray(), buffer);
+ assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
+ assertEquals(input.length(), reader.getPosition());
+ assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
+ }
+ }
+
@Test
void testReadLine() throws Exception {
try (ExtendedBufferedReader br = createBufferedReader("")) {
diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java
index 511876a28..a76f6e513 100644
--- a/src/test/java/org/apache/commons/csv/LexerTest.java
+++ b/src/test/java/org/apache/commons/csv/LexerTest.java
@@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException {
}
}
+ /**
+ * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
+ * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the
+ * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped.
+ */
+ @Test
+ void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
+ try (Lexer lexer = createLexer(" |a", format)) {
+ assertNextToken(TOKEN, "", lexer);
+ assertNextToken(EOF, "a", lexer);
+ }
+ try (Lexer lexer = createLexer("a | |b", format)) {
+ assertNextToken(TOKEN, "a", lexer);
+ assertNextToken(TOKEN, "", lexer);
+ assertNextToken(EOF, "b", lexer);
+ }
+ }
+
@Test
void testEOFWithoutClosingQuote() throws Exception {
final String code = "a,\"b";
@@ -409,6 +428,18 @@ void testNextToken6() throws IOException {
}
}
+ /**
+ * A truncated escaped multi-character delimiter at EOF must not be accepted by reusing the previous escape delimiter
+ * look-ahead in {@link Lexer#isEscapeDelimiter()}.
+ */
+ @Test
+ void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get();
+ try (Lexer lexer = createLexer("x![!|!]y![!|", format)) {
+ assertNextToken(EOF, "x[|]y![!|", lexer);
+ }
+ }
+
/**
* Tests CSV-324.
*/
@@ -421,6 +452,20 @@ void testPartialMultiCharacterDelimiterAtEOF() throws IOException {
}
}
+ /**
+ * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an
+ * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token).
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]".
+ final String recordString = "x[a][|";
+ try (Lexer lexer = createLexer(recordString, format)) {
+ assertNextToken(EOF, recordString, lexer);
+ }
+ }
+
@Test
void testReadEscapeBackspace() throws IOException {
try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {