diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index b7b7fdf05..cca38e512 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -46,10 +46,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
with:
persist-credentials: false
- - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+ - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
@@ -58,7 +58,7 @@ jobs:
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
- uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+ uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -69,7 +69,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
- uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+ uses: github/codeql-action/autobuild@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
# ℹ️ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
@@ -83,4 +83,4 @@ jobs:
# make release
- name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+ uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index f0d8ca94e..7bc02bdd2 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -26,6 +26,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: 'Checkout Repository'
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- name: 'Dependency Review PR'
- uses: actions/dependency-review-action@2031cfc080254a8a887f58cffee85186f0e49e48 # v4.9.0
+ uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 10c18b607..17ba7dd38 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -34,26 +34,26 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
- java: [ 8, 11, 17, 21, 25 ]
+ java: [ 8, 11, 17, 21, 25, 26 ]
experimental: [false]
# Keep the same parameter order as the matrix above
include:
- os: ubuntu-latest
- java: 26-ea
+ java: 27-ea
experimental: true
steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
with:
persist-credentials: false
- - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+ - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
+ uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0
with:
distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }}
java-version: ${{ matrix.java }}
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index 16e37f605..e1868cb46 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -40,7 +40,7 @@ jobs:
steps:
- name: "Checkout code"
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
+ uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0
with:
persist-credentials: false
@@ -64,6 +64,6 @@ jobs:
retention-days: 5
- name: "Upload to code-scanning"
- uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+ uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
with:
sarif_file: results.sarif
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index eb15f2518..3423e18ad 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -48,13 +48,13 @@ Getting Started
---------------
+ Make sure you have a [JIRA account](https://issues.apache.org/jira/).
-+ Make sure you have a [GitHub account](https://github.com/signup/free). This is not essential, but makes providing patches much easier.
++ Make sure you have a [GitHub account](https://github.com/signup). This is not essential, but makes providing patches much easier.
+ If you're planning to implement a new feature it makes sense to discuss your changes on the [dev list](https://commons.apache.org/mail-lists.html) first. This way you can make sure you're not wasting your time on something that isn't considered to be in Apache Commons CSV's scope.
+ Submit a [Jira Ticket][jira] for your issue, assuming one does not already exist.
+ Clearly describe the issue including steps to reproduce when it is a bug.
+ Make sure you fill in the earliest version that you know has the issue.
+ Find the corresponding [repository on GitHub](https://github.com/apache/?query=commons-),
-[fork](https://help.github.com/articles/fork-a-repo/) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository.
+[fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository.
Making Changes
--------------
@@ -108,8 +108,8 @@ Additional Resources
+ [Contributing patches](https://commons.apache.org/patches.html)
+ [Apache Commons CSV JIRA project page][jira]
+ [Contributor License Agreement][cla]
-+ [General GitHub documentation](https://help.github.com/)
-+ [GitHub pull request documentation](https://help.github.com/articles/creating-a-pull-request/)
++ [General GitHub documentation](https://docs.github.com/)
++ [GitHub pull request documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request)
+ [Apache Commons Twitter Account](https://twitter.com/ApacheCommons)
[cla]:https://www.apache.org/licenses/#clas
diff --git a/pom.xml b/pom.xml
index 7e796568e..8cb13ed7c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -20,10 +20,10 @@
org.apache.commons
commons-parent
- 99
+ 102
commons-csv
- 1.14.2-SNAPSHOT
+ 1.15.0-SNAPSHOT
Apache Commons CSV
https://commons.apache.org/proper/commons-csv/
2005
@@ -59,6 +59,7 @@
com.h2database
h2
+
2.2.224
test
@@ -89,12 +90,12 @@
- 1.14.2
+ 1.15.0
(Java 8 or above)
RC1
1.14.1
- 1.14.3
+ 1.15.1
csv
org.apache.commons.csv
CSV
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index aacd40586..93952e9f1 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -40,16 +40,34 @@
Apache Commons CSV Release Notes
-
+
Remove Spotbugs dependency and use exclude-filter instead #564.
Remove broken website link #577.
Fix Apache RAT plugin console warnings.
[Javadoc] Clarify behavior of deprecated CSVFormat#withFirstRecordAsHeader() #2413.
+ CSVFormat.equals()/hashCode() ignores maxRows (#600).
+ ExtendedBufferedReader byte tracking leads to an incorrect CSVRecord.getBytePosition() (#601).
+ CSVFormat.Builder.setQuote() does not refresh quotedNullString (#2447).
+ Lexer.isDelimiter() accepts a partial multi-character delimiter at EOF (#603).
+ CSVParser applies characterOffset to bytePosition (#604).
+ CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back.
+ CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used.
+ CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters.
+ CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null).
+ Escape Reader values with quote and escape (#606).
+ Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611).
+ Escape quote char in printWithEscapes when QuoteMode is NONE (#609).
+ Quote value starting with comment marker in minimal quote mode (#610).
+ Escape leading comment marker in printWithEscapes (#614).
+ Skip byte counting at EOF in ExtendedBufferedReader.read (#615).
+ Keep quoted empty trailing field with trailingDelimiter (#616).
+ Evaluate isDelimiter once in nextToken whitespace skip (#618)..
Add an "Android Compatibility" section to the web site.
+ Add CSVParser.Builder.setByteOffset(long) (#604).
- Bump org.apache.commons:commons-parent from 85 to 99 #573, #595.
+ Bump org.apache.commons:commons-parent from 85 to 102 #573, #595.
[test] Bump com.opencsv:opencsv from 5.11.2 to 5.12.0 #558.
Bump org.apache.commons:commons-lang3 from 3.18.0 to 3.20.0.
Bump commons-codec:commons-codec from 1.19.0 to 1.22.0.
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index 09e76e3a6..7145d23d3 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -780,8 +780,7 @@ public Builder setMaxRows(final long maxRows) {
*/
public Builder setNullString(final String nullString) {
this.nullString = nullString;
- this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
- return this;
+ return setQuotedNullString();
}
/**
@@ -806,6 +805,12 @@ public Builder setQuote(final Character quoteCharacter) {
throw new IllegalArgumentException("The quoteCharacter cannot be a line break");
}
this.quoteCharacter = quoteCharacter;
+ return setQuotedNullString();
+ }
+
+ private Builder setQuotedNullString() {
+ final Character quote = quoteCharacter != null ? quoteCharacter : Constants.DOUBLE_QUOTE_CHAR;
+ this.quotedNullString = quote + nullString + quote;
return this;
}
@@ -878,6 +883,16 @@ public Builder setTrailingData(final boolean trailingData) {
/**
* Sets whether to add a trailing delimiter.
*
+ *
+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field
+ * that such a trailing delimiter produces is dropped so the output round-trips back to the original record;
+ * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept.
+ *
+ *
+ * This is unrelated to {@link #setTrailingData(boolean) trailing data}, which controls whether characters
+ * after the closing quote of an encapsulated value are tolerated when reading.
+ *
+ *
* @param trailingDelimiter whether to add a trailing delimiter.
* @return This instance.
*/
@@ -1477,7 +1492,7 @@ private static boolean isLineBreak(final char c) {
* @return true if {@code c} is a line break character (and not null).
*/
private static boolean isLineBreak(final Character c) {
- return c != null && isLineBreak(c.charValue()); // Explicit (un)boxing is intentional
+ return c != null && isLineBreak(c.charValue()); // Explicit unboxing is intentional
}
/** Same test as in as {@link String#trim()}. */
@@ -1690,15 +1705,15 @@ public boolean equals(final Object obj) {
duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) &&
Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) &&
ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase &&
- ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof &&
- Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) &&
- quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) &&
- Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord &&
- trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
+ ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && maxRows == other.maxRows &&
+ Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
+ Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
+ skipHeaderRecord == other.skipHeaderRecord && trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter &&
+ trim == other.trim;
}
private void escape(final char c, final Appendable appendable) throws IOException {
- append(escapeCharacter.charValue(), appendable); // Explicit (un)boxing is intentional
+ append(escapeCharacter.charValue(), appendable); // Explicit unboxing is intentional
append(c, appendable);
}
@@ -1836,7 +1851,7 @@ public DuplicateHeaderMode getDuplicateHeaderMode() {
* @return the escape character, may be {@code 0}
*/
char getEscapeChar() {
- return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit (un)boxing is intentional
+ return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit unboxing is intentional
}
/**
@@ -2007,6 +2022,16 @@ public boolean getTrailingData() {
/**
* Gets whether to add a trailing delimiter.
*
+ *
+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field
+ * that such a trailing delimiter produces is dropped so the output round-trips back to the original record;
+ * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept.
+ *
+ *
+ * This is unrelated to {@link #getTrailingData() trailing data}, which controls whether characters after the
+ * closing quote of an encapsulated value are tolerated when reading.
+ *
+ *
* @return whether to add a trailing delimiter.
* @since 1.3
*/
@@ -2029,9 +2054,10 @@ public int hashCode() {
int result = 1;
result = prime * result + Arrays.hashCode(headerComments);
result = prime * result + Arrays.hashCode(headers);
- return prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
- ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString,
+ result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
+ ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, maxRows, nullString, quoteCharacter, quoteMode, quotedNullString,
recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim);
+ return result;
}
/**
@@ -2158,7 +2184,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo
}
final boolean quoteCharacterSet = isQuoteCharacterSet();
if (quoteCharacterSet) {
- append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional
+ append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional
}
// Stream the input to the output without reading or holding the whole value in memory.
// AppendableOutputStream cannot "close" an Appendable.
@@ -2166,7 +2192,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo
IOUtils.copy(inputStream, outputStream);
}
if (quoteCharacterSet) {
- append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional
+ append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional
}
}
@@ -2321,12 +2347,18 @@ private void printWithEscapes(final CharSequence charSeq, final Appendable appen
final char[] delimArray = getDelimiterCharArray();
final int delimLength = delimArray.length;
final char escape = getEscapeChar();
+ final boolean quoteSet = isQuoteCharacterSet();
+ final char quote = quoteSet ? getQuoteCharacter().charValue() : 0;
+ final boolean commentMarkerSet = isCommentMarkerSet();
+ final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional
while (pos < end) {
char c = charSeq.charAt(pos);
final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delimArray, delimLength);
final boolean isCr = c == Constants.CR;
final boolean isLf = c == Constants.LF;
- if (isCr || isLf || c == escape || isDelimiterStart) {
+ // A leading comment marker would be read back as a comment, so escape it.
+ final boolean isComment = commentMarkerSet && pos == 0 && c == commentChar;
+ if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) {
// write out segment up until this char
if (pos > start) {
appendable.append(charSeq, start, pos);
@@ -2365,8 +2397,13 @@ private void printWithEscapes(final Reader reader, final Appendable appendable)
final char[] delimArray = getDelimiterCharArray();
final int delimLength = delimArray.length;
final char escape = getEscapeChar();
+ final boolean quoteSet = isQuoteCharacterSet();
+ final char quote = quoteSet ? getQuoteCharacter().charValue() : 0;
+ final boolean commentMarkerSet = isCommentMarkerSet();
+ final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional
final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE);
int c;
+ boolean firstChar = true;
final char[] lookAheadBuffer = new char[delimLength - 1];
while (EOF != (c = bufferedReader.read())) {
builder.append((char) c);
@@ -2376,7 +2413,10 @@ private void printWithEscapes(final Reader reader, final Appendable appendable)
final boolean isDelimiterStart = isDelimiter((char) c, test, pos, delimArray, delimLength);
final boolean isCr = c == Constants.CR;
final boolean isLf = c == Constants.LF;
- if (isCr || isLf || c == escape || isDelimiterStart) {
+ // A leading comment marker would be read back as a comment, so escape it.
+ final boolean isComment = commentMarkerSet && firstChar && c == commentChar;
+ firstChar = false;
+ if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) {
// write out segment up until this char
if (pos > start) {
append(builder.substring(start, pos), appendable);
@@ -2415,7 +2455,7 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi
final int len = charSeq.length();
final char[] delim = getDelimiterCharArray();
final int delimLength = delim.length;
- final char quoteChar = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional
+ final char quoteChar = getQuoteCharacter().charValue(); // Explicit unboxing is intentional
// If escape char not specified, default to the quote char
// This avoids having to keep checking whether there is an escape character
// at the cost of checking against quote twice
@@ -2447,10 +2487,11 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi
}
} else {
char c = charSeq.charAt(pos);
- if (c <= Constants.COMMENT) {
+ if (c <= Constants.COMMENT || isCommentMarkerSet() && c == commentMarker.charValue()) {
// Some other chars at the start of a value caused the parser to fail, so for now
// encapsulate if we start in anything less than '#'. We are being conservative
- // by including the default comment char too.
+ // by including the default comment char and any configured comment marker too,
+ // which the parser would otherwise read back as a comment line.
quote = true;
} else {
while (pos < len) {
@@ -2518,15 +2559,16 @@ private void printWithQuotes(final Reader reader, final Appendable appendable) t
printWithEscapes(reader, appendable);
return;
}
- final char quote = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional
+ final char quote = getQuoteCharacter().charValue(); // Explicit unboxing is intentional
+ final char escape = isEscapeCharacterSet() ? getEscapeChar() : quote;
// (1) Append opening quote
append(quote, appendable);
- // (2) Append Reader contents, doubling quotes
+ // (2) Append Reader contents, doubling quotes and escape characters
int c;
while (EOF != (c = reader.read())) {
append((char) c, appendable);
- if (c == quote) {
- append(quote, appendable);
+ if (c == quote || c == escape) {
+ append((char) c, appendable);
}
}
// (3) Append closing quote
@@ -2604,13 +2646,13 @@ boolean useRow(final long rowNum) {
* @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes.
*/
private void validate() throws IllegalArgumentException {
- if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit (un)boxing is intentional
+ if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit unboxing is intentional
throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')");
}
- if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit (un)boxing is intentional
+ if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit unboxing is intentional
throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')");
}
- if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit (un)boxing is intentional
+ if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit unboxing is intentional
throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')");
}
if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) {
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index bce62ea54..141eba732 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -154,6 +154,7 @@ public final class CSVParser implements Iterable, Closeable {
public static class Builder extends AbstractStreamBuilder {
private CSVFormat format;
+ private long byteOffset = -1;
private long characterOffset;
private long recordNumber = 1;
private boolean trackBytes;
@@ -165,17 +166,33 @@ protected Builder() {
// empty
}
- @SuppressWarnings("resource")
@Override
public CSVParser get() throws IOException {
- return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
+ return new CSVParser(this);
}
/**
- * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
+ * Sets the lexer byte offset when the parser does not start parsing at the beginning of the source.
+ *
+ * By default, the value is {@code -1}, which reuses the character offset for the byte offset.
+ *
*
- * @param characterOffset the lexer offset.
+ * @param byteOffset the lexer byte offset.
* @return {@code this} instance.
+ * @see #setCharacterOffset(long)
+ * @since 1.15.0
+ */
+ public Builder setByteOffset(final long byteOffset) {
+ this.byteOffset = byteOffset;
+ return asThis();
+ }
+
+ /**
+ * Sets the lexer character offset when the parser does not start parsing at the beginning of the source.
+ *
+ * @param characterOffset the lexer character offset.
+ * @return {@code this} instance.
+ * @see #setByteOffset(long)
*/
public Builder setCharacterOffset(final long characterOffset) {
this.characterOffset = characterOffset;
@@ -220,6 +237,7 @@ public Builder setTrackBytes(final boolean trackBytes) {
final class CSVRecordIterator implements Iterator {
private CSVRecord current;
+ private long recordCount;
/**
* Gets the next record or null at the end of stream or max rows read.
@@ -230,8 +248,11 @@ final class CSVRecordIterator implements Iterator {
*/
private CSVRecord getNextRecord() {
CSVRecord record = null;
- if (format.useRow(recordNumber + 1)) {
+ if (format.useRow(recordCount + 1)) {
record = Uncheck.get(CSVParser.this::nextRecord);
+ if (record != null) {
+ recordCount++;
+ }
}
return record;
}
@@ -466,6 +487,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor
*/
private long recordNumber;
+ /**
+ * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
+ * with {@link #recordNumber}.
+ */
+ private final long byteOffset;
+
/**
* Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
* with {@link #recordNumber}.
@@ -474,6 +501,23 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor
private final Token reusableToken = new Token();
+ /**
+ * Constructs a new instance from a builder.
+ *
+ * @param builder The source builder.
+ * @throws IOException if an I/O error occurs.
+ */
+ @SuppressWarnings("resource") // Lexer manages ExtendedBufferedReader.
+ private CSVParser(final Builder builder) throws IOException {
+ this.format = (builder.format != null ? builder.format : CSVFormat.DEFAULT).copy();
+ this.lexer = new Lexer(format, new ExtendedBufferedReader(builder.getReader(), builder.getCharset(), builder.trackBytes));
+ this.csvRecordIterator = new CSVRecordIterator();
+ this.headers = createHeaders();
+ this.byteOffset = builder.byteOffset != -1 ? builder.byteOffset : builder.characterOffset;
+ this.characterOffset = builder.characterOffset;
+ this.recordNumber = builder.recordNumber - 1;
+ }
+
/**
* Constructs a new instance using the given {@link CSVFormat}.
*
@@ -524,51 +568,21 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
*/
@Deprecated
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException {
- this(reader, format, characterOffset, recordNumber, null, false);
- }
-
- /**
- * Constructs a new instance using the given {@link CSVFormat}.
- *
- *
- * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
- * unless you close the {@code reader}.
- *
- *
- * @param reader
- * a Reader containing CSV-formatted input. Must not be null.
- * @param format
- * the CSVFormat used for CSV parsing. Must not be null.
- * @param characterOffset
- * Lexer offset when the parser does not start parsing at the beginning of the source.
- * @param recordNumber
- * The next record number to assign.
- * @param charset
- * The character encoding to be used for the reader when enableByteTracking is true.
- * @param trackBytes
- * {@code true} to enable byte tracking for the parser; {@code false} to disable it.
- * @throws IllegalArgumentException
- * If the parameters of the format are inconsistent or if either the reader or format is null.
- * @throws IOException
- * If there is a problem reading the header or skipping the first record.
- * @throws CSVException Thrown on invalid CSV input data.
- */
- @SuppressWarnings("resource") // reader is managed by lexer.
- private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset,
- final boolean trackBytes) throws IOException {
- Objects.requireNonNull(reader, "reader");
- Objects.requireNonNull(format, "format");
- this.format = format.copy();
- this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
- this.csvRecordIterator = new CSVRecordIterator();
- this.headers = createHeaders();
- this.characterOffset = characterOffset;
- this.recordNumber = recordNumber - 1;
+ // @formatter:off
+ this(builder()
+ .setReader(reader)
+ .setFormat(Objects.requireNonNull(format, "format")) // requireNonNull for full compatibility
+ .setCharacterOffset(characterOffset)
+ .setRecordNumber(recordNumber)
+ .setCharset((Charset) null).setTrackBytes(false));
+ // @formatter:off
}
private void addRecordValue(final boolean lastRecord) {
final String input = format.trim(reusableToken.content.toString());
- if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
+ // Only drop the empty field produced by an actual trailing delimiter. A quoted empty
+ // field ("") is a real value, not a trailing delimiter, so it must be kept.
+ if (lastRecord && input.isEmpty() && format.getTrailingDelimiter() && !reusableToken.isQuoted) {
return;
}
recordList.add(handleNull(input));
@@ -642,7 +656,7 @@ private Headers createHeaders() throws IOException {
}
observedMissing |= blankHeader;
if (header != null) {
- headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional
+ headerMap.put(header, Integer.valueOf(i)); // Explicit boxing is intentional
if (headerNames == null) {
headerNames = new ArrayList<>(headerRecord.length);
}
@@ -887,7 +901,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
- final long startBytePosition = lexer.getBytesRead() + characterOffset;
+ final long startBytePosition = lexer.getBytesRead() + byteOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
diff --git a/src/main/java/org/apache/commons/csv/CSVPrinter.java b/src/main/java/org/apache/commons/csv/CSVPrinter.java
index 087129ec5..a7048fd62 100644
--- a/src/main/java/org/apache/commons/csv/CSVPrinter.java
+++ b/src/main/java/org/apache/commons/csv/CSVPrinter.java
@@ -235,7 +235,7 @@ public void printComment(final String comment) throws IOException {
if (!newRecord) {
println();
}
- appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional
+ appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional
appendable.append(SP);
for (int i = 0; i < comment.length(); i++) {
final char c = comment.charAt(i);
@@ -247,7 +247,7 @@ public void printComment(final String comment) throws IOException {
// falls-through: break intentionally excluded.
case LF:
println();
- appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional
+ appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional
appendable.append(SP);
break;
default:
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index f619717d0..8dab14d90 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -132,13 +132,11 @@ public String get(final String name) {
throw new IllegalArgumentException(String.format("Mapping for %s not found, expected one of %s", name, headerMap.keySet()));
}
try {
- return values[index.intValue()]; // Explicit (un)boxing is intentional
+ return values[index.intValue()]; // Explicit unboxing is intentional
} catch (final ArrayIndexOutOfBoundsException e) {
+ // Explicit boxing is intentional
throw new IllegalArgumentException(
- String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); // Explicit
- // (un)boxing
- // is
- // intentional
+ String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length)));
}
}
@@ -267,7 +265,7 @@ public boolean isSet(final int index) {
* @return whether a given column is mapped and has a value.
*/
public boolean isSet(final String name) {
- return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit (un)boxing is intentional
+ return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit unboxing is intentional
}
/**
@@ -283,7 +281,7 @@ public Iterator iterator() {
/**
* Puts all values of this record into the given Map.
*
- * @param the map type.
+ * @param The map type.
* @param map The Map to populate.
* @return the given map.
* @since 1.9.0
diff --git a/src/main/java/org/apache/commons/csv/Constants.java b/src/main/java/org/apache/commons/csv/Constants.java
index 0b9476e1c..9dd276ecc 100644
--- a/src/main/java/org/apache/commons/csv/Constants.java
+++ b/src/main/java/org/apache/commons/csv/Constants.java
@@ -40,7 +40,7 @@ final class Constants {
/** RFC 4180 defines line breaks as CRLF. */
static final String CRLF = "\r\n";
- static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit (un)boxing is intentional.
+ static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit boxing is intentional.
static final String EMPTY = "";
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 8dcda6517..20c1ef544 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -37,26 +37,30 @@
/**
* A special buffered reader which supports sophisticated read access.
*
- * In particular the reader supports a look-ahead option, which allows you to see the next char returned by
- * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
+ * In particular the reader supports a look-ahead option, which allows you to see the next char returned by {@link #read()}. This reader also tracks how many
+ * characters have been read with {@link #getPosition()}.
*
*/
final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
/** The last char returned */
private int lastChar = UNDEFINED;
+
private int lastCharMark = UNDEFINED;
/** The count of EOLs (CR/LF/CRLF) seen so far */
private long lineNumber;
+
private long lineNumberMark;
/** The position, which is the number of characters read so far */
private long position;
+
private long positionMark;
/** The number of bytes read so far. */
private long bytesRead;
+
private long bytesReadMark;
/** Encoder for calculating the number of bytes for each character read. */
@@ -70,12 +74,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
}
/**
- * Constructs a new instance with the specified reader, character set,
- * and byte tracking option. Initializes an encoder if byte tracking is enabled
- * and a character set is provided.
+ * Constructs a new instance with the specified reader, character set, and byte tracking option. Initializes an encoder if byte tracking is enabled and a
+ * character set is provided.
*
- * @param reader the reader supports a look-ahead option.
- * @param charset the character set for encoding, or {@code null} if not applicable.
+ * @param reader the reader supports a look-ahead option.
+ * @param charset the character set for encoding, or {@code null} if not applicable.
* @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
*/
ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
@@ -86,8 +89,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
/**
* Closes the stream.
*
- * @throws IOException
- * If an I/O error occurs
+ * @throws IOException If an I/O error occurs
*/
@Override
public void close() throws IOException {
@@ -105,26 +107,35 @@ long getBytesRead() {
return this.bytesRead;
}
+ private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
+ long len = 0;
+ int previous = lastChar;
+ for (int i = offset; i < offset + length; i++) {
+ len += getEncodedCharLength(previous, buf[i]);
+ previous = buf[i];
+ }
+ return len;
+ }
+
/**
- * Gets the byte length of the given character based on the original Unicode
- * specification, which defined characters as fixed-width 16-bit entities.
+ * Gets the byte length of the given character based on the original Unicode specification, which defined characters as fixed-width 16-bit entities.
*
* The Unicode characters are divided into two main ranges:
*
- * - U+0000 to U+FFFF (Basic Multilingual Plane, BMP):
- *
- * - Represented using a single 16-bit {@code char}.
- * - Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
- *
- *
- * - U+10000 to U+10FFFF (Supplementary Characters):
- *
- * - Represented as a pair of {@code char}s:
- * - The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
- * - The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
- * - Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
- *
- *
+ * - U+0000 to U+FFFF (Basic Multilingual Plane, BMP):
+ *
+ * - Represented using a single 16-bit {@code char}.
+ * - Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
+ *
+ *
+ * - U+10000 to U+10FFFF (Supplementary Characters):
+ *
+ * - Represented as a pair of {@code char}s:
+ * - The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
+ * - The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
+ * - Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
+ *
+ *
*
*
* @param current the current character to process.
@@ -132,8 +143,12 @@ long getBytesRead() {
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(final int current) throws CharacterCodingException {
+ return getEncodedCharLength(lastChar, current);
+ }
+
+ private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException {
final char cChar = (char) current;
- final char lChar = (char) lastChar;
+ final char lChar = (char) previous;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
}
@@ -148,10 +163,9 @@ private int getEncodedCharLength(final int current) throws CharacterCodingExcept
}
/**
- * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
- * any of the read methods. This will not include a character read using the {@link #peek()} method. If no
- * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
- * on the last read then this will return {@link IOUtils#EOF}.
+ * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by any of the read methods. This will not
+ * include a character read using the {@link #peek()} method. If no character has been read then this will return {@link Constants#UNDEFINED}. If the end of
+ * the stream was reached on the last read then this will return {@link IOUtils#EOF}.
*
* @return the last character that was read
*/
@@ -193,11 +207,10 @@ public void mark(final int readAheadLimit) throws IOException {
@Override
public int read() throws IOException {
final int current = super.read();
- if (current == CR || current == LF && lastChar != CR ||
- current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
+ if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
- if (encoder != null) {
+ if (encoder != null && current != EOF) {
this.bytesRead += getEncodedCharLength(current);
}
lastChar = current;
@@ -211,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
return 0;
}
final int len = super.read(buf, offset, length);
+ if (encoder != null && len > 0) {
+ this.bytesRead += getEncodedCharLength(buf, offset, len);
+ }
if (len > 0) {
for (int i = offset; i < offset + len; i++) {
final char ch = buf[i];
@@ -231,8 +247,7 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
}
/**
- * Gets the next line, dropping the line terminator(s). This method should only be called when processing a
- * comment, otherwise, information can be lost.
+ * Gets the next line, dropping the line terminator(s). This method should only be called when processing a comment, otherwise, information can be lost.
*
* Increments {@link #lineNumber} and updates {@link #position}.
*
@@ -272,5 +287,4 @@ public void reset() throws IOException {
bytesRead = bytesReadMark;
super.reset();
}
-
}
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index 3d00fe0bf..fe964480a 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -23,6 +23,7 @@
import java.io.Closeable;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.commons.io.IOUtils;
@@ -68,8 +69,8 @@ final class Lexer implements Closeable {
/**
* Appends the next escaped character to the token's content.
*
- * @param token the current token
- * @throws IOException on stream access error
+ * @param token the current token.
+ * @throws IOException on stream access error.
* @throws CSVException Thrown on invalid input.
*/
private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
@@ -89,7 +90,7 @@ private void appendNextEscapedCharacterToToken(final Token token) throws IOExcep
* Closes resources.
*
* @throws IOException
- * If an I/O error occurs
+ * If an I/O error occurs.
*/
@Override
public void close() throws IOException {
@@ -97,27 +98,27 @@ public void close() throws IOException {
}
/**
- * Gets the number of bytes read
+ * Gets the number of bytes read.
*
- * @return the number of bytes read
+ * @return the number of bytes read.
*/
long getBytesRead() {
return reader.getBytesRead();
}
/**
- * Returns the current character position
+ * Gets the current character position.
*
- * @return the current character position
+ * @return the current character position.
*/
long getCharacterPosition() {
return reader.getPosition();
}
/**
- * Returns the current line number
+ * Gets the current line number.
*
- * @return the current line number
+ * @return the current line number.
*/
long getCurrentLineNumber() {
return reader.getLineNumber();
@@ -136,7 +137,7 @@ boolean isCommentStart(final int ch) {
}
/**
- * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
+ * Tests whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
*
* @param ch
* the current character.
@@ -152,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException {
isLastTokenDelimiter = true;
return true;
}
+ Arrays.fill(delimiterBuf, '\0');
reader.peek(delimiterBuf);
for (int i = 0; i < delimiterBuf.length; i++) {
if (delimiterBuf[i] != delimiter[i + 1]) {
@@ -190,6 +192,7 @@ boolean isEscape(final int ch) {
* @throws IOException If an I/O error occurs.
*/
boolean isEscapeDelimiter() throws IOException {
+ Arrays.fill(escapeDelimiterBuf, '\0');
reader.peek(escapeDelimiterBuf);
if (escapeDelimiterBuf[0] != delimiter[0]) {
return false;
@@ -214,7 +217,7 @@ boolean isQuoteChar(final int ch) {
/**
* Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
*
- * @param ch the character to check
+ * @param ch the character to check.
* @return true if the character is at the start of a line.
*/
boolean isStartOfLine(final int ch) {
@@ -274,15 +277,22 @@ Token nextToken(final Token token) throws IOException {
}
// Important: make sure a new char gets consumed in each iteration
while (token.type == Token.Type.INVALID) {
+ // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must
+ // only be evaluated once per character. Remember a match found while skipping whitespace below.
+ boolean delimiter = false;
// ignore whitespaces at beginning of a token
if (ignoreSurroundingSpaces) {
- while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
+ while (Character.isWhitespace((char) c) && !eol) {
+ if (isDelimiter(c)) {
+ delimiter = true;
+ break;
+ }
c = reader.read();
eol = readEndOfLine(c);
}
}
// ok, start of token reached: encapsulated, or token
- if (isDelimiter(c)) {
+ if (delimiter || isDelimiter(c)) {
// empty token return TOKEN("")
token.type = Token.Type.TOKEN;
} else if (eol) {
@@ -400,10 +410,10 @@ private Token parseEncapsulatedToken(final Token token) throws IOException {
* An unescaped delimiter has been reached (TOKEN)
*
*
- * @param token the current token
- * @param ch the current character
- * @return the filled token
- * @throws IOException on stream access error
+ * @param token the current token.
+ * @param ch the current character.
+ * @return the filled token.
+ * @throws IOException on stream access error.
* @throws CSVException Thrown on invalid input.
*/
private Token parseSimpleToken(final Token token, final int ch) throws IOException {
@@ -442,7 +452,7 @@ private Token parseSimpleToken(final Token token, final int ch) throws IOExcepti
/**
* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
*
- * @return true if the given or next character is a line-terminator
+ * @return true if the given or next character is a line-terminator.
*/
boolean readEndOfLine(final int ch) throws IOException {
// check if we have \r\n...
diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java
index 4d428b465..ed20898de 100644
--- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java
@@ -395,6 +395,14 @@ void testEqualsLeftNoQuoteRightQuote_Deprecated() {
assertNotEqualsFlip(left, right);
}
+ @Test
+ void testEqualsMaxRows() {
+ final CSVFormat right = CSVFormat.DEFAULT.builder().setMaxRows(10).get();
+ final CSVFormat left = CSVFormat.DEFAULT.builder().setMaxRows(1000).get();
+ assertNotEqualsFlip(right, left);
+ assertNotEquals(right.hashCode(), left.hashCode());
+ }
+
@Test
void testEqualsNoQuotes() {
final CSVFormat left = CSVFormat.newFormat(',').builder().setQuote(null).get();
@@ -958,6 +966,23 @@ void testPrintWithQuotes() throws IOException {
assertEquals("\"\"\"a,b,c\r\nx,y,z\"", out.toString());
}
+ /**
+ * Tests CSV-326.
+ */
+ @Test
+ void testPrintWithQuotesEscapeBeforeQuote() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder()
+ .setEscape('\\')
+ .setQuote('"')
+ .get();
+ final String value = "\\\"";
+ final Appendable out = new StringBuilder();
+ format.print(new StringReader(value), out, true);
+ try (CSVParser parser = CSVParser.parse(out.toString(), format)) {
+ assertEquals(value, parser.getRecords().get(0).get(0));
+ }
+ }
+
@Test
void testQuoteCharSameAsCommentStartThrowsException() {
assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.builder().setQuote('!').setCommentMarker('!').get());
@@ -993,6 +1018,35 @@ void testQuoteCharSameAsDelimiterThrowsException_Deprecated() {
assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withQuote('!').withDelimiter('!'));
}
+ @Test
+ void testQuotedNullStringTracksQuoteCharacter() throws IOException {
+ final StringBuilder out = new StringBuilder();
+ // @formatter:off
+ final Builder builder = CSVFormat.DEFAULT.builder();
+ final CSVFormat format = builder
+ .setQuoteMode(QuoteMode.ALL)
+ .setNullString("NULL")
+ .get();
+ // @formatter:on
+ format.print(null, out, true);
+ assertEquals("\"NULL\"", out.toString());
+ // set
+ out.setLength(0);
+ builder.setQuote('\'');
+ builder.get().print(null, out, true);
+ assertEquals("'NULL'", out.toString());
+ // reset
+ out.setLength(0);
+ builder.setQuote((Character) null);
+ builder.get().print(null, out, true);
+ assertEquals("\"NULL\"", out.toString());
+ // reset, reverse setter order
+ out.setLength(0);
+ builder.setNullString(null).setQuote((Character) null).setNullString("NULL");
+ builder.get().print(null, out, true);
+ assertEquals("\"NULL\"", out.toString());
+ }
+
@Test
void testQuoteModeNoneShouldReturnMeaningfulExceptionMessage() {
final Exception exception = assertThrows(IllegalArgumentException.class, () ->
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index d9dd4e545..6d9bdd9e8 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() {
() -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false)));
}
+ /**
+ * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
+ * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping
+ * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value.
+ */
+ @Test
+ void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
+ try (CSVParser parser = CSVParser.parse(" |a", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "", "a" }, records.get(0));
+ }
+ try (CSVParser parser = CSVParser.parse("a | |b", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "a", "", "b" }, records.get(0));
+ }
+ try (CSVParser parser = CSVParser.parse("a | |b |", format)) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0));
+ }
+ }
+
@Test
void testEmptyFile() throws Exception {
try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8,
@@ -648,6 +673,100 @@ void testForEach() throws Exception {
}
}
+ @Test
+ void testGetBytePositionMultiCharacterDelimiter() throws IOException {
+ final String code = "aa[|]bb\ncc[|]dd\n";
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(format)
+ .setCharset(StandardCharsets.UTF_8)
+ .setTrackBytes(true)
+ .get()) {
+ final Iterator it = parser.iterator();
+ final CSVRecord first = it.next();
+ final CSVRecord second = it.next();
+ assertEquals(0, first.getBytePosition());
+ assertEquals(8, second.getBytePosition());
+ }
+ }
+
+ /**
+ * Tests CSV-329.
+ */
+ @Test
+ void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException {
+ final String delimiter = "x😀";
+ final String code = "ax😀b\ncx😀d\n";
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get();
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(format)
+ .setCharset(UTF_8)
+ .setTrackBytes(true)
+ .get()) {
+ final CSVRecord first = parser.nextRecord();
+ final CSVRecord second = parser.nextRecord();
+ assertNotNull(first);
+ assertNotNull(second);
+ assertValuesEquals(new String[] { "a", "b" }, first);
+ assertValuesEquals(new String[] { "c", "d" }, second);
+ assertEquals(0, first.getBytePosition());
+ assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition());
+ }
+ }
+
+ @Test
+ void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception {
+ final String row0 = "é,x\n";
+ final Charset charset = UTF_8;
+ // row0 char count is 4
+ assertEquals(4, row0.length());
+ // row0 byte count is 5
+ final int record1ByteOffset = row0.getBytes(charset).length;
+ assertEquals(5, record1ByteOffset);
+ final String row1 = "b,c\n";
+ final String rows = row0 + row1;
+ final long record1CharOffset = row0.length();
+ final long expectedByteOffset = row0.getBytes(charset).length;
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(row1))
+ .setFormat(CSVFormat.DEFAULT)
+ .setCharset(charset)
+ .setTrackBytes(true)
+ .setByteOffset(record1ByteOffset)
+ .setCharacterOffset(record1CharOffset)
+ .setRecordNumber(2) // not relevant but a better use case example.
+ .get()) {
+ final CSVRecord record = parser.nextRecord();
+ assertNotNull(record);
+ assertEquals(4, record.getCharacterPosition());
+ assertEquals(record1CharOffset, record.getCharacterPosition());
+ assertEquals(expectedByteOffset, record.getBytePosition());
+ }
+ }
+
+ @Test
+ void testGetBytePositionWithSingleByteCharset() throws IOException {
+ // A single-byte charset cannot encode U+FFFF, the char value of the EOF sentinel.
+ // Byte counting must skip the EOF read so a valid file parses without throwing.
+ final String code = "a,b\nc,d\n";
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader(code))
+ .setFormat(CSVFormat.DEFAULT)
+ .setCharset(StandardCharsets.ISO_8859_1)
+ .setTrackBytes(true)
+ .get()) {
+ final CSVRecord first = parser.nextRecord();
+ final CSVRecord second = parser.nextRecord();
+ assertNotNull(first);
+ assertNotNull(second);
+ assertNull(parser.nextRecord());
+ assertEquals(0, first.getBytePosition());
+ assertEquals(4, second.getBytePosition());
+ }
+ }
+
@Test
void testGetHeaderComment_HeaderComment1() throws IOException {
try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) {
@@ -917,6 +1036,23 @@ void testGetRecordsMaxRows(final long maxRows) throws IOException {
}
}
+ /**
+ * Tests CSV-327.
+ */
+ @Test
+ void testGetRecordsMaxRowsWithRecordNumberOffset() throws IOException {
+ try (CSVParser parser = CSVParser.builder()
+ .setReader(new StringReader("a,b\nc,d\n"))
+ .setFormat(CSVFormat.DEFAULT.builder().setMaxRows(1).get())
+ .setRecordNumber(2)
+ .get()) {
+ final List records = parser.getRecords();
+ assertEquals(1, records.size());
+ assertEquals(2, records.get(0).getRecordNumber());
+ assertValuesEquals(new String[] { "a", "b" }, records.get(0));
+ }
+ }
+
@Test
void testGetRecordThreeBytesRead() throws Exception {
final String code = "id,date,val5,val4\n" +
@@ -1603,6 +1739,50 @@ void testParsingPrintedEmptyFirstColumn(final CSVFormat.Predefined format) throw
}
}
+ /**
+ * A truncated escaped multi-character delimiter at EOF must stay literal data and not be completed from a stale
+ * escape delimiter look-ahead.
+ */
+ @Test
+ void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get();
+ try (CSVParser parser = format.parse(new StringReader("x![!|!]y![!|"))) {
+ final CSVRecord record = parser.nextRecord();
+ assertEquals("x[|]y![!|", record.get(0));
+ assertEquals(1, record.size());
+ }
+ }
+
+ /**
+ * Tests CSV-324.
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ try (CSVParser parser = format.parse(new StringReader("a[|]b[|"))) {
+ final CSVRecord record = parser.nextRecord();
+ assertEquals("a", record.get(0));
+ assertEquals("b[|", record.get(1));
+ assertEquals(2, record.size());
+ }
+ }
+
+ /**
+ * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an
+ * earlier non-matching peek in the same token.
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]".
+ final String recordString = "x[a][|";
+ try (CSVParser parser = format.parse(new StringReader(recordString))) {
+ final CSVRecord record = parser.nextRecord();
+ assertEquals(recordString, record.get(0));
+ assertEquals(1, record.size());
+ }
+ }
+
@Test
void testProvidedHeader() throws Exception {
final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z");
@@ -1794,6 +1974,23 @@ void testTrailingDelimiter() throws Exception {
}
}
+ @Test
+ void testTrailingDelimiterKeepsQuotedEmptyLastField() throws Exception {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setTrailingDelimiter(true).get();
+ try (CSVParser parser = CSVParser.parse("a,b,\"\"", format)) {
+ final CSVRecord record = parser.iterator().next();
+ assertEquals(3, record.size());
+ assertEquals("a", record.get(0));
+ assertEquals("b", record.get(1));
+ assertEquals("", record.get(2));
+ }
+ // An unquoted trailing delimiter still drops the empty field.
+ try (CSVParser parser = CSVParser.parse("a,b,", format)) {
+ final CSVRecord record = parser.iterator().next();
+ assertEquals(2, record.size());
+ }
+ }
+
@Test
void testTrim() throws Exception {
final Reader in = new StringReader("a,a,a\n\" 1 \",\" 2 \",\" 3 \"\nx,y,z");
diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
index 1ff791010..9ae80c1e5 100644
--- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
@@ -569,6 +569,57 @@ void testEscapeBackslash5() throws IOException {
assertEquals("\\\\", sw.toString());
}
+ @Test
+ void testEscapeCommentMarkerFirstChar() throws IOException {
+ // No quoting available in escape mode, so a leading comment marker must be escaped or the
+ // record reads back as a comment and is dropped. Mirrors the quoting fix for QuoteMode.MINIMAL.
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote(null).setEscape('\\').setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";comment-like";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, "b");
+ printer.printRecord(new StringReader(col1), new StringReader("b"));
+ // The marker past the first character does not start a comment and is left alone.
+ printer.printRecord("a;b", ";c");
+ }
+ final String string = sw.toString();
+ assertEquals("\\;comment-like,b" + RECORD_SEPARATOR +
+ "\\;comment-like,b" + RECORD_SEPARATOR +
+ "a;b,\\;c" + RECORD_SEPARATOR, string);
+ // The emitted records must read back as the original values, none parsed as a comment.
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(3, records.size());
+ assertEquals(col1, records.get(0).get(0));
+ assertEquals("b", records.get(0).get(1));
+ assertEquals(col1, records.get(1).get(0));
+ assertEquals("b", records.get(1).get(1));
+ assertEquals("a;b", records.get(2).get(0));
+ assertEquals(";c", records.get(2).get(1));
+ }
+ }
+
+ @Test
+ void testEscapeCommentMarkerFirstCharWithQuoteModeNone() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setEscape('\\').setQuoteMode(QuoteMode.NONE).setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";bar";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, "b");
+ printer.printRecord(new StringReader(col1), new StringReader("b"));
+ }
+ final String string = sw.toString();
+ assertEquals("\\;bar,b" + RECORD_SEPARATOR + "\\;bar,b" + RECORD_SEPARATOR, string);
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ for (final CSVRecord record : records) {
+ assertEquals(col1, record.get(0));
+ assertEquals("b", record.get(1));
+ }
+ }
+ }
+
@Test
void testEscapeNull1() throws IOException {
final StringWriter sw = new StringWriter();
@@ -1798,6 +1849,28 @@ void testQuoteAll() throws IOException {
}
}
+ @Test
+ void testQuoteCharEscapedWithQuoteModeNone() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote('"').setEscape('?').setQuoteMode(QuoteMode.NONE).get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = "\"abc";
+ final String col2 = "x\"y";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ printer.printRecord(col1, col2);
+ printer.printRecord(new StringReader(col1), new StringReader(col2));
+ }
+ assertEquals("?\"abc,x?\"y" + RECORD_SEPARATOR + "?\"abc,x?\"y" + RECORD_SEPARATOR, sw.toString());
+ // The emitted records must read back as the original values.
+ try (CSVParser parser = CSVParser.parse(sw.toString(), format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ for (final CSVRecord record : records) {
+ assertEquals(col1, record.get(0));
+ assertEquals(col2, record.get(1));
+ }
+ }
+ }
+
@Test
void testQuoteCommaFirstChar() throws IOException {
final StringWriter sw = new StringWriter();
@@ -1807,6 +1880,34 @@ void testQuoteCommaFirstChar() throws IOException {
}
}
+ @Test
+ void testQuoteCommentMarkerFirstChar() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setCommentMarker(';').get();
+ final StringWriter sw = new StringWriter();
+ final String col1 = ";comment-like";
+ try (CSVPrinter printer = new CSVPrinter(sw, format)) {
+ // A real comment is written with the marker, unquoted.
+ printer.printComment("a real comment");
+ // A value starting with the marker is quoted, so it does not read back as a comment.
+ printer.printRecord(col1, "b");
+ // The marker past the first character does not start a comment, so only the leading-marker value is quoted.
+ printer.printRecord("a;b", ";c");
+ }
+ final String string = sw.toString();
+ assertEquals("; a real comment" + RECORD_SEPARATOR +
+ "\";comment-like\",b" + RECORD_SEPARATOR +
+ "a;b,\";c\"" + RECORD_SEPARATOR, string);
+ // The comment is dropped on read; both data records survive intact.
+ try (CSVParser parser = CSVParser.parse(string, format)) {
+ final List records = parser.getRecords();
+ assertEquals(2, records.size());
+ assertEquals(col1, records.get(0).get(0));
+ assertEquals("b", records.get(0).get(1));
+ assertEquals("a;b", records.get(1).get(0));
+ assertEquals(";c", records.get(1).get(1));
+ }
+ }
+
@Test
void testQuoteNonNumeric() throws IOException {
final StringWriter sw = new StringWriter();
diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
index 056b8a9c9..b8d9b9f19 100644
--- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
+++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java
@@ -26,6 +26,7 @@
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
import org.junit.jupiter.api.Test;
@@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception {
}
}
+ @Test
+ void testReadingSupplementaryCharacterTracksBytes() throws Exception {
+ final String input = "😀";
+ final char[] buffer = new char[input.length()];
+ try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) {
+ assertEquals(input.length(), reader.read(buffer, 0, buffer.length));
+ assertArrayEquals(input.toCharArray(), buffer);
+ assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead());
+ assertEquals(input.length(), reader.getPosition());
+ assertEquals(input.charAt(input.length() - 1), reader.getLastChar());
+ }
+ }
+
@Test
void testReadLine() throws Exception {
try (ExtendedBufferedReader br = createBufferedReader("")) {
diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java
index e54e93365..a76f6e513 100644
--- a/src/test/java/org/apache/commons/csv/LexerTest.java
+++ b/src/test/java/org/apache/commons/csv/LexerTest.java
@@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException {
}
}
+ /**
+ * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
+ * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the
+ * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped.
+ */
+ @Test
+ void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
+ try (Lexer lexer = createLexer(" |a", format)) {
+ assertNextToken(TOKEN, "", lexer);
+ assertNextToken(EOF, "a", lexer);
+ }
+ try (Lexer lexer = createLexer("a | |b", format)) {
+ assertNextToken(TOKEN, "a", lexer);
+ assertNextToken(TOKEN, "", lexer);
+ assertNextToken(EOF, "b", lexer);
+ }
+ }
+
@Test
void testEOFWithoutClosingQuote() throws Exception {
final String code = "a,\"b";
@@ -409,6 +428,44 @@ void testNextToken6() throws IOException {
}
}
+ /**
+ * A truncated escaped multi-character delimiter at EOF must not be accepted by reusing the previous escape delimiter
+ * look-ahead in {@link Lexer#isEscapeDelimiter()}.
+ */
+ @Test
+ void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get();
+ try (Lexer lexer = createLexer("x![!|!]y![!|", format)) {
+ assertNextToken(EOF, "x[|]y![!|", lexer);
+ }
+ }
+
+ /**
+ * Tests CSV-324.
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOF() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ try (Lexer lexer = createLexer("a[|]b[|", format)) {
+ assertNextToken(TOKEN, "a", lexer);
+ assertNextToken(EOF, "b[|", lexer);
+ }
+ }
+
+ /**
+ * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an
+ * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token).
+ */
+ @Test
+ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
+ final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
+ // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]".
+ final String recordString = "x[a][|";
+ try (Lexer lexer = createLexer(recordString, format)) {
+ assertNextToken(EOF, recordString, lexer);
+ }
+ }
+
@Test
void testReadEscapeBackspace() throws IOException {
try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {