Apache Commons CSV Release Notes

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 6c4aa3f57..cca38e512 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -46,10 +46,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} @@ -58,7 +58,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -69,7 +69,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/autobuild@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl @@ -83,4 +83,4 @@ jobs: # make release - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index a04da5090..7bc02bdd2 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,6 +26,6 @@ jobs: runs-on: ubuntu-latest steps: - name: 'Checkout Repository' - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - name: 'Dependency Review PR' - uses: actions/dependency-review-action@3c4e3dcb1aa7874d2c16be7d79418e9b7efd6261 # v4.8.2 + uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0 diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 518219a1d..17ba7dd38 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -15,7 +15,11 @@ name: Java CI -on: [push, pull_request] +on: + push: + branches: + - 'master' + pull_request: {} permissions: contents: read @@ -30,26 +34,26 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] - java: [ 8, 11, 17, 21, 25 ] + java: [ 8, 11, 17, 21, 25, 26 ] experimental: [false] # Keep the same parameter order as the matrix above include: - os: ubuntu-latest - java: 26-ea + java: 27-ea experimental: true steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0 with: distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }} java-version: ${{ matrix.java }} diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml index f01ca3a11..e1868cb46 100644 --- a/.github/workflows/scorecards-analysis.yml +++ b/.github/workflows/scorecards-analysis.yml @@ -40,7 +40,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0 with: persist-credentials: false @@ -57,13 +57,13 @@ jobs: publish_results: true - name: "Upload artifact" - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: SARIF file path: results.sarif retention-days: 5 - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: sarif_file: results.sarif diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eb15f2518..3423e18ad 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,13 +48,13 @@ Getting Started --------------- + Make sure you have a [JIRA account](https://issues.apache.org/jira/). -+ Make sure you have a [GitHub account](https://github.com/signup/free). This is not essential, but makes providing patches much easier. ++ Make sure you have a [GitHub account](https://github.com/signup). This is not essential, but makes providing patches much easier. + If you're planning to implement a new feature it makes sense to discuss your changes on the [dev list](https://commons.apache.org/mail-lists.html) first. This way you can make sure you're not wasting your time on something that isn't considered to be in Apache Commons CSV's scope. + Submit a [Jira Ticket][jira] for your issue, assuming one does not already exist. + Clearly describe the issue including steps to reproduce when it is a bug. + Make sure you fill in the earliest version that you know has the issue. + Find the corresponding [repository on GitHub](https://github.com/apache/?query=commons-), -[fork](https://help.github.com/articles/fork-a-repo/) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. +[fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. Making Changes -------------- @@ -108,8 +108,8 @@ Additional Resources + [Contributing patches](https://commons.apache.org/patches.html) + [Apache Commons CSV JIRA project page][jira] + [Contributor License Agreement][cla] -+ [General GitHub documentation](https://help.github.com/) -+ [GitHub pull request documentation](https://help.github.com/articles/creating-a-pull-request/) ++ [General GitHub documentation](https://docs.github.com/) ++ [GitHub pull request documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) + [Apache Commons Twitter Account](https://twitter.com/ApacheCommons) [cla]:https://www.apache.org/licenses/#clas diff --git a/pom.xml b/pom.xml index 0ade7e0d4..8cb13ed7c 100644 --- a/pom.xml +++ b/pom.xml @@ -20,10 +20,10 @@ org.apache.commons commons-parent - 96 + 102 commons-csv - 1.14.2-SNAPSHOT + 1.15.0-SNAPSHOT Apache Commons CSV https://commons.apache.org/proper/commons-csv/ 2005 @@ -59,6 +59,7 @@ com.h2database h2 + 2.2.224 test @@ -89,12 +90,12 @@ - 1.14.2 + 1.15.0 (Java 8 or above) RC1 1.14.1 - 1.14.3 + 1.15.1 csv org.apache.commons.csv CSV @@ -108,8 +109,8 @@ false true 2025-07-30T14:51:35Z - 1.21.0 - 2.21.0 + 1.22.0 + 2.22.0 org.apache.commons.codec.binary;version="${commons.codec.version}", diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2e24024c8..93952e9f1 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -40,18 +40,38 @@ Apache Commons CSV Release Notes - + Remove Spotbugs dependency and use exclude-filter instead #564. Remove broken website link #577. Fix Apache RAT plugin console warnings. + [Javadoc] Clarify behavior of deprecated CSVFormat#withFirstRecordAsHeader() #2413. + CSVFormat.equals()/hashCode() ignores maxRows (#600). + ExtendedBufferedReader byte tracking leads to an incorrect CSVRecord.getBytePosition() (#601). + CSVFormat.Builder.setQuote() does not refresh quotedNullString (#2447). + Lexer.isDelimiter() accepts a partial multi-character delimiter at EOF (#603). + CSVParser applies characterOffset to bytePosition (#604). + CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back. + CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used. + CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters. + CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null). + Escape Reader values with quote and escape (#606). + Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611). + Escape quote char in printWithEscapes when QuoteMode is NONE (#609). + Quote value starting with comment marker in minimal quote mode (#610). + Escape leading comment marker in printWithEscapes (#614). + Skip byte counting at EOF in ExtendedBufferedReader.read (#615). + Keep quoted empty trailing field with trailingDelimiter (#616). + Evaluate isDelimiter once in nextToken whitespace skip (#618).. + Add an "Android Compatibility" section to the web site. + Add CSVParser.Builder.setByteOffset(long) (#604). - Bump org.apache.commons:commons-parent from 85 to 96 #573, #595. + Bump org.apache.commons:commons-parent from 85 to 102 #573, #595. [test] Bump com.opencsv:opencsv from 5.11.2 to 5.12.0 #558. Bump org.apache.commons:commons-lang3 from 3.18.0 to 3.20.0. - Bump commons-codec:commons-codec from 1.19.0 to 1.21.0. - Bump commons-io:commons-io from 2.20.0 to 2.21.0 #594. + Bump commons-codec:commons-codec from 1.19.0 to 1.22.0. + Bump commons-io:commons-io from 2.20.0 to 2.22.0 #594. diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 7251f83dc..7145d23d3 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -780,8 +780,7 @@ public Builder setMaxRows(final long maxRows) { */ public Builder setNullString(final String nullString) { this.nullString = nullString; - this.quotedNullString = quoteCharacter + nullString + quoteCharacter; - return this; + return setQuotedNullString(); } /** @@ -806,6 +805,12 @@ public Builder setQuote(final Character quoteCharacter) { throw new IllegalArgumentException("The quoteCharacter cannot be a line break"); } this.quoteCharacter = quoteCharacter; + return setQuotedNullString(); + } + + private Builder setQuotedNullString() { + final Character quote = quoteCharacter != null ? quoteCharacter : Constants.DOUBLE_QUOTE_CHAR; + this.quotedNullString = quote + nullString + quote; return this; } @@ -878,6 +883,16 @@ public Builder setTrailingData(final boolean trailingData) { /** * Sets whether to add a trailing delimiter. * + *

+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field + * that such a trailing delimiter produces is dropped so the output round-trips back to the original record; + * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept. + *

+ *

+ * This is unrelated to {@link #setTrailingData(boolean) trailing data}, which controls whether characters + * after the closing quote of an encapsulated value are tolerated when reading. + *

+ * * @param trailingDelimiter whether to add a trailing delimiter. * @return This instance. */ @@ -900,7 +915,7 @@ public Builder setTrim(final boolean trim) { } /** - * Predefines formats. + * Enumerates predefines formats. * * @since 1.2 */ @@ -1477,7 +1492,7 @@ private static boolean isLineBreak(final char c) { * @return true if {@code c} is a line break character (and not null). */ private static boolean isLineBreak(final Character c) { - return c != null && isLineBreak(c.charValue()); // Explicit (un)boxing is intentional + return c != null && isLineBreak(c.charValue()); // Explicit unboxing is intentional } /** Same test as in as {@link String#trim()}. */ @@ -1690,15 +1705,15 @@ public boolean equals(final Object obj) { duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) && ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase && - ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && - Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && - quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && - Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord && - trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim; + ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && maxRows == other.maxRows && + Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && + Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && + skipHeaderRecord == other.skipHeaderRecord && trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && + trim == other.trim; } private void escape(final char c, final Appendable appendable) throws IOException { - append(escapeCharacter.charValue(), appendable); // Explicit (un)boxing is intentional + append(escapeCharacter.charValue(), appendable); // Explicit unboxing is intentional append(c, appendable); } @@ -1836,7 +1851,7 @@ public DuplicateHeaderMode getDuplicateHeaderMode() { * @return the escape character, may be {@code 0} */ char getEscapeChar() { - return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit (un)boxing is intentional + return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit unboxing is intentional } /** @@ -2007,6 +2022,16 @@ public boolean getTrailingData() { /** * Gets whether to add a trailing delimiter. * + *

+ *

+ * This is unrelated to {@link #getTrailingData() trailing data}, which controls whether characters after the + * closing quote of an encapsulated value are tolerated when reading. + *

+ * * @return whether to add a trailing delimiter. * @since 1.3 */ @@ -2029,9 +2054,10 @@ public int hashCode() { int result = 1; result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headers); - return prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, - ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString, + result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, + ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, maxRows, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim); + return result; } /** @@ -2158,7 +2184,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo } final boolean quoteCharacterSet = isQuoteCharacterSet(); if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } // Stream the input to the output without reading or holding the whole value in memory. // AppendableOutputStream cannot "close" an Appendable. @@ -2166,7 +2192,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo IOUtils.copy(inputStream, outputStream); } if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } } @@ -2321,12 +2347,18 @@ private void printWithEscapes(final CharSequence charSeq, final Appendable appen final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional while (pos < end) { char c = charSeq.charAt(pos); final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && pos == 0 && c == commentChar; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { appendable.append(charSeq, start, pos); @@ -2365,8 +2397,13 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); int c; + boolean firstChar = true; final char[] lookAheadBuffer = new char[delimLength - 1]; while (EOF != (c = bufferedReader.read())) { builder.append((char) c); @@ -2376,7 +2413,10 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final boolean isDelimiterStart = isDelimiter((char) c, test, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && firstChar && c == commentChar; + firstChar = false; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { append(builder.substring(start, pos), appendable); @@ -2415,7 +2455,7 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi final int len = charSeq.length(); final char[] delim = getDelimiterCharArray(); final int delimLength = delim.length; - final char quoteChar = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quoteChar = getQuoteCharacter().charValue(); // Explicit unboxing is intentional // If escape char not specified, default to the quote char // This avoids having to keep checking whether there is an escape character // at the cost of checking against quote twice @@ -2447,10 +2487,11 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi } } else { char c = charSeq.charAt(pos); - if (c <= Constants.COMMENT) { + if (c <= Constants.COMMENT || isCommentMarkerSet() && c == commentMarker.charValue()) { // Some other chars at the start of a value caused the parser to fail, so for now // encapsulate if we start in anything less than '#'. We are being conservative - // by including the default comment char too. + // by including the default comment char and any configured comment marker too, + // which the parser would otherwise read back as a comment line. quote = true; } else { while (pos < len) { @@ -2518,15 +2559,16 @@ private void printWithQuotes(final Reader reader, final Appendable appendable) t printWithEscapes(reader, appendable); return; } - final char quote = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quote = getQuoteCharacter().charValue(); // Explicit unboxing is intentional + final char escape = isEscapeCharacterSet() ? getEscapeChar() : quote; // (1) Append opening quote append(quote, appendable); - // (2) Append Reader contents, doubling quotes + // (2) Append Reader contents, doubling quotes and escape characters int c; while (EOF != (c = reader.read())) { append((char) c, appendable); - if (c == quote) { - append(quote, appendable); + if (c == quote || c == escape) { + append((char) c, appendable); } } // (3) Append closing quote @@ -2604,13 +2646,13 @@ boolean useRow(final long rowNum) { * @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes. */ private void validate() throws IllegalArgumentException { - if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit (un)boxing is intentional + if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); } - if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit (un)boxing is intentional + if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); } - if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit (un)boxing is intentional + if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); } if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { @@ -2788,6 +2830,9 @@ public CSVFormat withEscape(final Character escape) { * .get(); * * + *

Any previously set headers are reset to empty. + * The resulting format will have {@code skipHeaderRecord = true}.

+ * * @return A new CSVFormat that is equal to this but using the first record as header. * @see Builder#setSkipHeaderRecord(boolean) * @see Builder#setHeader(String...) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index bce62ea54..141eba732 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -154,6 +154,7 @@ public final class CSVParser implements Iterable, Closeable { public static class Builder extends AbstractStreamBuilder { private CSVFormat format; + private long byteOffset = -1; private long characterOffset; private long recordNumber = 1; private boolean trackBytes; @@ -165,17 +166,33 @@ protected Builder() { // empty } - @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes); + return new CSVParser(this); } /** - * Sets the lexer offset when the parser does not start parsing at the beginning of the source. + * Sets the lexer byte offset when the parser does not start parsing at the beginning of the source. + *

+ * By default, the value is {@code -1}, which reuses the character offset for the byte offset. + *

* - * @param characterOffset the lexer offset. + * @param byteOffset the lexer byte offset. * @return {@code this} instance. + * @see #setCharacterOffset(long) + * @since 1.15.0 + */ + public Builder setByteOffset(final long byteOffset) { + this.byteOffset = byteOffset; + return asThis(); + } + + /** + * Sets the lexer character offset when the parser does not start parsing at the beginning of the source. + * + * @param characterOffset the lexer character offset. + * @return {@code this} instance. + * @see #setByteOffset(long) */ public Builder setCharacterOffset(final long characterOffset) { this.characterOffset = characterOffset; @@ -220,6 +237,7 @@ public Builder setTrackBytes(final boolean trackBytes) { final class CSVRecordIterator implements Iterator { private CSVRecord current; + private long recordCount; /** * Gets the next record or null at the end of stream or max rows read. @@ -230,8 +248,11 @@ final class CSVRecordIterator implements Iterator { */ private CSVRecord getNextRecord() { CSVRecord record = null; - if (format.useRow(recordNumber + 1)) { + if (format.useRow(recordCount + 1)) { record = Uncheck.get(CSVParser.this::nextRecord); + if (record != null) { + recordCount++; + } } return record; } @@ -466,6 +487,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor */ private long recordNumber; + /** + * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination + * with {@link #recordNumber}. + */ + private final long byteOffset; + /** * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination * with {@link #recordNumber}. @@ -474,6 +501,23 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor private final Token reusableToken = new Token(); + /** + * Constructs a new instance from a builder. + * + * @param builder The source builder. + * @throws IOException if an I/O error occurs. + */ + @SuppressWarnings("resource") // Lexer manages ExtendedBufferedReader. + private CSVParser(final Builder builder) throws IOException { + this.format = (builder.format != null ? builder.format : CSVFormat.DEFAULT).copy(); + this.lexer = new Lexer(format, new ExtendedBufferedReader(builder.getReader(), builder.getCharset(), builder.trackBytes)); + this.csvRecordIterator = new CSVRecordIterator(); + this.headers = createHeaders(); + this.byteOffset = builder.byteOffset != -1 ? builder.byteOffset : builder.characterOffset; + this.characterOffset = builder.characterOffset; + this.recordNumber = builder.recordNumber - 1; + } + /** * Constructs a new instance using the given {@link CSVFormat}. * @@ -524,51 +568,21 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException */ @Deprecated public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { - this(reader, format, characterOffset, recordNumber, null, false); - } - - /** - * Constructs a new instance using the given {@link CSVFormat}. - * - *

- * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, - * unless you close the {@code reader}. - *

- * - * @param reader - * a Reader containing CSV-formatted input. Must not be null. - * @param format - * the CSVFormat used for CSV parsing. Must not be null. - * @param characterOffset - * Lexer offset when the parser does not start parsing at the beginning of the source. - * @param recordNumber - * The next record number to assign. - * @param charset - * The character encoding to be used for the reader when enableByteTracking is true. - * @param trackBytes - * {@code true} to enable byte tracking for the parser; {@code false} to disable it. - * @throws IllegalArgumentException - * If the parameters of the format are inconsistent or if either the reader or format is null. - * @throws IOException - * If there is a problem reading the header or skipping the first record. - * @throws CSVException Thrown on invalid CSV input data. - */ - @SuppressWarnings("resource") // reader is managed by lexer. - private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset, - final boolean trackBytes) throws IOException { - Objects.requireNonNull(reader, "reader"); - Objects.requireNonNull(format, "format"); - this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes)); - this.csvRecordIterator = new CSVRecordIterator(); - this.headers = createHeaders(); - this.characterOffset = characterOffset; - this.recordNumber = recordNumber - 1; + // @formatter:off + this(builder() + .setReader(reader) + .setFormat(Objects.requireNonNull(format, "format")) // requireNonNull for full compatibility + .setCharacterOffset(characterOffset) + .setRecordNumber(recordNumber) + .setCharset((Charset) null).setTrackBytes(false)); + // @formatter:off } private void addRecordValue(final boolean lastRecord) { final String input = format.trim(reusableToken.content.toString()); - if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) { + // Only drop the empty field produced by an actual trailing delimiter. A quoted empty + // field ("") is a real value, not a trailing delimiter, so it must be kept. + if (lastRecord && input.isEmpty() && format.getTrailingDelimiter() && !reusableToken.isQuoted) { return; } recordList.add(handleNull(input)); @@ -642,7 +656,7 @@ private Headers createHeaders() throws IOException { } observedMissing |= blankHeader; if (header != null) { - headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional + headerMap.put(header, Integer.valueOf(i)); // Explicit boxing is intentional if (headerNames == null) { headerNames = new ArrayList<>(headerRecord.length); } @@ -887,7 +901,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; - final long startBytePosition = lexer.getBytesRead() + characterOffset; + final long startBytePosition = lexer.getBytesRead() + byteOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); diff --git a/src/main/java/org/apache/commons/csv/CSVPrinter.java b/src/main/java/org/apache/commons/csv/CSVPrinter.java index 087129ec5..a7048fd62 100644 --- a/src/main/java/org/apache/commons/csv/CSVPrinter.java +++ b/src/main/java/org/apache/commons/csv/CSVPrinter.java @@ -235,7 +235,7 @@ public void printComment(final String comment) throws IOException { if (!newRecord) { println(); } - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); for (int i = 0; i < comment.length(); i++) { final char c = comment.charAt(i); @@ -247,7 +247,7 @@ public void printComment(final String comment) throws IOException { // falls-through: break intentionally excluded. case LF: println(); - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); break; default: diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index f619717d0..8dab14d90 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -132,13 +132,11 @@ public String get(final String name) { throw new IllegalArgumentException(String.format("Mapping for %s not found, expected one of %s", name, headerMap.keySet())); } try { - return values[index.intValue()]; // Explicit (un)boxing is intentional + return values[index.intValue()]; // Explicit unboxing is intentional } catch (final ArrayIndexOutOfBoundsException e) { + // Explicit boxing is intentional throw new IllegalArgumentException( - String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); // Explicit - // (un)boxing - // is - // intentional + String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); } } @@ -267,7 +265,7 @@ public boolean isSet(final int index) { * @return whether a given column is mapped and has a value. */ public boolean isSet(final String name) { - return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit (un)boxing is intentional + return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit unboxing is intentional } /** @@ -283,7 +281,7 @@ public Iterator iterator() { /** * Puts all values of this record into the given Map. * - * @param the map type. + * @param The map type. * @param map The Map to populate. * @return the given map. * @since 1.9.0 diff --git a/src/main/java/org/apache/commons/csv/Constants.java b/src/main/java/org/apache/commons/csv/Constants.java index 0b9476e1c..9dd276ecc 100644 --- a/src/main/java/org/apache/commons/csv/Constants.java +++ b/src/main/java/org/apache/commons/csv/Constants.java @@ -40,7 +40,7 @@ final class Constants { /** RFC 4180 defines line breaks as CRLF. */ static final String CRLF = "\r\n"; - static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit (un)boxing is intentional. + static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit boxing is intentional. static final String EMPTY = ""; diff --git a/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java b/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java index 01989d664..8087f16ee 100644 --- a/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java +++ b/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java @@ -20,7 +20,7 @@ package org.apache.commons.csv; /** - * Determines how duplicate header fields should be handled + * Enumerates how duplicate header fields should be handled * if {@link CSVFormat.Builder#setHeader(Class)} is not null. * * @since 1.10.0 diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 8dcda6517..20c1ef544 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -37,26 +37,30 @@ /** * A special buffered reader which supports sophisticated read access. *

- * In particular the reader supports a look-ahead option, which allows you to see the next char returned by - * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. + * In particular the reader supports a look-ahead option, which allows you to see the next char returned by {@link #read()}. This reader also tracks how many + * characters have been read with {@link #getPosition()}. *

*/ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** The last char returned */ private int lastChar = UNDEFINED; + private int lastCharMark = UNDEFINED; /** The count of EOLs (CR/LF/CRLF) seen so far */ private long lineNumber; + private long lineNumberMark; /** The position, which is the number of characters read so far */ private long position; + private long positionMark; /** The number of bytes read so far. */ private long bytesRead; + private long bytesReadMark; /** Encoder for calculating the number of bytes for each character read. */ @@ -70,12 +74,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { } /** - * Constructs a new instance with the specified reader, character set, - * and byte tracking option. Initializes an encoder if byte tracking is enabled - * and a character set is provided. + * Constructs a new instance with the specified reader, character set, and byte tracking option. Initializes an encoder if byte tracking is enabled and a + * character set is provided. * - * @param reader the reader supports a look-ahead option. - * @param charset the character set for encoding, or {@code null} if not applicable. + * @param reader the reader supports a look-ahead option. + * @param charset the character set for encoding, or {@code null} if not applicable. * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. */ ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) { @@ -86,8 +89,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** * Closes the stream. * - * @throws IOException - * If an I/O error occurs + * @throws IOException If an I/O error occurs */ @Override public void close() throws IOException { @@ -105,26 +107,35 @@ long getBytesRead() { return this.bytesRead; } + private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { + long len = 0; + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(previous, buf[i]); + previous = buf[i]; + } + return len; + } + /** - * Gets the byte length of the given character based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. + * Gets the byte length of the given character based on the original Unicode specification, which defined characters as fixed-width 16-bit entities. *

* The Unicode characters are divided into two main ranges: *

U+0000 to U+FFFF (Basic Multilingual Plane, BMP): - *
- Represented using a single 16-bit {@code char}.
- Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
- *
U+10000 to U+10FFFF (Supplementary Characters): - *
- Represented as a pair of {@code char}s:
- The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
- The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
- Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
- *
U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
- Represented using a single 16-bit {@code char}.
- Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
+ *
U+10000 to U+10FFFF (Supplementary Characters): + *
- Represented as a pair of {@code char}s:
- The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
- The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
- Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
+ *

* * @param current the current character to process. @@ -132,8 +143,12 @@ long getBytesRead() { * @throws CharacterCodingException if the character cannot be encoded. */ private int getEncodedCharLength(final int current) throws CharacterCodingException { + return getEncodedCharLength(lastChar, current); + } + + private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -148,10 +163,9 @@ private int getEncodedCharLength(final int current) throws CharacterCodingExcept } /** - * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by - * any of the read methods. This will not include a character read using the {@link #peek()} method. If no - * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached - * on the last read then this will return {@link IOUtils#EOF}. + * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by any of the read methods. This will not + * include a character read using the {@link #peek()} method. If no character has been read then this will return {@link Constants#UNDEFINED}. If the end of + * the stream was reached on the last read then this will return {@link IOUtils#EOF}. * * @return the last character that was read */ @@ -193,11 +207,10 @@ public void mark(final int readAheadLimit) throws IOException { @Override public int read() throws IOException { final int current = super.read(); - if (current == CR || current == LF && lastChar != CR || - current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { + if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } - if (encoder != null) { + if (encoder != null && current != EOF) { this.bytesRead += getEncodedCharLength(current); } lastChar = current; @@ -211,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE return 0; } final int len = super.read(buf, offset, length); + if (encoder != null && len > 0) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } if (len > 0) { for (int i = offset; i < offset + len; i++) { final char ch = buf[i]; @@ -231,8 +247,7 @@ public int read(final char[] buf, final int offset, final int length) throws IOE } /** - * Gets the next line, dropping the line terminator(s). This method should only be called when processing a - * comment, otherwise, information can be lost. + * Gets the next line, dropping the line terminator(s). This method should only be called when processing a comment, otherwise, information can be lost. *

* Increments {@link #lineNumber} and updates {@link #position}. *

@@ -272,5 +287,4 @@ public void reset() throws IOException { bytesRead = bytesReadMark; super.reset(); } - } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 3d00fe0bf..fe964480a 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -23,6 +23,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.Arrays; import org.apache.commons.io.IOUtils; @@ -68,8 +69,8 @@ final class Lexer implements Closeable { /** * Appends the next escaped character to the token's content. * - * @param token the current token - * @throws IOException on stream access error + * @param token the current token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private void appendNextEscapedCharacterToToken(final Token token) throws IOException { @@ -89,7 +90,7 @@ private void appendNextEscapedCharacterToToken(final Token token) throws IOExcep * Closes resources. * * @throws IOException - * If an I/O error occurs + * If an I/O error occurs. */ @Override public void close() throws IOException { @@ -97,27 +98,27 @@ public void close() throws IOException { } /** - * Gets the number of bytes read + * Gets the number of bytes read. * - * @return the number of bytes read + * @return the number of bytes read. */ long getBytesRead() { return reader.getBytesRead(); } /** - * Returns the current character position + * Gets the current character position. * - * @return the current character position + * @return the current character position. */ long getCharacterPosition() { return reader.getPosition(); } /** - * Returns the current line number + * Gets the current line number. * - * @return the current line number + * @return the current line number. */ long getCurrentLineNumber() { return reader.getLineNumber(); @@ -136,7 +137,7 @@ boolean isCommentStart(final int ch) { } /** - * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. + * Tests whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. * * @param ch * the current character. @@ -152,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException { isLastTokenDelimiter = true; return true; } + Arrays.fill(delimiterBuf, '\0'); reader.peek(delimiterBuf); for (int i = 0; i < delimiterBuf.length; i++) { if (delimiterBuf[i] != delimiter[i + 1]) { @@ -190,6 +192,7 @@ boolean isEscape(final int ch) { * @throws IOException If an I/O error occurs. */ boolean isEscapeDelimiter() throws IOException { + Arrays.fill(escapeDelimiterBuf, '\0'); reader.peek(escapeDelimiterBuf); if (escapeDelimiterBuf[0] != delimiter[0]) { return false; @@ -214,7 +217,7 @@ boolean isQuoteChar(final int ch) { /** * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file. * - * @param ch the character to check + * @param ch the character to check. * @return true if the character is at the start of a line. */ boolean isStartOfLine(final int ch) { @@ -274,15 +277,22 @@ Token nextToken(final Token token) throws IOException { } // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { + // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must + // only be evaluated once per character. Remember a match found while skipping whitespace below. + boolean delimiter = false; // ignore whitespaces at beginning of a token if (ignoreSurroundingSpaces) { - while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) { + while (Character.isWhitespace((char) c) && !eol) { + if (isDelimiter(c)) { + delimiter = true; + break; + } c = reader.read(); eol = readEndOfLine(c); } } // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { + if (delimiter || isDelimiter(c)) { // empty token return TOKEN("") token.type = Token.Type.TOKEN; } else if (eol) { @@ -400,10 +410,10 @@ private Token parseEncapsulatedToken(final Token token) throws IOException { *

An unescaped delimiter has been reached (TOKEN)

* * - * @param token the current token - * @param ch the current character - * @return the filled token - * @throws IOException on stream access error + * @param token the current token. + * @param ch the current character. + * @return the filled token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private Token parseSimpleToken(final Token token, final int ch) throws IOException { @@ -442,7 +452,7 @@ private Token parseSimpleToken(final Token token, final int ch) throws IOExcepti /** * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... * - * @return true if the given or next character is a line-terminator + * @return true if the given or next character is a line-terminator. */ boolean readEndOfLine(final int ch) throws IOException { // check if we have \r\n... diff --git a/src/main/java/org/apache/commons/csv/QuoteMode.java b/src/main/java/org/apache/commons/csv/QuoteMode.java index 79bb1b34e..ae64ab486 100644 --- a/src/main/java/org/apache/commons/csv/QuoteMode.java +++ b/src/main/java/org/apache/commons/csv/QuoteMode.java @@ -19,7 +19,7 @@ package org.apache.commons.csv; /** - * Defines quoting behavior. + * Enumerates quoting behavior. * * @see CSVFormat.Builder#setQuoteMode(QuoteMode) */ diff --git a/src/media/commons-logo-component-100.xcf b/src/media/commons-logo-component-100.xcf new file mode 100644 index 000000000..77d92f277 Binary files /dev/null and b/src/media/commons-logo-component-100.xcf differ diff --git a/src/media/commons-logo-component.xcf b/src/media/commons-logo-component.xcf new file mode 100644 index 000000000..3670221da Binary files /dev/null and b/src/media/commons-logo-component.xcf differ diff --git a/src/media/logo-large.xcf b/src/media/logo-large.xcf deleted file mode 100644 index 7bb07af3e..000000000 Binary files a/src/media/logo-large.xcf and /dev/null differ diff --git a/src/media/logo.png b/src/media/logo.png index 582fba980..93bb6c014 100644 Binary files a/src/media/logo.png and b/src/media/logo.png differ diff --git a/src/media/logo.xcf b/src/media/logo.xcf deleted file mode 100644 index ac6376f71..000000000 Binary files a/src/media/logo.xcf and /dev/null differ diff --git a/src/site/resources/images/logo.png b/src/site/resources/images/logo.png index 582fba980..93bb6c014 100644 Binary files a/src/site/resources/images/logo.png and b/src/site/resources/images/logo.png differ diff --git a/src/site/xdoc/index.xml b/src/site/xdoc/index.xml index af49476f9..ac5b8cfa9 100644 --- a/src/site/xdoc/index.xml +++ b/src/site/xdoc/index.xml @@ -24,12 +24,10 @@ limitations under the License.

Commons CSV reads and writes files in variations of the Comma Separated Value (CSV) format.

Read the documentation starting with the Javadoc Overview.

An overview of the functionality is provided in the @@ -48,7 +46,6 @@ The git repository can be browsed.

Download Apache Commons CSV current (mirrors), requires Java 8 or above

Dependency Information

The latest code can be checked out from our git repository at https://gitbox.apache.org/repos/asf/commons-csv.git. You can build the component using Apache Maven using mvn clean package.

- +

+ Apache Commons CSV requires Java 8 or above. +

+ + + + + + + + + + + + + + + +

Commons CSV	Java	Android
1.10.0+	8	Android 7.0 (API level 24)

The commons developer mailing list is the main channel of communication for contributors. Please remember that the lists are shared between all commons components, so prefix your email by [csv].

@@ -87,7 +103,6 @@ For previous releases, see the TagList report.

If you'd like to offer up pull requests via GitHub rather than applying patches to JIRA, we have a GitHub mirror.

The commons mailing lists act as the main support forum. @@ -101,8 +116,6 @@ For previous releases, see the

Commons CSV was started to unify a common and simple interface for reading and writing CSV files under an ASL license. It has been bootstrapped by a code donation from Netcetera in Switzerland. There are three pre-existing BSD compatible CSV parsers which this component will hopefully make redundant (authors willing):

@@ -116,7 +129,5 @@ For previous releases, see the

Super CSV

- - diff --git a/src/site/xdoc/security.xml b/src/site/xdoc/security.xml index ab0056049..47edf5d11 100644 --- a/src/site/xdoc/security.xml +++ b/src/site/xdoc/security.xml @@ -47,5 +47,10 @@

None.

+ For information about safe deserialization, please see Safe Deserialization. +

\ No newline at end of file diff --git a/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java b/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java index cc47b999a..2f518a120 100644 --- a/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java +++ b/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java @@ -19,13 +19,16 @@ package org.apache.commons.csv; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -285,11 +288,11 @@ void testCSVFormat(final DuplicateHeaderMode duplicateHeaderMode, .setHeader(headers); if (valid) { final CSVFormat format = builder.get(); - Assertions.assertEquals(duplicateHeaderMode, format.getDuplicateHeaderMode(), "DuplicateHeaderMode"); - Assertions.assertEquals(allowMissingColumnNames, format.getAllowMissingColumnNames(), "AllowMissingColumnNames"); - Assertions.assertArrayEquals(headers, format.getHeader(), "Header"); + assertEquals(duplicateHeaderMode, format.getDuplicateHeaderMode(), "DuplicateHeaderMode"); + assertEquals(allowMissingColumnNames, format.getAllowMissingColumnNames(), "AllowMissingColumnNames"); + assertArrayEquals(headers, format.getHeader(), "Header"); } else { - Assertions.assertThrows(IllegalArgumentException.class, builder::get); + assertThrows(IllegalArgumentException.class, builder::get); } } @@ -327,10 +330,10 @@ void testCSVParser(final DuplicateHeaderMode duplicateHeaderMode, try (CSVParser parser = CSVParser.parse(input, format)) { // Parser ignores null headers final List expected = Arrays.stream(headers).filter(s -> s != null).collect(Collectors.toList()); - Assertions.assertEquals(expected, parser.getHeaderNames(), "HeaderNames"); + assertEquals(expected, parser.getHeaderNames(), "HeaderNames"); } } else { - Assertions.assertThrows(IllegalArgumentException.class, () -> CSVParser.parse(input, format)); + assertThrows(IllegalArgumentException.class, () -> CSVParser.parse(input, format)); } } } diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java index d1d19f755..ed20898de 100644 --- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java +++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotSame; import static org.junit.jupiter.api.Assertions.assertNull; @@ -48,7 +49,6 @@ import java.util.Objects; import org.apache.commons.csv.CSVFormat.Builder; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; /** @@ -64,16 +64,16 @@ public enum Header { Name, Email, Phone } - private static void assertNotEquals(final Object right, final Object left) { - Assertions.assertNotEquals(right, left); - Assertions.assertNotEquals(left, right); + private static void assertNotEqualsFlip(final Object right, final Object left) { + assertNotEquals(right, left); + assertNotEquals(left, right); } private static CSVFormat copy(final CSVFormat format) { return format.builder().setDelimiter(format.getDelimiter()).get(); } - private void assertNotEquals(final String name, final String type, final Object left, final Object right) { + private void assertNotEqualsHash(final String name, final String type, final Object left, final Object right) { if (left.equals(right) || right.equals(left)) { fail("Objects must not compare equal for " + name + "(" + type + ")"); } @@ -198,8 +198,8 @@ void testDuplicateHeaderElementsTrueContainsEmpty3() { void testEquals() { final CSVFormat right = CSVFormat.DEFAULT; final CSVFormat left = copy(right); - Assertions.assertNotEquals(null, right); - Assertions.assertNotEquals("A String Instance", right); + assertNotEquals(null, right); + assertNotEquals("A String Instance", right); assertEquals(right, right); assertEquals(right, left); assertEquals(left, right); @@ -212,7 +212,7 @@ void testEqualsCommentStart() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setCommentMarker('#').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setCommentMarker('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -221,7 +221,7 @@ void testEqualsCommentStart_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withCommentMarker('#').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withCommentMarker('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -229,7 +229,7 @@ void testEqualsDelimiter() { final CSVFormat right = CSVFormat.newFormat('!'); final CSVFormat left = CSVFormat.newFormat('?'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -237,7 +237,7 @@ void testEqualsEscape() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setCommentMarker('#').setEscape('+').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setEscape('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -246,7 +246,7 @@ void testEqualsEscape_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withCommentMarker('#').withEscape('+').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withEscape('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -262,49 +262,49 @@ void testEqualsHash() throws Exception { case "boolean": { final Object defTrue = method.invoke(CSVFormat.DEFAULT, Boolean.TRUE); final Object defFalse = method.invoke(CSVFormat.DEFAULT, Boolean.FALSE); - assertNotEquals(name, type, defTrue, defFalse); + assertNotEqualsHash(name, type, defTrue, defFalse); break; } case "char": { final Object a = method.invoke(CSVFormat.DEFAULT, 'a'); final Object b = method.invoke(CSVFormat.DEFAULT, 'b'); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.Character": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { null }); final Object b = method.invoke(CSVFormat.DEFAULT, Character.valueOf('d')); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.String": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { null }); final Object b = method.invoke(CSVFormat.DEFAULT, "e"); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.String[]": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { new String[] { null, null } }); final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] { new String[] { "f", "g" } }); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "org.apache.commons.csv.QuoteMode": { final Object a = method.invoke(CSVFormat.DEFAULT, QuoteMode.MINIMAL); final Object b = method.invoke(CSVFormat.DEFAULT, QuoteMode.ALL); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "org.apache.commons.csv.DuplicateHeaderMode": { final Object a = method.invoke(CSVFormat.DEFAULT, DuplicateHeaderMode.ALLOW_ALL); final Object b = method.invoke(CSVFormat.DEFAULT, DuplicateHeaderMode.DISALLOW); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.Object[]": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { new Object[] { null, null } }); final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] { new Object[] { new Object(), new Object() } }); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } default: @@ -327,7 +327,7 @@ void testEqualsHeader() { .setIgnoreEmptyLines(true).setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setHeader("Three", "Two", "One").get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -337,7 +337,7 @@ void testEqualsHeader_Deprecated() { .withIgnoreEmptyLines().withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withHeader("Three", "Two", "One"); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -346,7 +346,7 @@ void testEqualsIgnoreEmptyLines() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setIgnoreEmptyLines(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -356,7 +356,7 @@ void testEqualsIgnoreEmptyLines_Deprecated() { .withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withIgnoreEmptyLines(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -365,7 +365,7 @@ void testEqualsIgnoreSurroundingSpaces() { .setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setIgnoreSurroundingSpaces(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -375,7 +375,7 @@ void testEqualsIgnoreSurroundingSpaces_Deprecated() { .withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withIgnoreSurroundingSpaces(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -383,7 +383,7 @@ void testEqualsLeftNoQuoteRightQuote() { final CSVFormat left = CSVFormat.newFormat(',').builder().setQuote(null).get(); final CSVFormat right = left.builder().setQuote('#').get(); - assertNotEquals(left, right); + assertNotEqualsFlip(left, right); } @SuppressWarnings("deprecation") @@ -392,7 +392,15 @@ void testEqualsLeftNoQuoteRightQuote_Deprecated() { final CSVFormat left = CSVFormat.newFormat(',').withQuote(null); final CSVFormat right = left.withQuote('#'); - assertNotEquals(left, right); + assertNotEqualsFlip(left, right); + } + + @Test + void testEqualsMaxRows() { + final CSVFormat right = CSVFormat.DEFAULT.builder().setMaxRows(10).get(); + final CSVFormat left = CSVFormat.DEFAULT.builder().setMaxRows(1000).get(); + assertNotEqualsFlip(right, left); + assertNotEquals(right.hashCode(), left.hashCode()); } @Test @@ -418,7 +426,7 @@ void testEqualsNullString() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).setNullString("null").get(); final CSVFormat left = right.builder().setNullString("---").get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -428,7 +436,7 @@ void testEqualsNullString_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL).withNullString("null"); final CSVFormat left = right.withNullString("---"); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -495,7 +503,7 @@ void testEqualsOne() { assertFalse(csvFormatTwo.isCommentMarkerSet()); assertNotSame(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatTwo, csvFormatOne); assertEquals('\\', (char) csvFormatOne.getEscapeCharacter()); assertNull(csvFormatOne.getQuoteMode()); @@ -554,10 +562,10 @@ void testEqualsOne() { assertNotSame(csvFormatOne, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatOne, csvFormatTwo); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatOne, csvFormatTwo); + assertNotEquals(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatTwo, csvFormatOne); } @@ -566,7 +574,7 @@ void testEqualsQuoteChar() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').get(); final CSVFormat left = right.builder().setQuote('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -575,7 +583,7 @@ void testEqualsQuoteChar_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"'); final CSVFormat left = right.withQuote('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -583,7 +591,7 @@ void testEqualsQuotePolicy() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setQuoteMode(QuoteMode.MINIMAL).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -592,7 +600,7 @@ void testEqualsQuotePolicy_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withQuoteMode(QuoteMode.MINIMAL); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -601,7 +609,7 @@ void testEqualsRecordSeparator() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setRecordSeparator(LF).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -611,7 +619,7 @@ void testEqualsRecordSeparator_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withRecordSeparator(LF); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } void testEqualsSkipHeaderRecord() { @@ -619,7 +627,7 @@ void testEqualsSkipHeaderRecord() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).setNullString("null").setSkipHeaderRecord(true).get(); final CSVFormat left = right.builder().setSkipHeaderRecord(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -629,7 +637,7 @@ void testEqualsSkipHeaderRecord_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL).withNullString("null").withSkipHeaderRecord(); final CSVFormat left = right.withSkipHeaderRecord(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -691,7 +699,7 @@ void testEqualsWithNull() { assertNull(csvFormat.getQuoteCharacter()); assertTrue(csvFormat.isNullStringSet()); - Assertions.assertNotEquals(null, csvFormat); + assertNotEquals(null, csvFormat); } @@ -801,7 +809,7 @@ void testHashCodeAndWithIgnoreHeaderCase() { assertTrue(csvFormatTwo.getIgnoreHeaderCase()); // now different assertFalse(csvFormatTwo.getTrailingDelimiter()); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal assertFalse(csvFormatTwo.getAllowMissingColumnNames()); assertFalse(csvFormatTwo.getTrim()); @@ -958,6 +966,23 @@ void testPrintWithQuotes() throws IOException { assertEquals("\"\"\"a,b,c\r\nx,y,z\"", out.toString()); } + /** + * Tests CSV-326. + */ + @Test + void testPrintWithQuotesEscapeBeforeQuote() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder() + .setEscape('\\') + .setQuote('"') + .get(); + final String value = "\\\""; + final Appendable out = new StringBuilder(); + format.print(new StringReader(value), out, true); + try (CSVParser parser = CSVParser.parse(out.toString(), format)) { + assertEquals(value, parser.getRecords().get(0).get(0)); + } + } + @Test void testQuoteCharSameAsCommentStartThrowsException() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.builder().setQuote('!').setCommentMarker('!').get()); @@ -993,6 +1018,35 @@ void testQuoteCharSameAsDelimiterThrowsException_Deprecated() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withQuote('!').withDelimiter('!')); } + @Test + void testQuotedNullStringTracksQuoteCharacter() throws IOException { + final StringBuilder out = new StringBuilder(); + // @formatter:off + final Builder builder = CSVFormat.DEFAULT.builder(); + final CSVFormat format = builder + .setQuoteMode(QuoteMode.ALL) + .setNullString("NULL") + .get(); + // @formatter:on + format.print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // set + out.setLength(0); + builder.setQuote('\''); + builder.get().print(null, out, true); + assertEquals("'NULL'", out.toString()); + // reset + out.setLength(0); + builder.setQuote((Character) null); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // reset, reverse setter order + out.setLength(0); + builder.setNullString(null).setQuote((Character) null).setNullString("NULL"); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + } + @Test void testQuoteModeNoneShouldReturnMeaningfulExceptionMessage() { final Exception exception = assertThrows(IllegalArgumentException.class, () -> @@ -1156,7 +1210,7 @@ void testToStringAndWithCommentMarkerTakingCharacter() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); + assertNotEquals(csvFormatTwo, csvFormat); assertNull(csvFormat.getEscapeCharacter()); assertTrue(csvFormat.isQuoteCharacterSet()); @@ -1215,9 +1269,9 @@ void testToStringAndWithCommentMarkerTakingCharacter() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormat, csvFormatTwo); + assertNotEquals(csvFormat, csvFormatTwo); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); + assertNotEquals(csvFormatTwo, csvFormat); assertEquals("Delimiter=<,> QuoteChar=<\"> CommentStart= RecordSeparator=<\r\n> EmptyLines:ignored SkipHeaderRecord:false", csvFormatTwo.toString()); @@ -1403,7 +1457,7 @@ void testWithHeaderComments() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal final String string = csvFormatTwo.format(objectArray); @@ -1465,9 +1519,9 @@ void testWithHeaderComments() { assertNotSame(csvFormatTwo, csvFormat); assertNotNull(string); - Assertions.assertNotEquals(csvFormat, csvFormatTwo); // CSV-244 - should not be equal + assertNotEquals(csvFormat, csvFormatTwo); // CSV-244 - should not be equal - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal assertEquals(",,,,,,,", string); } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index d9dd4e545..6d9bdd9e8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() { () -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false))); } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping + * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value. + */ + @Test + void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (CSVParser parser = CSVParser.parse(" |a", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "", "a" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b |", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0)); + } + } + @Test void testEmptyFile() throws Exception { try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8, @@ -648,6 +673,100 @@ void testForEach() throws Exception { } } + @Test + void testGetBytePositionMultiCharacterDelimiter() throws IOException { + final String code = "aa[|]bb\ncc[|]dd\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(StandardCharsets.UTF_8) + .setTrackBytes(true) + .get()) { + final Iterator it = parser.iterator(); + final CSVRecord first = it.next(); + final CSVRecord second = it.next(); + assertEquals(0, first.getBytePosition()); + assertEquals(8, second.getBytePosition()); + } + } + + /** + * Tests CSV-329. + */ + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException { + final String delimiter = "x😀"; + final String code = "ax😀b\ncx😀d\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(UTF_8) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertValuesEquals(new String[] { "a", "b" }, first); + assertValuesEquals(new String[] { "c", "d" }, second); + assertEquals(0, first.getBytePosition()); + assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { + final String row0 = "é,x\n"; + final Charset charset = UTF_8; + // row0 char count is 4 + assertEquals(4, row0.length()); + // row0 byte count is 5 + final int record1ByteOffset = row0.getBytes(charset).length; + assertEquals(5, record1ByteOffset); + final String row1 = "b,c\n"; + final String rows = row0 + row1; + final long record1CharOffset = row0.length(); + final long expectedByteOffset = row0.getBytes(charset).length; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(row1)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(charset) + .setTrackBytes(true) + .setByteOffset(record1ByteOffset) + .setCharacterOffset(record1CharOffset) + .setRecordNumber(2) // not relevant but a better use case example. + .get()) { + final CSVRecord record = parser.nextRecord(); + assertNotNull(record); + assertEquals(4, record.getCharacterPosition()); + assertEquals(record1CharOffset, record.getCharacterPosition()); + assertEquals(expectedByteOffset, record.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithSingleByteCharset() throws IOException { + // A single-byte charset cannot encode U+FFFF, the char value of the EOF sentinel. + // Byte counting must skip the EOF read so a valid file parses without throwing. + final String code = "a,b\nc,d\n"; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(StandardCharsets.ISO_8859_1) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertNull(parser.nextRecord()); + assertEquals(0, first.getBytePosition()); + assertEquals(4, second.getBytePosition()); + } + } + @Test void testGetHeaderComment_HeaderComment1() throws IOException { try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) { @@ -917,6 +1036,23 @@ void testGetRecordsMaxRows(final long maxRows) throws IOException { } } + /** + * Tests CSV-327. + */ + @Test + void testGetRecordsMaxRowsWithRecordNumberOffset() throws IOException { + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader("a,b\nc,d\n")) + .setFormat(CSVFormat.DEFAULT.builder().setMaxRows(1).get()) + .setRecordNumber(2) + .get()) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertEquals(2, records.get(0).getRecordNumber()); + assertValuesEquals(new String[] { "a", "b" }, records.get(0)); + } + } + @Test void testGetRecordThreeBytesRead() throws Exception { final String code = "id,date,val5,val4\n" + @@ -1603,6 +1739,50 @@ void testParsingPrintedEmptyFirstColumn(final CSVFormat.Predefined format) throw } } + /** + * A truncated escaped multi-character delimiter at EOF must stay literal data and not be completed from a stale + * escape delimiter look-ahead. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (CSVParser parser = format.parse(new StringReader("x![!|!]y![!|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("x[|]y![!|", record.get(0)); + assertEquals(1, record.size()); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = format.parse(new StringReader("a[|]b[|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("a", record.get(0)); + assertEquals("b[|", record.get(1)); + assertEquals(2, record.size()); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (CSVParser parser = format.parse(new StringReader(recordString))) { + final CSVRecord record = parser.nextRecord(); + assertEquals(recordString, record.get(0)); + assertEquals(1, record.size()); + } + } + @Test void testProvidedHeader() throws Exception { final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z"); @@ -1794,6 +1974,23 @@ void testTrailingDelimiter() throws Exception { } } + @Test + void testTrailingDelimiterKeepsQuotedEmptyLastField() throws Exception { + final CSVFormat format = CSVFormat.DEFAULT.builder().setTrailingDelimiter(true).get(); + try (CSVParser parser = CSVParser.parse("a,b,\"\"", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(3, record.size()); + assertEquals("a", record.get(0)); + assertEquals("b", record.get(1)); + assertEquals("", record.get(2)); + } + // An unquoted trailing delimiter still drops the empty field. + try (CSVParser parser = CSVParser.parse("a,b,", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(2, record.size()); + } + } + @Test void testTrim() throws Exception { final Reader in = new StringReader("a,a,a\n\" 1 \",\" 2 \",\" 3 \"\nx,y,z"); diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index 1ff791010..9ae80c1e5 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -569,6 +569,57 @@ void testEscapeBackslash5() throws IOException { assertEquals("\\\\", sw.toString()); } + @Test + void testEscapeCommentMarkerFirstChar() throws IOException { + // No quoting available in escape mode, so a leading comment marker must be escaped or the + // record reads back as a comment and is dropped. Mirrors the quoting fix for QuoteMode.MINIMAL. + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote(null).setEscape('\\').setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + // The marker past the first character does not start a comment and is left alone. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("\\;comment-like,b" + RECORD_SEPARATOR + + "\\;comment-like,b" + RECORD_SEPARATOR + + "a;b,\\;c" + RECORD_SEPARATOR, string); + // The emitted records must read back as the original values, none parsed as a comment. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(3, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals(col1, records.get(1).get(0)); + assertEquals("b", records.get(1).get(1)); + assertEquals("a;b", records.get(2).get(0)); + assertEquals(";c", records.get(2).get(1)); + } + } + + @Test + void testEscapeCommentMarkerFirstCharWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setEscape('\\').setQuoteMode(QuoteMode.NONE).setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";bar"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + } + final String string = sw.toString(); + assertEquals("\\;bar,b" + RECORD_SEPARATOR + "\\;bar,b" + RECORD_SEPARATOR, string); + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals("b", record.get(1)); + } + } + } + @Test void testEscapeNull1() throws IOException { final StringWriter sw = new StringWriter(); @@ -1798,6 +1849,28 @@ void testQuoteAll() throws IOException { } } + @Test + void testQuoteCharEscapedWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote('"').setEscape('?').setQuoteMode(QuoteMode.NONE).get(); + final StringWriter sw = new StringWriter(); + final String col1 = "\"abc"; + final String col2 = "x\"y"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, col2); + printer.printRecord(new StringReader(col1), new StringReader(col2)); + } + assertEquals("?\"abc,x?\"y" + RECORD_SEPARATOR + "?\"abc,x?\"y" + RECORD_SEPARATOR, sw.toString()); + // The emitted records must read back as the original values. + try (CSVParser parser = CSVParser.parse(sw.toString(), format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals(col2, record.get(1)); + } + } + } + @Test void testQuoteCommaFirstChar() throws IOException { final StringWriter sw = new StringWriter(); @@ -1807,6 +1880,34 @@ void testQuoteCommaFirstChar() throws IOException { } } + @Test + void testQuoteCommentMarkerFirstChar() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + // A real comment is written with the marker, unquoted. + printer.printComment("a real comment"); + // A value starting with the marker is quoted, so it does not read back as a comment. + printer.printRecord(col1, "b"); + // The marker past the first character does not start a comment, so only the leading-marker value is quoted. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("; a real comment" + RECORD_SEPARATOR + + "\";comment-like\",b" + RECORD_SEPARATOR + + "a;b,\";c\"" + RECORD_SEPARATOR, string); + // The comment is dropped on read; both data records survive intact. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals("a;b", records.get(1).get(0)); + assertEquals(";c", records.get(1).get(1)); + } + } + @Test void testQuoteNonNumeric() throws IOException { final StringWriter sw = new StringWriter(); diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java index 056b8a9c9..b8d9b9f19 100644 --- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java +++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception { } } + @Test + void testReadingSupplementaryCharacterTracksBytes() throws Exception { + final String input = "😀"; + final char[] buffer = new char[input.length()]; + try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) { + assertEquals(input.length(), reader.read(buffer, 0, buffer.length)); + assertArrayEquals(input.toCharArray(), buffer); + assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead()); + assertEquals(input.length(), reader.getPosition()); + assertEquals(input.charAt(input.length() - 1), reader.getLastChar()); + } + } + @Test void testReadLine() throws Exception { try (ExtendedBufferedReader br = createBufferedReader("")) { diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index e54e93365..a76f6e513 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the + * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped. + */ + @Test + void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (Lexer lexer = createLexer(" |a", format)) { + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "a", lexer); + } + try (Lexer lexer = createLexer("a | |b", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "b", lexer); + } + } + @Test void testEOFWithoutClosingQuote() throws Exception { final String code = "a,\"b"; @@ -409,6 +428,44 @@ void testNextToken6() throws IOException { } } + /** + * A truncated escaped multi-character delimiter at EOF must not be accepted by reusing the previous escape delimiter + * look-ahead in {@link Lexer#isEscapeDelimiter()}. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (Lexer lexer = createLexer("x![!|!]y![!|", format)) { + assertNextToken(EOF, "x[|]y![!|", lexer); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (Lexer lexer = createLexer("a[|]b[|", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(EOF, "b[|", lexer); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token). + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (Lexer lexer = createLexer(recordString, format)) { + assertNextToken(EOF, recordString, lexer); + } + } + @Test void testReadEscapeBackspace() throws IOException { try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) { diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java new file mode 100644 index 000000000..2b9e335a8 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.commons.csv.issues; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.apache.commons.csv.QuoteMode; +import org.junit.jupiter.api.Test; + +/** + * Tests https://issues.apache.org/jira/browse/CSV-227 + */ +class JiraCsv227Test { + + @Test + public void test() throws IOException { + final StringBuilder out = new StringBuilder(); + try (CSVPrinter printer = new CSVPrinter(out, CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL))) { + printer.printRecord("ㅁㅎㄷㄹ", "ㅁㅎㄷㄹ", "", "test2"); + printer.printRecord("한글3", "hello3", "3한글3", "test3"); + printer.printRecord("", "hello4", "", "test4"); + } + // ㅁㅎㄷㄹ,ㅁㅎㄷㄹ,,test2 + // 한글3,hello3,3한글3,test3 + // "",hello4,,test4 + assertEquals("ㅁㅎㄷㄹ,ㅁㅎㄷㄹ,,test2\r\n한글3,hello3,3한글3,test3\r\n\"\",hello4,,test4\r\n", out.toString()); + } +}