diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 6c4aa3f57..cca38e512 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -46,10 +46,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} @@ -58,7 +58,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -69,7 +69,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/autobuild@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 # ℹ️ Command-line programs to run using the OS shell. # πŸ“š https://git.io/JvXDl @@ -83,4 +83,4 @@ jobs: # make release - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index a04da5090..7bc02bdd2 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,6 +26,6 @@ jobs: runs-on: ubuntu-latest steps: - name: 'Checkout Repository' - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - name: 'Dependency Review PR' - uses: actions/dependency-review-action@3c4e3dcb1aa7874d2c16be7d79418e9b7efd6261 # v4.8.2 + uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0 diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 518219a1d..17ba7dd38 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -15,7 +15,11 @@ name: Java CI -on: [push, pull_request] +on: + push: + branches: + - 'master' + pull_request: {} permissions: contents: read @@ -30,26 +34,26 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] - java: [ 8, 11, 17, 21, 25 ] + java: [ 8, 11, 17, 21, 25, 26 ] experimental: [false] # Keep the same parameter order as the matrix above include: - os: ubuntu-latest - java: 26-ea + java: 27-ea experimental: true steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0 with: distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }} java-version: ${{ matrix.java }} diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml index f01ca3a11..e1868cb46 100644 --- a/.github/workflows/scorecards-analysis.yml +++ b/.github/workflows/scorecards-analysis.yml @@ -40,7 +40,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0 with: persist-credentials: false @@ -57,13 +57,13 @@ jobs: publish_results: true - name: "Upload artifact" - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: SARIF file path: results.sarif retention-days: 5 - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4 + uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: sarif_file: results.sarif diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eb15f2518..3423e18ad 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,13 +48,13 @@ Getting Started --------------- + Make sure you have a [JIRA account](https://issues.apache.org/jira/). -+ Make sure you have a [GitHub account](https://github.com/signup/free). This is not essential, but makes providing patches much easier. ++ Make sure you have a [GitHub account](https://github.com/signup). This is not essential, but makes providing patches much easier. + If you're planning to implement a new feature it makes sense to discuss your changes on the [dev list](https://commons.apache.org/mail-lists.html) first. This way you can make sure you're not wasting your time on something that isn't considered to be in Apache Commons CSV's scope. + Submit a [Jira Ticket][jira] for your issue, assuming one does not already exist. + Clearly describe the issue including steps to reproduce when it is a bug. + Make sure you fill in the earliest version that you know has the issue. + Find the corresponding [repository on GitHub](https://github.com/apache/?query=commons-), -[fork](https://help.github.com/articles/fork-a-repo/) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. +[fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. Making Changes -------------- @@ -108,8 +108,8 @@ Additional Resources + [Contributing patches](https://commons.apache.org/patches.html) + [Apache Commons CSV JIRA project page][jira] + [Contributor License Agreement][cla] -+ [General GitHub documentation](https://help.github.com/) -+ [GitHub pull request documentation](https://help.github.com/articles/creating-a-pull-request/) ++ [General GitHub documentation](https://docs.github.com/) ++ [GitHub pull request documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) + [Apache Commons Twitter Account](https://twitter.com/ApacheCommons) [cla]:https://www.apache.org/licenses/#clas diff --git a/pom.xml b/pom.xml index 0ade7e0d4..8cb13ed7c 100644 --- a/pom.xml +++ b/pom.xml @@ -20,10 +20,10 @@ org.apache.commons commons-parent - 96 + 102 commons-csv - 1.14.2-SNAPSHOT + 1.15.0-SNAPSHOT Apache Commons CSV https://commons.apache.org/proper/commons-csv/ 2005 @@ -59,6 +59,7 @@ com.h2database h2 + 2.2.224 test @@ -89,12 +90,12 @@ - 1.14.2 + 1.15.0 (Java 8 or above) RC1 1.14.1 - 1.14.3 + 1.15.1 csv org.apache.commons.csv CSV @@ -108,8 +109,8 @@ false true 2025-07-30T14:51:35Z - 1.21.0 - 2.21.0 + 1.22.0 + 2.22.0 org.apache.commons.codec.binary;version="${commons.codec.version}", diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2e24024c8..93952e9f1 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -40,18 +40,38 @@ Apache Commons CSV Release Notes - + Remove Spotbugs dependency and use exclude-filter instead #564. Remove broken website link #577. Fix Apache RAT plugin console warnings. + [Javadoc] Clarify behavior of deprecated CSVFormat#withFirstRecordAsHeader() #2413. + CSVFormat.equals()/hashCode() ignores maxRows (#600). + ExtendedBufferedReader byte tracking leads to an incorrect CSVRecord.getBytePosition() (#601). + CSVFormat.Builder.setQuote() does not refresh quotedNullString (#2447). + Lexer.isDelimiter() accepts a partial multi-character delimiter at EOF (#603). + CSVParser applies characterOffset to bytePosition (#604). + CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back. + CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used. + CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters. + CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null). + Escape Reader values with quote and escape (#606). + Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611). + Escape quote char in printWithEscapes when QuoteMode is NONE (#609). + Quote value starting with comment marker in minimal quote mode (#610). + Escape leading comment marker in printWithEscapes (#614). + Skip byte counting at EOF in ExtendedBufferedReader.read (#615). + Keep quoted empty trailing field with trailingDelimiter (#616). + Evaluate isDelimiter once in nextToken whitespace skip (#618).. + Add an "Android Compatibility" section to the web site. + Add CSVParser.Builder.setByteOffset(long) (#604). - Bump org.apache.commons:commons-parent from 85 to 96 #573, #595. + Bump org.apache.commons:commons-parent from 85 to 102 #573, #595. [test] Bump com.opencsv:opencsv from 5.11.2 to 5.12.0 #558. Bump org.apache.commons:commons-lang3 from 3.18.0 to 3.20.0. - Bump commons-codec:commons-codec from 1.19.0 to 1.21.0. - Bump commons-io:commons-io from 2.20.0 to 2.21.0 #594. + Bump commons-codec:commons-codec from 1.19.0 to 1.22.0. + Bump commons-io:commons-io from 2.20.0 to 2.22.0 #594. diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 7251f83dc..7145d23d3 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -780,8 +780,7 @@ public Builder setMaxRows(final long maxRows) { */ public Builder setNullString(final String nullString) { this.nullString = nullString; - this.quotedNullString = quoteCharacter + nullString + quoteCharacter; - return this; + return setQuotedNullString(); } /** @@ -806,6 +805,12 @@ public Builder setQuote(final Character quoteCharacter) { throw new IllegalArgumentException("The quoteCharacter cannot be a line break"); } this.quoteCharacter = quoteCharacter; + return setQuotedNullString(); + } + + private Builder setQuotedNullString() { + final Character quote = quoteCharacter != null ? quoteCharacter : Constants.DOUBLE_QUOTE_CHAR; + this.quotedNullString = quote + nullString + quote; return this; } @@ -878,6 +883,16 @@ public Builder setTrailingData(final boolean trailingData) { /** * Sets whether to add a trailing delimiter. * + *

+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field + * that such a trailing delimiter produces is dropped so the output round-trips back to the original record; + * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept. + *

+ *

+ * This is unrelated to {@link #setTrailingData(boolean) trailing data}, which controls whether characters + * after the closing quote of an encapsulated value are tolerated when reading. + *

+ * * @param trailingDelimiter whether to add a trailing delimiter. * @return This instance. */ @@ -900,7 +915,7 @@ public Builder setTrim(final boolean trim) { } /** - * Predefines formats. + * Enumerates predefines formats. * * @since 1.2 */ @@ -1477,7 +1492,7 @@ private static boolean isLineBreak(final char c) { * @return true if {@code c} is a line break character (and not null). */ private static boolean isLineBreak(final Character c) { - return c != null && isLineBreak(c.charValue()); // Explicit (un)boxing is intentional + return c != null && isLineBreak(c.charValue()); // Explicit unboxing is intentional } /** Same test as in as {@link String#trim()}. */ @@ -1690,15 +1705,15 @@ public boolean equals(final Object obj) { duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) && ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase && - ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && - Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && - quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && - Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord && - trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim; + ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && maxRows == other.maxRows && + Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && + Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && + skipHeaderRecord == other.skipHeaderRecord && trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && + trim == other.trim; } private void escape(final char c, final Appendable appendable) throws IOException { - append(escapeCharacter.charValue(), appendable); // Explicit (un)boxing is intentional + append(escapeCharacter.charValue(), appendable); // Explicit unboxing is intentional append(c, appendable); } @@ -1836,7 +1851,7 @@ public DuplicateHeaderMode getDuplicateHeaderMode() { * @return the escape character, may be {@code 0} */ char getEscapeChar() { - return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit (un)boxing is intentional + return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit unboxing is intentional } /** @@ -2007,6 +2022,16 @@ public boolean getTrailingData() { /** * Gets whether to add a trailing delimiter. * + *

+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field + * that such a trailing delimiter produces is dropped so the output round-trips back to the original record; + * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept. + *

+ *

+ * This is unrelated to {@link #getTrailingData() trailing data}, which controls whether characters after the + * closing quote of an encapsulated value are tolerated when reading. + *

+ * * @return whether to add a trailing delimiter. * @since 1.3 */ @@ -2029,9 +2054,10 @@ public int hashCode() { int result = 1; result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headers); - return prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, - ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString, + result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, + ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, maxRows, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim); + return result; } /** @@ -2158,7 +2184,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo } final boolean quoteCharacterSet = isQuoteCharacterSet(); if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } // Stream the input to the output without reading or holding the whole value in memory. // AppendableOutputStream cannot "close" an Appendable. @@ -2166,7 +2192,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo IOUtils.copy(inputStream, outputStream); } if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } } @@ -2321,12 +2347,18 @@ private void printWithEscapes(final CharSequence charSeq, final Appendable appen final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional while (pos < end) { char c = charSeq.charAt(pos); final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && pos == 0 && c == commentChar; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { appendable.append(charSeq, start, pos); @@ -2365,8 +2397,13 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); int c; + boolean firstChar = true; final char[] lookAheadBuffer = new char[delimLength - 1]; while (EOF != (c = bufferedReader.read())) { builder.append((char) c); @@ -2376,7 +2413,10 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final boolean isDelimiterStart = isDelimiter((char) c, test, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && firstChar && c == commentChar; + firstChar = false; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { append(builder.substring(start, pos), appendable); @@ -2415,7 +2455,7 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi final int len = charSeq.length(); final char[] delim = getDelimiterCharArray(); final int delimLength = delim.length; - final char quoteChar = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quoteChar = getQuoteCharacter().charValue(); // Explicit unboxing is intentional // If escape char not specified, default to the quote char // This avoids having to keep checking whether there is an escape character // at the cost of checking against quote twice @@ -2447,10 +2487,11 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi } } else { char c = charSeq.charAt(pos); - if (c <= Constants.COMMENT) { + if (c <= Constants.COMMENT || isCommentMarkerSet() && c == commentMarker.charValue()) { // Some other chars at the start of a value caused the parser to fail, so for now // encapsulate if we start in anything less than '#'. We are being conservative - // by including the default comment char too. + // by including the default comment char and any configured comment marker too, + // which the parser would otherwise read back as a comment line. quote = true; } else { while (pos < len) { @@ -2518,15 +2559,16 @@ private void printWithQuotes(final Reader reader, final Appendable appendable) t printWithEscapes(reader, appendable); return; } - final char quote = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quote = getQuoteCharacter().charValue(); // Explicit unboxing is intentional + final char escape = isEscapeCharacterSet() ? getEscapeChar() : quote; // (1) Append opening quote append(quote, appendable); - // (2) Append Reader contents, doubling quotes + // (2) Append Reader contents, doubling quotes and escape characters int c; while (EOF != (c = reader.read())) { append((char) c, appendable); - if (c == quote) { - append(quote, appendable); + if (c == quote || c == escape) { + append((char) c, appendable); } } // (3) Append closing quote @@ -2604,13 +2646,13 @@ boolean useRow(final long rowNum) { * @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes. */ private void validate() throws IllegalArgumentException { - if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit (un)boxing is intentional + if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); } - if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit (un)boxing is intentional + if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); } - if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit (un)boxing is intentional + if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); } if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { @@ -2788,6 +2830,9 @@ public CSVFormat withEscape(final Character escape) { * .get(); * * + *

Any previously set headers are reset to empty. + * The resulting format will have {@code skipHeaderRecord = true}.

+ * * @return A new CSVFormat that is equal to this but using the first record as header. * @see Builder#setSkipHeaderRecord(boolean) * @see Builder#setHeader(String...) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index bce62ea54..141eba732 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -154,6 +154,7 @@ public final class CSVParser implements Iterable, Closeable { public static class Builder extends AbstractStreamBuilder { private CSVFormat format; + private long byteOffset = -1; private long characterOffset; private long recordNumber = 1; private boolean trackBytes; @@ -165,17 +166,33 @@ protected Builder() { // empty } - @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes); + return new CSVParser(this); } /** - * Sets the lexer offset when the parser does not start parsing at the beginning of the source. + * Sets the lexer byte offset when the parser does not start parsing at the beginning of the source. + *

+ * By default, the value is {@code -1}, which reuses the character offset for the byte offset. + *

* - * @param characterOffset the lexer offset. + * @param byteOffset the lexer byte offset. * @return {@code this} instance. + * @see #setCharacterOffset(long) + * @since 1.15.0 + */ + public Builder setByteOffset(final long byteOffset) { + this.byteOffset = byteOffset; + return asThis(); + } + + /** + * Sets the lexer character offset when the parser does not start parsing at the beginning of the source. + * + * @param characterOffset the lexer character offset. + * @return {@code this} instance. + * @see #setByteOffset(long) */ public Builder setCharacterOffset(final long characterOffset) { this.characterOffset = characterOffset; @@ -220,6 +237,7 @@ public Builder setTrackBytes(final boolean trackBytes) { final class CSVRecordIterator implements Iterator { private CSVRecord current; + private long recordCount; /** * Gets the next record or null at the end of stream or max rows read. @@ -230,8 +248,11 @@ final class CSVRecordIterator implements Iterator { */ private CSVRecord getNextRecord() { CSVRecord record = null; - if (format.useRow(recordNumber + 1)) { + if (format.useRow(recordCount + 1)) { record = Uncheck.get(CSVParser.this::nextRecord); + if (record != null) { + recordCount++; + } } return record; } @@ -466,6 +487,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor */ private long recordNumber; + /** + * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination + * with {@link #recordNumber}. + */ + private final long byteOffset; + /** * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination * with {@link #recordNumber}. @@ -474,6 +501,23 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor private final Token reusableToken = new Token(); + /** + * Constructs a new instance from a builder. + * + * @param builder The source builder. + * @throws IOException if an I/O error occurs. + */ + @SuppressWarnings("resource") // Lexer manages ExtendedBufferedReader. + private CSVParser(final Builder builder) throws IOException { + this.format = (builder.format != null ? builder.format : CSVFormat.DEFAULT).copy(); + this.lexer = new Lexer(format, new ExtendedBufferedReader(builder.getReader(), builder.getCharset(), builder.trackBytes)); + this.csvRecordIterator = new CSVRecordIterator(); + this.headers = createHeaders(); + this.byteOffset = builder.byteOffset != -1 ? builder.byteOffset : builder.characterOffset; + this.characterOffset = builder.characterOffset; + this.recordNumber = builder.recordNumber - 1; + } + /** * Constructs a new instance using the given {@link CSVFormat}. * @@ -524,51 +568,21 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException */ @Deprecated public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { - this(reader, format, characterOffset, recordNumber, null, false); - } - - /** - * Constructs a new instance using the given {@link CSVFormat}. - * - *

- * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, - * unless you close the {@code reader}. - *

- * - * @param reader - * a Reader containing CSV-formatted input. Must not be null. - * @param format - * the CSVFormat used for CSV parsing. Must not be null. - * @param characterOffset - * Lexer offset when the parser does not start parsing at the beginning of the source. - * @param recordNumber - * The next record number to assign. - * @param charset - * The character encoding to be used for the reader when enableByteTracking is true. - * @param trackBytes - * {@code true} to enable byte tracking for the parser; {@code false} to disable it. - * @throws IllegalArgumentException - * If the parameters of the format are inconsistent or if either the reader or format is null. - * @throws IOException - * If there is a problem reading the header or skipping the first record. - * @throws CSVException Thrown on invalid CSV input data. - */ - @SuppressWarnings("resource") // reader is managed by lexer. - private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset, - final boolean trackBytes) throws IOException { - Objects.requireNonNull(reader, "reader"); - Objects.requireNonNull(format, "format"); - this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes)); - this.csvRecordIterator = new CSVRecordIterator(); - this.headers = createHeaders(); - this.characterOffset = characterOffset; - this.recordNumber = recordNumber - 1; + // @formatter:off + this(builder() + .setReader(reader) + .setFormat(Objects.requireNonNull(format, "format")) // requireNonNull for full compatibility + .setCharacterOffset(characterOffset) + .setRecordNumber(recordNumber) + .setCharset((Charset) null).setTrackBytes(false)); + // @formatter:off } private void addRecordValue(final boolean lastRecord) { final String input = format.trim(reusableToken.content.toString()); - if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) { + // Only drop the empty field produced by an actual trailing delimiter. A quoted empty + // field ("") is a real value, not a trailing delimiter, so it must be kept. + if (lastRecord && input.isEmpty() && format.getTrailingDelimiter() && !reusableToken.isQuoted) { return; } recordList.add(handleNull(input)); @@ -642,7 +656,7 @@ private Headers createHeaders() throws IOException { } observedMissing |= blankHeader; if (header != null) { - headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional + headerMap.put(header, Integer.valueOf(i)); // Explicit boxing is intentional if (headerNames == null) { headerNames = new ArrayList<>(headerRecord.length); } @@ -887,7 +901,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; - final long startBytePosition = lexer.getBytesRead() + characterOffset; + final long startBytePosition = lexer.getBytesRead() + byteOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); diff --git a/src/main/java/org/apache/commons/csv/CSVPrinter.java b/src/main/java/org/apache/commons/csv/CSVPrinter.java index 087129ec5..a7048fd62 100644 --- a/src/main/java/org/apache/commons/csv/CSVPrinter.java +++ b/src/main/java/org/apache/commons/csv/CSVPrinter.java @@ -235,7 +235,7 @@ public void printComment(final String comment) throws IOException { if (!newRecord) { println(); } - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); for (int i = 0; i < comment.length(); i++) { final char c = comment.charAt(i); @@ -247,7 +247,7 @@ public void printComment(final String comment) throws IOException { // falls-through: break intentionally excluded. case LF: println(); - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); break; default: diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index f619717d0..8dab14d90 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -132,13 +132,11 @@ public String get(final String name) { throw new IllegalArgumentException(String.format("Mapping for %s not found, expected one of %s", name, headerMap.keySet())); } try { - return values[index.intValue()]; // Explicit (un)boxing is intentional + return values[index.intValue()]; // Explicit unboxing is intentional } catch (final ArrayIndexOutOfBoundsException e) { + // Explicit boxing is intentional throw new IllegalArgumentException( - String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); // Explicit - // (un)boxing - // is - // intentional + String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); } } @@ -267,7 +265,7 @@ public boolean isSet(final int index) { * @return whether a given column is mapped and has a value. */ public boolean isSet(final String name) { - return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit (un)boxing is intentional + return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit unboxing is intentional } /** @@ -283,7 +281,7 @@ public Iterator iterator() { /** * Puts all values of this record into the given Map. * - * @param the map type. + * @param The map type. * @param map The Map to populate. * @return the given map. * @since 1.9.0 diff --git a/src/main/java/org/apache/commons/csv/Constants.java b/src/main/java/org/apache/commons/csv/Constants.java index 0b9476e1c..9dd276ecc 100644 --- a/src/main/java/org/apache/commons/csv/Constants.java +++ b/src/main/java/org/apache/commons/csv/Constants.java @@ -40,7 +40,7 @@ final class Constants { /** RFC 4180 defines line breaks as CRLF. */ static final String CRLF = "\r\n"; - static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit (un)boxing is intentional. + static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit boxing is intentional. static final String EMPTY = ""; diff --git a/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java b/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java index 01989d664..8087f16ee 100644 --- a/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java +++ b/src/main/java/org/apache/commons/csv/DuplicateHeaderMode.java @@ -20,7 +20,7 @@ package org.apache.commons.csv; /** - * Determines how duplicate header fields should be handled + * Enumerates how duplicate header fields should be handled * if {@link CSVFormat.Builder#setHeader(Class)} is not null. * * @since 1.10.0 diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 8dcda6517..20c1ef544 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -37,26 +37,30 @@ /** * A special buffered reader which supports sophisticated read access. *

- * In particular the reader supports a look-ahead option, which allows you to see the next char returned by - * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. + * In particular the reader supports a look-ahead option, which allows you to see the next char returned by {@link #read()}. This reader also tracks how many + * characters have been read with {@link #getPosition()}. *

*/ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** The last char returned */ private int lastChar = UNDEFINED; + private int lastCharMark = UNDEFINED; /** The count of EOLs (CR/LF/CRLF) seen so far */ private long lineNumber; + private long lineNumberMark; /** The position, which is the number of characters read so far */ private long position; + private long positionMark; /** The number of bytes read so far. */ private long bytesRead; + private long bytesReadMark; /** Encoder for calculating the number of bytes for each character read. */ @@ -70,12 +74,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { } /** - * Constructs a new instance with the specified reader, character set, - * and byte tracking option. Initializes an encoder if byte tracking is enabled - * and a character set is provided. + * Constructs a new instance with the specified reader, character set, and byte tracking option. Initializes an encoder if byte tracking is enabled and a + * character set is provided. * - * @param reader the reader supports a look-ahead option. - * @param charset the character set for encoding, or {@code null} if not applicable. + * @param reader the reader supports a look-ahead option. + * @param charset the character set for encoding, or {@code null} if not applicable. * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. */ ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) { @@ -86,8 +89,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** * Closes the stream. * - * @throws IOException - * If an I/O error occurs + * @throws IOException If an I/O error occurs */ @Override public void close() throws IOException { @@ -105,26 +107,35 @@ long getBytesRead() { return this.bytesRead; } + private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { + long len = 0; + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(previous, buf[i]); + previous = buf[i]; + } + return len; + } + /** - * Gets the byte length of the given character based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. + * Gets the byte length of the given character based on the original Unicode specification, which defined characters as fixed-width 16-bit entities. *

* The Unicode characters are divided into two main ranges: *

    - *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): - *
      - *
    • Represented using a single 16-bit {@code char}.
    • - *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • - *
    - *
  • - *
  • U+10000 to U+10FFFF (Supplementary Characters): - *
      - *
    • Represented as a pair of {@code char}s:
    • - *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • - *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • - *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • - *
    - *
  • + *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
      + *
    • Represented using a single 16-bit {@code char}.
    • + *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • + *
    + *
  • + *
  • U+10000 to U+10FFFF (Supplementary Characters): + *
      + *
    • Represented as a pair of {@code char}s:
    • + *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • + *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • + *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • + *
    + *
  • *
* * @param current the current character to process. @@ -132,8 +143,12 @@ long getBytesRead() { * @throws CharacterCodingException if the character cannot be encoded. */ private int getEncodedCharLength(final int current) throws CharacterCodingException { + return getEncodedCharLength(lastChar, current); + } + + private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -148,10 +163,9 @@ private int getEncodedCharLength(final int current) throws CharacterCodingExcept } /** - * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by - * any of the read methods. This will not include a character read using the {@link #peek()} method. If no - * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached - * on the last read then this will return {@link IOUtils#EOF}. + * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by any of the read methods. This will not + * include a character read using the {@link #peek()} method. If no character has been read then this will return {@link Constants#UNDEFINED}. If the end of + * the stream was reached on the last read then this will return {@link IOUtils#EOF}. * * @return the last character that was read */ @@ -193,11 +207,10 @@ public void mark(final int readAheadLimit) throws IOException { @Override public int read() throws IOException { final int current = super.read(); - if (current == CR || current == LF && lastChar != CR || - current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { + if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } - if (encoder != null) { + if (encoder != null && current != EOF) { this.bytesRead += getEncodedCharLength(current); } lastChar = current; @@ -211,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE return 0; } final int len = super.read(buf, offset, length); + if (encoder != null && len > 0) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } if (len > 0) { for (int i = offset; i < offset + len; i++) { final char ch = buf[i]; @@ -231,8 +247,7 @@ public int read(final char[] buf, final int offset, final int length) throws IOE } /** - * Gets the next line, dropping the line terminator(s). This method should only be called when processing a - * comment, otherwise, information can be lost. + * Gets the next line, dropping the line terminator(s). This method should only be called when processing a comment, otherwise, information can be lost. *

* Increments {@link #lineNumber} and updates {@link #position}. *

@@ -272,5 +287,4 @@ public void reset() throws IOException { bytesRead = bytesReadMark; super.reset(); } - } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 3d00fe0bf..fe964480a 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -23,6 +23,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.Arrays; import org.apache.commons.io.IOUtils; @@ -68,8 +69,8 @@ final class Lexer implements Closeable { /** * Appends the next escaped character to the token's content. * - * @param token the current token - * @throws IOException on stream access error + * @param token the current token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private void appendNextEscapedCharacterToToken(final Token token) throws IOException { @@ -89,7 +90,7 @@ private void appendNextEscapedCharacterToToken(final Token token) throws IOExcep * Closes resources. * * @throws IOException - * If an I/O error occurs + * If an I/O error occurs. */ @Override public void close() throws IOException { @@ -97,27 +98,27 @@ public void close() throws IOException { } /** - * Gets the number of bytes read + * Gets the number of bytes read. * - * @return the number of bytes read + * @return the number of bytes read. */ long getBytesRead() { return reader.getBytesRead(); } /** - * Returns the current character position + * Gets the current character position. * - * @return the current character position + * @return the current character position. */ long getCharacterPosition() { return reader.getPosition(); } /** - * Returns the current line number + * Gets the current line number. * - * @return the current line number + * @return the current line number. */ long getCurrentLineNumber() { return reader.getLineNumber(); @@ -136,7 +137,7 @@ boolean isCommentStart(final int ch) { } /** - * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. + * Tests whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. * * @param ch * the current character. @@ -152,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException { isLastTokenDelimiter = true; return true; } + Arrays.fill(delimiterBuf, '\0'); reader.peek(delimiterBuf); for (int i = 0; i < delimiterBuf.length; i++) { if (delimiterBuf[i] != delimiter[i + 1]) { @@ -190,6 +192,7 @@ boolean isEscape(final int ch) { * @throws IOException If an I/O error occurs. */ boolean isEscapeDelimiter() throws IOException { + Arrays.fill(escapeDelimiterBuf, '\0'); reader.peek(escapeDelimiterBuf); if (escapeDelimiterBuf[0] != delimiter[0]) { return false; @@ -214,7 +217,7 @@ boolean isQuoteChar(final int ch) { /** * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file. * - * @param ch the character to check + * @param ch the character to check. * @return true if the character is at the start of a line. */ boolean isStartOfLine(final int ch) { @@ -274,15 +277,22 @@ Token nextToken(final Token token) throws IOException { } // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { + // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must + // only be evaluated once per character. Remember a match found while skipping whitespace below. + boolean delimiter = false; // ignore whitespaces at beginning of a token if (ignoreSurroundingSpaces) { - while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) { + while (Character.isWhitespace((char) c) && !eol) { + if (isDelimiter(c)) { + delimiter = true; + break; + } c = reader.read(); eol = readEndOfLine(c); } } // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { + if (delimiter || isDelimiter(c)) { // empty token return TOKEN("") token.type = Token.Type.TOKEN; } else if (eol) { @@ -400,10 +410,10 @@ private Token parseEncapsulatedToken(final Token token) throws IOException { *
  • An unescaped delimiter has been reached (TOKEN)
  • * * - * @param token the current token - * @param ch the current character - * @return the filled token - * @throws IOException on stream access error + * @param token the current token. + * @param ch the current character. + * @return the filled token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private Token parseSimpleToken(final Token token, final int ch) throws IOException { @@ -442,7 +452,7 @@ private Token parseSimpleToken(final Token token, final int ch) throws IOExcepti /** * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... * - * @return true if the given or next character is a line-terminator + * @return true if the given or next character is a line-terminator. */ boolean readEndOfLine(final int ch) throws IOException { // check if we have \r\n... diff --git a/src/main/java/org/apache/commons/csv/QuoteMode.java b/src/main/java/org/apache/commons/csv/QuoteMode.java index 79bb1b34e..ae64ab486 100644 --- a/src/main/java/org/apache/commons/csv/QuoteMode.java +++ b/src/main/java/org/apache/commons/csv/QuoteMode.java @@ -19,7 +19,7 @@ package org.apache.commons.csv; /** - * Defines quoting behavior. + * Enumerates quoting behavior. * * @see CSVFormat.Builder#setQuoteMode(QuoteMode) */ diff --git a/src/media/commons-logo-component-100.xcf b/src/media/commons-logo-component-100.xcf new file mode 100644 index 000000000..77d92f277 Binary files /dev/null and b/src/media/commons-logo-component-100.xcf differ diff --git a/src/media/commons-logo-component.xcf b/src/media/commons-logo-component.xcf new file mode 100644 index 000000000..3670221da Binary files /dev/null and b/src/media/commons-logo-component.xcf differ diff --git a/src/media/logo-large.xcf b/src/media/logo-large.xcf deleted file mode 100644 index 7bb07af3e..000000000 Binary files a/src/media/logo-large.xcf and /dev/null differ diff --git a/src/media/logo.png b/src/media/logo.png index 582fba980..93bb6c014 100644 Binary files a/src/media/logo.png and b/src/media/logo.png differ diff --git a/src/media/logo.xcf b/src/media/logo.xcf deleted file mode 100644 index ac6376f71..000000000 Binary files a/src/media/logo.xcf and /dev/null differ diff --git a/src/site/resources/images/logo.png b/src/site/resources/images/logo.png index 582fba980..93bb6c014 100644 Binary files a/src/site/resources/images/logo.png and b/src/site/resources/images/logo.png differ diff --git a/src/site/xdoc/index.xml b/src/site/xdoc/index.xml index af49476f9..ac5b8cfa9 100644 --- a/src/site/xdoc/index.xml +++ b/src/site/xdoc/index.xml @@ -24,12 +24,10 @@ limitations under the License. -

    Commons CSV reads and writes files in variations of the Comma Separated Value (CSV) format.

    Read the documentation starting with the Javadoc Overview.

    -

    An overview of the functionality is provided in the @@ -48,7 +46,6 @@ The git repository can be browsed.

    -
    -

    The latest code can be checked out from our git repository at https://gitbox.apache.org/repos/asf/commons-csv.git. You can build the component using Apache Maven using mvn clean package.

    - +
    +

    + Apache Commons CSV requires Java 8 or above. +

    + + + + + + + + + + + + + + + +
    Commons CSVJavaAndroid
    1.10.0+8Android 7.0 (API level 24)
    +

    The commons developer mailing list is the main channel of communication for contributors. Please remember that the lists are shared between all commons components, so prefix your email by [csv].

    @@ -87,7 +103,6 @@ For previous releases, see the TagList report.

    If you'd like to offer up pull requests via GitHub rather than applying patches to JIRA, we have a GitHub mirror.

    -

    The commons mailing lists act as the main support forum. @@ -101,8 +116,6 @@ For previous releases, see the

    Commons CSV was started to unify a common and simple interface for reading and writing CSV files under an ASL license. It has been bootstrapped by a code donation from Netcetera in Switzerland. There are three pre-existing BSD compatible CSV parsers which this component will hopefully make redundant (authors willing):

    - - diff --git a/src/site/xdoc/security.xml b/src/site/xdoc/security.xml index ab0056049..47edf5d11 100644 --- a/src/site/xdoc/security.xml +++ b/src/site/xdoc/security.xml @@ -47,5 +47,10 @@

    None.

    +
    +

    + For information about safe deserialization, please see Safe Deserialization. +

    +
    \ No newline at end of file diff --git a/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java b/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java index cc47b999a..2f518a120 100644 --- a/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java +++ b/src/test/java/org/apache/commons/csv/CSVDuplicateHeaderTest.java @@ -19,13 +19,16 @@ package org.apache.commons.csv; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -285,11 +288,11 @@ void testCSVFormat(final DuplicateHeaderMode duplicateHeaderMode, .setHeader(headers); if (valid) { final CSVFormat format = builder.get(); - Assertions.assertEquals(duplicateHeaderMode, format.getDuplicateHeaderMode(), "DuplicateHeaderMode"); - Assertions.assertEquals(allowMissingColumnNames, format.getAllowMissingColumnNames(), "AllowMissingColumnNames"); - Assertions.assertArrayEquals(headers, format.getHeader(), "Header"); + assertEquals(duplicateHeaderMode, format.getDuplicateHeaderMode(), "DuplicateHeaderMode"); + assertEquals(allowMissingColumnNames, format.getAllowMissingColumnNames(), "AllowMissingColumnNames"); + assertArrayEquals(headers, format.getHeader(), "Header"); } else { - Assertions.assertThrows(IllegalArgumentException.class, builder::get); + assertThrows(IllegalArgumentException.class, builder::get); } } @@ -327,10 +330,10 @@ void testCSVParser(final DuplicateHeaderMode duplicateHeaderMode, try (CSVParser parser = CSVParser.parse(input, format)) { // Parser ignores null headers final List expected = Arrays.stream(headers).filter(s -> s != null).collect(Collectors.toList()); - Assertions.assertEquals(expected, parser.getHeaderNames(), "HeaderNames"); + assertEquals(expected, parser.getHeaderNames(), "HeaderNames"); } } else { - Assertions.assertThrows(IllegalArgumentException.class, () -> CSVParser.parse(input, format)); + assertThrows(IllegalArgumentException.class, () -> CSVParser.parse(input, format)); } } } diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java index d1d19f755..ed20898de 100644 --- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java +++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotSame; import static org.junit.jupiter.api.Assertions.assertNull; @@ -48,7 +49,6 @@ import java.util.Objects; import org.apache.commons.csv.CSVFormat.Builder; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; /** @@ -64,16 +64,16 @@ public enum Header { Name, Email, Phone } - private static void assertNotEquals(final Object right, final Object left) { - Assertions.assertNotEquals(right, left); - Assertions.assertNotEquals(left, right); + private static void assertNotEqualsFlip(final Object right, final Object left) { + assertNotEquals(right, left); + assertNotEquals(left, right); } private static CSVFormat copy(final CSVFormat format) { return format.builder().setDelimiter(format.getDelimiter()).get(); } - private void assertNotEquals(final String name, final String type, final Object left, final Object right) { + private void assertNotEqualsHash(final String name, final String type, final Object left, final Object right) { if (left.equals(right) || right.equals(left)) { fail("Objects must not compare equal for " + name + "(" + type + ")"); } @@ -198,8 +198,8 @@ void testDuplicateHeaderElementsTrueContainsEmpty3() { void testEquals() { final CSVFormat right = CSVFormat.DEFAULT; final CSVFormat left = copy(right); - Assertions.assertNotEquals(null, right); - Assertions.assertNotEquals("A String Instance", right); + assertNotEquals(null, right); + assertNotEquals("A String Instance", right); assertEquals(right, right); assertEquals(right, left); assertEquals(left, right); @@ -212,7 +212,7 @@ void testEqualsCommentStart() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setCommentMarker('#').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setCommentMarker('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -221,7 +221,7 @@ void testEqualsCommentStart_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withCommentMarker('#').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withCommentMarker('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -229,7 +229,7 @@ void testEqualsDelimiter() { final CSVFormat right = CSVFormat.newFormat('!'); final CSVFormat left = CSVFormat.newFormat('?'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -237,7 +237,7 @@ void testEqualsEscape() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setCommentMarker('#').setEscape('+').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setEscape('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -246,7 +246,7 @@ void testEqualsEscape_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withCommentMarker('#').withEscape('+').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withEscape('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -262,49 +262,49 @@ void testEqualsHash() throws Exception { case "boolean": { final Object defTrue = method.invoke(CSVFormat.DEFAULT, Boolean.TRUE); final Object defFalse = method.invoke(CSVFormat.DEFAULT, Boolean.FALSE); - assertNotEquals(name, type, defTrue, defFalse); + assertNotEqualsHash(name, type, defTrue, defFalse); break; } case "char": { final Object a = method.invoke(CSVFormat.DEFAULT, 'a'); final Object b = method.invoke(CSVFormat.DEFAULT, 'b'); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.Character": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { null }); final Object b = method.invoke(CSVFormat.DEFAULT, Character.valueOf('d')); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.String": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { null }); final Object b = method.invoke(CSVFormat.DEFAULT, "e"); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.String[]": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { new String[] { null, null } }); final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] { new String[] { "f", "g" } }); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "org.apache.commons.csv.QuoteMode": { final Object a = method.invoke(CSVFormat.DEFAULT, QuoteMode.MINIMAL); final Object b = method.invoke(CSVFormat.DEFAULT, QuoteMode.ALL); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "org.apache.commons.csv.DuplicateHeaderMode": { final Object a = method.invoke(CSVFormat.DEFAULT, DuplicateHeaderMode.ALLOW_ALL); final Object b = method.invoke(CSVFormat.DEFAULT, DuplicateHeaderMode.DISALLOW); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } case "java.lang.Object[]": { final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] { new Object[] { null, null } }); final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] { new Object[] { new Object(), new Object() } }); - assertNotEquals(name, type, a, b); + assertNotEqualsHash(name, type, a, b); break; } default: @@ -327,7 +327,7 @@ void testEqualsHeader() { .setIgnoreEmptyLines(true).setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setHeader("Three", "Two", "One").get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -337,7 +337,7 @@ void testEqualsHeader_Deprecated() { .withIgnoreEmptyLines().withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withHeader("Three", "Two", "One"); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -346,7 +346,7 @@ void testEqualsIgnoreEmptyLines() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setIgnoreEmptyLines(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -356,7 +356,7 @@ void testEqualsIgnoreEmptyLines_Deprecated() { .withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withIgnoreEmptyLines(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -365,7 +365,7 @@ void testEqualsIgnoreSurroundingSpaces() { .setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setIgnoreSurroundingSpaces(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -375,7 +375,7 @@ void testEqualsIgnoreSurroundingSpaces_Deprecated() { .withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withIgnoreSurroundingSpaces(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -383,7 +383,7 @@ void testEqualsLeftNoQuoteRightQuote() { final CSVFormat left = CSVFormat.newFormat(',').builder().setQuote(null).get(); final CSVFormat right = left.builder().setQuote('#').get(); - assertNotEquals(left, right); + assertNotEqualsFlip(left, right); } @SuppressWarnings("deprecation") @@ -392,7 +392,15 @@ void testEqualsLeftNoQuoteRightQuote_Deprecated() { final CSVFormat left = CSVFormat.newFormat(',').withQuote(null); final CSVFormat right = left.withQuote('#'); - assertNotEquals(left, right); + assertNotEqualsFlip(left, right); + } + + @Test + void testEqualsMaxRows() { + final CSVFormat right = CSVFormat.DEFAULT.builder().setMaxRows(10).get(); + final CSVFormat left = CSVFormat.DEFAULT.builder().setMaxRows(1000).get(); + assertNotEqualsFlip(right, left); + assertNotEquals(right.hashCode(), left.hashCode()); } @Test @@ -418,7 +426,7 @@ void testEqualsNullString() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).setNullString("null").get(); final CSVFormat left = right.builder().setNullString("---").get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -428,7 +436,7 @@ void testEqualsNullString_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL).withNullString("null"); final CSVFormat left = right.withNullString("---"); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -495,7 +503,7 @@ void testEqualsOne() { assertFalse(csvFormatTwo.isCommentMarkerSet()); assertNotSame(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatTwo, csvFormatOne); assertEquals('\\', (char) csvFormatOne.getEscapeCharacter()); assertNull(csvFormatOne.getQuoteMode()); @@ -554,10 +562,10 @@ void testEqualsOne() { assertNotSame(csvFormatOne, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatOne, csvFormatTwo); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatOne, csvFormatTwo); + assertNotEquals(csvFormatTwo, csvFormatOne); - Assertions.assertNotEquals(csvFormatTwo, csvFormatOne); + assertNotEquals(csvFormatTwo, csvFormatOne); } @@ -566,7 +574,7 @@ void testEqualsQuoteChar() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').get(); final CSVFormat left = right.builder().setQuote('!').get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -575,7 +583,7 @@ void testEqualsQuoteChar_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"'); final CSVFormat left = right.withQuote('!'); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -583,7 +591,7 @@ void testEqualsQuotePolicy() { final CSVFormat right = CSVFormat.newFormat('\'').builder().setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setQuoteMode(QuoteMode.MINIMAL).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -592,7 +600,7 @@ void testEqualsQuotePolicy_Deprecated() { final CSVFormat right = CSVFormat.newFormat('\'').withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withQuoteMode(QuoteMode.MINIMAL); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -601,7 +609,7 @@ void testEqualsRecordSeparator() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).get(); final CSVFormat left = right.builder().setRecordSeparator(LF).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -611,7 +619,7 @@ void testEqualsRecordSeparator_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL); final CSVFormat left = right.withRecordSeparator(LF); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } void testEqualsSkipHeaderRecord() { @@ -619,7 +627,7 @@ void testEqualsSkipHeaderRecord() { .setIgnoreSurroundingSpaces(true).setQuote('"').setQuoteMode(QuoteMode.ALL).setNullString("null").setSkipHeaderRecord(true).get(); final CSVFormat left = right.builder().setSkipHeaderRecord(false).get(); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @SuppressWarnings("deprecation") @@ -629,7 +637,7 @@ void testEqualsSkipHeaderRecord_Deprecated() { .withIgnoreSurroundingSpaces().withQuote('"').withQuoteMode(QuoteMode.ALL).withNullString("null").withSkipHeaderRecord(); final CSVFormat left = right.withSkipHeaderRecord(false); - assertNotEquals(right, left); + assertNotEqualsFlip(right, left); } @Test @@ -691,7 +699,7 @@ void testEqualsWithNull() { assertNull(csvFormat.getQuoteCharacter()); assertTrue(csvFormat.isNullStringSet()); - Assertions.assertNotEquals(null, csvFormat); + assertNotEquals(null, csvFormat); } @@ -801,7 +809,7 @@ void testHashCodeAndWithIgnoreHeaderCase() { assertTrue(csvFormatTwo.getIgnoreHeaderCase()); // now different assertFalse(csvFormatTwo.getTrailingDelimiter()); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal assertFalse(csvFormatTwo.getAllowMissingColumnNames()); assertFalse(csvFormatTwo.getTrim()); @@ -958,6 +966,23 @@ void testPrintWithQuotes() throws IOException { assertEquals("\"\"\"a,b,c\r\nx,y,z\"", out.toString()); } + /** + * Tests CSV-326. + */ + @Test + void testPrintWithQuotesEscapeBeforeQuote() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder() + .setEscape('\\') + .setQuote('"') + .get(); + final String value = "\\\""; + final Appendable out = new StringBuilder(); + format.print(new StringReader(value), out, true); + try (CSVParser parser = CSVParser.parse(out.toString(), format)) { + assertEquals(value, parser.getRecords().get(0).get(0)); + } + } + @Test void testQuoteCharSameAsCommentStartThrowsException() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.builder().setQuote('!').setCommentMarker('!').get()); @@ -993,6 +1018,35 @@ void testQuoteCharSameAsDelimiterThrowsException_Deprecated() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withQuote('!').withDelimiter('!')); } + @Test + void testQuotedNullStringTracksQuoteCharacter() throws IOException { + final StringBuilder out = new StringBuilder(); + // @formatter:off + final Builder builder = CSVFormat.DEFAULT.builder(); + final CSVFormat format = builder + .setQuoteMode(QuoteMode.ALL) + .setNullString("NULL") + .get(); + // @formatter:on + format.print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // set + out.setLength(0); + builder.setQuote('\''); + builder.get().print(null, out, true); + assertEquals("'NULL'", out.toString()); + // reset + out.setLength(0); + builder.setQuote((Character) null); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // reset, reverse setter order + out.setLength(0); + builder.setNullString(null).setQuote((Character) null).setNullString("NULL"); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + } + @Test void testQuoteModeNoneShouldReturnMeaningfulExceptionMessage() { final Exception exception = assertThrows(IllegalArgumentException.class, () -> @@ -1156,7 +1210,7 @@ void testToStringAndWithCommentMarkerTakingCharacter() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); + assertNotEquals(csvFormatTwo, csvFormat); assertNull(csvFormat.getEscapeCharacter()); assertTrue(csvFormat.isQuoteCharacterSet()); @@ -1215,9 +1269,9 @@ void testToStringAndWithCommentMarkerTakingCharacter() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormat, csvFormatTwo); + assertNotEquals(csvFormat, csvFormatTwo); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); + assertNotEquals(csvFormatTwo, csvFormat); assertEquals("Delimiter=<,> QuoteChar=<\"> CommentStart= RecordSeparator=<\r\n> EmptyLines:ignored SkipHeaderRecord:false", csvFormatTwo.toString()); @@ -1403,7 +1457,7 @@ void testWithHeaderComments() { assertNotSame(csvFormat, csvFormatTwo); assertNotSame(csvFormatTwo, csvFormat); - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal final String string = csvFormatTwo.format(objectArray); @@ -1465,9 +1519,9 @@ void testWithHeaderComments() { assertNotSame(csvFormatTwo, csvFormat); assertNotNull(string); - Assertions.assertNotEquals(csvFormat, csvFormatTwo); // CSV-244 - should not be equal + assertNotEquals(csvFormat, csvFormatTwo); // CSV-244 - should not be equal - Assertions.assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal + assertNotEquals(csvFormatTwo, csvFormat); // CSV-244 - should not be equal assertEquals(",,,,,,,", string); } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index d9dd4e545..6d9bdd9e8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() { () -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false))); } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping + * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value. + */ + @Test + void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (CSVParser parser = CSVParser.parse(" |a", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "", "a" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b |", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0)); + } + } + @Test void testEmptyFile() throws Exception { try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8, @@ -648,6 +673,100 @@ void testForEach() throws Exception { } } + @Test + void testGetBytePositionMultiCharacterDelimiter() throws IOException { + final String code = "aa[|]bb\ncc[|]dd\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(StandardCharsets.UTF_8) + .setTrackBytes(true) + .get()) { + final Iterator it = parser.iterator(); + final CSVRecord first = it.next(); + final CSVRecord second = it.next(); + assertEquals(0, first.getBytePosition()); + assertEquals(8, second.getBytePosition()); + } + } + + /** + * Tests CSV-329. + */ + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException { + final String delimiter = "xπŸ˜€"; + final String code = "axπŸ˜€b\ncxπŸ˜€d\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(UTF_8) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertValuesEquals(new String[] { "a", "b" }, first); + assertValuesEquals(new String[] { "c", "d" }, second); + assertEquals(0, first.getBytePosition()); + assertEquals("axπŸ˜€b\n".getBytes(UTF_8).length, second.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { + final String row0 = "Γ©,x\n"; + final Charset charset = UTF_8; + // row0 char count is 4 + assertEquals(4, row0.length()); + // row0 byte count is 5 + final int record1ByteOffset = row0.getBytes(charset).length; + assertEquals(5, record1ByteOffset); + final String row1 = "b,c\n"; + final String rows = row0 + row1; + final long record1CharOffset = row0.length(); + final long expectedByteOffset = row0.getBytes(charset).length; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(row1)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(charset) + .setTrackBytes(true) + .setByteOffset(record1ByteOffset) + .setCharacterOffset(record1CharOffset) + .setRecordNumber(2) // not relevant but a better use case example. + .get()) { + final CSVRecord record = parser.nextRecord(); + assertNotNull(record); + assertEquals(4, record.getCharacterPosition()); + assertEquals(record1CharOffset, record.getCharacterPosition()); + assertEquals(expectedByteOffset, record.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithSingleByteCharset() throws IOException { + // A single-byte charset cannot encode U+FFFF, the char value of the EOF sentinel. + // Byte counting must skip the EOF read so a valid file parses without throwing. + final String code = "a,b\nc,d\n"; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(StandardCharsets.ISO_8859_1) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertNull(parser.nextRecord()); + assertEquals(0, first.getBytePosition()); + assertEquals(4, second.getBytePosition()); + } + } + @Test void testGetHeaderComment_HeaderComment1() throws IOException { try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) { @@ -917,6 +1036,23 @@ void testGetRecordsMaxRows(final long maxRows) throws IOException { } } + /** + * Tests CSV-327. + */ + @Test + void testGetRecordsMaxRowsWithRecordNumberOffset() throws IOException { + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader("a,b\nc,d\n")) + .setFormat(CSVFormat.DEFAULT.builder().setMaxRows(1).get()) + .setRecordNumber(2) + .get()) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertEquals(2, records.get(0).getRecordNumber()); + assertValuesEquals(new String[] { "a", "b" }, records.get(0)); + } + } + @Test void testGetRecordThreeBytesRead() throws Exception { final String code = "id,date,val5,val4\n" + @@ -1603,6 +1739,50 @@ void testParsingPrintedEmptyFirstColumn(final CSVFormat.Predefined format) throw } } + /** + * A truncated escaped multi-character delimiter at EOF must stay literal data and not be completed from a stale + * escape delimiter look-ahead. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (CSVParser parser = format.parse(new StringReader("x![!|!]y![!|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("x[|]y![!|", record.get(0)); + assertEquals(1, record.size()); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = format.parse(new StringReader("a[|]b[|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("a", record.get(0)); + assertEquals("b[|", record.get(1)); + assertEquals(2, record.size()); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (CSVParser parser = format.parse(new StringReader(recordString))) { + final CSVRecord record = parser.nextRecord(); + assertEquals(recordString, record.get(0)); + assertEquals(1, record.size()); + } + } + @Test void testProvidedHeader() throws Exception { final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z"); @@ -1794,6 +1974,23 @@ void testTrailingDelimiter() throws Exception { } } + @Test + void testTrailingDelimiterKeepsQuotedEmptyLastField() throws Exception { + final CSVFormat format = CSVFormat.DEFAULT.builder().setTrailingDelimiter(true).get(); + try (CSVParser parser = CSVParser.parse("a,b,\"\"", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(3, record.size()); + assertEquals("a", record.get(0)); + assertEquals("b", record.get(1)); + assertEquals("", record.get(2)); + } + // An unquoted trailing delimiter still drops the empty field. + try (CSVParser parser = CSVParser.parse("a,b,", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(2, record.size()); + } + } + @Test void testTrim() throws Exception { final Reader in = new StringReader("a,a,a\n\" 1 \",\" 2 \",\" 3 \"\nx,y,z"); diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index 1ff791010..9ae80c1e5 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -569,6 +569,57 @@ void testEscapeBackslash5() throws IOException { assertEquals("\\\\", sw.toString()); } + @Test + void testEscapeCommentMarkerFirstChar() throws IOException { + // No quoting available in escape mode, so a leading comment marker must be escaped or the + // record reads back as a comment and is dropped. Mirrors the quoting fix for QuoteMode.MINIMAL. + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote(null).setEscape('\\').setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + // The marker past the first character does not start a comment and is left alone. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("\\;comment-like,b" + RECORD_SEPARATOR + + "\\;comment-like,b" + RECORD_SEPARATOR + + "a;b,\\;c" + RECORD_SEPARATOR, string); + // The emitted records must read back as the original values, none parsed as a comment. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(3, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals(col1, records.get(1).get(0)); + assertEquals("b", records.get(1).get(1)); + assertEquals("a;b", records.get(2).get(0)); + assertEquals(";c", records.get(2).get(1)); + } + } + + @Test + void testEscapeCommentMarkerFirstCharWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setEscape('\\').setQuoteMode(QuoteMode.NONE).setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";bar"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + } + final String string = sw.toString(); + assertEquals("\\;bar,b" + RECORD_SEPARATOR + "\\;bar,b" + RECORD_SEPARATOR, string); + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals("b", record.get(1)); + } + } + } + @Test void testEscapeNull1() throws IOException { final StringWriter sw = new StringWriter(); @@ -1798,6 +1849,28 @@ void testQuoteAll() throws IOException { } } + @Test + void testQuoteCharEscapedWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote('"').setEscape('?').setQuoteMode(QuoteMode.NONE).get(); + final StringWriter sw = new StringWriter(); + final String col1 = "\"abc"; + final String col2 = "x\"y"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, col2); + printer.printRecord(new StringReader(col1), new StringReader(col2)); + } + assertEquals("?\"abc,x?\"y" + RECORD_SEPARATOR + "?\"abc,x?\"y" + RECORD_SEPARATOR, sw.toString()); + // The emitted records must read back as the original values. + try (CSVParser parser = CSVParser.parse(sw.toString(), format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals(col2, record.get(1)); + } + } + } + @Test void testQuoteCommaFirstChar() throws IOException { final StringWriter sw = new StringWriter(); @@ -1807,6 +1880,34 @@ void testQuoteCommaFirstChar() throws IOException { } } + @Test + void testQuoteCommentMarkerFirstChar() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + // A real comment is written with the marker, unquoted. + printer.printComment("a real comment"); + // A value starting with the marker is quoted, so it does not read back as a comment. + printer.printRecord(col1, "b"); + // The marker past the first character does not start a comment, so only the leading-marker value is quoted. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("; a real comment" + RECORD_SEPARATOR + + "\";comment-like\",b" + RECORD_SEPARATOR + + "a;b,\";c\"" + RECORD_SEPARATOR, string); + // The comment is dropped on read; both data records survive intact. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals("a;b", records.get(1).get(0)); + assertEquals(";c", records.get(1).get(1)); + } + } + @Test void testQuoteNonNumeric() throws IOException { final StringWriter sw = new StringWriter(); diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java index 056b8a9c9..b8d9b9f19 100644 --- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java +++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception { } } + @Test + void testReadingSupplementaryCharacterTracksBytes() throws Exception { + final String input = "πŸ˜€"; + final char[] buffer = new char[input.length()]; + try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) { + assertEquals(input.length(), reader.read(buffer, 0, buffer.length)); + assertArrayEquals(input.toCharArray(), buffer); + assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead()); + assertEquals(input.length(), reader.getPosition()); + assertEquals(input.charAt(input.length() - 1), reader.getLastChar()); + } + } + @Test void testReadLine() throws Exception { try (ExtendedBufferedReader br = createBufferedReader("")) { diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index e54e93365..a76f6e513 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the + * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped. + */ + @Test + void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (Lexer lexer = createLexer(" |a", format)) { + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "a", lexer); + } + try (Lexer lexer = createLexer("a | |b", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "b", lexer); + } + } + @Test void testEOFWithoutClosingQuote() throws Exception { final String code = "a,\"b"; @@ -409,6 +428,44 @@ void testNextToken6() throws IOException { } } + /** + * A truncated escaped multi-character delimiter at EOF must not be accepted by reusing the previous escape delimiter + * look-ahead in {@link Lexer#isEscapeDelimiter()}. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (Lexer lexer = createLexer("x![!|!]y![!|", format)) { + assertNextToken(EOF, "x[|]y![!|", lexer); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (Lexer lexer = createLexer("a[|]b[|", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(EOF, "b[|", lexer); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token). + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (Lexer lexer = createLexer(recordString, format)) { + assertNextToken(EOF, recordString, lexer); + } + } + @Test void testReadEscapeBackspace() throws IOException { try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) { diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java new file mode 100644 index 000000000..2b9e335a8 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv227Test.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.commons.csv.issues; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.apache.commons.csv.QuoteMode; +import org.junit.jupiter.api.Test; + +/** + * Tests https://issues.apache.org/jira/browse/CSV-227 + */ +class JiraCsv227Test { + + @Test + public void test() throws IOException { + final StringBuilder out = new StringBuilder(); + try (CSVPrinter printer = new CSVPrinter(out, CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL))) { + printer.printRecord("γ…γ…Žγ„·γ„Ή", "γ…γ…Žγ„·γ„Ή", "", "test2"); + printer.printRecord("ν•œκΈ€3", "hello3", "3ν•œκΈ€3", "test3"); + printer.printRecord("", "hello4", "", "test4"); + } + // γ…γ…Žγ„·γ„Ή,γ…γ…Žγ„·γ„Ή,,test2 + // ν•œκΈ€3,hello3,3ν•œκΈ€3,test3 + // "",hello4,,test4 + assertEquals("γ…γ…Žγ„·γ„Ή,γ…γ…Žγ„·γ„Ή,,test2\r\nν•œκΈ€3,hello3,3ν•œκΈ€3,test3\r\n\"\",hello4,,test4\r\n", out.toString()); + } +}