diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b7b7fdf05..cca38e512 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -46,10 +46,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} @@ -58,7 +58,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -69,7 +69,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/autobuild@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl @@ -83,4 +83,4 @@ jobs: # make release - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index f0d8ca94e..7bc02bdd2 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,6 +26,6 @@ jobs: runs-on: ubuntu-latest steps: - name: 'Checkout Repository' - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 - name: 'Dependency Review PR' - uses: actions/dependency-review-action@2031cfc080254a8a887f58cffee85186f0e49e48 # v4.9.0 + uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v5.0.0 diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 10c18b607..17ba7dd38 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -34,26 +34,26 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] - java: [ 8, 11, 17, 21, 25 ] + java: [ 8, 11, 17, 21, 25, 26 ] experimental: [false] # Keep the same parameter order as the matrix above include: - os: ubuntu-latest - java: 26-ea + java: 27-ea experimental: true steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0 with: distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }} java-version: ${{ matrix.java }} diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml index 16e37f605..e1868cb46 100644 --- a/.github/workflows/scorecards-analysis.yml +++ b/.github/workflows/scorecards-analysis.yml @@ -40,7 +40,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0 with: persist-credentials: false @@ -64,6 +64,6 @@ jobs: retention-days: 5 - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: sarif_file: results.sarif diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eb15f2518..3423e18ad 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,13 +48,13 @@ Getting Started --------------- + Make sure you have a [JIRA account](https://issues.apache.org/jira/). -+ Make sure you have a [GitHub account](https://github.com/signup/free). This is not essential, but makes providing patches much easier. ++ Make sure you have a [GitHub account](https://github.com/signup). This is not essential, but makes providing patches much easier. + If you're planning to implement a new feature it makes sense to discuss your changes on the [dev list](https://commons.apache.org/mail-lists.html) first. This way you can make sure you're not wasting your time on something that isn't considered to be in Apache Commons CSV's scope. + Submit a [Jira Ticket][jira] for your issue, assuming one does not already exist. + Clearly describe the issue including steps to reproduce when it is a bug. + Make sure you fill in the earliest version that you know has the issue. + Find the corresponding [repository on GitHub](https://github.com/apache/?query=commons-), -[fork](https://help.github.com/articles/fork-a-repo/) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. +[fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) and check out your forked repository. If you don't have a GitHub account, you can still clone the Commons repository. Making Changes -------------- @@ -108,8 +108,8 @@ Additional Resources + [Contributing patches](https://commons.apache.org/patches.html) + [Apache Commons CSV JIRA project page][jira] + [Contributor License Agreement][cla] -+ [General GitHub documentation](https://help.github.com/) -+ [GitHub pull request documentation](https://help.github.com/articles/creating-a-pull-request/) ++ [General GitHub documentation](https://docs.github.com/) ++ [GitHub pull request documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) + [Apache Commons Twitter Account](https://twitter.com/ApacheCommons) [cla]:https://www.apache.org/licenses/#clas diff --git a/pom.xml b/pom.xml index 7e796568e..8cb13ed7c 100644 --- a/pom.xml +++ b/pom.xml @@ -20,10 +20,10 @@ org.apache.commons commons-parent - 99 + 102 commons-csv - 1.14.2-SNAPSHOT + 1.15.0-SNAPSHOT Apache Commons CSV https://commons.apache.org/proper/commons-csv/ 2005 @@ -59,6 +59,7 @@ com.h2database h2 + 2.2.224 test @@ -89,12 +90,12 @@ - 1.14.2 + 1.15.0 (Java 8 or above) RC1 1.14.1 - 1.14.3 + 1.15.1 csv org.apache.commons.csv CSV diff --git a/src/changes/changes.xml b/src/changes/changes.xml index aacd40586..93952e9f1 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -40,16 +40,34 @@ Apache Commons CSV Release Notes - + Remove Spotbugs dependency and use exclude-filter instead #564. Remove broken website link #577. Fix Apache RAT plugin console warnings. [Javadoc] Clarify behavior of deprecated CSVFormat#withFirstRecordAsHeader() #2413. + CSVFormat.equals()/hashCode() ignores maxRows (#600). + ExtendedBufferedReader byte tracking leads to an incorrect CSVRecord.getBytePosition() (#601). + CSVFormat.Builder.setQuote() does not refresh quotedNullString (#2447). + Lexer.isDelimiter() accepts a partial multi-character delimiter at EOF (#603). + CSVParser applies characterOffset to bytePosition (#604). + CSVPrinter Reader printing with quote and escape can emit CSV that its parser cannot read back. + CSVParser applies maxRows to record numbers instead of rows produced when setRecordNumber(...) is used. + CSVParser with trackBytes enabled throws on multi-character delimiters containing supplementary Unicode characters. + CSVFormat.Builder.setNullString(String) can build an invalid quoted null string after setQuote(null). + Escape Reader values with quote and escape (#606). + Clear escape delimiter buffer before peek in Lexer.isEscapeDelimiter() (#608, #611). + Escape quote char in printWithEscapes when QuoteMode is NONE (#609). + Quote value starting with comment marker in minimal quote mode (#610). + Escape leading comment marker in printWithEscapes (#614). + Skip byte counting at EOF in ExtendedBufferedReader.read (#615). + Keep quoted empty trailing field with trailingDelimiter (#616). + Evaluate isDelimiter once in nextToken whitespace skip (#618).. Add an "Android Compatibility" section to the web site. + Add CSVParser.Builder.setByteOffset(long) (#604). - Bump org.apache.commons:commons-parent from 85 to 99 #573, #595. + Bump org.apache.commons:commons-parent from 85 to 102 #573, #595. [test] Bump com.opencsv:opencsv from 5.11.2 to 5.12.0 #558. Bump org.apache.commons:commons-lang3 from 3.18.0 to 3.20.0. Bump commons-codec:commons-codec from 1.19.0 to 1.22.0. diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 09e76e3a6..7145d23d3 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -780,8 +780,7 @@ public Builder setMaxRows(final long maxRows) { */ public Builder setNullString(final String nullString) { this.nullString = nullString; - this.quotedNullString = quoteCharacter + nullString + quoteCharacter; - return this; + return setQuotedNullString(); } /** @@ -806,6 +805,12 @@ public Builder setQuote(final Character quoteCharacter) { throw new IllegalArgumentException("The quoteCharacter cannot be a line break"); } this.quoteCharacter = quoteCharacter; + return setQuotedNullString(); + } + + private Builder setQuotedNullString() { + final Character quote = quoteCharacter != null ? quoteCharacter : Constants.DOUBLE_QUOTE_CHAR; + this.quotedNullString = quote + nullString + quote; return this; } @@ -878,6 +883,16 @@ public Builder setTrailingData(final boolean trailingData) { /** * Sets whether to add a trailing delimiter. * + *

+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field + * that such a trailing delimiter produces is dropped so the output round-trips back to the original record; + * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept. + *

+ *

+ * This is unrelated to {@link #setTrailingData(boolean) trailing data}, which controls whether characters + * after the closing quote of an encapsulated value are tolerated when reading. + *

+ * * @param trailingDelimiter whether to add a trailing delimiter. * @return This instance. */ @@ -1477,7 +1492,7 @@ private static boolean isLineBreak(final char c) { * @return true if {@code c} is a line break character (and not null). */ private static boolean isLineBreak(final Character c) { - return c != null && isLineBreak(c.charValue()); // Explicit (un)boxing is intentional + return c != null && isLineBreak(c.charValue()); // Explicit unboxing is intentional } /** Same test as in as {@link String#trim()}. */ @@ -1690,15 +1705,15 @@ public boolean equals(final Object obj) { duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) && ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase && - ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && - Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && - quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && - Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord && - trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim; + ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof && maxRows == other.maxRows && + Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && + Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && + skipHeaderRecord == other.skipHeaderRecord && trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && + trim == other.trim; } private void escape(final char c, final Appendable appendable) throws IOException { - append(escapeCharacter.charValue(), appendable); // Explicit (un)boxing is intentional + append(escapeCharacter.charValue(), appendable); // Explicit unboxing is intentional append(c, appendable); } @@ -1836,7 +1851,7 @@ public DuplicateHeaderMode getDuplicateHeaderMode() { * @return the escape character, may be {@code 0} */ char getEscapeChar() { - return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit (un)boxing is intentional + return escapeCharacter != null ? escapeCharacter.charValue() : 0; // Explicit unboxing is intentional } /** @@ -2007,6 +2022,16 @@ public boolean getTrailingData() { /** * Gets whether to add a trailing delimiter. * + *

+ * When writing, a delimiter is appended after the last value of each record. When reading, the empty field + * that such a trailing delimiter produces is dropped so the output round-trips back to the original record; + * a quoted empty trailing field ({@code ""}) is a real value rather than a trailing delimiter and is kept. + *

+ *

+ * This is unrelated to {@link #getTrailingData() trailing data}, which controls whether characters after the + * closing quote of an encapsulated value are tolerated when reading. + *

+ * * @return whether to add a trailing delimiter. * @since 1.3 */ @@ -2029,9 +2054,10 @@ public int hashCode() { int result = 1; result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headers); - return prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, - ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString, + result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter, + ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, maxRows, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim); + return result; } /** @@ -2158,7 +2184,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo } final boolean quoteCharacterSet = isQuoteCharacterSet(); if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } // Stream the input to the output without reading or holding the whole value in memory. // AppendableOutputStream cannot "close" an Appendable. @@ -2166,7 +2192,7 @@ private void print(final InputStream inputStream, final Appendable out, final bo IOUtils.copy(inputStream, outputStream); } if (quoteCharacterSet) { - append(getQuoteCharacter().charValue(), out); // Explicit (un)boxing is intentional + append(getQuoteCharacter().charValue(), out); // Explicit unboxing is intentional } } @@ -2321,12 +2347,18 @@ private void printWithEscapes(final CharSequence charSeq, final Appendable appen final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional while (pos < end) { char c = charSeq.charAt(pos); final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && pos == 0 && c == commentChar; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { appendable.append(charSeq, start, pos); @@ -2365,8 +2397,13 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final char[] delimArray = getDelimiterCharArray(); final int delimLength = delimArray.length; final char escape = getEscapeChar(); + final boolean quoteSet = isQuoteCharacterSet(); + final char quote = quoteSet ? getQuoteCharacter().charValue() : 0; + final boolean commentMarkerSet = isCommentMarkerSet(); + final char commentChar = commentMarkerSet ? commentMarker.charValue() : 0; // Explicit unboxing is intentional final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); int c; + boolean firstChar = true; final char[] lookAheadBuffer = new char[delimLength - 1]; while (EOF != (c = bufferedReader.read())) { builder.append((char) c); @@ -2376,7 +2413,10 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) final boolean isDelimiterStart = isDelimiter((char) c, test, pos, delimArray, delimLength); final boolean isCr = c == Constants.CR; final boolean isLf = c == Constants.LF; - if (isCr || isLf || c == escape || isDelimiterStart) { + // A leading comment marker would be read back as a comment, so escape it. + final boolean isComment = commentMarkerSet && firstChar && c == commentChar; + firstChar = false; + if (isCr || isLf || c == escape || quoteSet && c == quote || isDelimiterStart || isComment) { // write out segment up until this char if (pos > start) { append(builder.substring(start, pos), appendable); @@ -2415,7 +2455,7 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi final int len = charSeq.length(); final char[] delim = getDelimiterCharArray(); final int delimLength = delim.length; - final char quoteChar = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quoteChar = getQuoteCharacter().charValue(); // Explicit unboxing is intentional // If escape char not specified, default to the quote char // This avoids having to keep checking whether there is an escape character // at the cost of checking against quote twice @@ -2447,10 +2487,11 @@ private void printWithQuotes(final Object object, final CharSequence charSeq, fi } } else { char c = charSeq.charAt(pos); - if (c <= Constants.COMMENT) { + if (c <= Constants.COMMENT || isCommentMarkerSet() && c == commentMarker.charValue()) { // Some other chars at the start of a value caused the parser to fail, so for now // encapsulate if we start in anything less than '#'. We are being conservative - // by including the default comment char too. + // by including the default comment char and any configured comment marker too, + // which the parser would otherwise read back as a comment line. quote = true; } else { while (pos < len) { @@ -2518,15 +2559,16 @@ private void printWithQuotes(final Reader reader, final Appendable appendable) t printWithEscapes(reader, appendable); return; } - final char quote = getQuoteCharacter().charValue(); // Explicit (un)boxing is intentional + final char quote = getQuoteCharacter().charValue(); // Explicit unboxing is intentional + final char escape = isEscapeCharacterSet() ? getEscapeChar() : quote; // (1) Append opening quote append(quote, appendable); - // (2) Append Reader contents, doubling quotes + // (2) Append Reader contents, doubling quotes and escape characters int c; while (EOF != (c = reader.read())) { append((char) c, appendable); - if (c == quote) { - append(quote, appendable); + if (c == quote || c == escape) { + append((char) c, appendable); } } // (3) Append closing quote @@ -2604,13 +2646,13 @@ boolean useRow(final long rowNum) { * @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes. */ private void validate() throws IllegalArgumentException { - if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit (un)boxing is intentional + if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); } - if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit (un)boxing is intentional + if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); } - if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit (un)boxing is intentional + if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { // Explicit unboxing is intentional throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); } if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index bce62ea54..141eba732 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -154,6 +154,7 @@ public final class CSVParser implements Iterable, Closeable { public static class Builder extends AbstractStreamBuilder { private CSVFormat format; + private long byteOffset = -1; private long characterOffset; private long recordNumber = 1; private boolean trackBytes; @@ -165,17 +166,33 @@ protected Builder() { // empty } - @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes); + return new CSVParser(this); } /** - * Sets the lexer offset when the parser does not start parsing at the beginning of the source. + * Sets the lexer byte offset when the parser does not start parsing at the beginning of the source. + *

+ * By default, the value is {@code -1}, which reuses the character offset for the byte offset. + *

* - * @param characterOffset the lexer offset. + * @param byteOffset the lexer byte offset. * @return {@code this} instance. + * @see #setCharacterOffset(long) + * @since 1.15.0 + */ + public Builder setByteOffset(final long byteOffset) { + this.byteOffset = byteOffset; + return asThis(); + } + + /** + * Sets the lexer character offset when the parser does not start parsing at the beginning of the source. + * + * @param characterOffset the lexer character offset. + * @return {@code this} instance. + * @see #setByteOffset(long) */ public Builder setCharacterOffset(final long characterOffset) { this.characterOffset = characterOffset; @@ -220,6 +237,7 @@ public Builder setTrackBytes(final boolean trackBytes) { final class CSVRecordIterator implements Iterator { private CSVRecord current; + private long recordCount; /** * Gets the next record or null at the end of stream or max rows read. @@ -230,8 +248,11 @@ final class CSVRecordIterator implements Iterator { */ private CSVRecord getNextRecord() { CSVRecord record = null; - if (format.useRow(recordNumber + 1)) { + if (format.useRow(recordCount + 1)) { record = Uncheck.get(CSVParser.this::nextRecord); + if (record != null) { + recordCount++; + } } return record; } @@ -466,6 +487,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor */ private long recordNumber; + /** + * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination + * with {@link #recordNumber}. + */ + private final long byteOffset; + /** * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination * with {@link #recordNumber}. @@ -474,6 +501,23 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor private final Token reusableToken = new Token(); + /** + * Constructs a new instance from a builder. + * + * @param builder The source builder. + * @throws IOException if an I/O error occurs. + */ + @SuppressWarnings("resource") // Lexer manages ExtendedBufferedReader. + private CSVParser(final Builder builder) throws IOException { + this.format = (builder.format != null ? builder.format : CSVFormat.DEFAULT).copy(); + this.lexer = new Lexer(format, new ExtendedBufferedReader(builder.getReader(), builder.getCharset(), builder.trackBytes)); + this.csvRecordIterator = new CSVRecordIterator(); + this.headers = createHeaders(); + this.byteOffset = builder.byteOffset != -1 ? builder.byteOffset : builder.characterOffset; + this.characterOffset = builder.characterOffset; + this.recordNumber = builder.recordNumber - 1; + } + /** * Constructs a new instance using the given {@link CSVFormat}. * @@ -524,51 +568,21 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException */ @Deprecated public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { - this(reader, format, characterOffset, recordNumber, null, false); - } - - /** - * Constructs a new instance using the given {@link CSVFormat}. - * - *

- * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, - * unless you close the {@code reader}. - *

- * - * @param reader - * a Reader containing CSV-formatted input. Must not be null. - * @param format - * the CSVFormat used for CSV parsing. Must not be null. - * @param characterOffset - * Lexer offset when the parser does not start parsing at the beginning of the source. - * @param recordNumber - * The next record number to assign. - * @param charset - * The character encoding to be used for the reader when enableByteTracking is true. - * @param trackBytes - * {@code true} to enable byte tracking for the parser; {@code false} to disable it. - * @throws IllegalArgumentException - * If the parameters of the format are inconsistent or if either the reader or format is null. - * @throws IOException - * If there is a problem reading the header or skipping the first record. - * @throws CSVException Thrown on invalid CSV input data. - */ - @SuppressWarnings("resource") // reader is managed by lexer. - private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset, - final boolean trackBytes) throws IOException { - Objects.requireNonNull(reader, "reader"); - Objects.requireNonNull(format, "format"); - this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes)); - this.csvRecordIterator = new CSVRecordIterator(); - this.headers = createHeaders(); - this.characterOffset = characterOffset; - this.recordNumber = recordNumber - 1; + // @formatter:off + this(builder() + .setReader(reader) + .setFormat(Objects.requireNonNull(format, "format")) // requireNonNull for full compatibility + .setCharacterOffset(characterOffset) + .setRecordNumber(recordNumber) + .setCharset((Charset) null).setTrackBytes(false)); + // @formatter:off } private void addRecordValue(final boolean lastRecord) { final String input = format.trim(reusableToken.content.toString()); - if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) { + // Only drop the empty field produced by an actual trailing delimiter. A quoted empty + // field ("") is a real value, not a trailing delimiter, so it must be kept. + if (lastRecord && input.isEmpty() && format.getTrailingDelimiter() && !reusableToken.isQuoted) { return; } recordList.add(handleNull(input)); @@ -642,7 +656,7 @@ private Headers createHeaders() throws IOException { } observedMissing |= blankHeader; if (header != null) { - headerMap.put(header, Integer.valueOf(i)); // Explicit (un)boxing is intentional + headerMap.put(header, Integer.valueOf(i)); // Explicit boxing is intentional if (headerNames == null) { headerNames = new ArrayList<>(headerRecord.length); } @@ -887,7 +901,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; - final long startBytePosition = lexer.getBytesRead() + characterOffset; + final long startBytePosition = lexer.getBytesRead() + byteOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); diff --git a/src/main/java/org/apache/commons/csv/CSVPrinter.java b/src/main/java/org/apache/commons/csv/CSVPrinter.java index 087129ec5..a7048fd62 100644 --- a/src/main/java/org/apache/commons/csv/CSVPrinter.java +++ b/src/main/java/org/apache/commons/csv/CSVPrinter.java @@ -235,7 +235,7 @@ public void printComment(final String comment) throws IOException { if (!newRecord) { println(); } - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); for (int i = 0; i < comment.length(); i++) { final char c = comment.charAt(i); @@ -247,7 +247,7 @@ public void printComment(final String comment) throws IOException { // falls-through: break intentionally excluded. case LF: println(); - appendable.append(format.getCommentMarker().charValue()); // Explicit (un)boxing is intentional + appendable.append(format.getCommentMarker().charValue()); // Explicit unboxing is intentional appendable.append(SP); break; default: diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index f619717d0..8dab14d90 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -132,13 +132,11 @@ public String get(final String name) { throw new IllegalArgumentException(String.format("Mapping for %s not found, expected one of %s", name, headerMap.keySet())); } try { - return values[index.intValue()]; // Explicit (un)boxing is intentional + return values[index.intValue()]; // Explicit unboxing is intentional } catch (final ArrayIndexOutOfBoundsException e) { + // Explicit boxing is intentional throw new IllegalArgumentException( - String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); // Explicit - // (un)boxing - // is - // intentional + String.format("Index for header '%s' is %d but CSVRecord only has %d values!", name, index, Integer.valueOf(values.length))); } } @@ -267,7 +265,7 @@ public boolean isSet(final int index) { * @return whether a given column is mapped and has a value. */ public boolean isSet(final String name) { - return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit (un)boxing is intentional + return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; // Explicit unboxing is intentional } /** @@ -283,7 +281,7 @@ public Iterator iterator() { /** * Puts all values of this record into the given Map. * - * @param the map type. + * @param The map type. * @param map The Map to populate. * @return the given map. * @since 1.9.0 diff --git a/src/main/java/org/apache/commons/csv/Constants.java b/src/main/java/org/apache/commons/csv/Constants.java index 0b9476e1c..9dd276ecc 100644 --- a/src/main/java/org/apache/commons/csv/Constants.java +++ b/src/main/java/org/apache/commons/csv/Constants.java @@ -40,7 +40,7 @@ final class Constants { /** RFC 4180 defines line breaks as CRLF. */ static final String CRLF = "\r\n"; - static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit (un)boxing is intentional. + static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); // Explicit boxing is intentional. static final String EMPTY = ""; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 8dcda6517..20c1ef544 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -37,26 +37,30 @@ /** * A special buffered reader which supports sophisticated read access. *

- * In particular the reader supports a look-ahead option, which allows you to see the next char returned by - * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. + * In particular the reader supports a look-ahead option, which allows you to see the next char returned by {@link #read()}. This reader also tracks how many + * characters have been read with {@link #getPosition()}. *

*/ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** The last char returned */ private int lastChar = UNDEFINED; + private int lastCharMark = UNDEFINED; /** The count of EOLs (CR/LF/CRLF) seen so far */ private long lineNumber; + private long lineNumberMark; /** The position, which is the number of characters read so far */ private long position; + private long positionMark; /** The number of bytes read so far. */ private long bytesRead; + private long bytesReadMark; /** Encoder for calculating the number of bytes for each character read. */ @@ -70,12 +74,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { } /** - * Constructs a new instance with the specified reader, character set, - * and byte tracking option. Initializes an encoder if byte tracking is enabled - * and a character set is provided. + * Constructs a new instance with the specified reader, character set, and byte tracking option. Initializes an encoder if byte tracking is enabled and a + * character set is provided. * - * @param reader the reader supports a look-ahead option. - * @param charset the character set for encoding, or {@code null} if not applicable. + * @param reader the reader supports a look-ahead option. + * @param charset the character set for encoding, or {@code null} if not applicable. * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it. */ ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) { @@ -86,8 +89,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** * Closes the stream. * - * @throws IOException - * If an I/O error occurs + * @throws IOException If an I/O error occurs */ @Override public void close() throws IOException { @@ -105,26 +107,35 @@ long getBytesRead() { return this.bytesRead; } + private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException { + long len = 0; + int previous = lastChar; + for (int i = offset; i < offset + length; i++) { + len += getEncodedCharLength(previous, buf[i]); + previous = buf[i]; + } + return len; + } + /** - * Gets the byte length of the given character based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. + * Gets the byte length of the given character based on the original Unicode specification, which defined characters as fixed-width 16-bit entities. *

* The Unicode characters are divided into two main ranges: *

    - *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): - *
      - *
    • Represented using a single 16-bit {@code char}.
    • - *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • - *
    - *
  • - *
  • U+10000 to U+10FFFF (Supplementary Characters): - *
      - *
    • Represented as a pair of {@code char}s:
    • - *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • - *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • - *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • - *
    - *
  • + *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
      + *
    • Represented using a single 16-bit {@code char}.
    • + *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • + *
    + *
  • + *
  • U+10000 to U+10FFFF (Supplementary Characters): + *
      + *
    • Represented as a pair of {@code char}s:
    • + *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • + *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • + *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • + *
    + *
  • *
* * @param current the current character to process. @@ -132,8 +143,12 @@ long getBytesRead() { * @throws CharacterCodingException if the character cannot be encoded. */ private int getEncodedCharLength(final int current) throws CharacterCodingException { + return getEncodedCharLength(lastChar, current); + } + + private int getEncodedCharLength(final int previous, final int current) throws CharacterCodingException { final char cChar = (char) current; - final char lChar = (char) lastChar; + final char lChar = (char) previous; if (!Character.isSurrogate(cChar)) { return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit(); } @@ -148,10 +163,9 @@ private int getEncodedCharLength(final int current) throws CharacterCodingExcept } /** - * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by - * any of the read methods. This will not include a character read using the {@link #peek()} method. If no - * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached - * on the last read then this will return {@link IOUtils#EOF}. + * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by any of the read methods. This will not + * include a character read using the {@link #peek()} method. If no character has been read then this will return {@link Constants#UNDEFINED}. If the end of + * the stream was reached on the last read then this will return {@link IOUtils#EOF}. * * @return the last character that was read */ @@ -193,11 +207,10 @@ public void mark(final int readAheadLimit) throws IOException { @Override public int read() throws IOException { final int current = super.read(); - if (current == CR || current == LF && lastChar != CR || - current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { + if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } - if (encoder != null) { + if (encoder != null && current != EOF) { this.bytesRead += getEncodedCharLength(current); } lastChar = current; @@ -211,6 +224,9 @@ public int read(final char[] buf, final int offset, final int length) throws IOE return 0; } final int len = super.read(buf, offset, length); + if (encoder != null && len > 0) { + this.bytesRead += getEncodedCharLength(buf, offset, len); + } if (len > 0) { for (int i = offset; i < offset + len; i++) { final char ch = buf[i]; @@ -231,8 +247,7 @@ public int read(final char[] buf, final int offset, final int length) throws IOE } /** - * Gets the next line, dropping the line terminator(s). This method should only be called when processing a - * comment, otherwise, information can be lost. + * Gets the next line, dropping the line terminator(s). This method should only be called when processing a comment, otherwise, information can be lost. *

* Increments {@link #lineNumber} and updates {@link #position}. *

@@ -272,5 +287,4 @@ public void reset() throws IOException { bytesRead = bytesReadMark; super.reset(); } - } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 3d00fe0bf..fe964480a 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -23,6 +23,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.Arrays; import org.apache.commons.io.IOUtils; @@ -68,8 +69,8 @@ final class Lexer implements Closeable { /** * Appends the next escaped character to the token's content. * - * @param token the current token - * @throws IOException on stream access error + * @param token the current token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private void appendNextEscapedCharacterToToken(final Token token) throws IOException { @@ -89,7 +90,7 @@ private void appendNextEscapedCharacterToToken(final Token token) throws IOExcep * Closes resources. * * @throws IOException - * If an I/O error occurs + * If an I/O error occurs. */ @Override public void close() throws IOException { @@ -97,27 +98,27 @@ public void close() throws IOException { } /** - * Gets the number of bytes read + * Gets the number of bytes read. * - * @return the number of bytes read + * @return the number of bytes read. */ long getBytesRead() { return reader.getBytesRead(); } /** - * Returns the current character position + * Gets the current character position. * - * @return the current character position + * @return the current character position. */ long getCharacterPosition() { return reader.getPosition(); } /** - * Returns the current line number + * Gets the current line number. * - * @return the current line number + * @return the current line number. */ long getCurrentLineNumber() { return reader.getLineNumber(); @@ -136,7 +137,7 @@ boolean isCommentStart(final int ch) { } /** - * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. + * Tests whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}. * * @param ch * the current character. @@ -152,6 +153,7 @@ boolean isDelimiter(final int ch) throws IOException { isLastTokenDelimiter = true; return true; } + Arrays.fill(delimiterBuf, '\0'); reader.peek(delimiterBuf); for (int i = 0; i < delimiterBuf.length; i++) { if (delimiterBuf[i] != delimiter[i + 1]) { @@ -190,6 +192,7 @@ boolean isEscape(final int ch) { * @throws IOException If an I/O error occurs. */ boolean isEscapeDelimiter() throws IOException { + Arrays.fill(escapeDelimiterBuf, '\0'); reader.peek(escapeDelimiterBuf); if (escapeDelimiterBuf[0] != delimiter[0]) { return false; @@ -214,7 +217,7 @@ boolean isQuoteChar(final int ch) { /** * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file. * - * @param ch the character to check + * @param ch the character to check. * @return true if the character is at the start of a line. */ boolean isStartOfLine(final int ch) { @@ -274,15 +277,22 @@ Token nextToken(final Token token) throws IOException { } // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { + // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must + // only be evaluated once per character. Remember a match found while skipping whitespace below. + boolean delimiter = false; // ignore whitespaces at beginning of a token if (ignoreSurroundingSpaces) { - while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) { + while (Character.isWhitespace((char) c) && !eol) { + if (isDelimiter(c)) { + delimiter = true; + break; + } c = reader.read(); eol = readEndOfLine(c); } } // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { + if (delimiter || isDelimiter(c)) { // empty token return TOKEN("") token.type = Token.Type.TOKEN; } else if (eol) { @@ -400,10 +410,10 @@ private Token parseEncapsulatedToken(final Token token) throws IOException { *
  • An unescaped delimiter has been reached (TOKEN)
  • * * - * @param token the current token - * @param ch the current character - * @return the filled token - * @throws IOException on stream access error + * @param token the current token. + * @param ch the current character. + * @return the filled token. + * @throws IOException on stream access error. * @throws CSVException Thrown on invalid input. */ private Token parseSimpleToken(final Token token, final int ch) throws IOException { @@ -442,7 +452,7 @@ private Token parseSimpleToken(final Token token, final int ch) throws IOExcepti /** * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... * - * @return true if the given or next character is a line-terminator + * @return true if the given or next character is a line-terminator. */ boolean readEndOfLine(final int ch) throws IOException { // check if we have \r\n... diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java index 4d428b465..ed20898de 100644 --- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java +++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java @@ -395,6 +395,14 @@ void testEqualsLeftNoQuoteRightQuote_Deprecated() { assertNotEqualsFlip(left, right); } + @Test + void testEqualsMaxRows() { + final CSVFormat right = CSVFormat.DEFAULT.builder().setMaxRows(10).get(); + final CSVFormat left = CSVFormat.DEFAULT.builder().setMaxRows(1000).get(); + assertNotEqualsFlip(right, left); + assertNotEquals(right.hashCode(), left.hashCode()); + } + @Test void testEqualsNoQuotes() { final CSVFormat left = CSVFormat.newFormat(',').builder().setQuote(null).get(); @@ -958,6 +966,23 @@ void testPrintWithQuotes() throws IOException { assertEquals("\"\"\"a,b,c\r\nx,y,z\"", out.toString()); } + /** + * Tests CSV-326. + */ + @Test + void testPrintWithQuotesEscapeBeforeQuote() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder() + .setEscape('\\') + .setQuote('"') + .get(); + final String value = "\\\""; + final Appendable out = new StringBuilder(); + format.print(new StringReader(value), out, true); + try (CSVParser parser = CSVParser.parse(out.toString(), format)) { + assertEquals(value, parser.getRecords().get(0).get(0)); + } + } + @Test void testQuoteCharSameAsCommentStartThrowsException() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.builder().setQuote('!').setCommentMarker('!').get()); @@ -993,6 +1018,35 @@ void testQuoteCharSameAsDelimiterThrowsException_Deprecated() { assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withQuote('!').withDelimiter('!')); } + @Test + void testQuotedNullStringTracksQuoteCharacter() throws IOException { + final StringBuilder out = new StringBuilder(); + // @formatter:off + final Builder builder = CSVFormat.DEFAULT.builder(); + final CSVFormat format = builder + .setQuoteMode(QuoteMode.ALL) + .setNullString("NULL") + .get(); + // @formatter:on + format.print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // set + out.setLength(0); + builder.setQuote('\''); + builder.get().print(null, out, true); + assertEquals("'NULL'", out.toString()); + // reset + out.setLength(0); + builder.setQuote((Character) null); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + // reset, reverse setter order + out.setLength(0); + builder.setNullString(null).setQuote((Character) null).setNullString("NULL"); + builder.get().print(null, out, true); + assertEquals("\"NULL\"", out.toString()); + } + @Test void testQuoteModeNoneShouldReturnMeaningfulExceptionMessage() { final Exception exception = assertThrows(IllegalArgumentException.class, () -> diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index d9dd4e545..6d9bdd9e8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() { () -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false))); } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping + * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value. + */ + @Test + void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (CSVParser parser = CSVParser.parse(" |a", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "", "a" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b |", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0)); + } + } + @Test void testEmptyFile() throws Exception { try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8, @@ -648,6 +673,100 @@ void testForEach() throws Exception { } } + @Test + void testGetBytePositionMultiCharacterDelimiter() throws IOException { + final String code = "aa[|]bb\ncc[|]dd\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(StandardCharsets.UTF_8) + .setTrackBytes(true) + .get()) { + final Iterator it = parser.iterator(); + final CSVRecord first = it.next(); + final CSVRecord second = it.next(); + assertEquals(0, first.getBytePosition()); + assertEquals(8, second.getBytePosition()); + } + } + + /** + * Tests CSV-329. + */ + @Test + void testGetBytePositionMultiCharacterDelimiterWithSupplementaryCharacter() throws IOException { + final String delimiter = "x😀"; + final String code = "ax😀b\ncx😀d\n"; + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(delimiter).get(); + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(format) + .setCharset(UTF_8) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertValuesEquals(new String[] { "a", "b" }, first); + assertValuesEquals(new String[] { "c", "d" }, second); + assertEquals(0, first.getBytePosition()); + assertEquals("ax😀b\n".getBytes(UTF_8).length, second.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithCharacterOffsetAndMultiBytePrefix() throws Exception { + final String row0 = "é,x\n"; + final Charset charset = UTF_8; + // row0 char count is 4 + assertEquals(4, row0.length()); + // row0 byte count is 5 + final int record1ByteOffset = row0.getBytes(charset).length; + assertEquals(5, record1ByteOffset); + final String row1 = "b,c\n"; + final String rows = row0 + row1; + final long record1CharOffset = row0.length(); + final long expectedByteOffset = row0.getBytes(charset).length; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(row1)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(charset) + .setTrackBytes(true) + .setByteOffset(record1ByteOffset) + .setCharacterOffset(record1CharOffset) + .setRecordNumber(2) // not relevant but a better use case example. + .get()) { + final CSVRecord record = parser.nextRecord(); + assertNotNull(record); + assertEquals(4, record.getCharacterPosition()); + assertEquals(record1CharOffset, record.getCharacterPosition()); + assertEquals(expectedByteOffset, record.getBytePosition()); + } + } + + @Test + void testGetBytePositionWithSingleByteCharset() throws IOException { + // A single-byte charset cannot encode U+FFFF, the char value of the EOF sentinel. + // Byte counting must skip the EOF read so a valid file parses without throwing. + final String code = "a,b\nc,d\n"; + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader(code)) + .setFormat(CSVFormat.DEFAULT) + .setCharset(StandardCharsets.ISO_8859_1) + .setTrackBytes(true) + .get()) { + final CSVRecord first = parser.nextRecord(); + final CSVRecord second = parser.nextRecord(); + assertNotNull(first); + assertNotNull(second); + assertNull(parser.nextRecord()); + assertEquals(0, first.getBytePosition()); + assertEquals(4, second.getBytePosition()); + } + } + @Test void testGetHeaderComment_HeaderComment1() throws IOException { try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) { @@ -917,6 +1036,23 @@ void testGetRecordsMaxRows(final long maxRows) throws IOException { } } + /** + * Tests CSV-327. + */ + @Test + void testGetRecordsMaxRowsWithRecordNumberOffset() throws IOException { + try (CSVParser parser = CSVParser.builder() + .setReader(new StringReader("a,b\nc,d\n")) + .setFormat(CSVFormat.DEFAULT.builder().setMaxRows(1).get()) + .setRecordNumber(2) + .get()) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertEquals(2, records.get(0).getRecordNumber()); + assertValuesEquals(new String[] { "a", "b" }, records.get(0)); + } + } + @Test void testGetRecordThreeBytesRead() throws Exception { final String code = "id,date,val5,val4\n" + @@ -1603,6 +1739,50 @@ void testParsingPrintedEmptyFirstColumn(final CSVFormat.Predefined format) throw } } + /** + * A truncated escaped multi-character delimiter at EOF must stay literal data and not be completed from a stale + * escape delimiter look-ahead. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (CSVParser parser = format.parse(new StringReader("x![!|!]y![!|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("x[|]y![!|", record.get(0)); + assertEquals(1, record.size()); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (CSVParser parser = format.parse(new StringReader("a[|]b[|"))) { + final CSVRecord record = parser.nextRecord(); + assertEquals("a", record.get(0)); + assertEquals("b[|", record.get(1)); + assertEquals(2, record.size()); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be completed from the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (CSVParser parser = format.parse(new StringReader(recordString))) { + final CSVRecord record = parser.nextRecord(); + assertEquals(recordString, record.get(0)); + assertEquals(1, record.size()); + } + } + @Test void testProvidedHeader() throws Exception { final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z"); @@ -1794,6 +1974,23 @@ void testTrailingDelimiter() throws Exception { } } + @Test + void testTrailingDelimiterKeepsQuotedEmptyLastField() throws Exception { + final CSVFormat format = CSVFormat.DEFAULT.builder().setTrailingDelimiter(true).get(); + try (CSVParser parser = CSVParser.parse("a,b,\"\"", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(3, record.size()); + assertEquals("a", record.get(0)); + assertEquals("b", record.get(1)); + assertEquals("", record.get(2)); + } + // An unquoted trailing delimiter still drops the empty field. + try (CSVParser parser = CSVParser.parse("a,b,", format)) { + final CSVRecord record = parser.iterator().next(); + assertEquals(2, record.size()); + } + } + @Test void testTrim() throws Exception { final Reader in = new StringReader("a,a,a\n\" 1 \",\" 2 \",\" 3 \"\nx,y,z"); diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index 1ff791010..9ae80c1e5 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -569,6 +569,57 @@ void testEscapeBackslash5() throws IOException { assertEquals("\\\\", sw.toString()); } + @Test + void testEscapeCommentMarkerFirstChar() throws IOException { + // No quoting available in escape mode, so a leading comment marker must be escaped or the + // record reads back as a comment and is dropped. Mirrors the quoting fix for QuoteMode.MINIMAL. + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote(null).setEscape('\\').setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + // The marker past the first character does not start a comment and is left alone. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("\\;comment-like,b" + RECORD_SEPARATOR + + "\\;comment-like,b" + RECORD_SEPARATOR + + "a;b,\\;c" + RECORD_SEPARATOR, string); + // The emitted records must read back as the original values, none parsed as a comment. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(3, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals(col1, records.get(1).get(0)); + assertEquals("b", records.get(1).get(1)); + assertEquals("a;b", records.get(2).get(0)); + assertEquals(";c", records.get(2).get(1)); + } + } + + @Test + void testEscapeCommentMarkerFirstCharWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setEscape('\\').setQuoteMode(QuoteMode.NONE).setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";bar"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, "b"); + printer.printRecord(new StringReader(col1), new StringReader("b")); + } + final String string = sw.toString(); + assertEquals("\\;bar,b" + RECORD_SEPARATOR + "\\;bar,b" + RECORD_SEPARATOR, string); + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals("b", record.get(1)); + } + } + } + @Test void testEscapeNull1() throws IOException { final StringWriter sw = new StringWriter(); @@ -1798,6 +1849,28 @@ void testQuoteAll() throws IOException { } } + @Test + void testQuoteCharEscapedWithQuoteModeNone() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setQuote('"').setEscape('?').setQuoteMode(QuoteMode.NONE).get(); + final StringWriter sw = new StringWriter(); + final String col1 = "\"abc"; + final String col2 = "x\"y"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + printer.printRecord(col1, col2); + printer.printRecord(new StringReader(col1), new StringReader(col2)); + } + assertEquals("?\"abc,x?\"y" + RECORD_SEPARATOR + "?\"abc,x?\"y" + RECORD_SEPARATOR, sw.toString()); + // The emitted records must read back as the original values. + try (CSVParser parser = CSVParser.parse(sw.toString(), format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + for (final CSVRecord record : records) { + assertEquals(col1, record.get(0)); + assertEquals(col2, record.get(1)); + } + } + } + @Test void testQuoteCommaFirstChar() throws IOException { final StringWriter sw = new StringWriter(); @@ -1807,6 +1880,34 @@ void testQuoteCommaFirstChar() throws IOException { } } + @Test + void testQuoteCommentMarkerFirstChar() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setCommentMarker(';').get(); + final StringWriter sw = new StringWriter(); + final String col1 = ";comment-like"; + try (CSVPrinter printer = new CSVPrinter(sw, format)) { + // A real comment is written with the marker, unquoted. + printer.printComment("a real comment"); + // A value starting with the marker is quoted, so it does not read back as a comment. + printer.printRecord(col1, "b"); + // The marker past the first character does not start a comment, so only the leading-marker value is quoted. + printer.printRecord("a;b", ";c"); + } + final String string = sw.toString(); + assertEquals("; a real comment" + RECORD_SEPARATOR + + "\";comment-like\",b" + RECORD_SEPARATOR + + "a;b,\";c\"" + RECORD_SEPARATOR, string); + // The comment is dropped on read; both data records survive intact. + try (CSVParser parser = CSVParser.parse(string, format)) { + final List records = parser.getRecords(); + assertEquals(2, records.size()); + assertEquals(col1, records.get(0).get(0)); + assertEquals("b", records.get(0).get(1)); + assertEquals("a;b", records.get(1).get(0)); + assertEquals(";c", records.get(1).get(1)); + } + } + @Test void testQuoteNonNumeric() throws IOException { final StringWriter sw = new StringWriter(); diff --git a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java index 056b8a9c9..b8d9b9f19 100644 --- a/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java +++ b/src/test/java/org/apache/commons/csv/ExtendedBufferedReaderTest.java @@ -26,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -104,6 +105,19 @@ void testReadingInDifferentBuffer() throws Exception { } } + @Test + void testReadingSupplementaryCharacterTracksBytes() throws Exception { + final String input = "😀"; + final char[] buffer = new char[input.length()]; + try (ExtendedBufferedReader reader = new ExtendedBufferedReader(new StringReader(input), StandardCharsets.UTF_8, true)) { + assertEquals(input.length(), reader.read(buffer, 0, buffer.length)); + assertArrayEquals(input.toCharArray(), buffer); + assertEquals(input.getBytes(StandardCharsets.UTF_8).length, reader.getBytesRead()); + assertEquals(input.length(), reader.getPosition()); + assertEquals(input.charAt(input.length() - 1), reader.getLastChar()); + } + } + @Test void testReadLine() throws Exception { try (ExtendedBufferedReader br = createBufferedReader("")) { diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index e54e93365..a76f6e513 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the + * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped. + */ + @Test + void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (Lexer lexer = createLexer(" |a", format)) { + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "a", lexer); + } + try (Lexer lexer = createLexer("a | |b", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "b", lexer); + } + } + @Test void testEOFWithoutClosingQuote() throws Exception { final String code = "a,\"b"; @@ -409,6 +428,44 @@ void testNextToken6() throws IOException { } } + /** + * A truncated escaped multi-character delimiter at EOF must not be accepted by reusing the previous escape delimiter + * look-ahead in {@link Lexer#isEscapeDelimiter()}. + */ + @Test + void testPartialEscapedMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").setEscape('!').get(); + try (Lexer lexer = createLexer("x![!|!]y![!|", format)) { + assertNextToken(EOF, "x[|]y![!|", lexer); + } + } + + /** + * Tests CSV-324. + */ + @Test + void testPartialMultiCharacterDelimiterAtEOF() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + try (Lexer lexer = createLexer("a[|]b[|", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(EOF, "b[|", lexer); + } + } + + /** + * A truncated multi-character delimiter at EOF must not be accepted by reusing the look-ahead buffer left dirty by an + * earlier non-matching peek in the same token (CSV-324 only cleared the buffer once per token). + */ + @Test + void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get(); + // The "[a]" peek leaves ']' in the look-ahead buffer; the trailing "[|" must not match "[|]". + final String recordString = "x[a][|"; + try (Lexer lexer = createLexer(recordString, format)) { + assertNextToken(EOF, recordString, lexer); + } + } + @Test void testReadEscapeBackspace() throws IOException { try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {