Skip to content

Commit c137e80

Browse files
committed
A different take on PR #303
Add support for trailing text after the closing quote, and EOF without a final closing quote, for Excel compatibility. Fix a unit test and add a RAT exclude for the sample CSV file.
1 parent b069c2d commit c137e80

6 files changed

Lines changed: 170 additions & 63 deletions

File tree

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
<version>67</version>
2424
</parent>
2525
<artifactId>commons-csv</artifactId>
26-
<version>1.10.1-SNAPSHOT</version>
26+
<version>1.11.0-SNAPSHOT</version>
2727
<name>Apache Commons CSV</name>
2828
<url>https://commons.apache.org/proper/commons-csv/</url>
2929
<inceptionYear>2005</inceptionYear>
@@ -161,7 +161,7 @@
161161
</distributionManagement>
162162

163163
<properties>
164-
<commons.release.version>1.10.1</commons.release.version>
164+
<commons.release.version>1.11.0</commons.release.version>
165165
<commons.release.desc>(Java 8 or above)</commons.release.desc>
166166
<!-- The RC version used in the staging repository URL. -->
167167
<commons.rc.version>RC1</commons.rc.version>

src/changes/changes.xml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
<body>
4343
<release version="1.10.1" date="YYYY-MM-DD" description="Feature and bug fix release (Java 8 or above)">
4444
<!-- ADD -->
45-
<action issue="CSV-308" type="fix" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
45+
<action issue="CSV-308" type="add" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
46+
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setTrailingData(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
47+
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setLenientEof(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
4648
<!-- FIX -->
4749
<action type="fix" issue="CSV-306" dev="ggregory" due-to="Sam Ng, Bruno P. Kinoshita">Replace deprecated method in user guide, update external link #324, #325.</action>
4850
<action type="fix" dev="ggregory" due-to="Seth Falco, Bruno P. Kinoshita">Document duplicate header behavior #309.</action>
@@ -53,6 +55,7 @@
5355
<action type="fix" issue="CSV-311" dev="ggregory" due-to="Christian Feuersaenger, Gary Gregory">OutOfMemory for very long rows despite using column value of type Reader.</action>
5456
<action type="fix" dev="ggregory" due-to="Gary Gregory">Use try-with-resources to manage JDBC Clob in CSVPrinter.printRecords(ResultSet).</action>
5557
<action type="fix" dev="ggregory" due-to="Gary Gregory">JDBC Blob columns are now output as Base64 instead of Object#toString(), which usually is InputStream#toString().</action>
58+
<action type="fix" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Support unusual Excel use cases: Add support for trailing data after the closing quote, and EOF without a final closing quote #303.</action>
5659
<!-- UPDATE -->
5760
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump commons-io:commons-io: from 2.11.0 to 2.15.1.</action>
5861
<action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump commons-parent from 57 to 67.</action>

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 116 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ public static Builder create(final CSVFormat csvFormat) {
248248

249249
private boolean skipHeaderRecord;
250250

251+
private boolean lenientEof;
252+
253+
private boolean trailingData;
254+
251255
private boolean trailingDelimiter;
252256

253257
private boolean trim;
@@ -267,6 +271,8 @@ private Builder(final CSVFormat csvFormat) {
267271
this.headers = csvFormat.headers;
268272
this.skipHeaderRecord = csvFormat.skipHeaderRecord;
269273
this.ignoreHeaderCase = csvFormat.ignoreHeaderCase;
274+
this.lenientEof = csvFormat.lenientEof;
275+
this.trailingData = csvFormat.trailingData;
270276
this.trailingDelimiter = csvFormat.trailingDelimiter;
271277
this.trim = csvFormat.trim;
272278
this.autoFlush = csvFormat.autoFlush;
@@ -689,6 +695,18 @@ public Builder setIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces)
689695
return this;
690696
}
691697

698+
/**
699+
* Sets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
700+
*
701+
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
702+
* @return This instance.
703+
* @since 1.11.0
704+
*/
705+
public Builder setLenientEof(final boolean lenientEof) {
706+
this.lenientEof = lenientEof;
707+
return this;
708+
}
709+
692710
/**
693711
* Sets the String to convert to and from {@code null}. No substitution occurs if {@code null}.
694712
*
@@ -785,6 +803,18 @@ public Builder setSkipHeaderRecord(final boolean skipHeaderRecord) {
785803
return this;
786804
}
787805

806+
/**
807+
* Sets whether reading trailing data is allowed in records, helps Excel compatibility.
808+
*
809+
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
810+
* @return This instance.
811+
* @since 1.11.0
812+
*/
813+
public Builder setTrailingData(final boolean trailingData) {
814+
this.trailingData = trailingData;
815+
return this;
816+
}
817+
788818
/**
789819
* Sets whether to add a trailing delimiter.
790820
*
@@ -914,7 +944,7 @@ public CSVFormat getFormat() {
914944
* @see Predefined#Default
915945
*/
916946
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
917-
false, false, false, DuplicateHeaderMode.ALLOW_ALL);
947+
false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);
918948

919949
/**
920950
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale-dependent, it might be necessary
@@ -935,9 +965,11 @@ public CSVFormat getFormat() {
935965
* <li>{@code setDelimiter(',')}</li>
936966
* <li>{@code setQuote('"')}</li>
937967
* <li>{@code setRecordSeparator("\r\n")}</li>
968+
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
938969
* <li>{@code setIgnoreEmptyLines(false)}</li>
939970
* <li>{@code setAllowMissingColumnNames(true)}</li>
940-
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
971+
* <li>{@code setTrailingData(true)}</li>
972+
* <li>{@code setLenientEof(true)}</li>
941973
* </ul>
942974
* <p>
943975
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@@ -950,6 +982,8 @@ public CSVFormat getFormat() {
950982
public static final CSVFormat EXCEL = DEFAULT.builder()
951983
.setIgnoreEmptyLines(false)
952984
.setAllowMissingColumnNames(true)
985+
.setTrailingData(true)
986+
.setLenientEof(true)
953987
.build();
954988
// @formatter:on
955989

@@ -1372,7 +1406,7 @@ private static boolean isTrimChar(final CharSequence charSequence, final int pos
13721406
*/
13731407
public static CSVFormat newFormat(final char delimiter) {
13741408
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
1375-
DuplicateHeaderMode.ALLOW_ALL);
1409+
DuplicateHeaderMode.ALLOW_ALL, false, false);
13761410
}
13771411

13781412
static String[] toStringArray(final Object[] values) {
@@ -1455,6 +1489,10 @@ public static CSVFormat valueOf(final String format) {
14551489

14561490
private final boolean skipHeaderRecord;
14571491

1492+
private final boolean lenientEof;
1493+
1494+
private final boolean trailingData;
1495+
14581496
private final boolean trailingDelimiter;
14591497

14601498
private final boolean trim;
@@ -1474,6 +1512,8 @@ private CSVFormat(final Builder builder) {
14741512
this.headers = builder.headers;
14751513
this.skipHeaderRecord = builder.skipHeaderRecord;
14761514
this.ignoreHeaderCase = builder.ignoreHeaderCase;
1515+
this.lenientEof = builder.lenientEof;
1516+
this.trailingData = builder.trailingData;
14771517
this.trailingDelimiter = builder.trailingDelimiter;
14781518
this.trim = builder.trim;
14791519
this.autoFlush = builder.autoFlush;
@@ -1494,22 +1534,24 @@ private CSVFormat(final Builder builder) {
14941534
* @param ignoreEmptyLines {@code true} when the parser should skip empty lines.
14951535
* @param recordSeparator the line separator to use for output.
14961536
* @param nullString the line separator to use for output.
1497-
* @param headerComments the comments to be printed by the Printer before the actual CSV data.
1498-
* @param header the header
1499-
* @param skipHeaderRecord if {@code true} the header row will be skipped
1500-
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line
1501-
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input
1502-
* @param trim if {@code true} next record value will be trimmed
1503-
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)
1504-
* @param autoFlush if {@code true} the underlying stream will be flushed before closing
1505-
* @param duplicateHeaderMode the behavior when handling duplicate headers
1537+
* @param headerComments the comments to be printed by the Printer before the actual CSV data..
1538+
* @param header the header.
1539+
* @param skipHeaderRecord if {@code true} the header row will be skipped.
1540+
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line.
1541+
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input.
1542+
* @param trim if {@code true} next record value will be trimmed.
1543+
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)..
1544+
* @param autoFlush if {@code true} the underlying stream will be flushed before closing.
1545+
* @param duplicateHeaderMode the behavior when handling duplicate headers.
1546+
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
1547+
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
15061548
* @throws IllegalArgumentException if the delimiter is a line break character.
15071549
*/
15081550
private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape,
15091551
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
15101552
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
15111553
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
1512-
final DuplicateHeaderMode duplicateHeaderMode) {
1554+
final DuplicateHeaderMode duplicateHeaderMode, final boolean trailingData, final boolean lenientEof) {
15131555
this.delimiter = delimiter;
15141556
this.quoteCharacter = quoteChar;
15151557
this.quoteMode = quoteMode;
@@ -1524,6 +1566,8 @@ private CSVFormat(final String delimiter, final Character quoteChar, final Quote
15241566
this.headers = clone(header);
15251567
this.skipHeaderRecord = skipHeaderRecord;
15261568
this.ignoreHeaderCase = ignoreHeaderCase;
1569+
this.lenientEof = lenientEof;
1570+
this.trailingData = trailingData;
15271571
this.trailingDelimiter = trailingDelimiter;
15281572
this.trim = trim;
15291573
this.autoFlush = autoFlush;
@@ -1571,18 +1615,23 @@ public boolean equals(final Object obj) {
15711615
if (this == obj) {
15721616
return true;
15731617
}
1574-
if (obj == null || getClass() != obj.getClass()) {
1618+
if (obj == null) {
1619+
return false;
1620+
}
1621+
if (getClass() != obj.getClass()) {
15751622
return false;
15761623
}
15771624
final CSVFormat other = (CSVFormat) obj;
1578-
return duplicateHeaderMode == other.duplicateHeaderMode && allowMissingColumnNames == other.allowMissingColumnNames &&
1579-
autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
1580-
Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headers, other.headers) &&
1581-
Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines &&
1582-
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
1583-
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
1584-
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
1585-
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
1625+
return allowMissingColumnNames == other.allowMissingColumnNames && autoFlush == other.autoFlush &&
1626+
Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
1627+
duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) &&
1628+
Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) &&
1629+
ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase &&
1630+
ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof &&
1631+
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) &&
1632+
quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) &&
1633+
Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord &&
1634+
trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
15861635
}
15871636

15881637
private void escape(final char c, final Appendable appendable) throws IOException {
@@ -1808,6 +1857,16 @@ public boolean getIgnoreSurroundingSpaces() {
18081857
return ignoreSurroundingSpaces;
18091858
}
18101859

1860+
/**
1861+
* Gets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
1862+
*
1863+
* @return whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
1864+
* @since 1.11.0
1865+
*/
1866+
public boolean getLenientEof() {
1867+
return lenientEof;
1868+
}
1869+
18111870
/**
18121871
* Gets the String to convert to and from {@code null}.
18131872
* <ul>
@@ -1857,6 +1916,16 @@ public boolean getSkipHeaderRecord() {
18571916
return skipHeaderRecord;
18581917
}
18591918

1919+
/**
1920+
* Gets whether reading trailing data is allowed in records, helps Excel compatibility.
1921+
*
1922+
* @return whether reading trailing data is allowed in records, helps Excel compatibility.
1923+
* @since 1.11.0
1924+
*/
1925+
public boolean getTrailingData() {
1926+
return trailingData;
1927+
}
1928+
18601929
/**
18611930
* Gets whether to add a trailing delimiter.
18621931
*
@@ -1881,11 +1950,12 @@ public boolean getTrim() {
18811950
public int hashCode() {
18821951
final int prime = 31;
18831952
int result = 1;
1884-
result = prime * result + Arrays.hashCode(headers);
18851953
result = prime * result + Arrays.hashCode(headerComments);
1886-
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
1887-
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
1888-
skipHeaderRecord, trailingDelimiter, trim);
1954+
result = prime * result + Arrays.hashCode(headers);
1955+
result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
1956+
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString,
1957+
recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim);
1958+
return result;
18891959
}
18901960

18911961
/**
@@ -2006,6 +2076,26 @@ public CSVPrinter print(final File out, final Charset charset) throws IOExceptio
20062076
return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this);
20072077
}
20082078

2079+
private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
2080+
// InputStream is never null here
2081+
// There is nothing to escape when quoting is used which is the default.
2082+
if (!newRecord) {
2083+
append(getDelimiterString(), out);
2084+
}
2085+
final boolean quoteCharacterSet = isQuoteCharacterSet();
2086+
if (quoteCharacterSet) {
2087+
append(getQuoteCharacter().charValue(), out);
2088+
}
2089+
// Stream the input to the output without reading or holding the whole value in memory.
2090+
// AppendableOutputStream cannot "close" an Appendable.
2091+
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
2092+
IOUtils.copy(inputStream, outputStream);
2093+
}
2094+
if (quoteCharacterSet) {
2095+
append(getQuoteCharacter().charValue(), out);
2096+
}
2097+
}
2098+
20092099
/**
20102100
* Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated as needed. Useful when one wants to
20112101
* avoid creating CSVPrinters. Trims the value if {@link #getTrim()} is true.
@@ -2081,26 +2171,6 @@ public CSVPrinter print(final Path out, final Charset charset) throws IOExceptio
20812171
return print(Files.newBufferedWriter(out, charset));
20822172
}
20832173

2084-
private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
2085-
// InputStream is never null here
2086-
// There is nothing to escape when quoting is used which is the default.
2087-
if (!newRecord) {
2088-
append(getDelimiterString(), out);
2089-
}
2090-
final boolean quoteCharacterSet = isQuoteCharacterSet();
2091-
if (quoteCharacterSet) {
2092-
append(getQuoteCharacter().charValue(), out);
2093-
}
2094-
// Stream the input to the output without reading or holding the whole value in memory.
2095-
// AppendableOutputStream cannot "close" an Appendable.
2096-
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
2097-
IOUtils.copy(inputStream, outputStream);
2098-
}
2099-
if (quoteCharacterSet) {
2100-
append(getQuoteCharacter().charValue(), out);
2101-
}
2102-
}
2103-
21042174
private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException {
21052175
// Reader is never null here
21062176
if (!newRecord) {

0 commit comments

Comments
 (0)