Skip to content

Commit b1bdb99

Browse files
authored
Merge pull request #295 from DamjanJovanovic/master
Add support for trailing text after the closing quote, and EOF without a final closing quote, for Excel compatibility
2 parents 323ff08 + d0ea9e3 commit b1bdb99

3 files changed

Lines changed: 120 additions & 14 deletions

File tree

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,12 @@ public static Builder create(final CSVFormat csvFormat) {
206206
return new Builder(csvFormat);
207207
}
208208

209+
private boolean allowEofWithoutClosingQuote;
210+
209211
private boolean allowMissingColumnNames;
210212

213+
private boolean allowTrailingText;
214+
211215
private boolean autoFlush;
212216

213217
private Character commentMarker;
@@ -264,6 +268,8 @@ private Builder(final CSVFormat csvFormat) {
264268
this.autoFlush = csvFormat.autoFlush;
265269
this.quotedNullString = csvFormat.quotedNullString;
266270
this.duplicateHeaderMode = csvFormat.duplicateHeaderMode;
271+
this.allowTrailingText = csvFormat.allowTrailingText;
272+
this.allowEofWithoutClosingQuote = csvFormat.allowEofWithoutClosingQuote;
267273
}
268274

269275
/**
@@ -288,6 +294,19 @@ public Builder setAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNa
288294
return this;
289295
}
290296

297+
/**
298+
* Sets whether the last field on the last line, if quoted, can have no closing quote when the file ends, {@code true} if this is ok,
299+
* {@code false} if {@link IOException} should be thrown.
300+
*
301+
* @param allowEofWithoutClosingQuote whether to allow the last field on the last line to have a missing closing quote when the file ends,
302+
* {@code true} if so, or {@code false} to cause an {@link IOException} to be thrown.
303+
* @since 1.10.0
304+
*/
305+
public Builder setAllowEofWithoutClosingQuote(final boolean allowEofWithoutClosingQuote) {
306+
this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote;
307+
return this;
308+
}
309+
291310
/**
292311
* Sets the parser missing column names behavior, {@code true} to allow missing column names in the header line, {@code false} to cause an
293312
* {@link IllegalArgumentException} to be thrown.
@@ -301,6 +320,20 @@ public Builder setAllowMissingColumnNames(final boolean allowMissingColumnNames)
301320
return this;
302321
}
303322

323+
/**
324+
* Sets whether to allow trailing text in a quoted field, after the closing quote.
325+
*
326+
* @param allowTrailingText the trailing text behavior, {@code true} to append that text to the field contents, {@code false} to throw
327+
* an {@link IOException}.
328+
*
329+
* @return This instance.
330+
* @since 1.10.0
331+
*/
332+
public Builder setAllowTrailingText(final boolean allowTrailingText) {
333+
this.allowTrailingText = allowTrailingText;
334+
return this;
335+
}
336+
304337
/**
305338
* Sets whether to flush on close.
306339
*
@@ -810,7 +843,7 @@ public CSVFormat getFormat() {
810843
* @see Predefined#Default
811844
*/
812845
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
813-
false, false, false, DuplicateHeaderMode.ALLOW_ALL);
846+
false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);
814847

815848
/**
816849
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary
@@ -834,6 +867,8 @@ public CSVFormat getFormat() {
834867
* <li>{@code setIgnoreEmptyLines(false)}</li>
835868
* <li>{@code setAllowMissingColumnNames(true)}</li>
836869
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
870+
* <li>{@code setAllowTrailingText(true)}</li>
871+
* <li>{@code setAllowEofWithoutClosingQuote(true)}</li>
837872
* </ul>
838873
* <p>
839874
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@@ -846,6 +881,8 @@ public CSVFormat getFormat() {
846881
public static final CSVFormat EXCEL = DEFAULT.builder()
847882
.setIgnoreEmptyLines(false)
848883
.setAllowMissingColumnNames(true)
884+
.setAllowTrailingText(true)
885+
.setAllowEofWithoutClosingQuote(true)
849886
.build();
850887
// @formatter:on
851888

@@ -1268,7 +1305,7 @@ private static boolean isTrimChar(final CharSequence charSequence, final int pos
12681305
*/
12691306
public static CSVFormat newFormat(final char delimiter) {
12701307
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
1271-
DuplicateHeaderMode.ALLOW_ALL);
1308+
DuplicateHeaderMode.ALLOW_ALL, false, false);
12721309
}
12731310

12741311
static String[] toStringArray(final Object[] values) {
@@ -1310,8 +1347,12 @@ public static CSVFormat valueOf(final String format) {
13101347

13111348
private final DuplicateHeaderMode duplicateHeaderMode;
13121349

1350+
private final boolean allowEofWithoutClosingQuote;
1351+
13131352
private final boolean allowMissingColumnNames;
13141353

1354+
private final boolean allowTrailingText;
1355+
13151356
private final boolean autoFlush;
13161357

13171358
private final Character commentMarker; // null if commenting is disabled
@@ -1366,6 +1407,8 @@ private CSVFormat(final Builder builder) {
13661407
this.autoFlush = builder.autoFlush;
13671408
this.quotedNullString = builder.quotedNullString;
13681409
this.duplicateHeaderMode = builder.duplicateHeaderMode;
1410+
this.allowTrailingText = builder.allowTrailingText;
1411+
this.allowEofWithoutClosingQuote = builder.allowEofWithoutClosingQuote;
13691412
validate();
13701413
}
13711414

@@ -1396,7 +1439,7 @@ private CSVFormat(final String delimiter, final Character quoteChar, final Quote
13961439
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
13971440
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
13981441
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
1399-
final DuplicateHeaderMode duplicateHeaderMode) {
1442+
final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText, final boolean allowEofWithoutClosingQuote) {
14001443
this.delimiter = delimiter;
14011444
this.quoteCharacter = quoteChar;
14021445
this.quoteMode = quoteMode;
@@ -1416,6 +1459,8 @@ private CSVFormat(final String delimiter, final Character quoteChar, final Quote
14161459
this.autoFlush = autoFlush;
14171460
this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
14181461
this.duplicateHeaderMode = duplicateHeaderMode;
1462+
this.allowTrailingText = allowTrailingText;
1463+
this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote;
14191464
validate();
14201465
}
14211466

@@ -1469,7 +1514,8 @@ public boolean equals(final Object obj) {
14691514
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
14701515
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
14711516
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
1472-
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
1517+
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim &&
1518+
allowTrailingText == other.allowTrailingText && allowEofWithoutClosingQuote == other.allowEofWithoutClosingQuote;
14731519
}
14741520

14751521
/**
@@ -1503,6 +1549,16 @@ public boolean getAllowDuplicateHeaderNames() {
15031549
return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL;
15041550
}
15051551

1552+
/**
1553+
* Gets whether the file can end before the last field on the last line, if quoted, has a closing quote.
1554+
*
1555+
* @return {@code true} if so, {@code false} to throw an {@link IOException}.
1556+
* @since 1.10.0
1557+
*/
1558+
public boolean getAllowEofWithoutClosingQuote() {
1559+
return allowEofWithoutClosingQuote;
1560+
}
1561+
15061562
/**
15071563
* Gets whether missing column names are allowed when parsing the header line.
15081564
*
@@ -1512,6 +1568,16 @@ public boolean getAllowMissingColumnNames() {
15121568
return allowMissingColumnNames;
15131569
}
15141570

1571+
/**
1572+
* Gets whether quoted fields allow trailing text after the closing quote.
1573+
*
1574+
* @return {@code true} if allowed, {@code false} to throw an {@link IOException}.
1575+
* @since 1.10.0
1576+
*/
1577+
public boolean getAllowTrailingText() {
1578+
return allowTrailingText;
1579+
}
1580+
15151581
/**
15161582
* Gets whether to flush on close.
15171583
*
@@ -1692,9 +1758,9 @@ public int hashCode() {
16921758
int result = 1;
16931759
result = prime * result + Arrays.hashCode(headers);
16941760
result = prime * result + Arrays.hashCode(headerComments);
1695-
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
1696-
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
1697-
skipHeaderRecord, trailingDelimiter, trim);
1761+
return prime * result + Objects.hash(duplicateHeaderMode, allowEofWithoutClosingQuote, allowMissingColumnNames, allowTrailingText,
1762+
autoFlush, commentMarker, delimiter, escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces,
1763+
nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingDelimiter, trim);
16981764
}
16991765

17001766
/**

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ final class Lexer implements Closeable {
5757

5858
private final boolean ignoreSurroundingSpaces;
5959
private final boolean ignoreEmptyLines;
60+
private final boolean allowTrailingText;
61+
private final boolean allowEofWithoutClosingQuote;
6062

6163
/** The input stream */
6264
private final ExtendedBufferedReader reader;
@@ -72,6 +74,8 @@ final class Lexer implements Closeable {
7274
this.commentStart = mapNullToDisabled(format.getCommentMarker());
7375
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
7476
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
77+
this.allowTrailingText = format.getAllowTrailingText();
78+
this.allowEofWithoutClosingQuote = format.getAllowEofWithoutClosingQuote();
7579
this.delimiterBuf = new char[delimiter.length - 1];
7680
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
7781
}
@@ -364,17 +368,27 @@ private Token parseEncapsulatedToken(final Token token) throws IOException {
364368
token.type = EORECORD;
365369
return token;
366370
}
367-
if (!Character.isWhitespace((char)c)) {
368-
// error invalid char between token and next delimiter
369-
throw new IOException("(line " + getCurrentLineNumber() +
370-
") invalid char between encapsulated token and delimiter");
371+
if (allowTrailingText) {
372+
token.content.append((char) c);
373+
} else {
374+
if (!Character.isWhitespace((char)c)) {
375+
// error invalid char between token and next delimiter
376+
throw new IOException("(line " + getCurrentLineNumber() +
377+
") invalid char between encapsulated token and delimiter");
378+
}
371379
}
372380
}
373381
}
374382
} else if (isEndOfFile(c)) {
375-
// error condition (end of file before end of token)
376-
throw new IOException("(startline " + startLineNumber +
377-
") EOF reached before encapsulated token finished");
383+
if (allowEofWithoutClosingQuote) {
384+
token.type = EOF;
385+
token.isReady = true; // There is data at EOF
386+
return token;
387+
} else {
388+
// error condition (end of file before end of token)
389+
throw new IOException("(startline " + startLineNumber +
390+
") EOF reached before encapsulated token finished");
391+
}
378392
} else {
379393
// consume character
380394
token.content.append((char) c);

src/test/java/org/apache/commons/csv/LexerTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,4 +431,30 @@ public void testTrimTrailingSpacesZeroLength() throws Exception {
431431
lexer.trimTrailingSpaces(buffer);
432432
assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
433433
}
434+
435+
@Test
436+
public void testTrailingTextAfterQuote() throws Exception {
437+
final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
438+
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(true).build())) {
439+
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
440+
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
441+
assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
442+
}
443+
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) {
444+
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
445+
}
446+
}
447+
448+
@Test
449+
public void testEOFWithoutClosingQuote() throws Exception {
450+
final String code = "a,\"b";
451+
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(true).build())) {
452+
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
453+
assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
454+
}
455+
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(false).build())) {
456+
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
457+
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
458+
}
459+
}
434460
}

0 commit comments

Comments
 (0)