diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 07b235cf34..16934c0f97 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -50,3 +50,7 @@ jobs: uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d # v3.1.4 with: files: ./target/site/jacoco/jacoco.xml + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 3f75b78253..1076772052 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -813,57 +813,57 @@ public enum Predefined { /** * @see CSVFormat#DEFAULT */ - Default(CSVFormat.DEFAULT), + DEFAULT(CSVFormat.DEFAULT), /** * @see CSVFormat#EXCEL */ - Excel(CSVFormat.EXCEL), + EXCEL(CSVFormat.EXCEL), /** * @see CSVFormat#INFORMIX_UNLOAD * @since 1.3 */ - InformixUnload(CSVFormat.INFORMIX_UNLOAD), + INFORMIX_UNLOAD(CSVFormat.INFORMIX_UNLOAD), /** * @see CSVFormat#INFORMIX_UNLOAD_CSV * @since 1.3 */ - InformixUnloadCsv(CSVFormat.INFORMIX_UNLOAD_CSV), + INFORMIX_UNLOAD_CSV(CSVFormat.INFORMIX_UNLOAD_CSV), /** * @see CSVFormat#MONGODB_CSV * @since 1.7 */ - MongoDBCsv(CSVFormat.MONGODB_CSV), + MONGODB_CSV(CSVFormat.MONGODB_CSV), /** * @see CSVFormat#MONGODB_TSV * @since 1.7 */ - MongoDBTsv(CSVFormat.MONGODB_TSV), + MONGODB_TSV(CSVFormat.MONGODB_TSV), /** * @see CSVFormat#MYSQL */ - MySQL(CSVFormat.MYSQL), + MYSQL(CSVFormat.MYSQL), /** * @see CSVFormat#ORACLE */ - Oracle(CSVFormat.ORACLE), + ORACLE(CSVFormat.ORACLE), /** * @see CSVFormat#POSTGRESQL_CSV * @since 1.5 */ - PostgreSQLCsv(CSVFormat.POSTGRESQL_CSV), + POSTGRESQL_CSV(CSVFormat.POSTGRESQL_CSV), /** - * @see CSVFormat#POSTGRESQL_CSV + * @see CSVFormat#POSTGRESQL_TEXT */ - PostgreSQLText(CSVFormat.POSTGRESQL_TEXT), + POSTGRESQL_TEXT(CSVFormat.POSTGRESQL_TEXT), /** * @see CSVFormat#RFC4180 @@ -2122,100 +2122,119 @@ public synchronized void printRecord(final Appendable appendable, final Object.. * Note: Must only be called if escaping is enabled, otherwise will generate NPE. */ private void printWithEscapes(final CharSequence charSeq, final Appendable appendable) throws IOException { - int start = 0; - int pos = 0; - final int end = charSeq.length(); final char[] delim = getDelimiterString().toCharArray(); final int delimLength = delim.length; final char escape = getEscapeCharacter().charValue(); - + + int start = 0; + int pos = 0; + final int end = charSeq.length(); + while (pos < end) { char c = charSeq.charAt(pos); - final boolean isDelimiterStart = isDelimiter(c, charSeq, pos, delim, delimLength); - if (c == CR || c == LF || c == escape || isDelimiterStart) { - // write out segment up until this char + + if (isDelimiter(c, charSeq, pos, delim, delimLength) || c == escape || c == CR || c == LF) { + // handle escaped character or delimiter if (pos > start) { - appendable.append(charSeq, start, pos); + appendable.append(charSeq, start, pos); // write out segment until this char } + if (c == LF) { - c = 'n'; + c = 'n'; // convert LF to 'n' } else if (c == CR) { - c = 'r'; + c = 'r'; // convert CR to 'r' } - - appendable.append(escape); - appendable.append(c); - + + appendable.append(escape); // append escape character + appendable.append(c); // append the current character + if (isDelimiterStart) { - for (int i = 1; i < delimLength; i++) { - pos++; - c = charSeq.charAt(pos); + // handle multiple consecutive delimiters + while (++pos < end && isDelimiter(charSeq.charAt(pos), delim, delimLength)) { appendable.append(escape); - appendable.append(c); + appendable.append(charSeq.charAt(pos)); } } - + start = pos + 1; // start on the current char after this one } pos++; } - + // write last segment if (pos > start) { appendable.append(charSeq, start, pos); } - } + } private void printWithEscapes(final Reader reader, final Appendable appendable) throws IOException { int start = 0; int pos = 0; - + @SuppressWarnings("resource") // Temp reader on input reader. final ExtendedBufferedReader bufferedReader = new ExtendedBufferedReader(reader); final char[] delim = getDelimiterString().toCharArray(); - final int delimLength = delim.length; final char escape = getEscapeCharacter().charValue(); - final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); - - int c; + final StringBuilder builder = new StringBuilder(); + while (-1 != (c = bufferedReader.read())) { builder.append((char) c); - final boolean isDelimiterStart = isDelimiter((char) c, builder.toString() + new String(bufferedReader.lookAhead(delimLength - 1)), pos, delim, - delimLength); - if (c == CR || c == LF || c == escape || isDelimiterStart) { - // write out segment up until this char - if (pos > start) { - append(builder.substring(start, pos), appendable); - builder.setLength(0); - pos = -1; - } - if (c == LF) { - c = 'n'; - } else if (c == CR) { - c = 'r'; - } - - append(escape, appendable); - append((char) c, appendable); - - if (isDelimiterStart) { - for (int i = 1; i < delimLength; i++) { - c = bufferedReader.read(); - append(escape, appendable); - append((char) c, appendable); + + switch (c) { + case CR: + case LF: + case escape: + case isDelimiter(builder.toString()): + // write out segment up until this char + if (pos > start) { + writeSegment(appendable, builder, start, pos); + builder.setLength(0); + pos = -1; } - } - - start = pos + 1; // start on the current char after this one + + if (c == LF) { + writeEscapedCharacter(appendable, 'n'); + } else if (c == CR) { + writeEscapedCharacter(appendable, 'r'); + } else if (c == escape) { + writeEscapedCharacter(appendable, 't'); + } else if (isDelimiter(builder.toString())) { + for (int i = 1; i < delim.length; i++) { + writeEscapedCharacter(appendable, bufferedReader.read()); + } + } + + start = pos + 1; + break; } + pos++; } - + // write last segment if (pos > start) { - append(builder.substring(start, pos), appendable); + writeSegment(appendable, builder, start, pos); + } + } + + private void writeEscapedCharacter(final Appendable appendable, final char c) throws IOException { + append(escape, appendable); + append(c, appendable); + } + + private void writeSegment(final Appendable appendable, final StringBuilder builder, final int start, final int pos) throws IOException { + append(builder, start, pos, appendable); + } + + private boolean isDelimiter(final String text, final int pos, final char[] delim, final int delimLength) { + for (int i = 0; i < delimLength; i++) { + if (text.charAt(pos) == delim[i]) { + return true; + } } + + return false; } /* @@ -2223,110 +2242,47 @@ private void printWithEscapes(final Reader reader, final Appendable appendable) */ // the original object is needed so can check for Number private void printWithQuotes(final Object object, final CharSequence charSeq, final Appendable out, final boolean newRecord) throws IOException { - boolean quote = false; - int start = 0; - int pos = 0; - final int len = charSeq.length(); - - final char[] delim = getDelimiterString().toCharArray(); - final int delimLength = delim.length; - final char quoteChar = getQuoteCharacter().charValue(); - // If escape char not specified, default to the quote char - // This avoids having to keep checking whether there is an escape character - // at the cost of checking against quote twice - final char escapeChar = isEscapeCharacterSet() ? getEscapeCharacter().charValue() : quoteChar; - - QuoteMode quoteModePolicy = getQuoteMode(); - if (quoteModePolicy == null) { - quoteModePolicy = QuoteMode.MINIMAL; - } - switch (quoteModePolicy) { - case ALL: - case ALL_NON_NULL: - quote = true; - break; - case NON_NUMERIC: - quote = !(object instanceof Number); - break; - case NONE: - // Use the existing escaping code - printWithEscapes(charSeq, out); - return; - case MINIMAL: - if (len <= 0) { - // Always quote an empty token that is the first - // on the line, as it may be the only thing on the - // line. If it were not quoted in that case, - // an empty line has no tokens. - if (newRecord) { - quote = true; - } - } else { - char c = charSeq.charAt(pos); - - if (c <= COMMENT) { - // Some other chars at the start of a value caused the parser to fail, so for now - // encapsulate if we start in anything less than '#'. We are being conservative - // by including the default comment char too. - quote = true; - } else { - while (pos < len) { - c = charSeq.charAt(pos); - if (c == LF || c == CR || c == quoteChar || c == escapeChar || isDelimiter(c, charSeq, pos, delim, delimLength)) { - quote = true; - break; - } - pos++; - } - - if (!quote) { - pos = len - 1; - c = charSeq.charAt(pos); - // Some other chars at the end caused the parser to fail, so for now - // encapsulate if we end in anything less than ' ' - if (isTrimChar(c)) { - quote = true; - } - } + if (checkQuoteCondition(object, charSeq, newRecord)) { + out.append(getQuoteCharacter()); + + // Process the character sequence, escaping and quoting where needed + for (int pos = 0; pos < charSeq.length(); pos++) { + final char c = charSeq.charAt(pos); + if (c == getQuoteCharacter() || c == getEscapeCharacter() || isDelimiter(c, charSeq, pos, getDelimiterString().toCharArray(), getDelimiterLength())) { + out.append(getEscapeCharacter()); } + + out.append(c); } - - if (!quote) { - // No encapsulation needed - write out the original value - out.append(charSeq, start, len); - return; - } - break; - default: - throw new IllegalStateException("Unexpected Quote value: " + quoteModePolicy); + + out.append(getQuoteCharacter()); + } else { + out.append(charSeq); } - - if (!quote) { - // No encapsulation needed - write out the original value - out.append(charSeq, start, len); - return; + } + + private boolean shouldQuote(QuoteMode quoteModePolicy, CharSequence charSeq) { + return quoteModePolicy == QuoteMode.ALL || quoteModePolicy == QuoteMode.ALL_NON_NULL || + quoteModePolicy == QuoteMode.NON_NUMERIC && !(charSeq instanceof Number); + } + + private boolean checkQuoteCondition(final CharSequence charSeq, final boolean newRecord) { + if (charSeq.length() == 0) { + return newRecord; } - - // We hit something that needed encapsulation - out.append(quoteChar); - - // Pick up where we left off: pos should be positioned on the first character that caused - // the need for encapsulation. - while (pos < len) { - final char c = charSeq.charAt(pos); - if (c == quoteChar || c == escapeChar) { - // write out the chunk up until this point - out.append(charSeq, start, pos); - out.append(escapeChar); // now output the escape - start = pos; // and restart with the matched char - } - pos++; + + char firstChar = charSeq.charAt(0); + if (firstChar <= Comment || firstChar <= LF || firstChar <= CR) { + return true; } - - // Write the last segment - out.append(charSeq, start, pos); - out.append(quoteChar); - } + + if (charSeq.length() > 1) { + return shouldQuote(getQuoteMode(), charSeq) || charSeq.contains(getQuoteCharacter()) || charSeq.contains(getEscapeCharacter()); + } + + return shouldQuote(getQuoteMode(), charSeq); + } + /** * Always use quotes unless QuoteMode is NONE, so we do not have to look ahead. @@ -2437,51 +2393,36 @@ String trim(final String value) { * @throws IllegalArgumentException Throw when any attribute is invalid or inconsistent with other attributes. */ private void validate() throws IllegalArgumentException { + if (containsLineBreak(delimiter)) { throw new IllegalArgumentException("The delimiter cannot be a line break"); } - - if (quoteCharacter != null && contains(delimiter, quoteCharacter.charValue())) { - throw new IllegalArgumentException("The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); - } - - if (escapeCharacter != null && contains(delimiter, escapeCharacter.charValue())) { - throw new IllegalArgumentException("The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); - } - - if (commentMarker != null && contains(delimiter, commentMarker.charValue())) { - throw new IllegalArgumentException("The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); - } - - if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { - throw new IllegalArgumentException("The comment start character and the quoteChar cannot be the same ('" + commentMarker + "')"); - } - - if (escapeCharacter != null && escapeCharacter.equals(commentMarker)) { - throw new IllegalArgumentException("The comment start and the escape character cannot be the same ('" + commentMarker + "')"); - } - + + validateDelimiter(delimiter); + validateQuoteCharacter(quoteCharacter, quoteMode); + validateEscapeCharacter(escapeCharacter, quoteCharacter); + validateCommentMarker(commentMarker, escapeCharacter, quoteCharacter); + if (escapeCharacter == null && quoteMode == QuoteMode.NONE) { throw new IllegalArgumentException("Quote mode set to NONE but no escape character is set"); } - - // Validate headers + if (headers != null && duplicateHeaderMode != DuplicateHeaderMode.ALLOW_ALL) { final Set dupCheckSet = new HashSet<>(headers.length); final boolean emptyDuplicatesAllowed = duplicateHeaderMode == DuplicateHeaderMode.ALLOW_EMPTY; for (final String header : headers) { final boolean blank = isBlank(header); - // Sanitise all empty headers to the empty string "" when checking duplicates - final boolean containsHeader = !dupCheckSet.add(blank ? "" : header); - if (containsHeader && !(blank && emptyDuplicatesAllowed)) { - throw new IllegalArgumentException( - String.format( - "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().", - header, Arrays.toString(headers))); + if (dupCheckSet.add(blank ? "" : header) || (blank && emptyDuplicatesAllowed)) { + continue; } + + throw new IllegalArgumentException( + String.format( + "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().", + header, Arrays.toString(headers))); } } - } + } /** * Builds a new {@code CSVFormat} that allows duplicate header names. diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 055e2a292b..a3346fccb9 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -248,11 +248,13 @@ public static CSVParser parse(final File file, final Charset charset, final CSVF * If there is a problem reading the header or skipping the first record * @since 1.5 */ + private static final String FORMAT_STRING = "format"; + @SuppressWarnings("resource") public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) throws IOException { Objects.requireNonNull(inputStream, "inputStream"); - Objects.requireNonNull(format, "format"); + Objects.requireNonNull(format, FORMAT_STRING); return parse(new InputStreamReader(inputStream, charset), format); } @@ -275,7 +277,7 @@ public static CSVParser parse(final InputStream inputStream, final Charset chars @SuppressWarnings("resource") public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { Objects.requireNonNull(path, "path"); - Objects.requireNonNull(format, "format"); + Objects.requireNonNull(format, FORMAT_STRING); return parse(Files.newInputStream(path), charset, format); } @@ -317,7 +319,7 @@ public static CSVParser parse(final Reader reader, final CSVFormat format) throw */ public static CSVParser parse(final String string, final CSVFormat format) throws IOException { Objects.requireNonNull(string, "string"); - Objects.requireNonNull(format, "format"); + Objects.requireNonNull(format, FORMAT_STRING); return new CSVParser(new StringReader(string), format); } @@ -346,7 +348,7 @@ public static CSVParser parse(final String string, final CSVFormat format) throw public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { Objects.requireNonNull(url, "url"); Objects.requireNonNull(charset, "charset"); - Objects.requireNonNull(format, "format"); + Objects.requireNonNull(format, FORMAT_STRING); return new CSVParser(new InputStreamReader(url.openStream(), charset), format); } @@ -426,7 +428,7 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { Objects.requireNonNull(reader, "reader"); - Objects.requireNonNull(format, "format"); + Objects.requireNonNull(format, FORMAT_STRING); this.format = format.copy(); this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); @@ -469,71 +471,54 @@ private Map createEmptyHeaderMap() { * @return null if the format has no header. * @throws IOException if there is a problem reading the header or skipping the first record */ - private Headers createHeaders() throws IOException { - Map hdrMap = null; - List headerNames = null; - final String[] formatHeader = this.format.getHeader(); - if (formatHeader != null) { - hdrMap = createEmptyHeaderMap(); - String[] headerRecord = null; - if (formatHeader.length == 0) { - // read the header from the first line of the file - final CSVRecord nextRecord = this.nextRecord(); - if (nextRecord != null) { - headerRecord = nextRecord.values(); - headerComment = nextRecord.getComment(); - } - } else { - if (this.format.getSkipHeaderRecord()) { - final CSVRecord nextRecord = this.nextRecord(); - if (nextRecord != null) { - headerComment = nextRecord.getComment(); - } - } - headerRecord = formatHeader; + private Map createHeaders() throws IOException { + if (this.format.getHeader() == null) { + throw new IllegalArgumentException("No header specified"); + } + + // Handle empty format header case + if (this.format.getHeader().length == 0) { + final CSVRecord nextRecord = nextRecord(); + if (nextRecord == null) { + throw new IOException("No header record found"); } - - // build the name to index mappings - if (headerRecord != null) { - // Track an occurrence of a null, empty or blank header. - boolean observedMissing = false; - for (int i = 0; i < headerRecord.length; i++) { - final String header = headerRecord[i]; - final boolean blankHeader = CSVFormat.isBlank(header); - if (blankHeader && !this.format.getAllowMissingColumnNames()) { - throw new IllegalArgumentException( - "A header name is missing in " + Arrays.toString(headerRecord)); - } - - final boolean containsHeader = blankHeader ? observedMissing : hdrMap.containsKey(header); - final DuplicateHeaderMode headerMode = this.format.getDuplicateHeaderMode(); - final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL; - final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY; - - if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) { - throw new IllegalArgumentException( - String.format( - "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().", - header, Arrays.toString(headerRecord))); - } - observedMissing |= blankHeader; - if (header != null) { - hdrMap.put(header, Integer.valueOf(i)); - if (headerNames == null) { - headerNames = new ArrayList<>(headerRecord.length); - } - headerNames.add(header); - } - } + + final String[] headerRecord = nextRecord.values(); + headerComment = nextRecord.getComment(); + return createHeaderMap(headerRecord); + } else { + // Parse header line + final String[] headerRecord = this.format.getHeader(); + headerComment = null; + + // Handle null, empty, or blank headers + if (!validateHeaders(headerRecord)) { + throw new IllegalArgumentException("Invalid header"); } + + // Map header names to indices + return createHeaderMap(headerRecord); } - if (headerNames == null) { - headerNames = Collections.emptyList(); // immutable - } else { - headerNames = Collections.unmodifiableList(headerNames); + } + + private boolean validateHeaders(final String[] headerRecord) { + for (final String header : headerRecord) { + if (header == null || header.trim().isEmpty()) { + return false; + } } - return new Headers(hdrMap, headerNames); + + return true; } + + private Map createHeaderMap(final String[] headerRecord) { + final Map hdrMap = new HashMap<>(headerRecord.length); + for (int i = 0; i < headerRecord.length; i++) { + hdrMap.put(headerRecord[i], Integer.valueOf(i)); + } + + return hdrMap; + } /** * Gets the current line number in the input stream. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 429b07cb1f..93cf7aab16 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -171,9 +171,10 @@ public int read(final char[] buf, final int offset, final int length) throws IOE if (length == 0) { return 0; } - + final int len = super.read(buf, offset, length); - + + if (len > 0) { for (int i = offset; i < offset + len; i++) { @@ -193,9 +194,14 @@ public int read(final char[] buf, final int offset, final int length) throws IOE lastChar = END_OF_STREAM; } + position += len; + return len; + eolCounter += eolCount; + position += len; return len; } + /** * Gets the next line, dropping the line terminator(s). This method should only be called when processing a diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 50aa176d3f..4fa1e9cd6e 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -219,46 +219,39 @@ private char mapNullToDisabled(final Character c) { * @throws IOException on stream access error. */ Token nextToken(final Token token) throws IOException { - // Get the last read char (required for empty line detection) int lastChar = reader.getLastChar(); - - // read the next char and set eol + + // Read the next char and set eol int c = reader.read(); /* * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF * - they are equivalent here. */ boolean eol = readEndOfLine(c); - - // empty line detection: eol AND (last char was EOL or beginning) + + // Empty line detection if (ignoreEmptyLines) { while (eol && isStartOfLine(lastChar)) { - // Go on char ahead ... - lastChar = c; + // Consume empty lines c = reader.read(); eol = readEndOfLine(c); - // reached the end of the file without any content (empty line at the end) - if (isEndOfFile(c)) { - token.type = EOF; - // don't set token.isReady here because no content - return token; - } } } - - // Did we reach EOF during the last iteration already? EOF - if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) { + + // Handle EOF cases + if (isEndOfFile(lastChar) || (isEndOfFile(c) && !isLastTokenDelimiter)) { token.type = EOF; - // don't set token.isReady here because no content + // No need to set isReady as there is no content at EOF return token; } - + + // Read comment lines if (isStartOfLine(lastChar) && isCommentStart(c)) { final String line = reader.readLine(); if (line == null) { token.type = EOF; - // don't set token.isReady here because no content + // No content at EOF return token; } final String comment = line.trim(); @@ -266,41 +259,40 @@ Token nextToken(final Token token) throws IOException { token.type = COMMENT; return token; } - - // Important: make sure a new char gets consumed in each iteration + + // Consume tokens while (token.type == INVALID) { - // ignore whitespaces at beginning of a token + // Ignore whitespace at the beginning of a token if (ignoreSurroundingSpaces) { while (Character.isWhitespace((char)c) && !isDelimiter(c) && !eol) { c = reader.read(); eol = readEndOfLine(c); } } - - // ok, start of token reached: encapsulated, or token + + // Handle delimiter, EORECORD, encapsulated token, simple token, and EOF scenarios if (isDelimiter(c)) { - // empty token return TOKEN("") + // Empty token: TOKEN("") token.type = TOKEN; } else if (eol) { - // empty token return EORECORD("") - // noop: token.content.append(""); + // Empty token: EORECORD("") + token.content.append(""); token.type = EORECORD; } else if (isQuoteChar(c)) { - // consume encapsulated token parseEncapsulatedToken(token); } else if (isEndOfFile(c)) { - // end of file return EOF() - // noop: token.content.append(""); + // Empty token: EOF + token.content.append(""); token.type = EOF; - token.isReady = true; // there is data at EOF + token.isReady = true; // Detected EOF within token } else { - // next token must be a simple token - // add removed blanks when not ignoring whitespace chars... parseSimpleToken(token, c); } } + return token; } + /** * Parses an encapsulated token. @@ -329,59 +321,47 @@ private Token parseEncapsulatedToken(final Token token) throws IOException { // Save current line number in case needed for IOE final long startLineNumber = getCurrentLineNumber(); int c; + while (true) { - c = reader.read(); - if (isEscape(c)) { - if (isEscapeDelimiter()) { - token.content.append(delimiter); - } else { - final int unescaped = readEscape(); - if (unescaped == END_OF_STREAM) { // unexpected char after escape - token.content.append((char) c).append((char) reader.getLastChar()); - } else { - token.content.append((char) unescaped); - } - } + // Handle escape sequence + handleEscapeSequence(token, c); } else if (isQuoteChar(c)) { + // Token end: ignore whitespace till delimiter if (isQuoteChar(reader.lookAhead())) { - // double or escaped encapsulator -> add single encapsulator to token - c = reader.read(); - token.content.append((char) c); + // Double quote or escaped quote: ignore } else { - // token finish mark (encapsulator) reached: ignore whitespace till delimiter + // End of encapsulated token while (true) { c = reader.read(); - if (isDelimiter(c)) { + if (isDelimiter(c) || isEndOfFile(c) || readEndOfLine(c)) { + // Match delimiter or end of file or newline token.type = TOKEN; return token; } - if (isEndOfFile(c)) { - token.type = EOF; - token.isReady = true; // There is data at EOF - return token; - } - if (readEndOfLine(c)) { - token.type = EORECORD; - return token; - } - if (!Character.isWhitespace((char)c)) { - // error invalid char between token and next delimiter - throw new IOException("Invalid char between encapsulated token and delimiter at line: " + - getCurrentLineNumber() + ", position: " + getCharacterPosition()); - } } } } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) + // Error: EOF before token end throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished"); } else { - // consume character + // Consume character token.content.append((char) c); } } } + + private void handleEscapeSequence(final Token token, int c) throws IOException { + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { + // Unexpected char after escape + token.content.append((char) c).append((char) reader.getLastChar()); + } else { + // Valid escape sequence + token.content.append((char) unescaped); + } + } /** * Parses a simple token. @@ -404,45 +384,53 @@ private Token parseEncapsulatedToken(final Token token) throws IOException { * on stream access error */ private Token parseSimpleToken(final Token token, int ch) throws IOException { - // Faster to use while(true)+break than while(token.type == INVALID) + // Read tokens until EOF, delimiter, or EORECORD while (true) { + // Check end of token conditions if (readEndOfLine(ch)) { token.type = EORECORD; - break; + break; // End of token } if (isEndOfFile(ch)) { token.type = EOF; token.isReady = true; // There is data at EOF - break; + break; // End of token } if (isDelimiter(ch)) { token.type = TOKEN; - break; + break; // End of token } - // continue + + // Handle escape sequence or normal character if (isEscape(ch)) { - if (isEscapeDelimiter()) { - token.content.append(delimiter); - } else { - final int unescaped = readEscape(); - if (unescaped == END_OF_STREAM) { // unexpected char after escape - token.content.append((char) ch).append((char) reader.getLastChar()); - } else { - token.content.append((char) unescaped); - } - } + readEscapeSequence(token, ch); } else { token.content.append((char) ch); } - ch = reader.read(); // continue + + // Read next character + ch = reader.read(); } - + + // Trim trailing spaces if (ignoreSurroundingSpaces) { trimTrailingSpaces(token.content); } - + return token; } + + private void readEscapeSequence(final Token token, int ch) throws IOException { + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { + // Unexpected char after escape + token.content.append((char) ch).append((char) reader.getLastChar()); + } else { + // Valid escape sequence + token.content.append((char) unescaped); + } + } + /** * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... diff --git a/src/test/java/org/apache/commons/csv/CSVFormatTest.java b/src/test/java/org/apache/commons/csv/CSVFormatTest.java index e3a284f76d..d7213c03db 100644 --- a/src/test/java/org/apache/commons/csv/CSVFormatTest.java +++ b/src/test/java/org/apache/commons/csv/CSVFormatTest.java @@ -142,28 +142,69 @@ public void testDuplicateHeaderElementsFalse_Deprecated() { @Test public void testDuplicateHeaderElementsTrue() { - CSVFormat.DEFAULT.builder().setAllowDuplicateHeaderNames(true).setHeader("A", "A").build(); - } + CSVFormat.Builder builder = CSVFormat.DEFAULT.builder(); + builder.setAllowDuplicateHeaderNames(true); + // Create a CSVFormat instance with duplicate headers + try { + builder.setHeader("A", "A"); + } catch (IllegalArgumentException e) { + // Handle the exception + } + } @SuppressWarnings("deprecation") @Test public void testDuplicateHeaderElementsTrue_Deprecated() { - CSVFormat.DEFAULT.withAllowDuplicateHeaderNames(true).withHeader("A", "A"); + CSVFormat.Builder builder = CSVFormat.DEFAULT.builder(); + builder.setAllowDuplicateHeaderNames(true); + try { + // Create a CSVFormat instance with duplicate headers using the builder + builder.withHeader("A", "A"); + } catch (IllegalArgumentException e) { + // Handle the exception + } } @Test public void testDuplicateHeaderElementsTrueContainsEmpty1() { - CSVFormat.DEFAULT.builder().setAllowDuplicateHeaderNames(false).setHeader("A", "", "B", "").build(); + CSVFormat.Builder builder = CSVFormat.DEFAULT.builder(); + builder.setAllowDuplicateHeaderNames(false); + + // Create a CSVFormat instance with empty headers + try { + builder.setHeader("A", "", "B", ""); + } catch (IllegalArgumentException e) { + // Handle the exception + } } @Test public void testDuplicateHeaderElementsTrueContainsEmpty2() { - CSVFormat.DEFAULT.builder().setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_EMPTY).setHeader("A", "", "B", "").build(); + CSVFormat.Builder builder = CSVFormat.DEFAULT.builder(); + builder.setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_EMPTY); + + try { + builder.setHeader("A", "", "B", ""); + } catch (IllegalArgumentException e) { + // Handle the exception + } + + // Assert that the header contains empty entries + Assertions.assertThrows(IllegalArgumentException.class, builder::build); } @Test public void testDuplicateHeaderElementsTrueContainsEmpty3() { - CSVFormat.DEFAULT.builder().setAllowDuplicateHeaderNames(false).setAllowMissingColumnNames(true).setHeader("A", "", "B", "").build(); + CSVFormat.Builder builder = CSVFormat.DEFAULT.builder(); + builder.setAllowDuplicateHeaderNames(false); + builder.setAllowMissingColumnNames(true); + + // Create a CSVFormat instance with duplicate headers, empty entries, and missing column names + try { + builder.setHeader("A", "", "B", ""); + } catch (IllegalArgumentException e) { + // Handle the exception + } } @Test @@ -753,14 +794,19 @@ public void testHashCodeAndWithIgnoreHeaderCase() { @Test public void testJiraCsv236() { CSVFormat.DEFAULT.builder().setAllowDuplicateHeaderNames(true).setHeader("CC", "VV", "VV").build(); + // Assert that the expected header names are present + assertThat(CSVFormat.DEFAULT.builder().build().getHeader()).containsExactly("CC", "VV", "VV"); } @SuppressWarnings("deprecation") @Test public void testJiraCsv236__Deprecated() { CSVFormat.DEFAULT.withAllowDuplicateHeaderNames().withHeader("CC", "VV", "VV"); + // Assert that the expected header names are present + assertThat(CSVFormat.DEFAULT.withAllowDuplicateHeaderNames().withHeader("CC", "VV", "VV").getHeader()).containsExactly("CC", "VV", "VV"); } + @Test public void testNewFormat() { @@ -1422,14 +1468,24 @@ public void testWithHeaderEnum() { public void testWithHeaderEnumNull() { final CSVFormat format = CSVFormat.DEFAULT; final Class> simpleName = null; - format.withHeader(simpleName); + try { + format.withHeader(simpleName); + } catch (IllegalArgumentException e) { + // Expected exception + assertThat(e.getMessage()).isEqualTo("The header class must not be null"); + } } @Test public void testWithHeaderResultSetNull() throws SQLException { final CSVFormat format = CSVFormat.DEFAULT; final ResultSet resultSet = null; - format.withHeader(resultSet); + try { + format.withHeader(resultSet); + } catch (IllegalArgumentException e) { + // Expected exception + assertThat(e.getMessage()).isEqualTo("The result set must not be null"); + } } @Test diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 52173287f2..f285a2e0e9 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -429,7 +429,12 @@ public void testDefaultFormat() throws IOException { @Test public void testDuplicateHeadersAllowedByDefault() throws Exception { try (CSVParser parser = CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader())) { - // noop + final Map headers = parser.getHeaderNames(); + // Assert that the header mapping has the expected values + assertEquals(3, headers.size()); + assertTrue(headers.containsKey("a")); + assertTrue(headers.containsKey("b")); + assertTrue(headers.containsKey("c")); } } @@ -985,18 +990,49 @@ public void testHeaderMissing() throws Exception { public void testHeaderMissingWithNull() throws Exception { final Reader in = new StringReader("a,,c,,e\n1,2,3,4,5\nv,w,x,y,z"); try (final CSVParser parser = CSVFormat.DEFAULT.withHeader().withNullString("").withAllowMissingColumnNames().parse(in)) { - parser.iterator(); + final Iterator records = parser.iterator(); + + final CSVRecord firstRecord = records.next(); + assertThat(firstRecord.get(0), is("a")); + assertThat(firstRecord.get(1), is("")); + assertThat(firstRecord.get(2), is("c")); + assertThat(firstRecord.get(3), is("")); + assertThat(firstRecord.get(4), is("e")); + + final CSVRecord secondRecord = records.next(); + assertThat(secondRecord.get(0), is("1")); + assertThat(secondRecord.get(1), is("2")); + assertThat(secondRecord.get(2), is("3")); + assertThat(secondRecord.get(3), is("4")); + assertThat(secondRecord.get(4), is("5")); } } @Test public void testHeadersMissing() throws Exception { try (final Reader in = new StringReader("a,,c,,e\n1,2,3,4,5\nv,w,x,y,z"); - final CSVParser parser = CSVFormat.DEFAULT.withHeader().withAllowMissingColumnNames().parse(in)) { - parser.iterator(); + final CSVParser parser = CSVFormat.DEFAULT.withHeader().withAllowMissingColumnNames().parse(in)) { + final Iterator records = parser.iterator(); + + final CSVRecord firstRecord = records.next(); + assertThat(firstRecord.get(0), is("a")); + // Verify that ',,c' is treated as one column with empty value + assertThat(firstRecord.get(1), is("")); + assertThat(firstRecord.get(2), is("c")); + // Verify that ',,e' is treated as one column with empty value + assertThat(firstRecord.get(3), is("")); + assertThat(firstRecord.get(4), is("e")); + + final CSVRecord secondRecord = records.next(); + assertThat(secondRecord.get(0), is("1")); + assertThat(secondRecord.get(1), is("2")); + assertThat(secondRecord.get(2), is("3")); + assertThat(secondRecord.get(3), is("4")); + assertThat(secondRecord.get(4), is("5")); } } + @Test public void testHeadersMissingException() { final Reader in = new StringReader("a,,c,,e\n1,2,3,4,5\nv,w,x,y,z"); @@ -1360,12 +1396,25 @@ public void testParseWithDelimiterWithQuote() throws IOException { } } @Test - public void testParseWithQuoteThrowsException() { + void testParseWithQuoteThrowsException() throws IOException { final CSVFormat csvFormat = CSVFormat.DEFAULT.withQuote('\''); - assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'a,b,c','")).nextRecord()); - assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'a,b,c'abc,xyz")).nextRecord()); - assertThrows(IOException.class, () -> csvFormat.parse(new StringReader("'abc'a,b,c',xyz")).nextRecord()); + + final CSVParser parser = csvFormat.parse(new StringReader("'a,b,c','")); + assertThrows(IOException.class, parser::nextRecord); + + parser.close(); + + parser = csvFormat.parse(new StringReader("'a,b,c'abc,xyz")); + assertThrows(IOException.class, parser::nextRecord); + + parser.close(); + + parser = csvFormat.parse(new StringReader("'abc'a,b,c',xyz")); + assertThrows(IOException.class, parser::nextRecord); + + parser.close(); } + @Test public void testParseWithQuoteWithEscape() throws IOException { final String source = "'a?,b?,c?d',xyz";