diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 08c673ee0..cca38e512 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -49,7 +49,7 @@ jobs: uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index a6154ddb1..17ba7dd38 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -46,14 +46,14 @@ jobs: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: persist-credentials: false - - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5 + - uses: actions/cache@55cc8345863c7cc4c66a329aec7e433d2d1c52a9 #v6.1.0 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@ad2b38190b15e4d6bdf0c97fb4fca8412226d287 # v5.3.0 + uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520 # v5.4.0 with: distribution: ${{ runner.os == 'macOS' && matrix.java == '8' && 'zulu' || 'temurin' }} java-version: ${{ matrix.java }} diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml index bf246c140..e1868cb46 100644 --- a/.github/workflows/scorecards-analysis.yml +++ b/.github/workflows/scorecards-analysis.yml @@ -40,7 +40,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # 7.0.0 with: persist-credentials: false diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 0d0175ccc..93952e9f1 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -62,6 +62,7 @@ Escape leading comment marker in printWithEscapes (#614). Skip byte counting at EOF in ExtendedBufferedReader.read (#615). Keep quoted empty trailing field with trailingDelimiter (#616). + Evaluate isDelimiter once in nextToken whitespace skip (#618).. Add an "Android Compatibility" section to the web site. Add CSVParser.Builder.setByteOffset(long) (#604). diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 502bf318a..8dab14d90 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -281,7 +281,7 @@ public Iterator iterator() { /** * Puts all values of this record into the given Map. * - * @param the map type. + * @param The map type. * @param map The Map to populate. * @return the given map. * @since 1.9.0 diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 93a584663..fe964480a 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -277,15 +277,22 @@ Token nextToken(final Token token) throws IOException { } // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { + // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must + // only be evaluated once per character. Remember a match found while skipping whitespace below. + boolean delimiter = false; // ignore whitespaces at beginning of a token if (ignoreSurroundingSpaces) { - while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) { + while (Character.isWhitespace((char) c) && !eol) { + if (isDelimiter(c)) { + delimiter = true; + break; + } c = reader.read(); eol = readEndOfLine(c); } } // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { + if (delimiter || isDelimiter(c)) { // empty token return TOKEN("") token.type = Token.Type.TOKEN; } else if (eol) { diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 051548757..6d9bdd9e8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -465,6 +465,31 @@ void testDuplicateHeadersNotAllowed() { () -> CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader().withAllowDuplicateHeaderNames(false))); } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping + * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value. + */ + @Test + void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (CSVParser parser = CSVParser.parse(" |a", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "", "a" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b |", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b", "" }, records.get(0)); + } + } + @Test void testEmptyFile() throws Exception { try (CSVParser parser = CSVParser.parse(Paths.get("src/test/resources/org/apache/commons/csv/empty.txt"), StandardCharsets.UTF_8, diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index db1ab3a6d..a76f6e513 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -216,6 +216,25 @@ void testDelimiterIsWhitespace() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the + * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped. + */ + @Test + void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (Lexer lexer = createLexer(" |a", format)) { + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "a", lexer); + } + try (Lexer lexer = createLexer("a | |b", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "b", lexer); + } + } + @Test void testEOFWithoutClosingQuote() throws Exception { final String code = "a,\"b";