Skip to content

Commit cdacb4d

Browse files
committed
[ 2c61208 ] after bug fix 3
2 parents fecab6c + 2c61208 commit cdacb4d

3 files changed

Lines changed: 40 additions & 16 deletions

File tree

src/main/java/org/apache/commons/csv/CSVLexer.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,12 @@ private Token parseSimpleToken(final Token tkn, int c) throws IOException {
160160
tkn.type = TOKEN;
161161
break;
162162
} else if (isEscape(c)) {
163-
tkn.content.append((char) readEscape());
163+
final int unescaped = readEscape();
164+
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
165+
tkn.content.append((char) c).append((char) in.getLastChar());
166+
} else {
167+
tkn.content.append((char) unescaped);
168+
}
164169
c = in.read(); // continue
165170
} else {
166171
tkn.content.append((char) c);
@@ -203,7 +208,12 @@ private Token parseEncapsulatedToken(final Token tkn) throws IOException {
203208
c = in.read();
204209

205210
if (isEscape(c)) {
206-
tkn.content.append((char) readEscape());
211+
final int unescaped = readEscape();
212+
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
213+
tkn.content.append((char) c).append((char) in.getLastChar());
214+
} else {
215+
tkn.content.append((char) unescaped);
216+
}
207217
} else if (isQuoteChar(c)) {
208218
if (isQuoteChar(in.lookAhead())) {
209219
// double or escaped encapsulator -> add single encapsulator to token

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,18 @@ long getLineNumber() {
7474
}
7575

7676
// TODO escape handling needs more work
77+
/**
78+
* Handle an escape sequence.
79+
* The current character must be the escape character.
80+
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
81+
* on the input stream.
82+
*
83+
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
84+
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
85+
* the escape character is not allowed at end of strem
86+
*/
7787
int readEscape() throws IOException {
78-
// assume c is the escape char (normally a backslash)
88+
// the escape char has just been read (normally a backslash)
7989
final int c = in.read();
8090
switch (c) {
8191
case 'r':
@@ -88,10 +98,21 @@ int readEscape() throws IOException {
8898
return BACKSPACE;
8999
case 'f':
90100
return FF;
101+
case CR:
102+
case LF:
103+
case FF: // TODO is this correct?
104+
case TAB: // TODO is this correct? Do tabs need to be escaped?
105+
case BACKSPACE: // TODO is this correct?
106+
return c;
91107
case END_OF_STREAM:
92108
throw new IOException("EOF whilst processing escape sequence");
93109
default:
94-
return c;
110+
// Now check for meta-characters
111+
if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
112+
return c;
113+
}
114+
// indicate unexpected char - available from in.getLastChar()
115+
return END_OF_STREAM;
95116
}
96117
}
97118

src/test/java/org/apache/commons/csv/CSVLexerTest.java

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
import java.io.StringReader;
3737

3838
import org.junit.Before;
39-
import org.junit.Ignore;
4039
import org.junit.Test;
4140

4241
/**
@@ -311,48 +310,42 @@ public void testEscapedLF() throws Exception {
311310
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
312311
}
313312

314-
@Test
313+
@Test // TODO is this correct? Do we expect TAB to be un/escaped?
315314
public void testEscapedTab() throws Exception {
316315
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
317316
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
318317
}
319318

320-
@Test
319+
@Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
321320
public void testEscapeBackspace() throws Exception {
322321
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
323322
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
324323
}
325324

326-
@Test
325+
@Test // TODO is this correct? Do we expect FF to be un/escaped?
327326
public void testEscapeFF() throws Exception {
328327
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
329328
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
330329
}
331330

332-
// FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
333331
@Test
334-
@Ignore
335332
public void testEscapedMySqlNullValue() throws Exception {
336333
// MySQL uses \N to symbolize null values. We have to restore this
337334
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
338335
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
339336
}
340337

341-
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
342338
@Test
343-
@Ignore
344339
public void testEscapedCharacter() throws Exception {
345340
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
346341
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
347342
}
348343

349-
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
350344
@Test
351-
@Ignore
352345
public void testEscapedControlCharacter() throws Exception {
353-
// we are explicitly using an escape different from \ here, because \r is the character sequence for CR
346+
// we are explicitly using an escape different from \ here
354347
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
355-
assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped"));
348+
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
356349
}
357350

358351
@Test

0 commit comments

Comments
 (0)