Skip to content

Commit b55fb21

Browse files
committed
SANDBOX-206: add escape to strategy, turn off backslash-style escaping by default
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609155 13f79535-47bb-0310-9956-ffa450edef68
1 parent f34ce7d commit b55fb21

4 files changed

Lines changed: 127 additions & 60 deletions

File tree

src/java/org/apache/commons/csv/CSVParser.java

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ public CSVParser(Reader input) {
134134
* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
135135
*/
136136
public CSVParser(Reader input, char delimiter) {
137-
this(input, delimiter, '"', (char) 0);
137+
this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
138138
}
139139

140140
/**
@@ -347,7 +347,7 @@ protected Token nextToken(Token tkn) throws IOException {
347347
eol = isEndOfLine(c);
348348
}
349349
// ok, start of token reached: comment, encapsulated, or token
350-
if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
350+
if (c == strategy.getCommentStart()) {
351351
// ignore everything till end of line and continue (incr linecount)
352352
in.readLine();
353353
tkn = nextToken(tkn.reset());
@@ -400,19 +400,22 @@ protected Token nextToken(Token tkn) throws IOException {
400400
*/
401401
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
402402
wsBuf.clear();
403-
while (!tkn.isReady) {
403+
for (;;) {
404404
if (isEndOfLine(c)) {
405405
// end of record
406406
tkn.type = TT_EORECORD;
407407
tkn.isReady = true;
408+
return tkn;
408409
} else if (isEndOfFile(c)) {
409410
// end of file
410411
tkn.type = TT_EOF;
411412
tkn.isReady = true;
413+
return tkn;
412414
} else if (c == strategy.getDelimiter()) {
413415
// end of token
414416
tkn.type = TT_TOKEN;
415417
tkn.isReady = true;
418+
return tkn;
416419
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
417420
// interpret unicode escaped chars (like \u0070 -> p)
418421
tkn.content.append((char) unicodeEscapeLexer(c));
@@ -422,6 +425,8 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
422425
if (tkn.content.length() > 0) {
423426
wsBuf.append((char) c);
424427
}
428+
} else if (c == strategy.getEscape()) {
429+
tkn.content.append((char)readEscape(c));
425430
} else {
426431
// prepend whitespaces (if we have)
427432
if (wsBuf.length() > 0) {
@@ -435,7 +440,6 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
435440
c = in.read();
436441
}
437442
}
438-
return tkn;
439443
}
440444

441445

@@ -457,70 +461,55 @@ private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
457461
int startLineNumber = getLineNumber();
458462
// ignore the given delimiter
459463
// assert c == delimiter;
460-
c = in.read();
461-
while (!tkn.isReady) {
462-
boolean skipRead = false;
463-
if (c == strategy.getEncapsulator() || c == '\\') {
464-
// check lookahead
464+
for (;;) {
465+
c = in.read();
466+
467+
if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
468+
tkn.content.append((char) unicodeEscapeLexer(c));
469+
} else if (c == strategy.getEscape()) {
470+
tkn.content.append((char)readEscape(c));
471+
} else if (c == strategy.getEncapsulator()) {
465472
if (in.lookAhead() == strategy.getEncapsulator()) {
466473
// double or escaped encapsulator -> add single encapsulator to token
467474
c = in.read();
468475
tkn.content.append((char) c);
469-
} else if (c == '\\' && in.lookAhead() == '\\') {
470-
// doubled escape char, it does not escape itself, only encapsulator
471-
// -> add both escape chars to stream
472-
tkn.content.append((char) c);
473-
c = in.read();
474-
tkn.content.append((char) c);
475-
} else if (
476-
strategy.getUnicodeEscapeInterpretation()
477-
&& c == '\\'
478-
&& in.lookAhead() == 'u') {
479-
// interpret unicode escaped chars (like \u0070 -> p)
480-
tkn.content.append((char) unicodeEscapeLexer(c));
481-
} else if (c == '\\') {
482-
// use a single escape character -> add it to stream
483-
tkn.content.append((char) c);
484476
} else {
485477
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
486-
while (!tkn.isReady) {
478+
for (;;) {
487479
c = in.read();
488480
if (c == strategy.getDelimiter()) {
489481
tkn.type = TT_TOKEN;
490482
tkn.isReady = true;
483+
return tkn;
491484
} else if (isEndOfFile(c)) {
492485
tkn.type = TT_EOF;
493486
tkn.isReady = true;
487+
return tkn;
494488
} else if (isEndOfLine(c)) {
495489
// ok eo token reached
496490
tkn.type = TT_EORECORD;
497491
tkn.isReady = true;
492+
return tkn;
498493
} else if (!isWhitespace(c)) {
499-
// error invalid char between token and next delimiter
500-
throw new IOException(
501-
"(line " + getLineNumber()
502-
+ ") invalid char between encapsulated token end delimiter"
503-
);
504-
}
494+
// error invalid char between token and next delimiter
495+
throw new IOException(
496+
"(line " + getLineNumber()
497+
+ ") invalid char between encapsulated token end delimiter"
498+
);
499+
}
505500
}
506-
skipRead = true;
507501
}
508502
} else if (isEndOfFile(c)) {
509503
// error condition (end of file before end of token)
510504
throw new IOException(
511-
"(startline " + startLineNumber + ")"
512-
+ "eof reached before encapsulated token finished"
513-
);
505+
"(startline " + startLineNumber + ")"
506+
+ "eof reached before encapsulated token finished"
507+
);
514508
} else {
515509
// consume character
516510
tkn.content.append((char) c);
517511
}
518-
// get the next char
519-
if (!tkn.isReady && !skipRead) {
520-
c = in.read();
521-
}
522512
}
523-
return tkn;
524513
}
525514

526515

@@ -554,6 +543,21 @@ protected int unicodeEscapeLexer(int c) throws IOException {
554543
}
555544
return ret;
556545
}
546+
547+
private int readEscape(int c) throws IOException {
548+
// assume c is the escape char (normally a backslash)
549+
c = in.read();
550+
int out;
551+
switch (c) {
552+
case 'r': out='\r'; break;
553+
case 'n': out='\n'; break;
554+
case 't': out='\t'; break;
555+
case 'b': out='\b'; break;
556+
case 'f': out='\f'; break;
557+
default : out=c;
558+
}
559+
return out;
560+
}
557561

558562
// ======================================================
559563
// strategies

src/java/org/apache/commons/csv/CSVStrategy.java

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable {
2828
private char delimiter;
2929
private char encapsulator;
3030
private char commentStart;
31+
private char escape;
3132
private boolean ignoreLeadingWhitespaces;
3233
private boolean interpretUnicodeEscapes;
3334
private boolean ignoreEmptyLines;
3435

35-
public static char COMMENTS_DISABLED = (char) 0;
36+
// -2 is used to signal disabled, because it won't be confused with
37+
// an EOF signal (-1), and because \ufffe in UTF-16 would be
38+
// encoded as two chars (using surrogates) and thus there should never
39+
// be a collision with a real text char.
40+
public static char COMMENTS_DISABLED = (char)-2;
41+
public static char ESCAPE_DISABLED = (char)-2;
3642

37-
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true, false, true);
38-
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false);
39-
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, true, false, true);
43+
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
44+
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
45+
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
4046

4147

4248
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@@ -58,19 +64,34 @@ public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
5864
public CSVStrategy(
5965
char delimiter,
6066
char encapsulator,
61-
char commentStart,
67+
char commentStart,
68+
char escape,
6269
boolean ignoreLeadingWhitespace,
6370
boolean interpretUnicodeEscapes,
6471
boolean ignoreEmptyLines)
6572
{
6673
setDelimiter(delimiter);
6774
setEncapsulator(encapsulator);
6875
setCommentStart(commentStart);
76+
setEscape(escape);
6977
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
7078
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
7179
setIgnoreEmptyLines(ignoreEmptyLines);
7280
}
7381

82+
/** @deprecated */
83+
public CSVStrategy(
84+
char delimiter,
85+
char encapsulator,
86+
char commentStart,
87+
boolean ignoreLeadingWhitespace,
88+
boolean interpretUnicodeEscapes,
89+
boolean ignoreEmptyLines)
90+
{
91+
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
92+
}
93+
94+
7495
public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
7596
public char getDelimiter() { return this.delimiter; }
7697

@@ -81,6 +102,9 @@ public CSVStrategy(
81102
public char getCommentStart() { return this.commentStart; }
82103
public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; }
83104

105+
public void setEscape(char escape) { this.escape = escape; }
106+
public char getEscape() { return this.escape; }
107+
84108
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
85109
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
86110

src/test/org/apache/commons/csv/CSVParserTest.java

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,7 @@ public void testNextToken4() throws IOException {
182182
// encapsulator tokenizer (multi line, delimiter in string)
183183
public void testNextToken5() throws IOException {
184184
String code =
185-
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
186-
+ ",\"\\,\""
187-
+ ",\"\"\"\"";
185+
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
188186
TestCSVParser parser = new TestCSVParser(new StringReader(code));
189187
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
190188
System.out.println("---------\n" + code + "\n-------------");
@@ -193,11 +191,8 @@ public void testNextToken5() throws IOException {
193191
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
194192
assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
195193
parser.testNextToken());
196-
assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
197-
assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
198-
// escape char in quoted input only escapes delimiter
199-
assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
200-
assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
194+
assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
195+
201196
}
202197

203198
// change delimiters, comment, encapsulater
@@ -207,7 +202,7 @@ public void testNextToken6() throws IOException {
207202
* !comment;;;;
208203
* ;;
209204
*/
210-
String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
205+
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
211206
TestCSVParser parser = new TestCSVParser(new StringReader(code));
212207
parser.setStrategy( new CSVStrategy(';', '\'', '!') );
213208
System.out.println("---------\n" + code + "\n-------------");
@@ -226,8 +221,9 @@ public void testNextToken6() throws IOException {
226221
"a,b,c,d\n"
227222
+ " a , b , 1 2 \n"
228223
+ "\"foo baar\", b,\n"
229-
+ " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
230-
String[][] res = {
224+
// + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
225+
+ " \"foo\n,,\n\"\",,\n\"\"\",d,e\n"; // changed to use standard CSV escaping
226+
String[][] res = {
231227
{"a", "b", "c", "d"},
232228
{"a", "b", "1 2"},
233229
{"foo baar", "b", ""},
@@ -439,7 +435,7 @@ public void testEmptyLineBehaviourCSV() throws Exception {
439435
}
440436
}
441437

442-
public void testBackslashEscaping() throws IOException {
438+
public void OLDtestBackslashEscaping() throws IOException {
443439
String code =
444440
"one,two,three\n"
445441
+ "on\\\"e,two\n"
@@ -474,6 +470,49 @@ public void testBackslashEscaping() throws IOException {
474470
}
475471
}
476472

473+
public void testBackslashEscaping() throws IOException {
474+
475+
// To avoid confusion over the need for escaping chars in java code,
476+
// We will test with a forward slash as the escape char, and a single
477+
// quote as the encapsulator.
478+
479+
String code =
480+
"one,two,three\n" // 0
481+
+ "'',''\n" // 1) empty encapsulators
482+
+ "/',/'\n" // 2) single encapsulators
483+
+ "'/'','/''\n" // 3) single encapsulators encapsulated via escape
484+
+ "'''',''''\n" // 4) single encapsulators encapsulated via doubling
485+
+ "/,,/,\n" // 5) separator escaped
486+
+ "//,//\n" // 6) escape escaped
487+
+ "'//','//'\n" // 7) escape escaped in encapsulation
488+
+ "";
489+
String[][] res = {
490+
{ "one", "two", "three" }, // 0
491+
{ "", "" }, // 1
492+
{ "'", "'" }, // 2
493+
{ "'", "'" }, // 3
494+
{ "'", "'" }, // 4
495+
{ ",", "," }, // 5
496+
{ "/", "/" }, // 6
497+
{ "/", "/" }, // 7
498+
};
499+
500+
501+
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
502+
503+
CSVParser parser = new CSVParser(new StringReader(code), strategy);
504+
System.out.println("---------\n" + code + "\n-------------");
505+
String[][] tmp = parser.getAllValues();
506+
assertTrue(tmp.length > 0);
507+
for (int i = 0; i < res.length; i++) {
508+
for (int j = 0; j < tmp[i].length; j++) {
509+
System.out.println("'" + tmp[i][j] + "' should be '" + res[i][j] + "'");
510+
}
511+
assertTrue(Arrays.equals(res[i], tmp[i]));
512+
}
513+
}
514+
515+
477516
public void testUnicodeEscape() throws IOException {
478517
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
479518
CSVParser parser = new CSVParser(new StringReader(code));

src/test/org/apache/commons/csv/CSVStrategyTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,15 @@ public void testSetCSVStrategy() {
9191
// default settings
9292
assertEquals(strategy.getDelimiter(), ',');
9393
assertEquals(strategy.getEncapsulator(), '"');
94-
assertEquals(strategy.getCommentStart(), '\0');
94+
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
9595
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
9696
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
9797
assertEquals(true, strategy.getIgnoreEmptyLines());
9898
// explicit csv settings
9999
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
100100
assertEquals(strategy.getDelimiter(), ',');
101101
assertEquals(strategy.getEncapsulator(), '"');
102-
assertEquals(strategy.getCommentStart(), '\0');
102+
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
103103
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
104104
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
105105
assertEquals(true, strategy.getIgnoreEmptyLines());
@@ -109,7 +109,7 @@ public void testSetExcelStrategy() {
109109
CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
110110
assertEquals(strategy.getDelimiter(), ',');
111111
assertEquals(strategy.getEncapsulator(), '"');
112-
assertEquals(strategy.getCommentStart(), '\0');
112+
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
113113
assertEquals(false, strategy.getIgnoreLeadingWhitespaces());
114114
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
115115
assertEquals(false, strategy.getIgnoreEmptyLines());

0 commit comments

Comments
 (0)