Skip to content

Commit 086f434

Browse files
committed
SANDBOX-206: fix whitespace handling w/ escaping, add an option to not remove trailing whitespace
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609327 13f79535-47bb-0310-9956-ffa450edef68
1 parent b55fb21 commit 086f434

4 files changed

Lines changed: 47 additions & 27 deletions

File tree

src/java/org/apache/commons/csv/CSVParser.java

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -399,47 +399,39 @@ protected Token nextToken(Token tkn) throws IOException {
399399
* @throws IOException on stream access error
400400
*/
401401
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
402-
wsBuf.clear();
403402
for (;;) {
404403
if (isEndOfLine(c)) {
405404
// end of record
406405
tkn.type = TT_EORECORD;
407406
tkn.isReady = true;
408-
return tkn;
407+
break;
409408
} else if (isEndOfFile(c)) {
410409
// end of file
411410
tkn.type = TT_EOF;
412411
tkn.isReady = true;
413-
return tkn;
412+
break;
414413
} else if (c == strategy.getDelimiter()) {
415414
// end of token
416415
tkn.type = TT_TOKEN;
417416
tkn.isReady = true;
418-
return tkn;
417+
break;
419418
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
420419
// interpret unicode escaped chars (like \u0070 -> p)
421420
tkn.content.append((char) unicodeEscapeLexer(c));
422-
} else if (isWhitespace(c)) {
423-
// gather whitespaces
424-
// (as long as they are not at the beginning of a token)
425-
if (tkn.content.length() > 0) {
426-
wsBuf.append((char) c);
427-
}
428421
} else if (c == strategy.getEscape()) {
429422
tkn.content.append((char)readEscape(c));
430423
} else {
431-
// prepend whitespaces (if we have)
432-
if (wsBuf.length() > 0) {
433-
tkn.content.append(wsBuf);
434-
wsBuf.clear();
435-
}
436424
tkn.content.append((char) c);
437425
}
438-
// get the next char
439-
if (!tkn.isReady) {
440-
c = in.read();
441-
}
426+
427+
c = in.read();
442428
}
429+
430+
if (strategy.getIgnoreTrailingWhitespaces()) {
431+
tkn.content.trimTrailingWhitespace();
432+
}
433+
434+
return tkn;
443435
}
444436

445437

src/java/org/apache/commons/csv/CSVStrategy.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class CSVStrategy implements Cloneable, Serializable {
3030
private char commentStart;
3131
private char escape;
3232
private boolean ignoreLeadingWhitespaces;
33+
private boolean ignoreTrailingWhitespaces;
3334
private boolean interpretUnicodeEscapes;
3435
private boolean ignoreEmptyLines;
3536

@@ -40,9 +41,9 @@ public class CSVStrategy implements Cloneable, Serializable {
4041
public static char COMMENTS_DISABLED = (char)-2;
4142
public static char ESCAPE_DISABLED = (char)-2;
4243

43-
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
44-
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
45-
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
44+
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
45+
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false, false);
46+
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
4647

4748

4849
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@@ -67,6 +68,7 @@ public CSVStrategy(
6768
char commentStart,
6869
char escape,
6970
boolean ignoreLeadingWhitespace,
71+
boolean ignoreTrailingWhitespace,
7072
boolean interpretUnicodeEscapes,
7173
boolean ignoreEmptyLines)
7274
{
@@ -75,6 +77,7 @@ public CSVStrategy(
7577
setCommentStart(commentStart);
7678
setEscape(escape);
7779
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
80+
setIgnoreTrailingWhitespaces(ignoreTrailingWhitespace);
7881
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
7982
setIgnoreEmptyLines(ignoreEmptyLines);
8083
}
@@ -88,7 +91,7 @@ public CSVStrategy(
8891
boolean interpretUnicodeEscapes,
8992
boolean ignoreEmptyLines)
9093
{
91-
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
94+
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,true,interpretUnicodeEscapes,ignoreEmptyLines);
9295
}
9396

9497

@@ -108,6 +111,9 @@ public CSVStrategy(
108111
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
109112
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
110113

114+
public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) { this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces; }
115+
public boolean getIgnoreTrailingWhitespaces() { return this.ignoreTrailingWhitespaces; }
116+
111117
public void setUnicodeEscapeInterpretation(boolean interpretUnicodeEscapes) { this.interpretUnicodeEscapes = interpretUnicodeEscapes; }
112118
public boolean getUnicodeEscapeInterpretation() { return this.interpretUnicodeEscapes; }
113119

src/java/org/apache/commons/csv/CharBuffer.java

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* grows as necessary.
2525
* This class is not thread safe.
2626
*
27-
* @author Ortwin Glück
27+
* @author Ortwin Gl�ck
2828
*/
2929
public class CharBuffer {
3030
private char[] c;
@@ -65,7 +65,7 @@ public void clear() {
6565
public int length() {
6666
return length;
6767
}
68-
68+
6969
/**
7070
* Returns the current capacity of the buffer.
7171
* @return the maximum number of characters that can be stored in this buffer without
@@ -74,6 +74,7 @@ public int length() {
7474
public int capacity() {
7575
return c.length;
7676
}
77+
7778

7879
/**
7980
* Appends the contents of <code>cb</code> to the end of this CharBuffer.
@@ -142,6 +143,15 @@ public void shrink() {
142143
c = newc;
143144
}
144145

146+
/**
147+
* Removes trailing whitespace.
148+
*/
149+
public void trimTrailingWhitespace() {
150+
while (length>0 && Character.isWhitespace(c[length-1])) {
151+
length--;
152+
}
153+
}
154+
145155
/**
146156
* Returns the contents of the buffer as a char[]. The returned array may
147157
* be the internal array of the buffer, so the caller must take care when
@@ -156,7 +166,14 @@ public char[] getCharacters() {
156166
System.arraycopy(c, 0, chars, 0, length);
157167
return chars;
158168
}
159-
169+
170+
/**
171+
* Returns the character at the specified position.
172+
*/
173+
public char charAt(int pos) {
174+
return c[pos];
175+
}
176+
160177
/**
161178
* Converts the contents of the buffer into a StringBuffer.
162179
* This method involves copying the new data once!

src/test/org/apache/commons/csv/CSVParserTest.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,8 @@ public void testBackslashEscaping() throws IOException {
485485
+ "/,,/,\n" // 5) separator escaped
486486
+ "//,//\n" // 6) escape escaped
487487
+ "'//','//'\n" // 7) escape escaped in encapsulation
488+
+ " 8 , \"quoted \"\" /\" // string\" \n" // don't eat spaces
489+
+ "9, /\n \n" // escaped newline
488490
+ "";
489491
String[][] res = {
490492
{ "one", "two", "three" }, // 0
@@ -495,10 +497,12 @@ public void testBackslashEscaping() throws IOException {
495497
{ ",", "," }, // 5
496498
{ "/", "/" }, // 6
497499
{ "/", "/" }, // 7
500+
{ " 8 ", " \"quoted \"\" \" / string\" " },
501+
{ "9", " \n " },
498502
};
499503

500504

501-
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
505+
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',false,false,true,true);
502506

503507
CSVParser parser = new CSVParser(new StringReader(code), strategy);
504508
System.out.println("---------\n" + code + "\n-------------");
@@ -513,6 +517,7 @@ public void testBackslashEscaping() throws IOException {
513517
}
514518

515519

520+
516521
public void testUnicodeEscape() throws IOException {
517522
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
518523
CSVParser parser = new CSVParser(new StringReader(code));

0 commit comments

Comments
 (0)