Skip to content

Commit ca7bbae

Browse files
committed
Extracted the lexer from CSVParser in a distinct class (suggested by Bob Smith)
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@1298033 13f79535-47bb-0310-9956-ffa450edef68
1 parent 898b7f9 commit ca7bbae

3 files changed

Lines changed: 238 additions & 253 deletions

File tree

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 68 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import java.util.List;
2626
import java.util.NoSuchElementException;
2727

28-
import static org.apache.commons.csv.CSVParser.Token.Type.*;
28+
import org.apache.commons.csv.CSVLexer.Token;
29+
30+
import static org.apache.commons.csv.CSVLexer.Token.Type.*;
2931

3032
/**
3133
* Parses CSV files according to the specified configuration.
@@ -59,65 +61,16 @@
5961
*/
6062
public class CSVParser implements Iterable<String[]> {
6163

62-
/** length of the initial token (content-)buffer */
63-
private static final int INITIAL_TOKEN_LENGTH = 50;
64-
6564
/** Immutable empty String array. */
6665
private static final String[] EMPTY_STRING_ARRAY = new String[0];
6766

68-
/** The input stream */
69-
private final ExtendedBufferedReader in;
70-
71-
private final CSVFormat format;
72-
67+
private CSVLexer lexer;
68+
7369
// the following objects are shared to reduce garbage
7470

7571
/** A record buffer for getLine(). Grows as necessary and is reused. */
7672
private final List<String> record = new ArrayList<String>();
7773
private final Token reusableToken = new Token();
78-
private final CharBuffer wsBuf = new CharBuffer();
79-
80-
/**
81-
* Token is an internal token representation.
82-
* <p/>
83-
* It is used as contract between the lexer and the parser.
84-
*/
85-
static class Token {
86-
87-
enum Type {
88-
/** Token has no valid content, i.e. is in its initialized state. */
89-
INVALID,
90-
91-
/** Token with content, at beginning or in the middle of a line. */
92-
TOKEN,
93-
94-
/** Token (which can have content) when end of file is reached. */
95-
EOF,
96-
97-
/** Token with content when end of a line is reached. */
98-
EORECORD
99-
}
100-
101-
/** Token type */
102-
Type type = INVALID;
103-
104-
/** The content buffer. */
105-
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
106-
107-
/** Token ready flag: indicates a valid token with content (ready for the parser). */
108-
boolean isReady;
109-
110-
Token reset() {
111-
content.clear();
112-
type = INVALID;
113-
isReady = false;
114-
return this;
115-
}
116-
}
117-
118-
// ======================================================
119-
// the constructor
120-
// ======================================================
12174

12275
/**
12376
* CSV parser using the default {@link CSVFormat}.
@@ -139,8 +92,7 @@ public CSVParser(Reader input, CSVFormat format) {
13992
input = new UnicodeUnescapeReader(input);
14093
}
14194

142-
this.in = new ExtendedBufferedReader(input);
143-
this.format = format;
95+
this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
14496
}
14597

14698
/**
@@ -153,9 +105,6 @@ public CSVParser(String input, CSVFormat format) {
153105
this(new StringReader(input), format);
154106
}
155107

156-
// ======================================================
157-
// the parser
158-
// ======================================================
159108

160109
/**
161110
* Parses the CSV according to the given format and returns the content
@@ -191,7 +140,7 @@ String[] getLine() throws IOException {
191140
record.clear();
192141
while (true) {
193142
reusableToken.reset();
194-
nextToken(reusableToken);
143+
lexer.nextToken(reusableToken);
195144
switch (reusableToken.type) {
196145
case TOKEN:
197146
record.add(reusableToken.content.toString());
@@ -274,12 +223,69 @@ public void remove() { }
274223
* @return current line number
275224
*/
276225
public int getLineNumber() {
277-
return in.getLineNumber();
226+
return lexer.getLineNumber();
278227
}
228+
}
279229

280-
// ======================================================
281-
// the lexer(s)
282-
// ======================================================
230+
231+
class CSVLexer {
232+
233+
/** length of the initial token (content-)buffer */
234+
private static final int INITIAL_TOKEN_LENGTH = 50;
235+
236+
private final CharBuffer wsBuf = new CharBuffer();
237+
238+
private CSVFormat format;
239+
240+
/** The input stream */
241+
private ExtendedBufferedReader in;
242+
243+
/**
244+
* Token is an internal token representation.
245+
* <p/>
246+
* It is used as contract between the lexer and the parser.
247+
*/
248+
static class Token {
249+
250+
enum Type {
251+
/** Token has no valid content, i.e. is in its initialized state. */
252+
INVALID,
253+
254+
/** Token with content, at beginning or in the middle of a line. */
255+
TOKEN,
256+
257+
/** Token (which can have content) when end of file is reached. */
258+
EOF,
259+
260+
/** Token with content when end of a line is reached. */
261+
EORECORD
262+
}
263+
264+
/** Token type */
265+
Type type = INVALID;
266+
267+
/** The content buffer. */
268+
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
269+
270+
/** Token ready flag: indicates a valid token with content (ready for the parser). */
271+
boolean isReady;
272+
273+
Token reset() {
274+
content.clear();
275+
type = INVALID;
276+
isReady = false;
277+
return this;
278+
}
279+
}
280+
281+
CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
282+
this.format = format;
283+
this.in = in;
284+
}
285+
286+
public int getLineNumber() {
287+
return in.getLineNumber();
288+
}
283289

284290
/**
285291
* Returns the next token.
@@ -503,19 +509,6 @@ private int readEscape(int c) throws IOException {
503509
}
504510
}
505511

506-
/**
507-
* Obtain the specified CSV format.
508-
*
509-
* @return format currently being used
510-
*/
511-
public CSVFormat getFormat() {
512-
return this.format;
513-
}
514-
515-
// ======================================================
516-
// Character class checker
517-
// ======================================================
518-
519512
/**
520513
* @return true if the given char is a whitespace character
521514
*/
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.csv;
19+
20+
import java.io.IOException;
21+
import java.io.StringReader;
22+
23+
import junit.framework.TestCase;
24+
import org.apache.commons.csv.CSVLexer.Token;
25+
26+
import static org.apache.commons.csv.CSVLexer.Token.Type.*;
27+
28+
public class CSVLexerTest extends TestCase {
29+
30+
private CSVLexer getLexer(String input, CSVFormat format) {
31+
return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
32+
}
33+
34+
private void assertTokenEquals(Token.Type expectedType, String expectedContent, Token token) {
35+
assertEquals("Token type", expectedType, token.type);
36+
assertEquals("Token content", expectedContent, token.content.toString());
37+
}
38+
39+
// Single line (without comment)
40+
public void testNextToken1() throws IOException {
41+
String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,";
42+
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
43+
assertTokenEquals(TOKEN, "abc", parser.nextToken(new Token()));
44+
assertTokenEquals(TOKEN, "def", parser.nextToken(new Token()));
45+
assertTokenEquals(TOKEN, "hijk", parser.nextToken(new Token()));
46+
assertTokenEquals(TOKEN, "lmnop", parser.nextToken(new Token()));
47+
assertTokenEquals(TOKEN, "qrst", parser.nextToken(new Token()));
48+
assertTokenEquals(TOKEN, "uv", parser.nextToken(new Token()));
49+
assertTokenEquals(TOKEN, "wxy", parser.nextToken(new Token()));
50+
assertTokenEquals(TOKEN, "z", parser.nextToken(new Token()));
51+
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
52+
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
53+
}
54+
55+
// multiline including comments (and empty lines)
56+
public void testNextToken2() throws IOException {
57+
/* file: 1,2,3,
58+
* a,b x,c
59+
*
60+
* # this is a comment
61+
* d,e,
62+
*
63+
*/
64+
String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n";
65+
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
66+
67+
CSVLexer parser = getLexer(code, format);
68+
69+
70+
assertTokenEquals(TOKEN, "1", parser.nextToken(new Token()));
71+
assertTokenEquals(TOKEN, "2", parser.nextToken(new Token()));
72+
assertTokenEquals(TOKEN, "3", parser.nextToken(new Token()));
73+
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
74+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
75+
assertTokenEquals(TOKEN, "b x", parser.nextToken(new Token()));
76+
assertTokenEquals(EORECORD, "c", parser.nextToken(new Token()));
77+
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
78+
assertTokenEquals(TOKEN, "d", parser.nextToken(new Token()));
79+
assertTokenEquals(TOKEN, "e", parser.nextToken(new Token()));
80+
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
81+
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
82+
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
83+
84+
}
85+
86+
// simple token with escaping
87+
public void testNextToken3() throws IOException {
88+
/* file: a,\,,b
89+
* \,,
90+
*/
91+
String code = "a,\\,,b\n\\,,";
92+
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
93+
CSVLexer parser = getLexer(code, format);
94+
95+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
96+
// an unquoted single backslash is not an escape char
97+
assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
98+
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
99+
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
100+
// an unquoted single backslash is not an escape char
101+
assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
102+
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
103+
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
104+
}
105+
106+
// encapsulator tokenizer (sinle line)
107+
public void testNextToken4() throws IOException {
108+
/* file: a,"foo",b
109+
* a, " foo",b
110+
* a,"foo " ,b // whitespace after closing encapsulator
111+
* a, " foo " ,b
112+
*/
113+
String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
114+
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
115+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
116+
assertTokenEquals(TOKEN, "foo", parser.nextToken(new Token()));
117+
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
118+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
119+
assertTokenEquals(TOKEN, " foo", parser.nextToken(new Token()));
120+
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
121+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
122+
assertTokenEquals(TOKEN, "foo ", parser.nextToken(new Token()));
123+
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
124+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
125+
assertTokenEquals(TOKEN, " foo ", parser.nextToken(new Token()));
126+
// assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
127+
assertTokenEquals(EOF, "b", parser.nextToken(new Token()));
128+
}
129+
130+
// encapsulator tokenizer (multi line, delimiter in string)
131+
public void testNextToken5() throws IOException {
132+
String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
133+
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
134+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
135+
assertTokenEquals(TOKEN, "foo\n", parser.nextToken(new Token()));
136+
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
137+
assertTokenEquals(EORECORD, "foo\n baar ,,,", parser.nextToken(new Token()));
138+
assertTokenEquals(EOF, "\n\t \n", parser.nextToken(new Token()));
139+
140+
}
141+
142+
// change delimiters, comment, encapsulater
143+
public void testNextToken6() throws IOException {
144+
/* file: a;'b and \' more
145+
* '
146+
* !comment;;;;
147+
* ;;
148+
*/
149+
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
150+
CSVFormat format = new CSVFormat(';', '\'', '!');
151+
CSVLexer parser = getLexer(code, format);
152+
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
153+
assertTokenEquals(EORECORD, "b and ' more\n", parser.nextToken(new Token()));
154+
}
155+
156+
// From SANDBOX-153
157+
public void testDelimiterIsWhitespace() throws IOException {
158+
String code = "one\ttwo\t\tfour \t five\t six";
159+
CSVLexer parser = getLexer(code, CSVFormat.TDF);
160+
assertTokenEquals(TOKEN, "one", parser.nextToken(new Token()));
161+
assertTokenEquals(TOKEN, "two", parser.nextToken(new Token()));
162+
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
163+
assertTokenEquals(TOKEN, "four", parser.nextToken(new Token()));
164+
assertTokenEquals(TOKEN, "five", parser.nextToken(new Token()));
165+
assertTokenEquals(EOF, "six", parser.nextToken(new Token()));
166+
}
167+
}

0 commit comments

Comments
 (0)