Skip to content

Commit 8cd4024

Browse files
committed
Add experimental state-driven lexer
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1306519 13f79535-47bb-0310-9956-ffa450edef68
1 parent 6ca0f78 commit 8cd4024

1 file changed

Lines changed: 248 additions & 0 deletions

File tree

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.csv;
19+
20+
import java.io.IOException;
21+
22+
import static org.apache.commons.csv.Token.Type.*;
23+
24+
/**
25+
* Experimental Lexer using enums to keep track of state and character type.
26+
* Unfortunately it is twice as slow.
27+
* For reference purpose only.
28+
*
29+
*/
30+
class CSVLexer3 extends Lexer {
31+
32+
private final char escape;
33+
34+
// ctor needs to be public so can be called dynamically by PerformanceTest class
35+
public CSVLexer3(CSVFormat format, ExtendedBufferedReader in) {
36+
super(format, in);
37+
this.escape = format.getEscape();
38+
}
39+
40+
/**
41+
* Classify the character types
42+
*/
43+
private static enum CharType {
44+
DELIM,
45+
ESCAPE,
46+
ENCAP,
47+
EOL,
48+
COMMENT_START,
49+
WHITESPACE,
50+
OTHER,
51+
EOFCHAR
52+
}
53+
54+
private CharType classify(int intch) {
55+
if (isDelimiter(intch)) {
56+
return CharType.DELIM;
57+
}
58+
if (isCommentStart(intch)) {
59+
return CharType.COMMENT_START;
60+
}
61+
if (isEncapsulator(intch)) {
62+
return CharType.ENCAP;
63+
}
64+
if (isEscape(intch)) {
65+
return CharType.ESCAPE;
66+
}
67+
if (intch == '\r' || intch == '\n') {
68+
return CharType.EOL;
69+
}
70+
if (isWhitespace(intch)) { // Must be after EOL check
71+
return CharType.WHITESPACE;
72+
}
73+
if (intch == ExtendedBufferedReader.END_OF_STREAM) {
74+
return CharType.EOFCHAR;
75+
}
76+
return CharType.OTHER;
77+
}
78+
79+
/**
80+
* Parsing states
81+
*/
82+
private static enum State {
83+
BEGIN, PLAIN, INQUOTE, QUOTEQUOTE, ESCAPE_PLAIN, ESCAPE_QUOTE,
84+
}
85+
86+
/**
87+
* Returns the next token.
88+
* <p/>
89+
* A token corresponds to a term, a record change or an end-of-file indicator.
90+
*
91+
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
92+
* @return the next token found
93+
* @throws java.io.IOException on stream access error
94+
*/
95+
@Override
96+
Token nextToken(Token tkn) throws IOException {
97+
98+
State state = State.BEGIN;
99+
int intch;
100+
boolean trimTrailingSpaces = false;
101+
while(tkn.type == INVALID) {
102+
intch = in.read();
103+
CharType type = classify(intch);
104+
switch(state) {
105+
case BEGIN:
106+
switch(type){
107+
case COMMENT_START:
108+
in.readLine();
109+
tkn.type = COMMENT;
110+
break;
111+
case ENCAP:
112+
state = State.INQUOTE;
113+
break;
114+
case DELIM:
115+
tkn.type = TOKEN;
116+
break;
117+
case EOL:
118+
tkn.type = EORECORD;
119+
break;
120+
case EOFCHAR:
121+
tkn.type = EOF;
122+
break;
123+
case ESCAPE:
124+
state = State.ESCAPE_PLAIN;
125+
break;
126+
case OTHER:
127+
tkn.content.append((char) intch);
128+
state = State.PLAIN;
129+
break;
130+
case WHITESPACE:
131+
if (!surroundingSpacesIgnored){
132+
tkn.content.append((char) intch);
133+
state = State.PLAIN;
134+
}
135+
break;
136+
}
137+
break;
138+
case PLAIN:
139+
switch(type){
140+
case DELIM:
141+
tkn.type = TOKEN;
142+
break;
143+
case EOL:
144+
tkn.type = EORECORD;
145+
break;
146+
case EOFCHAR:
147+
tkn.type = EOF;
148+
break;
149+
case ESCAPE:
150+
state = State.ESCAPE_PLAIN;
151+
break;
152+
default:
153+
trimTrailingSpaces = surroundingSpacesIgnored; // we have a plain token
154+
tkn.content.append((char) intch);
155+
break;
156+
}
157+
break;
158+
case INQUOTE: // Started a quoted string
159+
switch(type){
160+
case ENCAP:
161+
state = State.QUOTEQUOTE;
162+
break;
163+
case ESCAPE:
164+
state = State.ESCAPE_QUOTE;
165+
break;
166+
case EOFCHAR:
167+
throw new IOException("(line " + getLineNumber() + ") unexpected EOF in quoted string");
168+
default:
169+
tkn.content.append((char) intch);
170+
break;
171+
}
172+
break;
173+
case QUOTEQUOTE: // "..." seen, expecting end of token or "
174+
switch(type){
175+
case DELIM:
176+
tkn.type = TOKEN;
177+
break;
178+
case EOL:
179+
tkn.type = EORECORD;
180+
break;
181+
case EOFCHAR:
182+
tkn.type = EOF;
183+
break;
184+
case ENCAP: // "..."" seen, append it
185+
tkn.content.append((char) intch);
186+
state = State.INQUOTE;
187+
break;
188+
case WHITESPACE: // trailing whitespace may be allowed
189+
if (!surroundingSpacesIgnored) {
190+
// error invalid char between token and next delimiter
191+
throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
192+
}
193+
break;
194+
// Everything else is invalid
195+
case ESCAPE:
196+
case OTHER:
197+
case COMMENT_START:
198+
// error invalid char between token and next delimiter
199+
throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
200+
}
201+
break;
202+
case ESCAPE_PLAIN:
203+
switch(type){
204+
case DELIM:
205+
case ESCAPE:
206+
case EOL:
207+
tkn.content.append((char) intch);
208+
state = State.PLAIN;
209+
break;
210+
case COMMENT_START: // TODO should comment be escaped?
211+
case ENCAP: // TODO is this correct?
212+
case OTHER: // TODO may need to escape further
213+
case WHITESPACE:
214+
tkn.content.append(escape);
215+
tkn.content.append((char) intch);
216+
break;
217+
case EOFCHAR:
218+
throw new IOException("(line " + getLineNumber() + ") unexpected EOF in escape sequence");
219+
}
220+
break;
221+
case ESCAPE_QUOTE:
222+
switch(type){
223+
case ESCAPE:
224+
case ENCAP: // this is the only required escape
225+
tkn.content.append((char) intch);
226+
break;
227+
case COMMENT_START:
228+
case DELIM:
229+
case EOL:
230+
case OTHER:
231+
case WHITESPACE:
232+
tkn.content.append(escape);
233+
tkn.content.append((char) intch);
234+
break;
235+
case EOFCHAR:
236+
throw new IOException("(line " + getLineNumber() + ") unexpected EOF in escape sequence");
237+
}
238+
break;
239+
default:
240+
break;
241+
}
242+
}
243+
if (trimTrailingSpaces) {
244+
trimTrailingSpaces(tkn.content);
245+
}
246+
return tkn;
247+
}
248+
}

0 commit comments

Comments
 (0)