Skip to content

Commit ff41f78

Browse files
committed
Temporary copy of original CSVLexer for performance comparison.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1303901 13f79535-47bb-0310-9956-ffa450edef68
1 parent 1ae3639 commit ff41f78

1 file changed

Lines changed: 238 additions & 0 deletions

File tree

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.csv;
19+
20+
import java.io.IOException;
21+
22+
import static org.apache.commons.csv.Token.Type.*;
23+
24+
class CSVLexer1 extends Lexer {
25+
26+
private final StringBuilder wsBuf = new StringBuilder();
27+
28+
// ctor needs to be public so can be called dynamically by PerformanceTest class
29+
public CSVLexer1(CSVFormat format, ExtendedBufferedReader in) {
30+
super(format, in);
31+
}
32+
33+
/**
34+
* Returns the next token.
35+
* <p/>
36+
* A token corresponds to a term, a record change or an end-of-file indicator.
37+
*
38+
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
39+
* @return the next token found
40+
* @throws java.io.IOException on stream access error
41+
*/
42+
@Override
43+
Token nextToken(Token tkn) throws IOException {
44+
wsBuf.setLength(0); // reuse
45+
46+
// get the last read char (required for empty line detection)
47+
int lastChar = in.readAgain();
48+
49+
// read the next char and set eol
50+
/* note: unfortunately isEndOfLine may consumes a character silently.
51+
* this has no effect outside of the method. so a simple workaround
52+
* is to call 'readAgain' on the stream...
53+
*/
54+
int c = in.read();
55+
boolean eol = isEndOfLine(c);
56+
c = in.readAgain();
57+
58+
// empty line detection: eol AND (last char was EOL or beginning)
59+
if (format.isEmptyLinesIgnored()) {
60+
while (eol
61+
&& (lastChar == '\n' || lastChar == '\r' || lastChar == ExtendedBufferedReader.UNDEFINED)
62+
&& !isEndOfFile(lastChar)) {
63+
// go on char ahead ...
64+
lastChar = c;
65+
c = in.read();
66+
eol = isEndOfLine(c);
67+
c = in.readAgain();
68+
// reached end of file without any content (empty line at the end)
69+
if (isEndOfFile(c)) {
70+
tkn.type = EOF;
71+
return tkn;
72+
}
73+
}
74+
}
75+
76+
// did we reach eof during the last iteration already ? EOF
77+
if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) {
78+
tkn.type = EOF;
79+
return tkn;
80+
}
81+
82+
// important: make sure a new char gets consumed in each iteration
83+
while (!tkn.isReady && tkn.type != EOF) {
84+
// ignore whitespaces at beginning of a token
85+
if (format.isLeadingSpacesIgnored()) {
86+
while (isWhitespace(c) && !eol) {
87+
wsBuf.append((char) c);
88+
c = in.read();
89+
eol = isEndOfLine(c);
90+
}
91+
}
92+
93+
// ok, start of token reached: comment, encapsulated, or token
94+
if (c == format.getCommentStart()) {
95+
// ignore everything till end of line and continue (incr linecount)
96+
in.readLine();
97+
tkn = nextToken(tkn.reset());
98+
} else if (c == format.getDelimiter()) {
99+
// empty token return TOKEN("")
100+
tkn.type = TOKEN;
101+
tkn.isReady = true;
102+
} else if (eol) {
103+
// empty token return EORECORD("")
104+
//noop: tkn.content.append("");
105+
tkn.type = EORECORD;
106+
tkn.isReady = true;
107+
} else if (c == format.getEncapsulator()) {
108+
// consume encapsulated token
109+
encapsulatedTokenLexer(tkn, c);
110+
} else if (isEndOfFile(c)) {
111+
// end of file return EOF()
112+
//noop: tkn.content.append("");
113+
tkn.type = EOF;
114+
tkn.isReady = true;
115+
} else {
116+
// next token must be a simple token
117+
// add removed blanks when not ignoring whitespace chars...
118+
if (!format.isLeadingSpacesIgnored()) {
119+
tkn.content.append(wsBuf);
120+
}
121+
simpleTokenLexer(tkn, c);
122+
}
123+
}
124+
return tkn;
125+
}
126+
127+
/**
128+
* A simple token lexer
129+
* <p/>
130+
* Simple token are tokens which are not surrounded by encapsulators.
131+
* A simple token might contain escaped delimiters (as \, or \;). The
132+
* token is finished when one of the following conditions become true:
133+
* <ul>
134+
* <li>end of line has been reached (EORECORD)</li>
135+
* <li>end of stream has been reached (EOF)</li>
136+
* <li>an unescaped delimiter has been reached (TOKEN)</li>
137+
* </ul>
138+
*
139+
* @param tkn the current token
140+
* @param c the current character
141+
* @return the filled token
142+
* @throws IOException on stream access error
143+
*/
144+
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
145+
while (true) {
146+
if (isEndOfLine(c)) {
147+
// end of record
148+
tkn.type = EORECORD;
149+
tkn.isReady = true;
150+
break;
151+
} else if (isEndOfFile(c)) {
152+
// end of file
153+
tkn.type = EOF;
154+
tkn.isReady = true;
155+
break;
156+
} else if (c == format.getDelimiter()) {
157+
// end of token
158+
tkn.type = TOKEN;
159+
tkn.isReady = true;
160+
break;
161+
} else if (c == format.getEscape()) {
162+
tkn.content.append((char) readEscape(c));
163+
} else {
164+
tkn.content.append((char) c);
165+
}
166+
167+
c = in.read();
168+
}
169+
170+
if (format.isTrailingSpacesIgnored()) {
171+
trimTrailingSpaces(tkn.content);
172+
}
173+
174+
return tkn;
175+
}
176+
177+
/**
178+
* An encapsulated token lexer
179+
* <p/>
180+
* Encapsulated tokens are surrounded by the given encapsulating-string.
181+
* The encapsulator itself might be included in the token using a
182+
* doubling syntax (as "", '') or using escaping (as in \", \').
183+
* Whitespaces before and after an encapsulated token are ignored.
184+
*
185+
* @param tkn the current token
186+
* @param c the current character
187+
* @return a valid token object
188+
* @throws IOException on invalid state
189+
*/
190+
private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
191+
// save current line
192+
int startLineNumber = getLineNumber();
193+
// ignore the given delimiter
194+
// assert c == delimiter;
195+
while (true) {
196+
c = in.read();
197+
198+
if (c == format.getEscape()) {
199+
tkn.content.append((char) readEscape(c));
200+
} else if (c == format.getEncapsulator()) {
201+
if (in.lookAhead() == format.getEncapsulator()) {
202+
// double or escaped encapsulator -> add single encapsulator to token
203+
c = in.read();
204+
tkn.content.append((char) c);
205+
} else {
206+
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
207+
while (true) {
208+
c = in.read();
209+
if (c == format.getDelimiter()) {
210+
tkn.type = TOKEN;
211+
tkn.isReady = true;
212+
return tkn;
213+
} else if (isEndOfFile(c)) {
214+
tkn.type = EOF;
215+
tkn.isReady = true;
216+
return tkn;
217+
} else if (isEndOfLine(c)) {
218+
// ok eo token reached
219+
tkn.type = EORECORD;
220+
tkn.isReady = true;
221+
return tkn;
222+
} else if (!isWhitespace(c)) {
223+
// error invalid char between token and next delimiter
224+
throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
225+
}
226+
}
227+
}
228+
} else if (isEndOfFile(c)) {
229+
// error condition (end of file before end of token)
230+
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
231+
} else {
232+
// consume character
233+
tkn.content.append((char) c);
234+
}
235+
}
236+
}
237+
238+
}

0 commit comments

Comments
 (0)