Skip to content

Commit 5a925ff

Browse files
committed
Another baseline
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1307043 13f79535-47bb-0310-9956-ffa450edef68
1 parent 3296f59 commit 5a925ff

1 file changed

Lines changed: 226 additions & 0 deletions

File tree

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.csv;
19+
20+
import java.io.IOException;
21+
22+
import static org.apache.commons.csv.Token.Type.*;
23+
24+
class CSVLexer1306667 extends Lexer {
25+
26+
// ctor needs to be public so can be called dynamically by PerformanceTest class
27+
public CSVLexer1306667(CSVFormat format, ExtendedBufferedReader in) {
28+
super(format, in);
29+
}
30+
31+
/**
32+
* Returns the next token.
33+
* <p/>
34+
* A token corresponds to a term, a record change or an end-of-file indicator.
35+
*
36+
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
37+
* @return the next token found
38+
* @throws java.io.IOException on stream access error
39+
*/
40+
@Override
41+
Token nextToken(Token tkn) throws IOException {
42+
43+
// get the last read char (required for empty line detection)
44+
int lastChar = in.readAgain();
45+
46+
// read the next char and set eol
47+
int c = in.read();
48+
49+
/* note: unfortunately isEndOfLine may consumes a character silently.
50+
* this has no effect outside of the method. so a simple workaround
51+
* is to call 'readAgain' on the stream...
52+
*/
53+
boolean eol = isEndOfLine(c);
54+
c = in.readAgain();
55+
56+
// empty line detection: eol AND (last char was EOL or beginning)
57+
if (emptyLinesIgnored) {
58+
while (eol && isStartOfLine(lastChar)) {
59+
// go on char ahead ...
60+
lastChar = c;
61+
c = in.read();
62+
eol = isEndOfLine(c);
63+
c = in.readAgain();
64+
// reached end of file without any content (empty line at the end)
65+
if (isEndOfFile(c)) {
66+
tkn.type = EOF;
67+
// don't set tkn.isReady here because no content
68+
return tkn;
69+
}
70+
}
71+
}
72+
73+
// did we reach eof during the last iteration already ? EOF
74+
if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) {
75+
tkn.type = EOF;
76+
// don't set tkn.isReady here because no content
77+
return tkn;
78+
}
79+
80+
if (isStartOfLine(lastChar) && isCommentStart(c)) {
81+
in.readLine();
82+
tkn.type = COMMENT;
83+
return tkn;
84+
}
85+
86+
// important: make sure a new char gets consumed in each iteration
87+
while (tkn.type == INVALID) {
88+
// ignore whitespaces at beginning of a token
89+
if (surroundingSpacesIgnored) {
90+
while (isWhitespace(c) && !eol) {
91+
c = in.read();
92+
eol = isEndOfLine(c);
93+
}
94+
}
95+
96+
// ok, start of token reached: encapsulated, or token
97+
if (isDelimiter(c)) {
98+
// empty token return TOKEN("")
99+
tkn.type = TOKEN;
100+
} else if (eol) {
101+
// empty token return EORECORD("")
102+
//noop: tkn.content.append("");
103+
tkn.type = EORECORD;
104+
} else if (isEncapsulator(c)) {
105+
// consume encapsulated token
106+
encapsulatedTokenLexer(tkn);
107+
} else if (isEndOfFile(c)) {
108+
// end of file return EOF()
109+
//noop: tkn.content.append("");
110+
tkn.type = EOF;
111+
tkn.isReady = true; // there is data at EOF
112+
} else {
113+
// next token must be a simple token
114+
// add removed blanks when not ignoring whitespace chars...
115+
simpleTokenLexer(tkn, c);
116+
}
117+
}
118+
return tkn;
119+
}
120+
121+
/**
122+
* A simple token lexer
123+
* <p/>
124+
* Simple token are tokens which are not surrounded by encapsulators.
125+
* A simple token might contain escaped delimiters (as \, or \;). The
126+
* token is finished when one of the following conditions become true:
127+
* <ul>
128+
* <li>end of line has been reached (EORECORD)</li>
129+
* <li>end of stream has been reached (EOF)</li>
130+
* <li>an unescaped delimiter has been reached (TOKEN)</li>
131+
* </ul>
132+
*
133+
* @param tkn the current token
134+
* @param c the current character
135+
* @return the filled token
136+
* @throws IOException on stream access error
137+
*/
138+
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
139+
// Faster to use while(true)+break than while(tkn.type == INVALID)
140+
while (true) {
141+
if (isEndOfLine(c)) {
142+
tkn.type = EORECORD;
143+
break;
144+
} else if (isEndOfFile(c)) {
145+
tkn.type = EOF;
146+
tkn.isReady = true; // There is data at EOF
147+
break;
148+
} else if (isDelimiter(c)) {
149+
tkn.type = TOKEN;
150+
break;
151+
} else if (isEscape(c)) {
152+
tkn.content.append((char) readEscape());
153+
c = in.read(); // continue
154+
} else {
155+
tkn.content.append((char) c);
156+
c = in.read(); // continue
157+
}
158+
}
159+
160+
if (surroundingSpacesIgnored) {
161+
trimTrailingSpaces(tkn.content);
162+
}
163+
164+
return tkn;
165+
}
166+
167+
/**
168+
* An encapsulated token lexer
169+
* <p/>
170+
* Encapsulated tokens are surrounded by the given encapsulating-string.
171+
* The encapsulator itself might be included in the token using a
172+
* doubling syntax (as "", '') or using escaping (as in \", \').
173+
* Whitespaces before and after an encapsulated token are ignored.
174+
*
175+
* @param tkn the current token
176+
* @return a valid token object
177+
* @throws IOException on invalid state
178+
*/
179+
private Token encapsulatedTokenLexer(Token tkn) throws IOException {
180+
// save current line
181+
int startLineNumber = getLineNumber();
182+
// ignore the given delimiter
183+
// assert c == delimiter;
184+
int c;
185+
while (true) {
186+
c = in.read();
187+
188+
if (isEscape(c)) {
189+
tkn.content.append((char) readEscape());
190+
} else if (isEncapsulator(c)) {
191+
if (isEncapsulator(in.lookAhead())) {
192+
// double or escaped encapsulator -> add single encapsulator to token
193+
c = in.read();
194+
tkn.content.append((char) c);
195+
} else {
196+
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
197+
while (true) {
198+
c = in.read();
199+
if (isDelimiter(c)) {
200+
tkn.type = TOKEN;
201+
return tkn;
202+
} else if (isEndOfFile(c)) {
203+
tkn.type = EOF;
204+
tkn.isReady = true; // There is data at EOF
205+
return tkn;
206+
} else if (isEndOfLine(c)) {
207+
// ok eo token reached
208+
tkn.type = EORECORD;
209+
return tkn;
210+
} else if (!isWhitespace(c)) {
211+
// error invalid char between token and next delimiter
212+
throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
213+
}
214+
}
215+
}
216+
} else if (isEndOfFile(c)) {
217+
// error condition (end of file before end of token)
218+
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
219+
} else {
220+
// consume character
221+
tkn.content.append((char) c);
222+
}
223+
}
224+
}
225+
226+
}

0 commit comments

Comments
 (0)