Skip to content

Commit 38670db

Browse files
committed
Moved the lexer in a separate file
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1300850 13f79535-47bb-0310-9956-ffa450edef68
1 parent 35b954e commit 38670db

2 files changed

Lines changed: 344 additions & 323 deletions

File tree

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.csv;
19+
20+
import java.io.IOException;
21+
22+
import static org.apache.commons.csv.CSVLexer.Token.Type.*;
23+
24+
class CSVLexer {
25+
26+
/** length of the initial token (content-)buffer */
27+
private static final int INITIAL_TOKEN_LENGTH = 50;
28+
29+
private final StringBuilder wsBuf = new StringBuilder();
30+
31+
private final CSVFormat format;
32+
33+
/** The input stream */
34+
private final ExtendedBufferedReader in;
35+
36+
/**
37+
* Token is an internal token representation.
38+
* <p/>
39+
* It is used as contract between the lexer and the parser.
40+
*/
41+
static class Token {
42+
43+
enum Type {
44+
/** Token has no valid content, i.e. is in its initialized state. */
45+
INVALID,
46+
47+
/** Token with content, at beginning or in the middle of a line. */
48+
TOKEN,
49+
50+
/** Token (which can have content) when end of file is reached. */
51+
EOF,
52+
53+
/** Token with content when end of a line is reached. */
54+
EORECORD
55+
}
56+
57+
/** Token type */
58+
Type type = INVALID;
59+
60+
/** The content buffer. */
61+
StringBuilder content = new StringBuilder(INITIAL_TOKEN_LENGTH);
62+
63+
/** Token ready flag: indicates a valid token with content (ready for the parser). */
64+
boolean isReady;
65+
66+
Token reset() {
67+
content.setLength(0);
68+
type = INVALID;
69+
isReady = false;
70+
return this;
71+
}
72+
}
73+
74+
CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
75+
this.format = format;
76+
this.in = in;
77+
}
78+
79+
public int getLineNumber() {
80+
return in.getLineNumber();
81+
}
82+
83+
/**
84+
* Returns the next token.
85+
* <p/>
86+
* A token corresponds to a term, a record change or an end-of-file indicator.
87+
*
88+
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
89+
* @return the next token found
90+
* @throws java.io.IOException on stream access error
91+
*/
92+
Token nextToken(Token tkn) throws IOException {
93+
wsBuf.setLength(0); // reuse
94+
95+
// get the last read char (required for empty line detection)
96+
int lastChar = in.readAgain();
97+
98+
// read the next char and set eol
99+
/* note: unfortunately isEndOfLine may consumes a character silently.
100+
* this has no effect outside of the method. so a simple workaround
101+
* is to call 'readAgain' on the stream...
102+
*/
103+
int c = in.read();
104+
boolean eol = isEndOfLine(c);
105+
c = in.readAgain();
106+
107+
// empty line detection: eol AND (last char was EOL or beginning)
108+
if (format.isEmptyLinesIgnored()) {
109+
while (eol
110+
&& (lastChar == '\n' || lastChar == '\r' || lastChar == ExtendedBufferedReader.UNDEFINED)
111+
&& !isEndOfFile(lastChar)) {
112+
// go on char ahead ...
113+
lastChar = c;
114+
c = in.read();
115+
eol = isEndOfLine(c);
116+
c = in.readAgain();
117+
// reached end of file without any content (empty line at the end)
118+
if (isEndOfFile(c)) {
119+
tkn.type = EOF;
120+
return tkn;
121+
}
122+
}
123+
}
124+
125+
// did we reach eof during the last iteration already ? EOF
126+
if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) {
127+
tkn.type = EOF;
128+
return tkn;
129+
}
130+
131+
// important: make sure a new char gets consumed in each iteration
132+
while (!tkn.isReady && tkn.type != EOF) {
133+
// ignore whitespaces at beginning of a token
134+
if (format.isLeadingSpacesIgnored()) {
135+
while (isWhitespace(c) && !eol) {
136+
wsBuf.append((char) c);
137+
c = in.read();
138+
eol = isEndOfLine(c);
139+
}
140+
}
141+
142+
// ok, start of token reached: comment, encapsulated, or token
143+
if (c == format.getCommentStart()) {
144+
// ignore everything till end of line and continue (incr linecount)
145+
in.readLine();
146+
tkn = nextToken(tkn.reset());
147+
} else if (c == format.getDelimiter()) {
148+
// empty token return TOKEN("")
149+
tkn.type = TOKEN;
150+
tkn.isReady = true;
151+
} else if (eol) {
152+
// empty token return EORECORD("")
153+
//noop: tkn.content.append("");
154+
tkn.type = EORECORD;
155+
tkn.isReady = true;
156+
} else if (c == format.getEncapsulator()) {
157+
// consume encapsulated token
158+
encapsulatedTokenLexer(tkn, c);
159+
} else if (isEndOfFile(c)) {
160+
// end of file return EOF()
161+
//noop: tkn.content.append("");
162+
tkn.type = EOF;
163+
tkn.isReady = true;
164+
} else {
165+
// next token must be a simple token
166+
// add removed blanks when not ignoring whitespace chars...
167+
if (!format.isLeadingSpacesIgnored()) {
168+
tkn.content.append(wsBuf);
169+
}
170+
simpleTokenLexer(tkn, c);
171+
}
172+
}
173+
return tkn;
174+
}
175+
176+
/**
177+
* A simple token lexer
178+
* <p/>
179+
* Simple token are tokens which are not surrounded by encapsulators.
180+
* A simple token might contain escaped delimiters (as \, or \;). The
181+
* token is finished when one of the following conditions become true:
182+
* <ul>
183+
* <li>end of line has been reached (EORECORD)</li>
184+
* <li>end of stream has been reached (EOF)</li>
185+
* <li>an unescaped delimiter has been reached (TOKEN)</li>
186+
* </ul>
187+
*
188+
* @param tkn the current token
189+
* @param c the current character
190+
* @return the filled token
191+
* @throws IOException on stream access error
192+
*/
193+
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
194+
while (true) {
195+
if (isEndOfLine(c)) {
196+
// end of record
197+
tkn.type = EORECORD;
198+
tkn.isReady = true;
199+
break;
200+
} else if (isEndOfFile(c)) {
201+
// end of file
202+
tkn.type = EOF;
203+
tkn.isReady = true;
204+
break;
205+
} else if (c == format.getDelimiter()) {
206+
// end of token
207+
tkn.type = TOKEN;
208+
tkn.isReady = true;
209+
break;
210+
} else if (c == format.getEscape()) {
211+
tkn.content.append((char) readEscape(c));
212+
} else {
213+
tkn.content.append((char) c);
214+
}
215+
216+
c = in.read();
217+
}
218+
219+
if (format.isTrailingSpacesIgnored()) {
220+
trimTrailingSpaces(tkn.content);
221+
}
222+
223+
return tkn;
224+
}
225+
226+
private void trimTrailingSpaces(StringBuilder buffer) {
227+
int length = buffer.length();
228+
while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
229+
length = length - 1;
230+
}
231+
if (length != buffer.length()) {
232+
buffer.setLength(length);
233+
}
234+
}
235+
236+
/**
237+
* An encapsulated token lexer
238+
* <p/>
239+
* Encapsulated tokens are surrounded by the given encapsulating-string.
240+
* The encapsulator itself might be included in the token using a
241+
* doubling syntax (as "", '') or using escaping (as in \", \').
242+
* Whitespaces before and after an encapsulated token are ignored.
243+
*
244+
* @param tkn the current token
245+
* @param c the current character
246+
* @return a valid token object
247+
* @throws IOException on invalid state
248+
*/
249+
private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
250+
// save current line
251+
int startLineNumber = getLineNumber();
252+
// ignore the given delimiter
253+
// assert c == delimiter;
254+
while (true) {
255+
c = in.read();
256+
257+
if (c == format.getEscape()) {
258+
tkn.content.append((char) readEscape(c));
259+
} else if (c == format.getEncapsulator()) {
260+
if (in.lookAhead() == format.getEncapsulator()) {
261+
// double or escaped encapsulator -> add single encapsulator to token
262+
c = in.read();
263+
tkn.content.append((char) c);
264+
} else {
265+
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
266+
while (true) {
267+
c = in.read();
268+
if (c == format.getDelimiter()) {
269+
tkn.type = TOKEN;
270+
tkn.isReady = true;
271+
return tkn;
272+
} else if (isEndOfFile(c)) {
273+
tkn.type = EOF;
274+
tkn.isReady = true;
275+
return tkn;
276+
} else if (isEndOfLine(c)) {
277+
// ok eo token reached
278+
tkn.type = EORECORD;
279+
tkn.isReady = true;
280+
return tkn;
281+
} else if (!isWhitespace(c)) {
282+
// error invalid char between token and next delimiter
283+
throw new IOException("(line " + getLineNumber() + ") invalid char between encapsulated token and delimiter");
284+
}
285+
}
286+
}
287+
} else if (isEndOfFile(c)) {
288+
// error condition (end of file before end of token)
289+
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
290+
} else {
291+
// consume character
292+
tkn.content.append((char) c);
293+
}
294+
}
295+
}
296+
297+
private int readEscape(int c) throws IOException {
298+
// assume c is the escape char (normally a backslash)
299+
c = in.read();
300+
switch (c) {
301+
case 'r':
302+
return '\r';
303+
case 'n':
304+
return '\n';
305+
case 't':
306+
return '\t';
307+
case 'b':
308+
return '\b';
309+
case 'f':
310+
return '\f';
311+
default:
312+
return c;
313+
}
314+
}
315+
316+
/**
317+
* @return true if the given char is a whitespace character
318+
*/
319+
private boolean isWhitespace(int c) {
320+
return (c != format.getDelimiter()) && Character.isWhitespace((char) c);
321+
}
322+
323+
/**
324+
* Greedy - accepts \n, \r and \r\n
325+
* This checker consumes silently the second control-character...
326+
*
327+
* @return true if the given character is a line-terminator
328+
*/
329+
private boolean isEndOfLine(int c) throws IOException {
330+
// check if we have \r\n...
331+
if (c == '\r' && in.lookAhead() == '\n') {
332+
// note: does not change c outside of this method !!
333+
c = in.read();
334+
}
335+
return (c == '\n' || c == '\r');
336+
}
337+
338+
/**
339+
* @return true if the given character indicates end of file
340+
*/
341+
private boolean isEndOfFile(int c) {
342+
return c == ExtendedBufferedReader.END_OF_STREAM;
343+
}
344+
}

0 commit comments

Comments
 (0)