1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one or more
3+ * contributor license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright ownership.
5+ * The ASF licenses this file to You under the Apache License, Version 2.0
6+ * (the "License"); you may not use this file except in compliance with
7+ * the License. You may obtain a copy of the License at
8+ *
9+ * http://www.apache.org/licenses/LICENSE-2.0
10+ *
11+ * Unless required by applicable law or agreed to in writing, software
12+ * distributed under the License is distributed on an "AS IS" BASIS,
13+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ * See the License for the specific language governing permissions and
15+ * limitations under the License.
16+ */
17+
18+ package org .apache .commons .csv ;
19+
20+ import java .io .IOException ;
21+
22+ import static org .apache .commons .csv .CSVLexer .Token .Type .*;
23+
24+ class CSVLexer {
25+
26+ /** length of the initial token (content-)buffer */
27+ private static final int INITIAL_TOKEN_LENGTH = 50 ;
28+
29+ private final StringBuilder wsBuf = new StringBuilder ();
30+
31+ private final CSVFormat format ;
32+
33+ /** The input stream */
34+ private final ExtendedBufferedReader in ;
35+
36+ /**
37+ * Token is an internal token representation.
38+ * <p/>
39+ * It is used as contract between the lexer and the parser.
40+ */
41+ static class Token {
42+
43+ enum Type {
44+ /** Token has no valid content, i.e. is in its initialized state. */
45+ INVALID ,
46+
47+ /** Token with content, at beginning or in the middle of a line. */
48+ TOKEN ,
49+
50+ /** Token (which can have content) when end of file is reached. */
51+ EOF ,
52+
53+ /** Token with content when end of a line is reached. */
54+ EORECORD
55+ }
56+
57+ /** Token type */
58+ Type type = INVALID ;
59+
60+ /** The content buffer. */
61+ StringBuilder content = new StringBuilder (INITIAL_TOKEN_LENGTH );
62+
63+ /** Token ready flag: indicates a valid token with content (ready for the parser). */
64+ boolean isReady ;
65+
66+ Token reset () {
67+ content .setLength (0 );
68+ type = INVALID ;
69+ isReady = false ;
70+ return this ;
71+ }
72+ }
73+
74+ CSVLexer (CSVFormat format , ExtendedBufferedReader in ) {
75+ this .format = format ;
76+ this .in = in ;
77+ }
78+
79+ public int getLineNumber () {
80+ return in .getLineNumber ();
81+ }
82+
83+ /**
84+ * Returns the next token.
85+ * <p/>
86+ * A token corresponds to a term, a record change or an end-of-file indicator.
87+ *
88+ * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
89+ * @return the next token found
90+ * @throws java.io.IOException on stream access error
91+ */
92+ Token nextToken (Token tkn ) throws IOException {
93+ wsBuf .setLength (0 ); // reuse
94+
95+ // get the last read char (required for empty line detection)
96+ int lastChar = in .readAgain ();
97+
98+ // read the next char and set eol
99+ /* note: unfortunately isEndOfLine may consumes a character silently.
100+ * this has no effect outside of the method. so a simple workaround
101+ * is to call 'readAgain' on the stream...
102+ */
103+ int c = in .read ();
104+ boolean eol = isEndOfLine (c );
105+ c = in .readAgain ();
106+
107+ // empty line detection: eol AND (last char was EOL or beginning)
108+ if (format .isEmptyLinesIgnored ()) {
109+ while (eol
110+ && (lastChar == '\n' || lastChar == '\r' || lastChar == ExtendedBufferedReader .UNDEFINED )
111+ && !isEndOfFile (lastChar )) {
112+ // go on char ahead ...
113+ lastChar = c ;
114+ c = in .read ();
115+ eol = isEndOfLine (c );
116+ c = in .readAgain ();
117+ // reached end of file without any content (empty line at the end)
118+ if (isEndOfFile (c )) {
119+ tkn .type = EOF ;
120+ return tkn ;
121+ }
122+ }
123+ }
124+
125+ // did we reach eof during the last iteration already ? EOF
126+ if (isEndOfFile (lastChar ) || (lastChar != format .getDelimiter () && isEndOfFile (c ))) {
127+ tkn .type = EOF ;
128+ return tkn ;
129+ }
130+
131+ // important: make sure a new char gets consumed in each iteration
132+ while (!tkn .isReady && tkn .type != EOF ) {
133+ // ignore whitespaces at beginning of a token
134+ if (format .isLeadingSpacesIgnored ()) {
135+ while (isWhitespace (c ) && !eol ) {
136+ wsBuf .append ((char ) c );
137+ c = in .read ();
138+ eol = isEndOfLine (c );
139+ }
140+ }
141+
142+ // ok, start of token reached: comment, encapsulated, or token
143+ if (c == format .getCommentStart ()) {
144+ // ignore everything till end of line and continue (incr linecount)
145+ in .readLine ();
146+ tkn = nextToken (tkn .reset ());
147+ } else if (c == format .getDelimiter ()) {
148+ // empty token return TOKEN("")
149+ tkn .type = TOKEN ;
150+ tkn .isReady = true ;
151+ } else if (eol ) {
152+ // empty token return EORECORD("")
153+ //noop: tkn.content.append("");
154+ tkn .type = EORECORD ;
155+ tkn .isReady = true ;
156+ } else if (c == format .getEncapsulator ()) {
157+ // consume encapsulated token
158+ encapsulatedTokenLexer (tkn , c );
159+ } else if (isEndOfFile (c )) {
160+ // end of file return EOF()
161+ //noop: tkn.content.append("");
162+ tkn .type = EOF ;
163+ tkn .isReady = true ;
164+ } else {
165+ // next token must be a simple token
166+ // add removed blanks when not ignoring whitespace chars...
167+ if (!format .isLeadingSpacesIgnored ()) {
168+ tkn .content .append (wsBuf );
169+ }
170+ simpleTokenLexer (tkn , c );
171+ }
172+ }
173+ return tkn ;
174+ }
175+
176+ /**
177+ * A simple token lexer
178+ * <p/>
179+ * Simple token are tokens which are not surrounded by encapsulators.
180+ * A simple token might contain escaped delimiters (as \, or \;). The
181+ * token is finished when one of the following conditions become true:
182+ * <ul>
183+ * <li>end of line has been reached (EORECORD)</li>
184+ * <li>end of stream has been reached (EOF)</li>
185+ * <li>an unescaped delimiter has been reached (TOKEN)</li>
186+ * </ul>
187+ *
188+ * @param tkn the current token
189+ * @param c the current character
190+ * @return the filled token
191+ * @throws IOException on stream access error
192+ */
193+ private Token simpleTokenLexer (Token tkn , int c ) throws IOException {
194+ while (true ) {
195+ if (isEndOfLine (c )) {
196+ // end of record
197+ tkn .type = EORECORD ;
198+ tkn .isReady = true ;
199+ break ;
200+ } else if (isEndOfFile (c )) {
201+ // end of file
202+ tkn .type = EOF ;
203+ tkn .isReady = true ;
204+ break ;
205+ } else if (c == format .getDelimiter ()) {
206+ // end of token
207+ tkn .type = TOKEN ;
208+ tkn .isReady = true ;
209+ break ;
210+ } else if (c == format .getEscape ()) {
211+ tkn .content .append ((char ) readEscape (c ));
212+ } else {
213+ tkn .content .append ((char ) c );
214+ }
215+
216+ c = in .read ();
217+ }
218+
219+ if (format .isTrailingSpacesIgnored ()) {
220+ trimTrailingSpaces (tkn .content );
221+ }
222+
223+ return tkn ;
224+ }
225+
226+ private void trimTrailingSpaces (StringBuilder buffer ) {
227+ int length = buffer .length ();
228+ while (length > 0 && Character .isWhitespace (buffer .charAt (length - 1 ))) {
229+ length = length - 1 ;
230+ }
231+ if (length != buffer .length ()) {
232+ buffer .setLength (length );
233+ }
234+ }
235+
236+ /**
237+ * An encapsulated token lexer
238+ * <p/>
239+ * Encapsulated tokens are surrounded by the given encapsulating-string.
240+ * The encapsulator itself might be included in the token using a
241+ * doubling syntax (as "", '') or using escaping (as in \", \').
242+ * Whitespaces before and after an encapsulated token are ignored.
243+ *
244+ * @param tkn the current token
245+ * @param c the current character
246+ * @return a valid token object
247+ * @throws IOException on invalid state
248+ */
249+ private Token encapsulatedTokenLexer (Token tkn , int c ) throws IOException {
250+ // save current line
251+ int startLineNumber = getLineNumber ();
252+ // ignore the given delimiter
253+ // assert c == delimiter;
254+ while (true ) {
255+ c = in .read ();
256+
257+ if (c == format .getEscape ()) {
258+ tkn .content .append ((char ) readEscape (c ));
259+ } else if (c == format .getEncapsulator ()) {
260+ if (in .lookAhead () == format .getEncapsulator ()) {
261+ // double or escaped encapsulator -> add single encapsulator to token
262+ c = in .read ();
263+ tkn .content .append ((char ) c );
264+ } else {
265+ // token finish mark (encapsulator) reached: ignore whitespace till delimiter
266+ while (true ) {
267+ c = in .read ();
268+ if (c == format .getDelimiter ()) {
269+ tkn .type = TOKEN ;
270+ tkn .isReady = true ;
271+ return tkn ;
272+ } else if (isEndOfFile (c )) {
273+ tkn .type = EOF ;
274+ tkn .isReady = true ;
275+ return tkn ;
276+ } else if (isEndOfLine (c )) {
277+ // ok eo token reached
278+ tkn .type = EORECORD ;
279+ tkn .isReady = true ;
280+ return tkn ;
281+ } else if (!isWhitespace (c )) {
282+ // error invalid char between token and next delimiter
283+ throw new IOException ("(line " + getLineNumber () + ") invalid char between encapsulated token and delimiter" );
284+ }
285+ }
286+ }
287+ } else if (isEndOfFile (c )) {
288+ // error condition (end of file before end of token)
289+ throw new IOException ("(startline " + startLineNumber + ") EOF reached before encapsulated token finished" );
290+ } else {
291+ // consume character
292+ tkn .content .append ((char ) c );
293+ }
294+ }
295+ }
296+
297+ private int readEscape (int c ) throws IOException {
298+ // assume c is the escape char (normally a backslash)
299+ c = in .read ();
300+ switch (c ) {
301+ case 'r' :
302+ return '\r' ;
303+ case 'n' :
304+ return '\n' ;
305+ case 't' :
306+ return '\t' ;
307+ case 'b' :
308+ return '\b' ;
309+ case 'f' :
310+ return '\f' ;
311+ default :
312+ return c ;
313+ }
314+ }
315+
316+ /**
317+ * @return true if the given char is a whitespace character
318+ */
319+ private boolean isWhitespace (int c ) {
320+ return (c != format .getDelimiter ()) && Character .isWhitespace ((char ) c );
321+ }
322+
323+ /**
324+ * Greedy - accepts \n, \r and \r\n
325+ * This checker consumes silently the second control-character...
326+ *
327+ * @return true if the given character is a line-terminator
328+ */
329+ private boolean isEndOfLine (int c ) throws IOException {
330+ // check if we have \r\n...
331+ if (c == '\r' && in .lookAhead () == '\n' ) {
332+ // note: does not change c outside of this method !!
333+ c = in .read ();
334+ }
335+ return (c == '\n' || c == '\r' );
336+ }
337+
338+ /**
339+ * @return true if the given character indicates end of file
340+ */
341+ private boolean isEndOfFile (int c ) {
342+ return c == ExtendedBufferedReader .END_OF_STREAM ;
343+ }
344+ }
0 commit comments