1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one or more
3+ * contributor license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright ownership.
5+ * The ASF licenses this file to You under the Apache License, Version 2.0
6+ * (the "License"); you may not use this file except in compliance with
7+ * the License. You may obtain a copy of the License at
8+ *
9+ * http://www.apache.org/licenses/LICENSE-2.0
10+ *
11+ * Unless required by applicable law or agreed to in writing, software
12+ * distributed under the License is distributed on an "AS IS" BASIS,
13+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ * See the License for the specific language governing permissions and
15+ * limitations under the License.
16+ */
17+
18+ package org .apache .commons .csv ;
19+
20+ import java .io .IOException ;
21+
22+ import static org .apache .commons .csv .Token .Type .*;
23+
24+ class CSVLexer1 extends Lexer {
25+
26+ private final StringBuilder wsBuf = new StringBuilder ();
27+
28+ // ctor needs to be public so can be called dynamically by PerformanceTest class
29+ public CSVLexer1 (CSVFormat format , ExtendedBufferedReader in ) {
30+ super (format , in );
31+ }
32+
33+ /**
34+ * Returns the next token.
35+ * <p/>
36+ * A token corresponds to a term, a record change or an end-of-file indicator.
37+ *
38+ * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
39+ * @return the next token found
40+ * @throws java.io.IOException on stream access error
41+ */
42+ @ Override
43+ Token nextToken (Token tkn ) throws IOException {
44+ wsBuf .setLength (0 ); // reuse
45+
46+ // get the last read char (required for empty line detection)
47+ int lastChar = in .readAgain ();
48+
49+ // read the next char and set eol
50+ /* note: unfortunately isEndOfLine may consumes a character silently.
51+ * this has no effect outside of the method. so a simple workaround
52+ * is to call 'readAgain' on the stream...
53+ */
54+ int c = in .read ();
55+ boolean eol = isEndOfLine (c );
56+ c = in .readAgain ();
57+
58+ // empty line detection: eol AND (last char was EOL or beginning)
59+ if (format .isEmptyLinesIgnored ()) {
60+ while (eol
61+ && (lastChar == '\n' || lastChar == '\r' || lastChar == ExtendedBufferedReader .UNDEFINED )
62+ && !isEndOfFile (lastChar )) {
63+ // go on char ahead ...
64+ lastChar = c ;
65+ c = in .read ();
66+ eol = isEndOfLine (c );
67+ c = in .readAgain ();
68+ // reached end of file without any content (empty line at the end)
69+ if (isEndOfFile (c )) {
70+ tkn .type = EOF ;
71+ return tkn ;
72+ }
73+ }
74+ }
75+
76+ // did we reach eof during the last iteration already ? EOF
77+ if (isEndOfFile (lastChar ) || (lastChar != format .getDelimiter () && isEndOfFile (c ))) {
78+ tkn .type = EOF ;
79+ return tkn ;
80+ }
81+
82+ // important: make sure a new char gets consumed in each iteration
83+ while (!tkn .isReady && tkn .type != EOF ) {
84+ // ignore whitespaces at beginning of a token
85+ if (format .isLeadingSpacesIgnored ()) {
86+ while (isWhitespace (c ) && !eol ) {
87+ wsBuf .append ((char ) c );
88+ c = in .read ();
89+ eol = isEndOfLine (c );
90+ }
91+ }
92+
93+ // ok, start of token reached: comment, encapsulated, or token
94+ if (c == format .getCommentStart ()) {
95+ // ignore everything till end of line and continue (incr linecount)
96+ in .readLine ();
97+ tkn = nextToken (tkn .reset ());
98+ } else if (c == format .getDelimiter ()) {
99+ // empty token return TOKEN("")
100+ tkn .type = TOKEN ;
101+ tkn .isReady = true ;
102+ } else if (eol ) {
103+ // empty token return EORECORD("")
104+ //noop: tkn.content.append("");
105+ tkn .type = EORECORD ;
106+ tkn .isReady = true ;
107+ } else if (c == format .getEncapsulator ()) {
108+ // consume encapsulated token
109+ encapsulatedTokenLexer (tkn , c );
110+ } else if (isEndOfFile (c )) {
111+ // end of file return EOF()
112+ //noop: tkn.content.append("");
113+ tkn .type = EOF ;
114+ tkn .isReady = true ;
115+ } else {
116+ // next token must be a simple token
117+ // add removed blanks when not ignoring whitespace chars...
118+ if (!format .isLeadingSpacesIgnored ()) {
119+ tkn .content .append (wsBuf );
120+ }
121+ simpleTokenLexer (tkn , c );
122+ }
123+ }
124+ return tkn ;
125+ }
126+
127+ /**
128+ * A simple token lexer
129+ * <p/>
130+ * Simple token are tokens which are not surrounded by encapsulators.
131+ * A simple token might contain escaped delimiters (as \, or \;). The
132+ * token is finished when one of the following conditions become true:
133+ * <ul>
134+ * <li>end of line has been reached (EORECORD)</li>
135+ * <li>end of stream has been reached (EOF)</li>
136+ * <li>an unescaped delimiter has been reached (TOKEN)</li>
137+ * </ul>
138+ *
139+ * @param tkn the current token
140+ * @param c the current character
141+ * @return the filled token
142+ * @throws IOException on stream access error
143+ */
144+ private Token simpleTokenLexer (Token tkn , int c ) throws IOException {
145+ while (true ) {
146+ if (isEndOfLine (c )) {
147+ // end of record
148+ tkn .type = EORECORD ;
149+ tkn .isReady = true ;
150+ break ;
151+ } else if (isEndOfFile (c )) {
152+ // end of file
153+ tkn .type = EOF ;
154+ tkn .isReady = true ;
155+ break ;
156+ } else if (c == format .getDelimiter ()) {
157+ // end of token
158+ tkn .type = TOKEN ;
159+ tkn .isReady = true ;
160+ break ;
161+ } else if (c == format .getEscape ()) {
162+ tkn .content .append ((char ) readEscape (c ));
163+ } else {
164+ tkn .content .append ((char ) c );
165+ }
166+
167+ c = in .read ();
168+ }
169+
170+ if (format .isTrailingSpacesIgnored ()) {
171+ trimTrailingSpaces (tkn .content );
172+ }
173+
174+ return tkn ;
175+ }
176+
177+ /**
178+ * An encapsulated token lexer
179+ * <p/>
180+ * Encapsulated tokens are surrounded by the given encapsulating-string.
181+ * The encapsulator itself might be included in the token using a
182+ * doubling syntax (as "", '') or using escaping (as in \", \').
183+ * Whitespaces before and after an encapsulated token are ignored.
184+ *
185+ * @param tkn the current token
186+ * @param c the current character
187+ * @return a valid token object
188+ * @throws IOException on invalid state
189+ */
190+ private Token encapsulatedTokenLexer (Token tkn , int c ) throws IOException {
191+ // save current line
192+ int startLineNumber = getLineNumber ();
193+ // ignore the given delimiter
194+ // assert c == delimiter;
195+ while (true ) {
196+ c = in .read ();
197+
198+ if (c == format .getEscape ()) {
199+ tkn .content .append ((char ) readEscape (c ));
200+ } else if (c == format .getEncapsulator ()) {
201+ if (in .lookAhead () == format .getEncapsulator ()) {
202+ // double or escaped encapsulator -> add single encapsulator to token
203+ c = in .read ();
204+ tkn .content .append ((char ) c );
205+ } else {
206+ // token finish mark (encapsulator) reached: ignore whitespace till delimiter
207+ while (true ) {
208+ c = in .read ();
209+ if (c == format .getDelimiter ()) {
210+ tkn .type = TOKEN ;
211+ tkn .isReady = true ;
212+ return tkn ;
213+ } else if (isEndOfFile (c )) {
214+ tkn .type = EOF ;
215+ tkn .isReady = true ;
216+ return tkn ;
217+ } else if (isEndOfLine (c )) {
218+ // ok eo token reached
219+ tkn .type = EORECORD ;
220+ tkn .isReady = true ;
221+ return tkn ;
222+ } else if (!isWhitespace (c )) {
223+ // error invalid char between token and next delimiter
224+ throw new IOException ("(line " + getLineNumber () + ") invalid char between encapsulated token and delimiter" );
225+ }
226+ }
227+ }
228+ } else if (isEndOfFile (c )) {
229+ // error condition (end of file before end of token)
230+ throw new IOException ("(startline " + startLineNumber + ") EOF reached before encapsulated token finished" );
231+ } else {
232+ // consume character
233+ tkn .content .append ((char ) c );
234+ }
235+ }
236+ }
237+
238+ }
0 commit comments