1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one or more
3+ * contributor license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright ownership.
5+ * The ASF licenses this file to You under the Apache License, Version 2.0
6+ * (the "License"); you may not use this file except in compliance with
7+ * the License. You may obtain a copy of the License at
8+ *
9+ * http://www.apache.org/licenses/LICENSE-2.0
10+ *
11+ * Unless required by applicable law or agreed to in writing, software
12+ * distributed under the License is distributed on an "AS IS" BASIS,
13+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ * See the License for the specific language governing permissions and
15+ * limitations under the License.
16+ */
17+
18+ package org .apache .commons .csv ;
19+
20+ import java .io .IOException ;
21+
22+ import static org .apache .commons .csv .Token .Type .*;
23+
24+ class CSVLexer1306667 extends Lexer {
25+
26+ // ctor needs to be public so can be called dynamically by PerformanceTest class
27+ public CSVLexer1306667 (CSVFormat format , ExtendedBufferedReader in ) {
28+ super (format , in );
29+ }
30+
31+ /**
32+ * Returns the next token.
33+ * <p/>
34+ * A token corresponds to a term, a record change or an end-of-file indicator.
35+ *
36+ * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
37+ * @return the next token found
38+ * @throws java.io.IOException on stream access error
39+ */
40+ @ Override
41+ Token nextToken (Token tkn ) throws IOException {
42+
43+ // get the last read char (required for empty line detection)
44+ int lastChar = in .readAgain ();
45+
46+ // read the next char and set eol
47+ int c = in .read ();
48+
49+ /* note: unfortunately isEndOfLine may consumes a character silently.
50+ * this has no effect outside of the method. so a simple workaround
51+ * is to call 'readAgain' on the stream...
52+ */
53+ boolean eol = isEndOfLine (c );
54+ c = in .readAgain ();
55+
56+ // empty line detection: eol AND (last char was EOL or beginning)
57+ if (emptyLinesIgnored ) {
58+ while (eol && isStartOfLine (lastChar )) {
59+ // go on char ahead ...
60+ lastChar = c ;
61+ c = in .read ();
62+ eol = isEndOfLine (c );
63+ c = in .readAgain ();
64+ // reached end of file without any content (empty line at the end)
65+ if (isEndOfFile (c )) {
66+ tkn .type = EOF ;
67+ // don't set tkn.isReady here because no content
68+ return tkn ;
69+ }
70+ }
71+ }
72+
73+ // did we reach eof during the last iteration already ? EOF
74+ if (isEndOfFile (lastChar ) || (!isDelimiter (lastChar ) && isEndOfFile (c ))) {
75+ tkn .type = EOF ;
76+ // don't set tkn.isReady here because no content
77+ return tkn ;
78+ }
79+
80+ if (isStartOfLine (lastChar ) && isCommentStart (c )) {
81+ in .readLine ();
82+ tkn .type = COMMENT ;
83+ return tkn ;
84+ }
85+
86+ // important: make sure a new char gets consumed in each iteration
87+ while (tkn .type == INVALID ) {
88+ // ignore whitespaces at beginning of a token
89+ if (surroundingSpacesIgnored ) {
90+ while (isWhitespace (c ) && !eol ) {
91+ c = in .read ();
92+ eol = isEndOfLine (c );
93+ }
94+ }
95+
96+ // ok, start of token reached: encapsulated, or token
97+ if (isDelimiter (c )) {
98+ // empty token return TOKEN("")
99+ tkn .type = TOKEN ;
100+ } else if (eol ) {
101+ // empty token return EORECORD("")
102+ //noop: tkn.content.append("");
103+ tkn .type = EORECORD ;
104+ } else if (isEncapsulator (c )) {
105+ // consume encapsulated token
106+ encapsulatedTokenLexer (tkn );
107+ } else if (isEndOfFile (c )) {
108+ // end of file return EOF()
109+ //noop: tkn.content.append("");
110+ tkn .type = EOF ;
111+ tkn .isReady = true ; // there is data at EOF
112+ } else {
113+ // next token must be a simple token
114+ // add removed blanks when not ignoring whitespace chars...
115+ simpleTokenLexer (tkn , c );
116+ }
117+ }
118+ return tkn ;
119+ }
120+
121+ /**
122+ * A simple token lexer
123+ * <p/>
124+ * Simple token are tokens which are not surrounded by encapsulators.
125+ * A simple token might contain escaped delimiters (as \, or \;). The
126+ * token is finished when one of the following conditions become true:
127+ * <ul>
128+ * <li>end of line has been reached (EORECORD)</li>
129+ * <li>end of stream has been reached (EOF)</li>
130+ * <li>an unescaped delimiter has been reached (TOKEN)</li>
131+ * </ul>
132+ *
133+ * @param tkn the current token
134+ * @param c the current character
135+ * @return the filled token
136+ * @throws IOException on stream access error
137+ */
138+ private Token simpleTokenLexer (Token tkn , int c ) throws IOException {
139+ // Faster to use while(true)+break than while(tkn.type == INVALID)
140+ while (true ) {
141+ if (isEndOfLine (c )) {
142+ tkn .type = EORECORD ;
143+ break ;
144+ } else if (isEndOfFile (c )) {
145+ tkn .type = EOF ;
146+ tkn .isReady = true ; // There is data at EOF
147+ break ;
148+ } else if (isDelimiter (c )) {
149+ tkn .type = TOKEN ;
150+ break ;
151+ } else if (isEscape (c )) {
152+ tkn .content .append ((char ) readEscape ());
153+ c = in .read (); // continue
154+ } else {
155+ tkn .content .append ((char ) c );
156+ c = in .read (); // continue
157+ }
158+ }
159+
160+ if (surroundingSpacesIgnored ) {
161+ trimTrailingSpaces (tkn .content );
162+ }
163+
164+ return tkn ;
165+ }
166+
167+ /**
168+ * An encapsulated token lexer
169+ * <p/>
170+ * Encapsulated tokens are surrounded by the given encapsulating-string.
171+ * The encapsulator itself might be included in the token using a
172+ * doubling syntax (as "", '') or using escaping (as in \", \').
173+ * Whitespaces before and after an encapsulated token are ignored.
174+ *
175+ * @param tkn the current token
176+ * @return a valid token object
177+ * @throws IOException on invalid state
178+ */
179+ private Token encapsulatedTokenLexer (Token tkn ) throws IOException {
180+ // save current line
181+ int startLineNumber = getLineNumber ();
182+ // ignore the given delimiter
183+ // assert c == delimiter;
184+ int c ;
185+ while (true ) {
186+ c = in .read ();
187+
188+ if (isEscape (c )) {
189+ tkn .content .append ((char ) readEscape ());
190+ } else if (isEncapsulator (c )) {
191+ if (isEncapsulator (in .lookAhead ())) {
192+ // double or escaped encapsulator -> add single encapsulator to token
193+ c = in .read ();
194+ tkn .content .append ((char ) c );
195+ } else {
196+ // token finish mark (encapsulator) reached: ignore whitespace till delimiter
197+ while (true ) {
198+ c = in .read ();
199+ if (isDelimiter (c )) {
200+ tkn .type = TOKEN ;
201+ return tkn ;
202+ } else if (isEndOfFile (c )) {
203+ tkn .type = EOF ;
204+ tkn .isReady = true ; // There is data at EOF
205+ return tkn ;
206+ } else if (isEndOfLine (c )) {
207+ // ok eo token reached
208+ tkn .type = EORECORD ;
209+ return tkn ;
210+ } else if (!isWhitespace (c )) {
211+ // error invalid char between token and next delimiter
212+ throw new IOException ("(line " + getLineNumber () + ") invalid char between encapsulated token and delimiter" );
213+ }
214+ }
215+ }
216+ } else if (isEndOfFile (c )) {
217+ // error condition (end of file before end of token)
218+ throw new IOException ("(startline " + startLineNumber + ") EOF reached before encapsulated token finished" );
219+ } else {
220+ // consume character
221+ tkn .content .append ((char ) c );
222+ }
223+ }
224+ }
225+
226+ }
0 commit comments