1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one or more
3+ * contributor license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright ownership.
5+ * The ASF licenses this file to You under the Apache License, Version 2.0
6+ * (the "License"); you may not use this file except in compliance with
7+ * the License. You may obtain a copy of the License at
8+ *
9+ * http://www.apache.org/licenses/LICENSE-2.0
10+ *
11+ * Unless required by applicable law or agreed to in writing, software
12+ * distributed under the License is distributed on an "AS IS" BASIS,
13+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ * See the License for the specific language governing permissions and
15+ * limitations under the License.
16+ */
17+
18+ package org .apache .commons .csv ;
19+
20+ import java .io .IOException ;
21+
22+ import static org .apache .commons .csv .Token .Type .*;
23+
24+ /**
25+ * Experimental Lexer using enums to keep track of state and character type.
26+ * Unfortunately it is twice as slow.
27+ * For reference purpose only.
28+ *
29+ */
30+ class CSVLexer3 extends Lexer {
31+
32+ private final char escape ;
33+
34+ // ctor needs to be public so can be called dynamically by PerformanceTest class
35+ public CSVLexer3 (CSVFormat format , ExtendedBufferedReader in ) {
36+ super (format , in );
37+ this .escape = format .getEscape ();
38+ }
39+
40+ /**
41+ * Classify the character types
42+ */
43+ private static enum CharType {
44+ DELIM ,
45+ ESCAPE ,
46+ ENCAP ,
47+ EOL ,
48+ COMMENT_START ,
49+ WHITESPACE ,
50+ OTHER ,
51+ EOFCHAR
52+ }
53+
54+ private CharType classify (int intch ) {
55+ if (isDelimiter (intch )) {
56+ return CharType .DELIM ;
57+ }
58+ if (isCommentStart (intch )) {
59+ return CharType .COMMENT_START ;
60+ }
61+ if (isEncapsulator (intch )) {
62+ return CharType .ENCAP ;
63+ }
64+ if (isEscape (intch )) {
65+ return CharType .ESCAPE ;
66+ }
67+ if (intch == '\r' || intch == '\n' ) {
68+ return CharType .EOL ;
69+ }
70+ if (isWhitespace (intch )) { // Must be after EOL check
71+ return CharType .WHITESPACE ;
72+ }
73+ if (intch == ExtendedBufferedReader .END_OF_STREAM ) {
74+ return CharType .EOFCHAR ;
75+ }
76+ return CharType .OTHER ;
77+ }
78+
79+ /**
80+ * Parsing states
81+ */
82+ private static enum State {
83+ BEGIN , PLAIN , INQUOTE , QUOTEQUOTE , ESCAPE_PLAIN , ESCAPE_QUOTE ,
84+ }
85+
86+ /**
87+ * Returns the next token.
88+ * <p/>
89+ * A token corresponds to a term, a record change or an end-of-file indicator.
90+ *
91+ * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
92+ * @return the next token found
93+ * @throws java.io.IOException on stream access error
94+ */
95+ @ Override
96+ Token nextToken (Token tkn ) throws IOException {
97+
98+ State state = State .BEGIN ;
99+ int intch ;
100+ boolean trimTrailingSpaces = false ;
101+ while (tkn .type == INVALID ) {
102+ intch = in .read ();
103+ CharType type = classify (intch );
104+ switch (state ) {
105+ case BEGIN :
106+ switch (type ){
107+ case COMMENT_START :
108+ in .readLine ();
109+ tkn .type = COMMENT ;
110+ break ;
111+ case ENCAP :
112+ state = State .INQUOTE ;
113+ break ;
114+ case DELIM :
115+ tkn .type = TOKEN ;
116+ break ;
117+ case EOL :
118+ tkn .type = EORECORD ;
119+ break ;
120+ case EOFCHAR :
121+ tkn .type = EOF ;
122+ break ;
123+ case ESCAPE :
124+ state = State .ESCAPE_PLAIN ;
125+ break ;
126+ case OTHER :
127+ tkn .content .append ((char ) intch );
128+ state = State .PLAIN ;
129+ break ;
130+ case WHITESPACE :
131+ if (!surroundingSpacesIgnored ){
132+ tkn .content .append ((char ) intch );
133+ state = State .PLAIN ;
134+ }
135+ break ;
136+ }
137+ break ;
138+ case PLAIN :
139+ switch (type ){
140+ case DELIM :
141+ tkn .type = TOKEN ;
142+ break ;
143+ case EOL :
144+ tkn .type = EORECORD ;
145+ break ;
146+ case EOFCHAR :
147+ tkn .type = EOF ;
148+ break ;
149+ case ESCAPE :
150+ state = State .ESCAPE_PLAIN ;
151+ break ;
152+ default :
153+ trimTrailingSpaces = surroundingSpacesIgnored ; // we have a plain token
154+ tkn .content .append ((char ) intch );
155+ break ;
156+ }
157+ break ;
158+ case INQUOTE : // Started a quoted string
159+ switch (type ){
160+ case ENCAP :
161+ state = State .QUOTEQUOTE ;
162+ break ;
163+ case ESCAPE :
164+ state = State .ESCAPE_QUOTE ;
165+ break ;
166+ case EOFCHAR :
167+ throw new IOException ("(line " + getLineNumber () + ") unexpected EOF in quoted string" );
168+ default :
169+ tkn .content .append ((char ) intch );
170+ break ;
171+ }
172+ break ;
173+ case QUOTEQUOTE : // "..." seen, expecting end of token or "
174+ switch (type ){
175+ case DELIM :
176+ tkn .type = TOKEN ;
177+ break ;
178+ case EOL :
179+ tkn .type = EORECORD ;
180+ break ;
181+ case EOFCHAR :
182+ tkn .type = EOF ;
183+ break ;
184+ case ENCAP : // "..."" seen, append it
185+ tkn .content .append ((char ) intch );
186+ state = State .INQUOTE ;
187+ break ;
188+ case WHITESPACE : // trailing whitespace may be allowed
189+ if (!surroundingSpacesIgnored ) {
190+ // error invalid char between token and next delimiter
191+ throw new IOException ("(line " + getLineNumber () + ") invalid char between encapsulated token and delimiter" );
192+ }
193+ break ;
194+ // Everything else is invalid
195+ case ESCAPE :
196+ case OTHER :
197+ case COMMENT_START :
198+ // error invalid char between token and next delimiter
199+ throw new IOException ("(line " + getLineNumber () + ") invalid char between encapsulated token and delimiter" );
200+ }
201+ break ;
202+ case ESCAPE_PLAIN :
203+ switch (type ){
204+ case DELIM :
205+ case ESCAPE :
206+ case EOL :
207+ tkn .content .append ((char ) intch );
208+ state = State .PLAIN ;
209+ break ;
210+ case COMMENT_START : // TODO should comment be escaped?
211+ case ENCAP : // TODO is this correct?
212+ case OTHER : // TODO may need to escape further
213+ case WHITESPACE :
214+ tkn .content .append (escape );
215+ tkn .content .append ((char ) intch );
216+ break ;
217+ case EOFCHAR :
218+ throw new IOException ("(line " + getLineNumber () + ") unexpected EOF in escape sequence" );
219+ }
220+ break ;
221+ case ESCAPE_QUOTE :
222+ switch (type ){
223+ case ESCAPE :
224+ case ENCAP : // this is the only required escape
225+ tkn .content .append ((char ) intch );
226+ break ;
227+ case COMMENT_START :
228+ case DELIM :
229+ case EOL :
230+ case OTHER :
231+ case WHITESPACE :
232+ tkn .content .append (escape );
233+ tkn .content .append ((char ) intch );
234+ break ;
235+ case EOFCHAR :
236+ throw new IOException ("(line " + getLineNumber () + ") unexpected EOF in escape sequence" );
237+ }
238+ break ;
239+ default :
240+ break ;
241+ }
242+ }
243+ if (trimTrailingSpaces ) {
244+ trimTrailingSpaces (tkn .content );
245+ }
246+ return tkn ;
247+ }
248+ }
0 commit comments