1717
1818package org .apache .commons .csv ;
1919
20+ import static org .apache .commons .csv .Constants .BACKSPACE ;
21+ import static org .apache .commons .csv .Constants .CR ;
22+ import static org .apache .commons .csv .Constants .END_OF_STREAM ;
23+ import static org .apache .commons .csv .Constants .FF ;
24+ import static org .apache .commons .csv .Constants .LF ;
25+ import static org .apache .commons .csv .Constants .TAB ;
26+ import static org .apache .commons .csv .Constants .UNDEFINED ;
2027import static org .apache .commons .csv .Token .Type .COMMENT ;
2128import static org .apache .commons .csv .Token .Type .EOF ;
2229import static org .apache .commons .csv .Token .Type .EORECORD ;
3037 *
3138 * @version $Id$
3239 */
33- final class CSVLexer extends Lexer {
40+ final class CSVLexer {
41+
42+ /**
43+ * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
44+ * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
45+ * chars (using surrogates) and thus there should never be a collision with a real text char.
46+ */
47+ private static final char DISABLED = '\ufffe' ;
48+
49+ private final char delimiter ;
50+ private final char escape ;
51+ private final char quoteChar ;
52+ private final char commmentStart ;
53+
54+ final boolean ignoreSurroundingSpaces ;
55+ final boolean ignoreEmptyLines ;
56+
57+ final CSVFormat format ;
58+
59+ /** The input stream */
60+ final ExtendedBufferedReader in ;
3461
3562 /** INTERNAL API. ctor needs to be public so can be called dynamically by PerformanceTest class */
3663 CSVLexer (final CSVFormat format , final ExtendedBufferedReader in ) {
37- super (format , in );
64+ this .format = format ;
65+ this .in = in ;
66+ this .delimiter = format .getDelimiter ();
67+ this .escape = mapNullToDisabled (format .getEscape ());
68+ this .quoteChar = mapNullToDisabled (format .getQuoteChar ());
69+ this .commmentStart = mapNullToDisabled (format .getCommentStart ());
70+ this .ignoreSurroundingSpaces = format .getIgnoreSurroundingSpaces ();
71+ this .ignoreEmptyLines = format .getIgnoreEmptyLines ();
3872 }
3973
4074 /**
@@ -48,7 +82,6 @@ final class CSVLexer extends Lexer {
4882 * @throws java.io.IOException
4983 * on stream access error
5084 */
51- @ Override
5285 Token nextToken (final Token token ) throws IOException {
5386
5487 // get the last read char (required for empty line detection)
@@ -257,4 +290,144 @@ private Token parseEncapsulatedToken(final Token token) throws IOException {
257290 }
258291 }
259292
293+ private final char mapNullToDisabled (final Character c ) {
294+ return c == null ? DISABLED : c .charValue ();
295+ }
296+
297+ /**
298+ * Returns the current line number
299+ *
300+ * @return the current line number
301+ */
302+ long getCurrentLineNumber () {
303+ return in .getCurrentLineNumber ();
304+ }
305+
306+ // TODO escape handling needs more work
307+ /**
308+ * Handle an escape sequence.
309+ * The current character must be the escape character.
310+ * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
311+ * on the input stream.
312+ *
313+ * @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
314+ * @throws IOException if there is a problem reading the stream or the end of stream is detected:
315+ * the escape character is not allowed at end of strem
316+ */
317+ int readEscape () throws IOException {
318+ // the escape char has just been read (normally a backslash)
319+ final int ch = in .read ();
320+ switch (ch ) {
321+ case 'r' :
322+ return CR ;
323+ case 'n' :
324+ return LF ;
325+ case 't' :
326+ return TAB ;
327+ case 'b' :
328+ return BACKSPACE ;
329+ case 'f' :
330+ return FF ;
331+ case CR :
332+ case LF :
333+ case FF : // TODO is this correct?
334+ case TAB : // TODO is this correct? Do tabs need to be escaped?
335+ case BACKSPACE : // TODO is this correct?
336+ return ch ;
337+ case END_OF_STREAM :
338+ throw new IOException ("EOF whilst processing escape sequence" );
339+ default :
340+ // Now check for meta-characters
341+ if (isMetaChar (ch )) {
342+ return ch ;
343+ }
344+ // indicate unexpected char - available from in.getLastChar()
345+ return END_OF_STREAM ;
346+ }
347+ }
348+
349+ void trimTrailingSpaces (final StringBuilder buffer ) {
350+ int length = buffer .length ();
351+ while (length > 0 && Character .isWhitespace (buffer .charAt (length - 1 ))) {
352+ length = length - 1 ;
353+ }
354+ if (length != buffer .length ()) {
355+ buffer .setLength (length );
356+ }
357+ }
358+
359+ /**
360+ * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
361+ *
362+ * @return true if the given or next character is a line-terminator
363+ */
364+ boolean readEndOfLine (int ch ) throws IOException {
365+ // check if we have \r\n...
366+ if (ch == CR && in .lookAhead () == LF ) {
367+ // note: does not change ch outside of this method!
368+ ch = in .read ();
369+ }
370+ return ch == LF || ch == CR ;
371+ }
372+
373+ boolean isClosed () {
374+ return in .isClosed ();
375+ }
376+
377+ /**
378+ * @return true if the given char is a whitespace character
379+ */
380+ boolean isWhitespace (final int ch ) {
381+ return !isDelimiter (ch ) && Character .isWhitespace ((char ) ch );
382+ }
383+
384+ /**
385+ * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
386+ *
387+ * @param ch the character to check
388+ * @return true if the character is at the start of a line.
389+ */
390+ boolean isStartOfLine (final int ch ) {
391+ return ch == LF || ch == CR || ch == UNDEFINED ;
392+ }
393+
394+ /**
395+ * @return true if the given character indicates end of file
396+ */
397+ boolean isEndOfFile (final int ch ) {
398+ return ch == END_OF_STREAM ;
399+ }
400+
401+ boolean isDelimiter (final int ch ) {
402+ return ch == delimiter ;
403+ }
404+
405+ boolean isEscape (final int ch ) {
406+ return ch == escape ;
407+ }
408+
409+ boolean isQuoteChar (final int ch ) {
410+ return ch == quoteChar ;
411+ }
412+
413+ boolean isCommentStart (final int ch ) {
414+ return ch == commmentStart ;
415+ }
416+
417+ private boolean isMetaChar (final int ch ) {
418+ return ch == delimiter ||
419+ ch == escape ||
420+ ch == quoteChar ||
421+ ch == commmentStart ;
422+ }
423+
424+ /**
425+ * Closes resources.
426+ *
427+ * @throws IOException
428+ * If an I/O error occurs
429+ */
430+ void close () throws IOException {
431+ in .close ();
432+ }
260433}
0 commit comments