Skip to content

Commit 7b168eb

Browse files
committed
Merge Lexer with CSVLexer
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1511006 13f79535-47bb-0310-9956-ffa450edef68
1 parent 7755640 commit 7b168eb

5 files changed

Lines changed: 207 additions & 245 deletions

File tree

src/main/java/org/apache/commons/csv/CSVLexer.java

Lines changed: 176 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@
1717

1818
package org.apache.commons.csv;
1919

20+
import static org.apache.commons.csv.Constants.BACKSPACE;
21+
import static org.apache.commons.csv.Constants.CR;
22+
import static org.apache.commons.csv.Constants.END_OF_STREAM;
23+
import static org.apache.commons.csv.Constants.FF;
24+
import static org.apache.commons.csv.Constants.LF;
25+
import static org.apache.commons.csv.Constants.TAB;
26+
import static org.apache.commons.csv.Constants.UNDEFINED;
2027
import static org.apache.commons.csv.Token.Type.COMMENT;
2128
import static org.apache.commons.csv.Token.Type.EOF;
2229
import static org.apache.commons.csv.Token.Type.EORECORD;
@@ -30,11 +37,38 @@
3037
*
3138
* @version $Id$
3239
*/
33-
final class CSVLexer extends Lexer {
40+
final class CSVLexer {
41+
42+
/**
43+
* Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
44+
* won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
45+
* chars (using surrogates) and thus there should never be a collision with a real text char.
46+
*/
47+
private static final char DISABLED = '\ufffe';
48+
49+
private final char delimiter;
50+
private final char escape;
51+
private final char quoteChar;
52+
private final char commmentStart;
53+
54+
final boolean ignoreSurroundingSpaces;
55+
final boolean ignoreEmptyLines;
56+
57+
final CSVFormat format;
58+
59+
/** The input stream */
60+
final ExtendedBufferedReader in;
3461

3562
/** INTERNAL API. ctor needs to be public so can be called dynamically by PerformanceTest class */
3663
CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) {
37-
super(format, in);
64+
this.format = format;
65+
this.in = in;
66+
this.delimiter = format.getDelimiter();
67+
this.escape = mapNullToDisabled(format.getEscape());
68+
this.quoteChar = mapNullToDisabled(format.getQuoteChar());
69+
this.commmentStart = mapNullToDisabled(format.getCommentStart());
70+
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
71+
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
3872
}
3973

4074
/**
@@ -48,7 +82,6 @@ final class CSVLexer extends Lexer {
4882
* @throws java.io.IOException
4983
* on stream access error
5084
*/
51-
@Override
5285
Token nextToken(final Token token) throws IOException {
5386

5487
// get the last read char (required for empty line detection)
@@ -257,4 +290,144 @@ private Token parseEncapsulatedToken(final Token token) throws IOException {
257290
}
258291
}
259292

293+
private final char mapNullToDisabled(final Character c) {
294+
return c == null ? DISABLED : c.charValue();
295+
}
296+
297+
/**
298+
* Returns the current line number
299+
*
300+
* @return the current line number
301+
*/
302+
long getCurrentLineNumber() {
303+
return in.getCurrentLineNumber();
304+
}
305+
306+
// TODO escape handling needs more work
307+
/**
308+
* Handle an escape sequence.
309+
* The current character must be the escape character.
310+
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
311+
* on the input stream.
312+
*
313+
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
314+
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
315+
* the escape character is not allowed at end of strem
316+
*/
317+
int readEscape() throws IOException {
318+
// the escape char has just been read (normally a backslash)
319+
final int ch = in.read();
320+
switch (ch) {
321+
case 'r':
322+
return CR;
323+
case 'n':
324+
return LF;
325+
case 't':
326+
return TAB;
327+
case 'b':
328+
return BACKSPACE;
329+
case 'f':
330+
return FF;
331+
case CR:
332+
case LF:
333+
case FF: // TODO is this correct?
334+
case TAB: // TODO is this correct? Do tabs need to be escaped?
335+
case BACKSPACE: // TODO is this correct?
336+
return ch;
337+
case END_OF_STREAM:
338+
throw new IOException("EOF whilst processing escape sequence");
339+
default:
340+
// Now check for meta-characters
341+
if (isMetaChar(ch)) {
342+
return ch;
343+
}
344+
// indicate unexpected char - available from in.getLastChar()
345+
return END_OF_STREAM;
346+
}
347+
}
348+
349+
void trimTrailingSpaces(final StringBuilder buffer) {
350+
int length = buffer.length();
351+
while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
352+
length = length - 1;
353+
}
354+
if (length != buffer.length()) {
355+
buffer.setLength(length);
356+
}
357+
}
358+
359+
/**
360+
* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
361+
*
362+
* @return true if the given or next character is a line-terminator
363+
*/
364+
boolean readEndOfLine(int ch) throws IOException {
365+
// check if we have \r\n...
366+
if (ch == CR && in.lookAhead() == LF) {
367+
// note: does not change ch outside of this method!
368+
ch = in.read();
369+
}
370+
return ch == LF || ch == CR;
371+
}
372+
373+
boolean isClosed() {
374+
return in.isClosed();
375+
}
376+
377+
/**
378+
* @return true if the given char is a whitespace character
379+
*/
380+
boolean isWhitespace(final int ch) {
381+
return !isDelimiter(ch) && Character.isWhitespace((char) ch);
382+
}
383+
384+
/**
385+
* Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
386+
*
387+
* @param ch the character to check
388+
* @return true if the character is at the start of a line.
389+
*/
390+
boolean isStartOfLine(final int ch) {
391+
return ch == LF || ch == CR || ch == UNDEFINED;
392+
}
393+
394+
/**
395+
* @return true if the given character indicates end of file
396+
*/
397+
boolean isEndOfFile(final int ch) {
398+
return ch == END_OF_STREAM;
399+
}
400+
401+
boolean isDelimiter(final int ch) {
402+
return ch == delimiter;
403+
}
404+
405+
boolean isEscape(final int ch) {
406+
return ch == escape;
407+
}
408+
409+
boolean isQuoteChar(final int ch) {
410+
return ch == quoteChar;
411+
}
412+
413+
boolean isCommentStart(final int ch) {
414+
return ch == commmentStart;
415+
}
416+
417+
private boolean isMetaChar(final int ch) {
418+
return ch == delimiter ||
419+
ch == escape ||
420+
ch == quoteChar ||
421+
ch == commmentStart;
422+
}
423+
424+
/**
425+
* Closes resources.
426+
*
427+
* @throws IOException
428+
* If an I/O error occurs
429+
*/
430+
void close() throws IOException {
431+
in.close();
432+
}
260433
}

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ public static CSVParser parseURL(URL url, Charset charset, final CSVFormat forma
217217
private final CSVFormat format;
218218
private final Map<String, Integer> headerMap;
219219

220-
private final Lexer lexer;
220+
private final CSVLexer lexer;
221221

222222
/** A record buffer for getRecord(). Grows as necessary and is reused. */
223223
private final List<String> record = new ArrayList<String>();

0 commit comments

Comments
 (0)