8000 Add support in Commons CSV for tracking byte positions during parsing… · apache/commons-csv@7dca281 · GitHub
Skip to content

Commit 7dca281

Browse files
authored
Add support in Commons CSV for tracking byte positions during parsing (#9)
Add support in Commons CSV for tracking byte positions during parsing
1 parent 20edd47 commit 7dca281

10 files changed

Lines changed: 315 additions & 2 deletions

File tree

pom.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
<url>https://commons.apache.org/proper/commons-csv/</url>
2929
<inceptionYear>2005</inceptionYear>
3030
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
31+
<packaging>jar</packaging>
3132

3233
<dependencies>
3334
<dependency>
@@ -231,6 +232,8 @@
231232
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
232233
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
233234
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
235+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
236+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
234237
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
235238
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
236239
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,6 +2097,30 @@ public CSVParser parse(final Reader reader) throws IOException {
20972097
return new CSVParser(reader, this);
20982098
}
20992099

2100+
/**
2101+
* Parses the specified content.
2102+
*
2103+
* <p>
2104+
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
2105+
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
2106+
* </p>
2107+
*
2108+
* <p>
2109+
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
2110+
* </p>
2111+
*
2112+
* @param reader the input stream
2113+
* @param characterOffset the character offset to start parsing from
2114+
* @param recordNumber the initial record number to start counting from
2115+
* @param encoding the character encoding of the input stream
2116+
* @return a parser over a stream of {@link CSVRecord}s.
2117+
* @throws IOException If an I/O error occurs
2118+
* @throws CSVException Thrown on invalid input.
2119+
*/
2120+
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
2121+
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
2122+
}
2123+
21002124
/**
21012125
* Prints to the specified output.
21022126
*

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
511511
@SuppressWarnings("resource")
512512
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
513513
throws IOException {
514+
this(reader, format, characterOffset, recordNumber, null);
515+
}
516+
517+
/**
518+
* Constructs a new instance using the given {@link CSVFormat}
519+
*
520+
* <p>
521+
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
522+
* unless you close the {@code reader}.
523+
* </p>
524+
*
525+
* @param reader
526+
* a Reader containing CSV-formatted input. Must not be null.
527+
* @param format
528+
* the CSVFormat used for CSV parsing. Must not be null.
529+
* @param characterOffset
530+
* Lexer offset when the parser does not start parsing at the beginning of the source.
531+
* @param recordNumber
532+
* The next record number to assign
533+
* @param encoding
534+
* The encoding to use for the reader
535+
* @throws IllegalArgumentException
536+
* If the parameters of the format are inconsistent or if either the reader or format is null.
537+
* @throws IOException
538+
* If there is a problem reading the header or skipping the first record
539+
* @throws CSVException Thrown on invalid input.
540+
*/
541+
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
542+
String encoding) throws IOException {
514543
Objects.requireNonNull(reader, "reader");
515544
Objects.requireNonNull(format, "format");
516545
this.format = format.copy();
517-
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
546+
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
518547
this.csvRecordIterator = new CSVRecordIterator();
519548
this.headers = createHeaders();
520549
this.characterOffset = characterOffset;
@@ -841,6 +870,7 @@ CSVRecord nextRecord() throws IOException {
841870
recordList.clear();
842871
StringBuilder sb = null;
843872
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
873+
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
844874
do {
845875
reusableToken.reset();
846876
lexer.nextToken(reusableToken);
@@ -878,7 +908,7 @@ CSVRecord nextRecord() throws IOException {
878908
recordNumber++;
879909
final String comment = Objects.toString(sb, null);
880910
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
881-
recordNumber, startCharPosition);
911+
recordNumber, startCharPosition, startCharByte);
882912
}
883913
return result;
884914
}

src/main/java/org/apache/commons/csv/CSVRecord.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
4848
*/
4949
private final long characterPosition;
5050

51+
/**
52+
* The start byte of this record as a character byte in the source stream.
53+
*/
54+
private final long characterByte;
55+
5156
/** The accumulated comments (if any) */
5257
private final String comment;
5358

@@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
6772
this.parser = parser;
6873
this.comment = comment;
6974
this.characterPosition = characterPosition;
75+
this.characterByte = 0L;
7076
}
7177

78+
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
79+
final long characterPosition, final long characterByte) {
80+
this.recordNumber = recordNumber;
81+
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
82+
this.parser = parser;
83+
this.comment = comment;
84+
this.characterPosition = characterPosition;
85+
this.characterByte = characterByte;
86+
}
7287
/**
7388
* Returns a value by {@link Enum}.
7489
*
@@ -144,6 +159,15 @@ public long getCharacterPosition() {
144159
return characterPosition;
145160
}
146161

162+
/**
163+
* Returns the start byte of this record as a character byte in the source stream.
164+
*
165+
* @return the start byte of this record as a character byte in the source stream.
166+
*/
167+
public long getCharacterByte() {
168+
return characterByte;
169+
}
170+
147171
/**
148172
* Returns the comment for this record, if any.
149173
* Note that comments are attached to the following record.

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
import java.io.IOException;
2626
import java.io.Reader;
27+
import java.nio.CharBuffer;
28+
import java.nio.charset.CharacterCodingException;
29+
import java.nio.charset.Charset;
30+
import java.nio.charset.CharsetEncoder;
2731

2832
import org.apache.commons.io.IOUtils;
2933
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,13 +53,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
4953
private long position;
5054
private long positionMark;
5155

56+
/** The number of bytes read so far */
57+
private long bytesRead;
58+
private long bytesReadMark;
59+
60+
/** Encoder used to calculate the bytes of characters */
61+
CharsetEncoder encoder;
62+
5263
/**
5364
* Constructs a new instance using the default buffer size.
5465
*/
5566
ExtendedBufferedReader(final Reader reader) {
5667
super(reader);
5768
}
5869

70+
ExtendedBufferedReader(final Reader reader, String encoding) {
71+
super(reader);
72+
if (encoding != null) {
73+
encoder = Charset.forName(encoding).newEncoder();
74+
}
75+
}
76+
5977
/**
6078
* Closes the stream.
6179
*
@@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
108126
lineNumberMark = lineNumber;
109127
lastCharMark = lastChar;
110128
positionMark = position;
129+
bytesReadMark = bytesRead;
111130
super.mark(readAheadLimit);
112131
}
113132

@@ -118,11 +137,43 @@ public int read() throws IOException {
118137
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
119138
lineNumber++;
120139
}
140+
if (encoder != null) {
141+
this.bytesRead += getCharBytes(current);
142+
}
121143
lastChar = current;
122144
position++;
123145
return lastChar;
124146
}
125147

148+
/**
149+
* In Java, a char data type are based on the original Unicode
150+
* specification, which defined characters as fixed-width 16-bit entities.
151+
* U+0000 to U+FFFF:
152+
* - BMP, represented using 1 16-bit char
153+
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
154+
* U+10000 to U+10FFFF:
155+
* - Supplementary characters, represented as a pair of characters,
156+
* the first char from the high-surrogates range (\uD800-\uDBFF),
157+
* and the second char from the low-surrogates range (uDC00-\uDFFF).
158+
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
159+
*/
160+
private long getCharBytes(int current) throws CharacterCodingException {
161+
char cChar = (char) current;
162+
char lChar = (char) lastChar;
163+
if (!Character.isSurrogate(cChar)) {
164+
return encoder.encode(
165+
CharBuffer.wrap(new char[] {cChar})).limit();
166+
} else {
167+
if (Character.isHighSurrogate(cChar)) {
168+
// Move on to the next char (low surrogate)
169+
return 0;
170+
} else if (Character.isSurrogatePair(lChar, cChar)) {
171+
return encoder.encode(
172+
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
173+
} else throw new CharacterCodingException();
174+
}
175+
}
176+
126177
@Override
127178
public int read(final char[] buf, final int offset, final int length) throws IOException {
128179
if (length == 0) {
@@ -187,7 +238,17 @@ public void reset() throws IOException {
187238
lineNumber = lineNumberMark;
188239
lastChar = lastCharMark;
189240
position = positionMark;
241+
bytesRead = bytesReadMark;
190242
super.reset();
191243
}
192244

245+
/**
246+
* Gets the number of bytes read by the reader.
247+
*
248+
* @return the number of bytes read by the read
249+
*/
250+
long getBytesRead() {
251+
return this.bytesRead;
252+
}
253+
193254
}

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,15 @@ long getCharacterPosition() {
103103
return reader.getPosition();
104104
}
105105

106+
/**< 45E8 /span>
107+
* Returns the number of bytes read
108+
*
109+
* @return the number of bytes read
110+
*/
111+
long getBytesRead() {
112+
return reader.getBytesRead();
113+
}
114+
106115
/**
107116
* Returns the current line number
108117
*

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
701701
}
702702
}
703703

704+
@Test
705+
public void testGetRecordThreeBytesRead() throws Exception {
706+
String code = "id,date,val5,val4\n" +
707+
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
708+
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
709+
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
710+
// String code = "'1',4";
711+
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
712+
final CSVFormat format = CSVFormat.Builder.create()
713+
.setDelimiter(',')
714+
.setQuote('\'')
715+
.build();
716+
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
717+
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
718+
719+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
720+
assertEquals(0, parser.getRecordNumber());
721+
assertNotNull(record = parser.nextRecord());
722+
assertEquals(1, record.getRecordNumber());
723+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
724+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
725+
726+
assertNotNull(record = parser.nextRecord());
727+
assertEquals(2, record.getRecordNumber());
728+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
729+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
730+
731+
assertNotNull(record = parser.nextRecord());
732+
assertEquals(3, record.getRecordNumber());
733+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
734+
assertEquals(record.getCharacterByte(), 95);
735+
736+
assertNotNull(record = parser.nextRecord());
737+
assertEquals(4, record.getRecordNumber());
738+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
739+
assertEquals(record.getCharacterByte(), 154);
740+
741+
parser.close();
742+
743+
}
744+
745+
@Test
746+
public void testGetRecordFourBytesRead() throws Exception {
747+
String code = "id,a,b,c\n" +
748+
"1,😊,🤔,😂\n" +
749+
"2,😊,🤔,😂\n" +
750+
"3,😊,🤔,😂\n";
751+
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
752+
final CSVFormat format = CSVFormat.Builder.create()
753+
.setDelimiter(',')
754+
.setQuote('\'')
755+
.build();
756+
757+
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
758+
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
759+
760+
CSVRecord record;
761+
assertEquals(0, parser.getRecordNumber());
762+
assertNotNull(record = parser.nextRecord());
763+
assertEquals(1, record.getRecordNumber());
764+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
765+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
766+
767+
assertNotNull(record = parser.nextRecord());
768+
assertEquals(2, record.getRecordNumber());
769+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
770+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
771+
assertNotNull(record = parser.nextRecord());
772+
assertEquals(3, record.getRecordNumber());
773+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
774+
assertEquals(record.getCharacterByte(), 26);
775+
assertNotNull(record = parser.nextRecord());
776+
assertEquals(4, record.getRecordNumber());
777+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
778+
assertEquals(record.getCharacterByte(), 43);
779+
parser.close();
780+
}
781+
704782
@Test
705783
public void testGetHeaderMap() throws Exception {
706784
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {

0 commit comments

Comments
 (0)