Skip to content

Commit 526ecc2

Browse files
authored
Merge pull request #6 from DarrenJAN/apply-fix-on-release
Add support in Commons CSV for tracking byte positions during parsing
2 parents 74f0970 + 61087a6 commit 526ecc2

10 files changed

Lines changed: 315 additions & 2 deletions

File tree

pom.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
<url>https://commons.apache.org/proper/commons-csv/</url>
2929
<inceptionYear>2005</inceptionYear>
3030
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
31+
<packaging>jar</packaging>
3132

3233
<dependencies>
3334
<dependency>
@@ -231,6 +232,8 @@
231232
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
232233
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
233234
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
235+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
236+
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
234237
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
235238
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
236239
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2074,6 +2074,30 @@ public CSVParser parse(final Reader reader) throws IOException {
20742074
return new CSVParser(reader, this);
20752075
}
20762076

2077+
/**
2078+
* Parses the specified content.
2079+
*
2080+
* <p>
2081+
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
2082+
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
2083+
* </p>
2084+
*
2085+
* <p>
2086+
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
2087+
* </p>
2088+
*
2089+
* @param reader the input stream
2090+
* @param characterOffset the character offset to start parsing from
2091+
* @param recordNumber the initial record number to start counting from
2092+
* @param encoding the character encoding of the input stream
2093+
* @return a parser over a stream of {@link CSVRecord}s.
2094+
* @throws IOException If an I/O error occurs
2095+
* @throws CSVException Thrown on invalid input.
2096+
*/
2097+
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
2098+
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
2099+
}
2100+
20772101
/**
20782102
* Prints to the specified output.
20792103
*

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,10 +438,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
438438
@SuppressWarnings("resource")
439439
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
440440
throws IOException {
441+
this(reader, format, characterOffset, recordNumber, null);
442+
}
443+
444+
/**
445+
* Constructs a new instance using the given {@link CSVFormat}
446+
*
447+
* <p>
448+
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
449+
* unless you close the {@code reader}.
450+
* </p>
451+
*
452+
* @param reader
453+
* a Reader containing CSV-formatted input. Must not be null.
454+
* @param format
455+
* the CSVFormat used for CSV parsing. Must not be null.
456+
* @param characterOffset
457+
* Lexer offset when the parser does not start parsing at the beginning of the source.
458+
* @param recordNumber
459+
* The next record number to assign
460+
* @param encoding
461+
* The encoding to use for the reader
462+
* @throws IllegalArgumentException
463+
* If the parameters of the format are inconsistent or if either the reader or format is null.
464+
* @throws IOException
465+
* If there is a problem reading the header or skipping the first record
466+
* @throws CSVException Thrown on invalid input.
467+
*/
468+
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
469+
String encoding) throws IOException {
441470
Objects.requireNonNull(reader, "reader");
442471
Objects.requireNonNull(format, "format");
443472
this.format = format.copy();
444-
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
473+
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
445474
this.csvRecordIterator = new CSVRecordIterator();
446475
this.headers = createHeaders();
447476
this.characterOffset = characterOffset;
@@ -768,6 +797,7 @@ CSVRecord nextRecord() throws IOException {
768797
recordList.clear();
769798
StringBuilder sb = null;
770799
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
800+
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
771801
do {
772802
reusableToken.reset();
773803
lexer.nextToken(reusableToken);
@@ -805,7 +835,7 @@ CSVRecord nextRecord() throws IOException {
805835
recordNumber++;
806836
final String comment = Objects.toString(sb, null);
807837
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
808-
recordNumber, startCharPosition);
838+
recordNumber, startCharPosition, startCharByte);
809839
}
810840
return result;
811841
}

src/main/java/org/apache/commons/csv/CSVRecord.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
4848
*/
4949
private final long characterPosition;
5050

51+
/**
52+
* The start byte of this record as a character byte in the source stream.
53+
*/
54+
private final long characterByte;
55+
5156
/** The accumulated comments (if any) */
5257
private final String comment;
5358

@@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
6772
this.parser = parser;
6873
this.comment = comment;
6974
this.characterPosition = characterPosition;
75+
this.characterByte = 0L;
7076
}
7177

78+
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
79+
final long characterPosition, final long characterByte) {
80+
this.recordNumber = recordNumber;
81+
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
82+
this.parser = parser;
83+
this.comment = comment;
84+
this.characterPosition = characterPosition;
85+
this.characterByte = characterByte;
86+
}
7287
/**
7388
* Returns a value by {@link Enum}.
7489
*
@@ -144,6 +159,15 @@ public long getCharacterPosition() {
144159
return characterPosition;
145160
}
146161

162+
/**
163+
* Returns the start byte of this record as a character byte in the source stream.
164+
*
165+
* @return the start byte of this record as a character byte in the source stream.
166+
*/
167+
public long getCharacterByte() {
168+
return characterByte;
169+
}
170+
147171
/**
148172
* Returns the comment for this record, if any.
149173
* Note that comments are attached to the following record.

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
import java.io.IOException;
2626
import java.io.Reader;
27+
import java.nio.CharBuffer;
28+
import java.nio.charset.CharacterCodingException;
29+
import java.nio.charset.Charset;
30+
import java.nio.charset.CharsetEncoder;
2731

2832
import org.apache.commons.io.IOUtils;
2933
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,13 +53,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
4953
private long position;
5054
private long positionMark;
5155

56+
/** The number of bytes read so far */
57+
private long bytesRead;
58+
private long bytesReadMark;
59+
60+
/** Encoder used to calculate the bytes of characters */
61+
CharsetEncoder encoder;
62+
5263
/**
5364
* Constructs a new instance using the default buffer size.
5465
*/
5566
ExtendedBufferedReader(final Reader reader) {
5667
super(reader);
5768
}
5869

70+
ExtendedBufferedReader(final Reader reader, String encoding) {
71+
super(reader);
72+
if (encoding != null) {
73+
encoder = Charset.forName(encoding).newEncoder();
74+
}
75+
}
76+
5977
/**
6078
* Closes the stream.
6179
*
@@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
108126
lineNumberMark = lineNumber;
109127
lastCharMark = lastChar;
110128
positionMark = position;
129+
bytesReadMark = bytesRead;
111130
super.mark(readAheadLimit);
112131
}
113132

@@ -118,11 +137,43 @@ public int read() throws IOException {
118137
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
119138
lineNumber++;
120139
}
140+
if (encoder != null) {
141+
this.bytesRead += getCharBytes(current);
142+
}
121143
lastChar = current;
122144
position++;
123145
return lastChar;
124146
}
125147

148+
/**
149+
* In Java, a char data type are based on the original Unicode
150+
* specification, which defined characters as fixed-width 16-bit entities.
151+
* U+0000 to U+FFFF:
152+
* - BMP, represented using 1 16-bit char
153+
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
154+
* U+10000 to U+10FFFF:
155+
* - Supplementary characters, represented as a pair of characters,
156+
* the first char from the high-surrogates range (\uD800-\uDBFF),
157+
* and the second char from the low-surrogates range (uDC00-\uDFFF).
158+
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
159+
*/
160+
private long getCharBytes(int current) throws CharacterCodingException {
161+
char cChar = (char) current;
162+
char lChar = (char) lastChar;
163+
if (!Character.isSurrogate(cChar)) {
164+
return encoder.encode(
165+
CharBuffer.wrap(new char[] {cChar})).limit();
166+
} else {
167+
if (Character.isHighSurrogate(cChar)) {
168+
// Move on to the next char (low surrogate)
169+
return 0;
170+
} else if (Character.isSurrogatePair(lChar, cChar)) {
171+
return encoder.encode(
172+
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
173+
} else throw new CharacterCodingException();
174+
}
175+
}
176+
126177
@Override
127178
public int read(final char[] buf, final int offset, final int length) throws IOException {
128179
if (length == 0) {
@@ -187,7 +238,17 @@ public void reset() throws IOException {
187238
lineNumber = lineNumberMark;
188239
lastChar = lastCharMark;
189240
position = positionMark;
241+
bytesRead = bytesReadMark;
190242
super.reset();
191243
}
192244

245+
/**
246+
* Gets the number of bytes read by the reader.
247+
*
248+
* @return the number of bytes read by the read
249+
*/
250+
long getBytesRead() {
251+
return this.bytesRead;
252+
}
253+
193254
}

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,15 @@ long getCharacterPosition() {
103103
return reader.getPosition();
104104
}
105105

106+
/**
107+
* Returns the number of bytes read
108+
*
109+
* @return the number of bytes read
110+
*/
111+
long getBytesRead() {
112+
return reader.getBytesRead();
113+
}
114+
106115
/**
107116
* Returns the current line number
108117
*

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
693693
}
694694
}
695695

696+
@Test
697+
public void testGetRecordThreeBytesRead() throws Exception {
698+
String code = "id,date,val5,val4\n" +
699+
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
700+
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
701+
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
702+
// String code = "'1',4";
703+
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
704+
final CSVFormat format = CSVFormat.Builder.create()
705+
.setDelimiter(',')
706+
.setQuote('\'')
707+
.build();
708+
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
709+
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
710+
711+
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
712+
assertEquals(0, parser.getRecordNumber());
713+
assertNotNull(record = parser.nextRecord());
714+
assertEquals(1, record.getRecordNumber());
715+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
716+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
717+
718+
assertNotNull(record = parser.nextRecord());
719+
assertEquals(2, record.getRecordNumber());
720+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
721+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
722+
723+
assertNotNull(record = parser.nextRecord());
724+
assertEquals(3, record.getRecordNumber());
725+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
726+
assertEquals(record.getCharacterByte(), 95);
727+
728+
assertNotNull(record = parser.nextRecord());
729+
assertEquals(4, record.getRecordNumber());
730+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
731+
assertEquals(record.getCharacterByte(), 154);
732+
733+
parser.close();
734+
735+
}
736+
737+
@Test
738+
public void testGetRecordFourBytesRead() throws Exception {
739+
String code = "id,a,b,c\n" +
740+
"1,😊,🤔,😂\n" +
741+
"2,😊,🤔,😂\n" +
742+
"3,😊,🤔,😂\n";
743+
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
744+
final CSVFormat format = CSVFormat.Builder.create()
745+
.setDelimiter(',')
746+
.setQuote('\'')
747+
.build();
748+
749+
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
750+
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
751+
752+
CSVRecord record;
753+
assertEquals(0, parser.getRecordNumber());
754+
assertNotNull(record = parser.nextRecord());
755+
assertEquals(1, record.getRecordNumber());
756+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
757+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
758+
759+
assertNotNull(record = parser.nextRecord());
760+
assertEquals(2, record.getRecordNumber());
761+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
762+
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
763+
assertNotNull(record = parser.nextRecord());
764+
assertEquals(3, record.getRecordNumber());
765+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
766+
assertEquals(record.getCharacterByte(), 26);
767+
assertNotNull(record = parser.nextRecord());
768+
assertEquals(4, record.getRecordNumber());
769+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
770+
assertEquals(record.getCharacterByte(), 43);
771+
parser.close();
772+
}
773+
696774
@Test
697775
public void testGetHeaderMap() throws Exception {
698776
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {

0 commit comments

Comments
 (0)