Skip to content

Commit a02c334

Browse files
committed
[CSV-196] Track byte information of the source
1 parent edb87f3 commit a02c334

File tree

7 files changed

+158
-2
lines changed

7 files changed

+158
-2
lines changed

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,11 +348,16 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
348348
@SuppressWarnings("resource")
349349
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
350350
throws IOException {
351+
this(reader, format, characterOffset, recordNumber, null);
352+
}
353+
354+
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset,
355+
final long recordNumber, String encoding) throws IOException {
351356
Assertions.notNull(reader, "reader");
352357
Assertions.notNull(format, "format");
353358

354359
this.format = format;
355-
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
360+
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
356361
this.headerMap = this.initializeHeader();
357362
this.characterOffset = characterOffset;
358363
this.recordNumber = recordNumber - 1;
@@ -581,6 +586,7 @@ CSVRecord nextRecord() throws IOException {
581586
this.recordList.clear();
582587
StringBuilder sb = null;
583588
final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
589+
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
584590
do {
585591
this.reusableToken.reset();
586592
this.lexer.nextToken(this.reusableToken);
@@ -616,7 +622,7 @@ CSVRecord nextRecord() throws IOException {
616622
this.recordNumber++;
617623
final String comment = sb == null ? null : sb.toString();
618624
result = new CSVRecord(this.recordList.toArray(new String[this.recordList.size()]), this.headerMap, comment,
619-
this.recordNumber, startCharPosition);
625+
this.recordNumber, startCharPosition, startCharByte);
620626
}
621627
return result;
622628
}

src/main/java/org/apache/commons/csv/CSVRecord.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ public final class CSVRecord implements Serializable, Iterable<String> {
3636

3737
private final long characterPosition;
3838

39+
private final long characterByte;
40+
3941
/** The accumulated comments (if any) */
4042
private final String comment;
4143

@@ -48,13 +50,25 @@ public final class CSVRecord implements Serializable, Iterable<String> {
4850
/** The values of the record */
4951
private final String[] values;
5052

53+
54+
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber,
55+
final long characterPosition, final long characterByte) {
56+
this.recordNumber = recordNumber;
57+
this.values = values != null ? values : EMPTY_STRING_ARRAY;
58+
this.mapping = mapping;
59+
this.comment = comment;
60+
this.characterPosition = characterPosition;
61+
this.characterByte = characterByte;
62+
}
63+
5164
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber,
5265
final long characterPosition) {
5366
this.recordNumber = recordNumber;
5467
this.values = values != null ? values : EMPTY_STRING_ARRAY;
5568
this.mapping = mapping;
5669
this.comment = comment;
5770
this.characterPosition = characterPosition;
71+
this.characterByte = 0L;
5872
}
5973

6074
/**
@@ -121,6 +135,15 @@ public long getCharacterPosition() {
121135
return characterPosition;
122136
}
123137

138+
/**
139+
* Returns the start byte of this record as a character byte in the source stream.
140+
*
141+
* @return the start byte of this record as a character byte in the source stream.
142+
*/
143+
public long getCharacterByte() {
144+
return characterByte;
145+
}
146+
124147
/**
125148
* Returns the comment for this record, if any.
126149
* Note that comments are attached to the following record.

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@
2424

2525
import java.io.BufferedReader;
2626
import java.io.IOException;
27+
import java.io.InputStreamReader;
2728
import java.io.Reader;
29+
import java.nio.charset.CharsetEncoder;
30+
import java.nio.charset.Charset;
31+
import java.nio.CharBuffer;
2832

2933
/**
3034
* A special buffered reader which supports sophisticated read access.
@@ -46,12 +50,25 @@ final class ExtendedBufferedReader extends BufferedReader {
4650

4751
private boolean closed;
4852

53+
/** The number of bytes read so far */
54+
private long bytesRead;
55+
56+
/** Encoder used to calculate the bytes of characters */
57+
CharsetEncoder encoder;
58+
4959
/**
5060
* Created extended buffered reader using default buffer-size
5161
*/
5262
ExtendedBufferedReader(final Reader reader) {
5363
super(reader);
5464
}
65+
66+
ExtendedBufferedReader(final Reader reader, String encoding) {
67+
super(reader);
68+
if (encoding != null) {
69+
encoder = Charset.forName(encoding).newEncoder();
70+
}
71+
}
5572

5673
@Override
5774
public int read() throws IOException {
@@ -61,6 +78,9 @@ public int read() throws IOException {
6178
}
6279
lastChar = current;
6380
this.position++;
81+
if (encoder != null) {
82+
this.bytesRead += encoder.encode(CharBuffer.wrap(new char[] { (char)current })).limit();
83+
}
6484
return lastChar;
6585
}
6686

@@ -170,6 +190,15 @@ long getPosition() {
170190
return this.position;
171191
}
172192

193+
/**
194+
* Gets the number of bytes read by the reader.
195+
*
196+
* @return the number of bytes read by the read
197+
*/
198+
long getBytesRead() {
199+
return this.bytesRead;
200+
}
201+
173202
public boolean isClosed() {
174203
return closed;
175204
}

src/main/java/org/apache/commons/csv/Lexer.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,15 @@ long getCharacterPosition() {
316316
return reader.getPosition();
317317
}
318318

319+
/**
320+
* Returns the number of bytes read
321+
*
322+
* @return the number of bytes read
323+
*/
324+
long getBytesRead() {
325+
return reader.getBytesRead();
326+
}
327+
319328
// TODO escape handling needs more work
320329
/**
321330
* Handle an escape sequence.

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,42 @@ public void testGetRecordPositionWithCRLF() throws Exception {
560560
public void testGetRecordPositionWithLF() throws Exception {
561561
this.validateRecordPosition(String.valueOf(LF));
562562
}
563+
564+
@Test
565+
public void testGetRecordBytesRead() throws Exception {
566+
String code = "id,date,val5,val4\n"
567+
+ "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n"
568+
+ "22222222222222,'4017-01-01',おはよう私の友人~,v4\n"
569+
+ "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
570+
final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
571+
CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
572+
573+
CSVRecord record;
574+
assertEquals(0, parser.getRecordNumber());
575+
576+
assertNotNull(record = parser.nextRecord());
577+
assertEquals(1, record.getRecordNumber());
578+
assertEquals(code.indexOf('i'), record.getCharacterByte());
579+
assertEquals(code.indexOf('i'), record.getCharacterPosition());
580+
581+
assertNotNull(record = parser.nextRecord());
582+
assertEquals(2, record.getRecordNumber());
583+
assertEquals(code.indexOf('1'), record.getCharacterByte());
584+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
585+
586+
assertNotNull(record = parser.nextRecord());
587+
assertEquals(3, record.getRecordNumber());
588+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
589+
assertEquals(code.indexOf('2'), record.getCharacterPosition());
590+
591+
assertNotNull(record = parser.nextRecord());
592+
assertEquals(4, record.getRecordNumber());
593+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
594+
assertEquals(code.indexOf('3'), record.getCharacterPosition());
595+
596+
parser.close();
597+
598+
}
563599

564600
@Test
565601
public void testGetRecords() throws IOException {
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.commons.csv.issues;
18+
19+
import java.io.IOException;
20+
import java.io.InputStream;
21+
import java.io.InputStreamReader;
22+
import java.io.Reader;
23+
24+
import org.apache.commons.csv.CSVFormat;
25+
import org.apache.commons.csv.CSVParser;
26+
import org.apache.commons.csv.CSVRecord;
27+
import org.junit.Test;
28+
import org.junit.Assert;
29+
30+
public class JiraCsv196Test {
31+
32+
@Test
33+
public void parse() throws IOException {
34+
final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
35+
CSVParser parser = new CSVParser(getTestInput(), format, 0L, 1L, "UTF-8");
36+
long[] charByteKey = {0, 89, 242, 395};
37+
int idx = 0;
38+
for (CSVRecord record : parser) {
39+
Assert.assertEquals(charByteKey[idx++], record.getCharacterByte());
40+
}
41+
parser.close();
42+
}
43+
44+
private Reader getTestInput() {
45+
final InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream("CSV-196/sample1.csv");
46+
return new InputStreamReader(is);
47+
}
48+
49+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15
2+
00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
3+
00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
4+
00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15

0 commit comments

Comments
 (0)