Skip to content

Commit e28e28e

Browse files
committed
[CSV-131] Save positions of records to enable random access. The floor is open for code review and further discussion based on the comments in the Jira.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1635052 13f79535-47bb-0310-9956-ffa450edef68
1 parent b466ec0 commit e28e28e

5 files changed

Lines changed: 151 additions & 21 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
</properties>
4040
<body>
4141
<release version="1.1" date="2014-mm-dd" description="Feature and bug fix release">
42+
<action issue="CSV-131" type="add" dev="ggregory" due-to="Holger Stratmann">Save positions of records to enable random access</action>
4243
<action issue="CSV-130" type="fix" dev="ggregory" due-to="Sergei Lebedev">CSVFormat#withHeader doesn't work well with #printComment, add withHeaderComments(String...)</action>
4344
<action issue="CSV-128" type="fix" dev="ggregory">CSVFormat.EXCEL should ignore empty header names</action>
4445
<action issue="CSV-129" type="add" dev="ggregory">Add CSVFormat#with 0-arg methods matching boolean arg methods</action>

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,12 @@ public static CSVParser parse(final URL url, final Charset charset, final CSVFor
219219
private final List<String> record = new ArrayList<String>();
220220

221221
private long recordNumber;
222+
223+
/**
224+
* Lexer offset if the parser does not start parsing at the beginning of the source. Usually used in combination
225+
* with {@link #setNextRecordNumber(long)}
226+
*/
227+
private long characterOffset;
222228

223229
private final Token reusableToken = new Token();
224230

@@ -295,6 +301,43 @@ public Map<String, Integer> getHeaderMap() {
295301
return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
296302
}
297303

304+
/**
305+
* Sets the record number to be assigned to the next record read.
306+
* <p>
307+
* Use this if the reader is not positioned at the first record when you create the parser. For example, the first
308+
* record read might be the 51st record in the source file.
309+
* </p>
310+
* <p>
311+
* If you want the records to also have the correct character position referring to the underlying source, call
312+
* {@link #setNextCharacterPosition(long)}.
313+
* </p>
314+
*
315+
* @param nextRecordNumber
316+
* the next record number
317+
* @since 1.1
318+
*/
319+
public void setNextRecordNumber(long nextRecordNumber) {
320+
this.recordNumber = nextRecordNumber - 1;
321+
}
322+
323+
/**
324+
* Sets the current position in the source stream regardless of where the parser and lexer start reading.
325+
* <p>
326+
* For example: We open a file and seek to position 5434 in order to start reading at record 42. In order to have
327+
* the parser assign the correct characterPosition to records, we call this method.
328+
* </p>
329+
* <p>
330+
* If you want the records to also have the correct record numbers, call {@link #setNextRecordNumber(long)}
331+
* </p>
332+
*
333+
* @param position
334+
* the new character position
335+
* @since 1.1
336+
*/
337+
public void setNextCharacterPosition(long position) {
338+
this.characterOffset = position - lexer.getCharacterPosition();
339+
}
340+
298341
/**
299342
* Returns the current record number in the input stream.
300343
*
@@ -445,6 +488,7 @@ CSVRecord nextRecord() throws IOException {
445488
CSVRecord result = null;
446489
this.record.clear();
447490
StringBuilder sb = null;
491+
final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
448492
do {
449493
this.reusableToken.reset();
450494
this.lexer.nextToken(this.reusableToken);
@@ -480,7 +524,7 @@ CSVRecord nextRecord() throws IOException {
480524
this.recordNumber++;
481525
final String comment = sb == null ? null : sb.toString();
482526
result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
483-
this.recordNumber);
527+
this.recordNumber, startCharPosition);
484528
}
485529
return result;
486530
}

src/main/java/org/apache/commons/csv/CSVRecord.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ public final class CSVRecord implements Serializable, Iterable<String> {
3636

3737
private static final long serialVersionUID = 1L;
3838

39+
private final long characterPosition;
40+
3941
/** The accumulated comments (if any) */
4042
private final String comment;
4143

@@ -44,15 +46,16 @@ public final class CSVRecord implements Serializable, Iterable<String> {
4446

4547
/** The record number. */
4648
private final long recordNumber;
47-
49+
4850
/** The values of the record */
4951
private final String[] values;
5052

51-
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber) {
53+
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber, long characterPosition) {
5254
this.recordNumber = recordNumber;
5355
this.values = values != null ? values : EMPTY_STRING_ARRAY;
5456
this.mapping = mapping;
5557
this.comment = comment;
58+
this.characterPosition = characterPosition;
5659
}
5760

5861
/**
@@ -109,6 +112,16 @@ public String get(final String name) {
109112
}
110113
}
111114

115+
/**
116+
* Returns the start position of this record as a character position in the source stream. This may or may not
117+
* correspond to the byte position depending on the character set.
118+
*
119+
* @return the position of this record in the source stream.
120+
*/
121+
public long getCharacterPosition() {
122+
return characterPosition;
123+
}
124+
112125
/**
113126
* Returns the comment for this record, if any.
114127
*

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -299,22 +299,23 @@ public void testEmptyLineBehaviourExcel() throws Exception {
299299
}
300300
}
301301

302-
// @Test
303-
// public void testStartWithEmptyLinesThenHeaders() throws Exception {
304-
// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n", "hello,\"\"\n\n\n" };
305-
// final String[][] res = { { "hello", "" }, { "" }, // Excel format does not ignore empty lines
306-
// { "" } };
307-
// for (final String code : codes) {
308-
// final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL);
309-
// final List<CSVRecord> records = parser.getRecords();
310-
// assertEquals(res.length, records.size());
311-
// assertTrue(records.size() > 0);
312-
// for (int i = 0; i < res.length; i++) {
313-
// assertArrayEquals(res[i], records.get(i).values());
314-
// }
315-
// parser.close();
316-
// }
317-
// }
302+
// @Test
303+
// public void testStartWithEmptyLinesThenHeaders() throws Exception {
304+
// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n",
305+
// "hello,\"\"\n\n\n" };
306+
// final String[][] res = { { "hello", "" }, { "" }, // Excel format does not ignore empty lines
307+
// { "" } };
308+
// for (final String code : codes) {
309+
// final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL);
310+
// final List<CSVRecord> records = parser.getRecords();
311+
// assertEquals(res.length, records.size());
312+
// assertTrue(records.size() > 0);
313+
// for (int i = 0; i < res.length; i++) {
314+
// assertArrayEquals(res[i], records.get(i).values());
315+
// }
316+
// parser.close();
317+
// }
318+
// }
318319

319320
@Test
320321
public void testEndOfFileBehaviorCSV() throws Exception {
@@ -474,6 +475,16 @@ public void testGetLineNumberWithLF() throws Exception {
474475
this.validateLineNumbers(String.valueOf(LF));
475476
}
476477

478+
@Test
479+
public void testGetRecordPositionWithCRLF() throws Exception {
480+
this.validateRecordPosition(CRLF);
481+
}
482+
483+
@Test
484+
public void testGetRecordPositionWithLF() throws Exception {
485+
this.validateRecordPosition(String.valueOf(LF));
486+
}
487+
477488
@Test
478489
public void testGetOneLine() throws IOException {
479490
final CSVParser parser = CSVParser.parse(CSV_INPUT_1, CSVFormat.DEFAULT);
@@ -902,4 +913,65 @@ private void validateRecordNumbers(final String lineSeparator) throws IOExceptio
902913
parser.close();
903914
}
904915

916+
private void validateRecordPosition(final String lineSeparator) throws IOException {
917+
final String nl = lineSeparator; // used as linebreak in values for better distinction
918+
919+
String code = "a,b,c" + lineSeparator + "1,2,3" + lineSeparator +
920+
// to see if recordPosition correctly points to the enclosing quote
921+
"'A" + nl + "A','B" + nl + "B',CC" + lineSeparator +
922+
// unicode test... not very relevant while operating on strings instead of bytes, but for
923+
// completeness...
924+
"\u00c4,\u00d6,\u00dc" + lineSeparator + "EOF,EOF,EOF";
925+
926+
final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'').withRecordSeparator(lineSeparator);
927+
CSVParser parser = CSVParser.parse(code, format);
928+
929+
CSVRecord record;
930+
assertEquals(0, parser.getRecordNumber());
931+
932+
assertNotNull(record = parser.nextRecord());
933+
assertEquals(1, record.getRecordNumber());
934+
assertEquals(code.indexOf('a'), record.getCharacterPosition());
935+
936+
assertNotNull(record = parser.nextRecord());
937+
assertEquals(2, record.getRecordNumber());
938+
assertEquals(code.indexOf('1'), record.getCharacterPosition());
939+
940+
assertNotNull(record = parser.nextRecord());
941+
final long positionRecord3 = record.getCharacterPosition();
942+
assertEquals(3, record.getRecordNumber());
943+
assertEquals(code.indexOf("'A"), record.getCharacterPosition());
944+
assertEquals("A" + lineSeparator + "A", record.get(0));
945+
assertEquals("B" + lineSeparator + "B", record.get(1));
946+
assertEquals("CC", record.get(2));
947+
948+
assertNotNull(record = parser.nextRecord());
949+
assertEquals(4, record.getRecordNumber());
950+
assertEquals(code.indexOf('\u00c4'), record.getCharacterPosition());
951+
952+
assertNotNull(record = parser.nextRecord());
953+
assertEquals(5, record.getRecordNumber());
954+
assertEquals(code.indexOf("EOF"), record.getCharacterPosition());
955+
956+
parser.close();
957+
958+
// now try to read starting at record 3
959+
parser = CSVParser.parse(code.substring((int) positionRecord3), format);
960+
parser.setNextRecordNumber(3);
961+
parser.setNextCharacterPosition(positionRecord3);
962+
963+
assertNotNull(record = parser.nextRecord());
964+
assertEquals(3, record.getRecordNumber());
965+
assertEquals(code.indexOf("'A"), record.getCharacterPosition());
966+
assertEquals("A" + lineSeparator + "A", record.get(0));
967+
assertEquals("B" + lineSeparator + "B", record.get(1));
968+
assertEquals("CC", record.get(2));
969+
970+
assertNotNull(record = parser.nextRecord());
971+
assertEquals(4, record.getRecordNumber());
972+
assertEquals(code.indexOf('\u00c4'), record.getCharacterPosition());
973+
assertEquals("\u00c4", record.get(0));
974+
975+
parser.close();
976+
}
905977
}

src/test/java/org/apache/commons/csv/CSVRecordTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ private enum EnumFixture { UNKNOWN_COLUMN }
4545
@Before
4646
public void setUp() throws Exception {
4747
values = new String[] { "A", "B", "C" };
48-
record = new CSVRecord(values, null, null, 0);
48+
record = new CSVRecord(values, null, null, 0, -1);
4949
header = new HashMap<String, Integer>();
5050
header.put("first", Integer.valueOf(0));
5151
header.put("second", Integer.valueOf(1));
5252
header.put("third", Integer.valueOf(2));
53-
recordWithHeader = new CSVRecord(values, header, null, 0);
53+
recordWithHeader = new CSVRecord(values, header, null, 0, -1);
5454
}
5555

5656
@Test

0 commit comments

Comments
 (0)