Skip to content

Commit aee6c50

Browse files
committed
Header support (CSV-65)
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1301852 13f79535-47bb-0310-9956-ffa450edef68
1 parent d45289f commit aee6c50

7 files changed

Lines changed: 303 additions & 134 deletions

File tree

src/main/java/org/apache/commons/csv/CSVFormat.java

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public class CSVFormat implements Serializable {
3131

3232
/** According to RFC 4180, line breaks are delimited by CRLF */
3333
private static final String CRLF = "\r\n";
34+
3435
private final char delimiter;
3536
private final char encapsulator;
3637
private final char commentStart;
@@ -39,7 +40,8 @@ public class CSVFormat implements Serializable {
3940
private final boolean trailingSpacesIgnored;
4041
private final boolean unicodeEscapesInterpreted;
4142
private final boolean emptyLinesIgnored;
42-
private final String lineSeparator; // for output
43+
private final String lineSeparator; // for outputs
44+
private final String[] header;
4345

4446

4547
/**
@@ -51,7 +53,7 @@ public class CSVFormat implements Serializable {
5153
static final char DISABLED = '\ufffe';
5254

5355
/** Standard comma separated format as defined by <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. */
54-
public static final CSVFormat DEFAULT = new CSVFormat(',', '"', DISABLED, DISABLED, true, true, false, true, CRLF);
56+
public static final CSVFormat DEFAULT = new CSVFormat(',', '"', DISABLED, DISABLED, true, true, false, true, CRLF, null);
5557

5658
/**
5759
* Excel file format (using a comma as the value delimiter).
@@ -64,10 +66,10 @@ public class CSVFormat implements Serializable {
6466
*
6567
* <pre>CSVFormat fmt = CSVFormat.EXCEL.withDelimiter(';');</pre>
6668
*/
67-
public static final CSVFormat EXCEL = new CSVFormat(',', '"', DISABLED, DISABLED, false, false, false, false, CRLF);
69+
public static final CSVFormat EXCEL = new CSVFormat(',', '"', DISABLED, DISABLED, false, false, false, false, CRLF, null);
6870

6971
/** Tab-delimited format, with quote; leading and trailing spaces ignored. */
70-
public static final CSVFormat TDF = new CSVFormat('\t', '"', DISABLED, DISABLED, true, true, false, true, CRLF);
72+
public static final CSVFormat TDF = new CSVFormat('\t', '"', DISABLED, DISABLED, true, true, false, true, CRLF, null);
7173

7274
/**
7375
* Default MySQL format used by the <tt>SELECT INTO OUTFILE</tt> and
@@ -77,7 +79,7 @@ public class CSVFormat implements Serializable {
7779
*
7880
* @see <a href="http://dev.mysql.com/doc/refman/5.1/en/load-data.html">http://dev.mysql.com/doc/refman/5.1/en/load-data.html</a>
7981
*/
80-
public static final CSVFormat MYSQL = new CSVFormat('\t', DISABLED, DISABLED, '\\', false, false, false, false, "\n");
82+
public static final CSVFormat MYSQL = new CSVFormat('\t', DISABLED, DISABLED, '\\', false, false, false, false, "\n", null);
8183

8284

8385
/**
@@ -92,6 +94,7 @@ public class CSVFormat implements Serializable {
9294
* @param unicodeEscapesInterpreted <tt>true</tt> when unicode escapes should be interpreted
9395
* @param emptyLinesIgnored <tt>true</tt> when the parser should skip emtpy lines
9496
* @param lineSeparator the line separator to use for output
97+
* @param header the header
9598
*/
9699
CSVFormat(
97100
char delimiter,
@@ -102,7 +105,8 @@ public class CSVFormat implements Serializable {
102105
boolean trailingSpacesIgnored,
103106
boolean unicodeEscapesInterpreted,
104107
boolean emptyLinesIgnored,
105-
String lineSeparator) {
108+
String lineSeparator,
109+
String[] header) {
106110
this.delimiter = delimiter;
107111
this.encapsulator = encapsulator;
108112
this.commentStart = commentStart;
@@ -112,6 +116,7 @@ public class CSVFormat implements Serializable {
112116
this.unicodeEscapesInterpreted = unicodeEscapesInterpreted;
113117
this.emptyLinesIgnored = emptyLinesIgnored;
114118
this.lineSeparator = lineSeparator;
119+
this.header = header;
115120
}
116121

117122
/**
@@ -171,7 +176,7 @@ public CSVFormat withDelimiter(char delimiter) {
171176
throw new IllegalArgumentException("The delimiter cannot be a line break");
172177
}
173178

174-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
179+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
175180
}
176181

177182
/**
@@ -195,7 +200,7 @@ public CSVFormat withEncapsulator(char encapsulator) {
195200
throw new IllegalArgumentException("The encapsulator cannot be a line break");
196201
}
197202

198-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
203+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
199204
}
200205

201206
boolean isEncapsulating() {
@@ -223,7 +228,7 @@ public CSVFormat withCommentStart(char commentStart) {
223228
throw new IllegalArgumentException("The comment start character cannot be a line break");
224229
}
225230

226-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
231+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
227232
}
228233

229234
/**
@@ -256,7 +261,7 @@ public CSVFormat withEscape(char escape) {
256261
throw new IllegalArgumentException("The escape character cannot be a line break");
257262
}
258263

259-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
264+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
260265
}
261266

262267
boolean isEscaping() {
@@ -280,7 +285,7 @@ public boolean isLeadingSpacesIgnored() {
280285
* @return A copy of this format with the specified left trimming behavior.
281286
*/
282287
public CSVFormat withLeadingSpacesIgnored(boolean leadingSpacesIgnored) {
283-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
288+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
284289
}
285290

286291
/**
@@ -300,7 +305,7 @@ public boolean isTrailingSpacesIgnored() {
300305
* @return A copy of this format with the specified right trimming behavior.
301306
*/
302307
public CSVFormat withTrailingSpacesIgnored(boolean trailingSpacesIgnored) {
303-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
308+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
304309
}
305310

306311
/**
@@ -311,7 +316,7 @@ public CSVFormat withTrailingSpacesIgnored(boolean trailingSpacesIgnored) {
311316
* @return A copy of this format with the specified trimming behavior.
312317
*/
313318
public CSVFormat withSurroundingSpacesIgnored(boolean surroundingSpacesIgnored) {
314-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, surroundingSpacesIgnored, surroundingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
319+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, surroundingSpacesIgnored, surroundingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
315320
}
316321

317322
/**
@@ -332,7 +337,7 @@ public boolean isUnicodeEscapesInterpreted() {
332337
* @return A copy of this format with the specified unicode escaping behavior.
333338
*/
334339
public CSVFormat withUnicodeEscapesInterpreted(boolean unicodeEscapesInterpreted) {
335-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
340+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
336341
}
337342

338343
/**
@@ -352,7 +357,7 @@ public boolean isEmptyLinesIgnored() {
352357
* @return A copy of this format with the specified empty line skipping behavior.
353358
*/
354359
public CSVFormat withEmptyLinesIgnored(boolean emptyLinesIgnored) {
355-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
360+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
356361
}
357362

358363
/**
@@ -372,15 +377,37 @@ public String getLineSeparator() {
372377
* @return A copy of this format using the specified output line separator
373378
*/
374379
public CSVFormat withLineSeparator(String lineSeparator) {
375-
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator);
380+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
381+
}
382+
383+
String[] getHeader() {
384+
return header;
385+
}
386+
387+
/**
388+
* Returns a copy of this format using the specified header. The header can
389+
* either be parsed automatically from the input file with:
390+
*
391+
* <pre>CSVFormat format = CSVFormat.DEFAULT.withHeader();</pre>
392+
*
393+
* or specified manually with:
394+
*
395+
* <pre>CSVFormat format = CSVFormat.DEFAULT.withHeader("name", "email", "phone");</pre>
396+
*
397+
* @param header the header, <tt>null</tt> if disabled, empty if parsed automatically, user specified otherwise.
398+
*
399+
* @return A copy of this format using the specified header
400+
*/
401+
public CSVFormat withHeader(String... header) {
402+
return new CSVFormat(delimiter, encapsulator, commentStart, escape, leadingSpacesIgnored, trailingSpacesIgnored, unicodeEscapesInterpreted, emptyLinesIgnored, lineSeparator, header);
376403
}
377404

378405
/**
379406
* Parses the specified content.
380407
*
381408
* @param in the input stream
382409
*/
383-
public Iterable<String[]> parse(Reader in) {
410+
public Iterable<CSVRecord> parse(Reader in) throws IOException {
384411
return new CSVParser(in, this);
385412
}
386413

src/main/java/org/apache/commons/csv/CSVParser.java

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
import java.io.Reader;
2222
import java.io.StringReader;
2323
import java.util.ArrayList;
24+
import java.util.HashMap;
2425
import java.util.Iterator;
2526
import java.util.List;
27+
import java.util.Map;
2628
import java.util.NoSuchElementException;
2729

2830
import org.apache.commons.csv.CSVLexer.Token;
@@ -40,14 +42,14 @@
4042
* <pre>
4143
* CSVFormat format = new CSVFormat('\t', '"', '#');
4244
* Reader in = new StringReader("a\tb\nc\td");
43-
* String[][] records = new CSVParser(in, format).getRecords();
45+
* List&lt;CSVRecord> records = new CSVParser(in, format).getRecords();
4446
* </pre>
4547
*
4648
* <p>Parsing of a csv-string in Excel CSV format, using a for-each loop:</p>
4749
* <pre>
4850
* Reader in = new StringReader("a;b\nc;d");
4951
* CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
50-
* for (String[] record : parser) {
52+
* for (CSVRecord record : parser) {
5153
* ...
5254
* }
5355
* </pre>
@@ -59,13 +61,11 @@
5961
* <p>see <a href="package-summary.html">package documentation</a>
6062
* for more details</p>
6163
*/
62-
public class CSVParser implements Iterable<String[]> {
63-
64-
/** Immutable empty String array. */
65-
private static final String[] EMPTY_STRING_ARRAY = new String[0];
64+
public class CSVParser implements Iterable<CSVRecord> {
6665

6766
private final CSVLexer lexer;
68-
67+
private Map<String, Integer> headerMapping;
68+
6969
// the following objects are shared to reduce garbage
7070

7171
/** A record buffer for getRecord(). Grows as necessary and is reused. */
@@ -78,7 +78,7 @@ public class CSVParser implements Iterable<String[]> {
7878
* @param input a Reader containing "csv-formatted" input
7979
* @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
8080
*/
81-
public CSVParser(Reader input) {
81+
public CSVParser(Reader input) throws IOException {
8282
this(input, CSVFormat.DEFAULT);
8383
}
8484

@@ -89,14 +89,16 @@ public CSVParser(Reader input) {
8989
* @param format the CSVFormat used for CSV parsing
9090
* @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
9191
*/
92-
public CSVParser(Reader input, CSVFormat format) {
92+
public CSVParser(Reader input, CSVFormat format) throws IOException {
9393
format.validate();
9494

9595
if (format.isUnicodeEscapesInterpreted()) {
9696
input = new UnicodeUnescapeReader(input);
9797
}
9898

9999
this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
100+
101+
initializeHeader(format);
100102
}
101103

102104
/**
@@ -106,7 +108,7 @@ public CSVParser(Reader input, CSVFormat format) {
106108
* @param format the CSVFormat used for CSV parsing
107109
* @throws IllegalArgumentException thrown if the parameters of the format are inconsistent
108110
*/
109-
public CSVParser(String input, CSVFormat format) {
111+
public CSVParser(String input, CSVFormat format) throws IOException{
110112
this(new StringReader(input), format);
111113
}
112114

@@ -120,15 +122,15 @@ public CSVParser(String input, CSVFormat format) {
120122
* @return matrix of records x values ('null' when end of file)
121123
* @throws IOException on parse error or input read-failure
122124
*/
123-
public String[][] getRecords() throws IOException {
124-
List<String[]> records = new ArrayList<String[]>();
125-
String[] record;
125+
public List<CSVRecord> getRecords() throws IOException {
126+
List<CSVRecord> records = new ArrayList<CSVRecord>();
127+
CSVRecord record;
126128
while ((record = getRecord()) != null) {
127129
records.add(record);
128130
}
129131

130132
if (!records.isEmpty()) {
131-
return records.toArray(new String[records.size()][]);
133+
return records;
132134
} else {
133135
return null;
134136
}
@@ -140,8 +142,8 @@ public String[][] getRecords() throws IOException {
140142
* @return the record as an array of values, or <tt>null</tt> if the end of the stream has been reached
141143
* @throws IOException on parse error or input read-failure
142144
*/
143-
String[] getRecord() throws IOException {
144-
String[] result = EMPTY_STRING_ARRAY;
145+
CSVRecord getRecord() throws IOException {
146+
CSVRecord result = new CSVRecord(null, headerMapping);
145147
record.clear();
146148
do {
147149
reusableToken.reset();
@@ -161,25 +163,50 @@ String[] getRecord() throws IOException {
161163
}
162164
break;
163165
case INVALID:
164-
// error: throw IOException
165166
throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
166-
// unreachable: break;
167167
}
168168
} while (reusableToken.type == TOKEN);
169169

170170
if (!record.isEmpty()) {
171-
result = record.toArray(new String[record.size()]);
171+
result = new CSVRecord(record.toArray(new String[record.size()]), headerMapping);
172172
}
173173
return result;
174174
}
175175

176+
/**
177+
* Initializes the name to index mapping if the format defines a header.
178+
*/
179+
private void initializeHeader(CSVFormat format) throws IOException {
180+
if (format.getHeader() != null) {
181+
headerMapping = new HashMap<String, Integer>();
182+
183+
String[] header = null;
184+
if (format.getHeader().length == 0) {
185+
// read the header from the first line of the file
186+
CSVRecord record = getRecord();
187+
if (record != null) {
188+
header = record.values();
189+
}
190+
} else {
191+
header = format.getHeader();
192+
}
193+
194+
// build the name to index mappings
195+
if (header != null) {
196+
for (int i = 0; i < header.length; i++) {
197+
headerMapping.put(header[i], i);
198+
}
199+
}
200+
}
201+
}
202+
176203
/**
177204
* Returns an iterator on the records. IOExceptions occuring
178205
* during the iteration are wrapped in a RuntimeException.
179206
*/
180-
public Iterator<String[]> iterator() {
181-
return new Iterator<String[]>() {
182-
private String[] current;
207+
public Iterator<CSVRecord> iterator() {
208+
return new Iterator<CSVRecord>() {
209+
private CSVRecord current;
183210

184211
public boolean hasNext() {
185212
if (current == null) {
@@ -189,8 +216,8 @@ public boolean hasNext() {
189216
return current != null;
190217
}
191218

192-
public String[] next() {
193-
String[] next = current;
219+
public CSVRecord next() {
220+
CSVRecord next = current;
194221
current = null;
195222

196223
if (next == null) {
@@ -204,7 +231,7 @@ public String[] next() {
204231
return next;
205232
}
206233

207-
private String[] getNextRecord() {
234+
private CSVRecord getNextRecord() {
208235
try {
209236
return getRecord();
210237
} catch (IOException e) {

0 commit comments

Comments
 (0)