Skip to content

Commit d7e9458

Browse files
author
Ortwin Glueck
committed
This patch reduces the amount of intermediate garbage significantly.
PR: SANDBOX-166 Contributed by: Ortwin Glück Reviewed by: Henri Yandell git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/sandbox/csv/trunk@430322 13f79535-47bb-0310-9956-ffa450edef68
1 parent 4119702 commit d7e9458

2 files changed

Lines changed: 78 additions & 57 deletions

File tree

src/java/org/apache/commons/csv/CSVParser.java

Lines changed: 67 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import java.io.InputStream;
2020
import java.io.InputStreamReader;
2121
import java.io.Reader;
22-
import java.util.Vector;
22+
import java.util.ArrayList;
2323

2424

2525
/**
@@ -63,29 +63,41 @@ public class CSVParser {
6363
protected static final int TT_EOF = 1;
6464
/** Token with content when end of a line is reached. */
6565
protected static final int TT_EORECORD = 2;
66+
67+
/** Immutable empty String array. */
68+
private static final String[] EMPTY_STRING_ARRAY = new String[0];
6669

6770
// the input stream
6871
private ExtendedBufferedReader in;
6972

7073
private CSVStrategy strategy;
7174

75+
// the following objects are shared to reduce garbage
76+
/** A record buffer for getLine(). Grows as necessary and is reused. */
77+
private ArrayList record = new ArrayList();
78+
private Token reusableToken = new Token();
79+
private CharBuffer wsBuf = new CharBuffer();
80+
private CharBuffer code = new CharBuffer(4);
81+
82+
7283
/**
7384
* Token is an internal token representation.
7485
*
7586
* It is used as contract between the lexer and the parser.
7687
*/
7788
class Token {
7889
/** Token type, see TT_xxx constants. */
79-
int type;
90+
int type = TT_INVALID;
8091
/** The content buffer. */
81-
StringBuffer content;
92+
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
8293
/** Token ready flag: indicates a valid token with content (ready for the parser). */
8394
boolean isReady;
84-
/** Initializes an empty token. */
85-
Token() {
86-
content = new StringBuffer(INITIAL_TOKEN_LENGTH);
87-
type = TT_INVALID;
88-
isReady = false;
95+
96+
Token reset() {
97+
content.clear();
98+
type = TT_INVALID;
99+
isReady = false;
100+
return this;
89101
}
90102
}
91103

@@ -160,7 +172,7 @@ public CSVParser(Reader input, char delimiter, char encapsulator, char commentSt
160172
* @throws IOException on parse error or input read-failure
161173
*/
162174
public String[][] getAllValues() throws IOException {
163-
Vector records = new Vector();
175+
ArrayList records = new ArrayList();
164176
String[] values;
165177
String[][] ret = null;
166178
while ((values = getLine()) != null) {
@@ -211,35 +223,35 @@ public String nextValue() throws IOException {
211223
* @throws IOException on parse error or input read-failure
212224
*/
213225
public String[] getLine() throws IOException {
214-
Vector record = new Vector();
215-
String[] ret = new String[0];
216-
Token tkn;
217-
while ((tkn = nextToken()).type == TT_TOKEN) {
218-
record.add(tkn.content.toString());
219-
}
220-
// did we reached eorecord or eof ?
221-
switch (tkn.type) {
222-
case TT_EORECORD:
223-
record.add(tkn.content.toString());
224-
break;
225-
case TT_EOF:
226-
if (tkn.isReady) {
227-
record.add(tkn.content.toString());
228-
} else {
229-
ret = null;
226+
String[] ret = EMPTY_STRING_ARRAY;
227+
record.clear();
228+
while (true) {
229+
reusableToken.reset();
230+
nextToken(reusableToken);
231+
switch (reusableToken.type) {
232+
case TT_TOKEN:
233+
record.add(reusableToken.content.toString());
234+
break;
235+
case TT_EORECORD:
236+
record.add(reusableToken.content.toString());
237+
break;
238+
case TT_EOF:
239+
if (reusableToken.isReady) {
240+
record.add(reusableToken.content.toString());
241+
} else {
242+
ret = null;
243+
}
244+
break;
245+
case TT_INVALID:
246+
default:
247+
// error: throw IOException
248+
throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
249+
// unreachable: break;
230250
}
231-
break;
232-
case TT_INVALID:
233-
default:
234-
// error: throw IOException
235-
throw new IOException(
236-
"(line " + getLineNumber()
237-
+ ") invalid parse sequence");
238-
// unreachable: break;
251+
if (reusableToken.type != TT_TOKEN) break;
239252
}
240-
if (record.size() > 0) {
241-
ret = new String[record.size()];
242-
record.toArray(ret);
253+
if (!record.isEmpty()) {
254+
ret = (String[]) record.toArray(new String[record.size()]);
243255
}
244256
return ret;
245257
}
@@ -260,18 +272,26 @@ public int getLineNumber() {
260272
// the lexer(s)
261273
// ======================================================
262274

275+
/**
276+
* Convenience method for <code>nextToken(null)</code>.
277+
*/
278+
protected Token nextToken() throws IOException {
279+
return nextToken(new Token());
280+
}
281+
263282
/**
264283
* Returns the next token.
265284
*
266285
* A token corresponds to a term, a record change or an
267286
* end-of-file indicator.
268287
*
288+
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the
289+
* Token.
269290
* @return the next token found
270291
* @throws IOException on stream access error
271292
*/
272-
protected Token nextToken() throws IOException {
273-
Token tkn = new Token();
274-
StringBuffer wsBuf = new StringBuffer();
293+
protected Token nextToken(Token tkn) throws IOException {
294+
wsBuf.clear(); // resuse
275295

276296
// get the last read char (required for empty line detection)
277297
int lastChar = in.readAgain();
@@ -321,29 +341,29 @@ protected Token nextToken() throws IOException {
321341
if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
322342
// ignore everything till end of line and continue (incr linecount)
323343
in.readLine();
324-
tkn = nextToken();
344+
tkn = nextToken(tkn.reset());
325345
} else if (c == strategy.getDelimiter()) {
326346
// empty token return TT_TOKEN("")
327347
tkn.type = TT_TOKEN;
328348
tkn.isReady = true;
329349
} else if (eol) {
330350
// empty token return TT_EORECORD("")
331-
tkn.content.append("");
351+
//noop: tkn.content.append("");
332352
tkn.type = TT_EORECORD;
333353
tkn.isReady = true;
334354
} else if (c == strategy.getEncapsulator()) {
335355
// consume encapsulated token
336356
encapsulatedTokenLexer(tkn, c);
337357
} else if (isEndOfFile(c)) {
338358
// end of file return TT_EOF()
339-
tkn.content.append("");
359+
//noop: tkn.content.append("");
340360
tkn.type = TT_EOF;
341361
tkn.isReady = true;
342362
} else {
343363
// next token must be a simple token
344364
// add removed blanks when not ignoring whitespace chars...
345365
if (!strategy.getIgnoreLeadingWhitespaces()) {
346-
tkn.content.append(wsBuf.toString());
366+
tkn.content.append(wsBuf);
347367
}
348368
simpleTokenLexer(tkn, c);
349369
}
@@ -370,7 +390,7 @@ protected Token nextToken() throws IOException {
370390
* @throws IOException on stream access error
371391
*/
372392
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
373-
StringBuffer wsBuf = new StringBuffer();
393+
wsBuf.clear();
374394
while (!tkn.isReady) {
375395
if (isEndOfLine(c)) {
376396
// end of record
@@ -396,9 +416,8 @@ private Token simpleTokenLexer(Token tkn, int c) throws IOException {
396416
} else {
397417
// prepend whitespaces (if we have)
398418
if (wsBuf.length() > 0) {
399-
// for J2SDK 1.3 compatibility we use toString()
400-
tkn.content.append(wsBuf.toString());
401-
wsBuf.delete(0, wsBuf.length());
419+
tkn.content.append(wsBuf);
420+
wsBuf.clear();
402421
}
403422
tkn.content.append((char) c);
404423
}
@@ -508,7 +527,7 @@ protected int unicodeEscapeLexer(int c) throws IOException {
508527
int ret = 0;
509528
// ignore 'u' (assume c==\ now) and read 4 hex digits
510529
c = in.read();
511-
StringBuffer code = new StringBuffer(4);
530+
code.clear();
512531
try {
513532
for (int i = 0; i < 4; i++) {
514533
c = in.read();

src/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ public class ExtendedBufferedReader extends BufferedReader {
4747
private int lastChar = UNDEFINED;
4848
/** the line counter */
4949
private int lineCounter = 0;
50+
private CharBuffer line = new CharBuffer();
51+
5052
/**
5153
* Created extended buffered reader using default buffer-size
5254
*
@@ -154,16 +156,16 @@ public String readUntil(char c) throws IOException {
154156
if (lookaheadChar == UNDEFINED) {
155157
lookaheadChar = super.read();
156158
}
157-
StringBuffer ret = new StringBuffer("");
159+
line.clear(); // reuse
158160
while (lookaheadChar != c && lookaheadChar != END_OF_STREAM) {
159-
ret.append((char) lookaheadChar);
161+
line.append((char) lookaheadChar);
160162
if (lookaheadChar == '\n') {
161163
lineCounter++;
162164
}
163165
lastChar = lookaheadChar;
164166
lookaheadChar = super.read();
165167
}
166-
return ret.toString();
168+
return line.toString();
167169
}
168170

169171
/**
@@ -177,7 +179,7 @@ public String readLine() throws IOException {
177179
lookaheadChar = super.read();
178180
}
179181

180-
StringBuffer ret = new StringBuffer("");
182+
line.clear(); //reuse
181183

182184
// return null if end of stream has been reached
183185
if (lookaheadChar == END_OF_STREAM) {
@@ -194,19 +196,19 @@ public String readLine() throws IOException {
194196
lookaheadChar = super.read();
195197
}
196198
lineCounter++;
197-
return ret.toString();
199+
return line.toString();
198200
}
199201

200202
// create the rest-of-line return and update the lookahead
201-
ret.append(String.valueOf(laChar));
202-
String restOfLine = super.readLine();
203+
line.append(laChar);
204+
String restOfLine = super.readLine(); // TODO involves copying
203205
lastChar = lookaheadChar;
204206
lookaheadChar = super.read();
205207
if (restOfLine != null) {
206-
ret.append(restOfLine);
208+
line.append(restOfLine);
207209
}
208210
lineCounter++;
209-
return ret.toString();
211+
return line.toString();
210212
}
211213

212214
/**

0 commit comments

Comments
 (0)