Skip to content

Commit 604199b

Browse files
committed
Merge pull request #11 from internetarchive/master
Merge IA changes into IIPC master
2 parents 08c9df7 + 3d28897 commit 604199b

7 files changed

Lines changed: 185 additions & 88 deletions

File tree

pom.xml

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@
77
<version>7</version>
88
</parent>
99

10-
1110
<groupId>org.netpreserve.commons</groupId>
1211
<artifactId>webarchive-commons</artifactId>
13-
<version>1.1.1-SNAPSHOT</version>
12+
<version>1.1.1-${build.tag}SNAPSHOT</version>
1413
<packaging>jar</packaging>
1514

1615
<name>webarchive-commons</name>
@@ -54,6 +53,10 @@
5453
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
5554
<build.time>${maven.build.timestamp}</build.time>
5655
<maven.build.timestamp.format>yyyyMMddhhmmss</maven.build.timestamp.format>
56+
<build.tag></build.tag>
57+
<!-- sonatype repositories are defaults for distributionManagement -->
58+
<repository.url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</repository.url>
59+
<snapshotRepository.url>https://oss.sonatype.org/content/repositories/snapshots/</snapshotRepository.url>
5760
</properties>
5861

5962
<dependencies>
@@ -73,7 +76,7 @@
7376
<dependency>
7477
<groupId>org.json</groupId>
7578
<artifactId>json</artifactId>
76-
<version>20090211</version>
79+
<version>20131018</version>
7780
</dependency>
7881
<dependency>
7982
<groupId>org.htmlparser</groupId>
@@ -240,4 +243,15 @@
240243

241244
</repositories>
242245

246+
<distributionManagement>
247+
<repository>
248+
<id>repository</id>
249+
<url>${repository.url}</url>
250+
</repository>
251+
<snapshotRepository>
252+
<id>snapshotRepository</id>
253+
<url>${snapshotRepository.url}</url>
254+
</snapshotRepository>
255+
</distributionManagement>
256+
243257
</project>

src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,16 @@ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn)
117117
super.addProxyConnectionHeader(state, conn);
118118
this.httpRecorderMethod.handleAddProxyConnectionHeader(this);
119119
}
120+
121+
// XXX see https://webarchive.jira.com/browse/HER-2059
122+
// We never call this method with the implied question mark prepended, so
123+
// adding it does the trick, since commons-httpclient will strip it later.
124+
public void setQueryString(String queryString) {
125+
if (queryString != null) {
126+
super.setQueryString('?' + queryString);
127+
} else {
128+
super.setQueryString(queryString);
129+
}
130+
}
131+
120132
}

src/main/java/org/archive/io/RecordingInputStream.java

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,10 @@ public RecordingInputStream(int bufferSize, String backingFilename)
7474
}
7575

7676
public void open(InputStream wrappedStream) throws IOException {
77-
logger.fine(Thread.currentThread().getName() + " opening " +
78-
wrappedStream + ", " + Thread.currentThread().getName());
77+
if (logger.isLoggable(Level.FINE)) {
78+
logger.fine("wrapping " + wrappedStream + " in thread "
79+
+ Thread.currentThread().getName());
80+
}
7981
if(isOpen()) {
8082
// error; should not be opening/wrapping in an unclosed
8183
// stream remains open
@@ -135,11 +137,11 @@ public int read(byte[] b) throws IOException {
135137

136138
public void close() throws IOException {
137139
if (logger.isLoggable(Level.FINE)) {
138-
logger.fine(Thread.currentThread().getName() + " closing " +
139-
this.in + ", " + Thread.currentThread().getName());
140+
logger.fine("closing " + this.in + " in thread "
141+
+ Thread.currentThread().getName());
140142
}
141143
IOUtils.closeQuietly(this.in);
142-
this.in = null;
144+
this.in = null;
143145
IOUtils.closeQuietly(this.recordingOutputStream);
144146
}
145147

@@ -159,20 +161,77 @@ public long readFully() throws IOException {
159161
return this.recordingOutputStream.getSize();
160162
}
161163

164+
public void readToEndOfContent(long contentLength)
165+
throws IOException, InterruptedException {
166+
// Check we're open before proceeding.
167+
if (!isOpen()) {
168+
// TODO: should this be a noisier exception-raising error?
169+
return;
170+
}
171+
172+
long totalBytes = recordingOutputStream.position - recordingOutputStream.getMessageBodyBegin();
173+
long bytesRead = -1L;
174+
long maxToRead = -1;
175+
while (contentLength <= 0 || totalBytes < contentLength) {
176+
try {
177+
// read no more than soft max
178+
maxToRead = (contentLength <= 0)
179+
? drainBuffer.length
180+
: Math.min(drainBuffer.length, contentLength - totalBytes);
181+
// nor more than hard max
182+
maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
183+
// but always at least 1 (to trigger hard max exception) XXX wtf is this?
184+
maxToRead = Math.max(maxToRead, 1);
185+
186+
bytesRead = read(drainBuffer,0,(int)maxToRead);
187+
if (bytesRead == -1) {
188+
break;
189+
}
190+
totalBytes += bytesRead;
191+
192+
if (Thread.interrupted()) {
193+
throw new InterruptedException("Interrupted during IO");
194+
}
195+
} catch (SocketTimeoutException e) {
196+
// A socket timeout is just a transient problem, meaning
197+
// nothing was available in the configured timeout period,
198+
// but something else might become available later.
199+
// Take this opportunity to check the overall
200+
// timeout (below). One reason for this timeout is
201+
// servers that keep up the connection, 'keep-alive', even
202+
// though we asked them to not keep the connection open.
203+
if (logger.isLoggable(Level.FINE)) {
204+
logger.log(Level.FINE, "socket timeout", e);
205+
}
206+
// check for interrupt
207+
if (Thread.interrupted()) {
208+
throw new InterruptedException("Interrupted during IO");
209+
}
210+
// check for overall timeout
211+
recordingOutputStream.checkLimits();
212+
} catch (SocketException se) {
213+
throw se;
214+
} catch (NullPointerException e) {
215+
// [ 896757 ] NPEs in Andy's Th-Fri Crawl.
216+
// A crawl was showing NPE's in this part of the code but can
217+
// not reproduce. Adding this rethrowing catch block w/
218+
// diagnostics to help should we come across the problem in the
219+
// future.
220+
throw new NullPointerException("Stream " + this.in + ", " +
221+
e.getMessage() + " " + Thread.currentThread().getName());
222+
}
223+
}
224+
}
225+
162226
/**
163227
* Read all of a stream (Or read until we timeout or have read to the max).
164228
* @param softMaxLength Maximum length to read; if zero or < 0, then no
165229
* limit. If met, return normally.
166-
* @param hardMaxLength Maximum length to read; if zero or < 0, then no
167-
* limit. If exceeded, throw RecorderLengthExceededException
168-
* @param timeout Timeout in milliseconds for total read; if zero or
169-
* negative, timeout is <code>Long.MAX_VALUE</code>. If exceeded, throw
170-
* RecorderTimeoutException
171-
* @param maxBytesPerMs How many bytes per millisecond.
172230
* @throws IOException failed read.
173231
* @throws RecorderLengthExceededException
174232
* @throws RecorderTimeoutException
175233
* @throws InterruptedException
234+
* @deprecated
176235
*/
177236
public void readFullyOrUntil(long softMaxLength)
178237
throws IOException, RecorderLengthExceededException,
@@ -349,6 +408,13 @@ public int getRecordedBufferLength() {
349408
return recordingOutputStream.getBufferLength();
350409
}
351410

411+
/**
412+
* See doc on {@link RecordingOutputStream#chopAtMessageBodyBegin()}
413+
*/
414+
public void chopAtMessageBodyBegin() {
415+
recordingOutputStream.chopAtMessageBodyBegin();
416+
}
417+
352418
public void clearForReuse() throws IOException {
353419
recordingOutputStream.clearForReuse();
354420
}

src/main/java/org/archive/io/RecordingOutputStream.java

Lines changed: 69 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
2323

24+
import java.io.FileNotFoundException;
2425
import java.io.FileOutputStream;
2526
import java.io.IOException;
2627
import java.io.OutputStream;
@@ -84,17 +85,11 @@ public class RecordingOutputStream extends OutputStream {
8485
private byte[] buffer;
8586

8687
/** current virtual position in the recording */
87-
private long position;
88+
long position;
8889

8990
/** flag to disable recording */
9091
private boolean recording;
9192

92-
/**
93-
* Reusable buffer for FastBufferedOutputStream
94-
*/
95-
protected byte[] bufStreamBuf =
96-
new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ];
97-
9893
/**
9994
* True if we're to digest content.
10095
*/
@@ -132,6 +127,29 @@ public class RecordingOutputStream extends OutputStream {
132127
*/
133128
protected long messageBodyBeginMark;
134129

130+
/**
131+
* While messageBodyBeginMark is not set, the last two bytes seen.
132+
*
133+
* <p>
134+
* This class does automatic detection of http message body begin (i.e. end
135+
* of http headers). Unfortunately httpcomponents did not want to add
136+
* functionality to help us with this, see
137+
* https://issues.apache.org/jira/browse/HTTPCORE-325
138+
*
139+
* <p>
140+
* It works like this: while messageBodyBeginMark is not set, we remember
141+
* the last two bytes seen, and look at each byte we write. If the
142+
* lastTwoBytes+currentByte is "\n\r\n", or lastTwoBytes[1]+currentByte is
143+
* "\n\n" then we call markMessageBodyBegin() at the position after
144+
* currentByte.
145+
*
146+
* <p>
147+
* An assumption here is that protocols other than http don't have headers,
148+
* and for those protocols the user of this class will call
149+
* markMessageBodyBegin() at position 0 before writing anything.
150+
*/
151+
protected int[] lastTwoBytes = new int[] {-1, -1};
152+
135153
/**
136154
* Stream to record.
137155
*/
@@ -183,15 +201,18 @@ public void open(OutputStream wrappedStream) throws IOException {
183201
}
184202
clearForReuse();
185203
this.out = wrappedStream;
204+
startTime = System.currentTimeMillis();
205+
}
206+
207+
protected OutputStream ensureDiskStream() throws FileNotFoundException {
186208
if (this.diskStream == null) {
187-
// TODO: Fix so we only make file when its actually needed.
188209
FileOutputStream fis = new FileOutputStream(this.backingFilename);
189-
190-
this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf);
210+
this.diskStream = new FastBufferedOutputStream(fis);
191211
}
192-
startTime = System.currentTimeMillis();
212+
return this.diskStream;
193213
}
194214

215+
195216
public void write(int b) throws IOException {
196217
if(position<maxPosition) {
197218
// revisiting previous content; do nothing but advance position
@@ -204,6 +225,20 @@ public void write(int b) throws IOException {
204225
if (this.out != null) {
205226
this.out.write(b);
206227
}
228+
229+
// see comment on int[] lastTwoBytes
230+
if (messageBodyBeginMark < 0l) {
231+
// looking for "\n\n" or "\n\r\n"
232+
if (b == '\n'
233+
&& (lastTwoBytes[1] == '\n'
234+
|| (lastTwoBytes[0] == '\n' && lastTwoBytes[1] == '\r'))) {
235+
markMessageBodyBegin();
236+
} else {
237+
lastTwoBytes[0] = lastTwoBytes[1];
238+
lastTwoBytes[1] = b;
239+
}
240+
}
241+
207242
checkLimits();
208243
}
209244

@@ -220,6 +255,14 @@ public void write(byte[] b, int off, int len) throws IOException {
220255
off += consumeRange;
221256
len -= consumeRange;
222257
}
258+
259+
// see comment on int[] lastTwoBytes
260+
while (messageBodyBeginMark < 0 && len > 0) {
261+
write(b[off]);
262+
off++;
263+
len--;
264+
}
265+
223266
if(recording) {
224267
record(b, off, len);
225268
}
@@ -251,7 +294,7 @@ protected void checkLimits() throws RecorderIOException {
251294
throw new RecorderTimeoutException();
252295
}
253296
// need to throttle reading to hit max configured rate?
254-
if(position/duration > maxRateBytesPerMs) {
297+
if(position/duration >= maxRateBytesPerMs) {
255298
long desiredDuration = position / maxRateBytesPerMs;
256299
try {
257300
Thread.sleep(desiredDuration-duration);
@@ -274,10 +317,7 @@ private void record(int b) throws IOException {
274317
this.digest.update((byte)b);
275318
}
276319
if (this.position >= this.buffer.length) {
277-
// TODO: Its possible to call write w/o having first opened a
278-
// stream. Protect ourselves against this.
279-
assert this.diskStream != null: "Diskstream is null";
280-
this.diskStream.write(b);
320+
this.ensureDiskStream().write(b);
281321
} else {
282322
this.buffer[(int) this.position] = (byte) b;
283323
}
@@ -312,12 +352,7 @@ private void record(byte[] b, int off, int len) throws IOException {
312352
*/
313353
private void tailRecord(byte[] b, int off, int len) throws IOException {
314354
if(this.position >= this.buffer.length){
315-
// TODO: Its possible to call write w/o having first opened a
316-
// stream. Lets protect ourselves against this.
317-
if (this.diskStream == null) {
318-
throw new IOException("diskstream is null");
319-
}
320-
this.diskStream.write(b, off, len);
355+
this.ensureDiskStream().write(b, off, len);
321356
this.position += len;
322357
} else {
323358
assert this.buffer != null: "Buffer is null";
@@ -557,6 +592,18 @@ public long getRemainingLength() {
557592
return maxLength - position;
558593
}
559594

595+
/**
596+
* Forget about anything past the point where the content-body starts. This
597+
* is needed to support FetchHTTP's shouldFetchBody setting. See also the
598+
* docs on {@link #lastTwoBytes}
599+
*/
600+
public void chopAtMessageBodyBegin() {
601+
if (messageBodyBeginMark >= 0) {
602+
this.size = messageBodyBeginMark;
603+
this.position = messageBodyBeginMark;
604+
}
605+
}
606+
560607
public void clearForReuse() throws IOException {
561608
this.out = null;
562609
this.position = 0;

src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java

Lines changed: 0 additions & 37 deletions
This file was deleted.

0 commit comments

Comments
 (0)