From 0963f2e34339536f9e327f584610e8eecd53e1ef Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 6 Dec 2013 19:21:50 -0800 Subject: [PATCH 01/11] fix HER-2059 with commons-httpclient workaround --- .../archive/httpclient/HttpRecorderGetMethod.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java index 105c4f7e..ef241b48 100644 --- a/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java +++ b/src/main/java/org/archive/httpclient/HttpRecorderGetMethod.java @@ -117,4 +117,16 @@ protected void addProxyConnectionHeader(HttpState state, HttpConnection conn) super.addProxyConnectionHeader(state, conn); this.httpRecorderMethod.handleAddProxyConnectionHeader(this); } + + // XXX see https://webarchive.jira.com/browse/HER-2059 + // We never call this method with the implied question mark prepended, so + // adding it does the trick, since commons-httpclient will strip it later. + public void setQueryString(String queryString) { + if (queryString != null) { + super.setQueryString('?' + queryString); + } else { + super.setQueryString(queryString); + } + } + } From 3a7093a4e393896795245377aa0ddca5d94044ed Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 Dec 2013 13:32:47 -0800 Subject: [PATCH 02/11] Update version to 1.1.0 release! --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c714fe8c..57fec7d2 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ org.archive ia-web-commons - 1.0-SNAPSHOT + 1.1.0 jar ia-web-commons From acec0d4682860ce8b13b6362abf399df434477ac Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Jan 2014 17:53:19 -0800 Subject: [PATCH 03/11] updates for heritrix FetchHTTP using httpcomponents 4.3 --- pom.xml | 2 +- .../org/archive/io/RecordingInputStream.java | 88 ++++++++++++++++--- .../org/archive/io/RecordingOutputStream.java | 59 ++++++++++++- .../org/archive/io/ReplayInputStream.java | 15 ++-- 4 files changed, 143 insertions(+), 21 deletions(-) diff --git a/pom.xml b/pom.xml index 57fec7d2..4a7290c7 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ org.archive ia-web-commons - 1.1.0 + 1.1.1-SNAPSHOT jar ia-web-commons diff --git a/src/main/java/org/archive/io/RecordingInputStream.java b/src/main/java/org/archive/io/RecordingInputStream.java index b46905ed..36f539a9 100644 --- a/src/main/java/org/archive/io/RecordingInputStream.java +++ b/src/main/java/org/archive/io/RecordingInputStream.java @@ -74,8 +74,10 @@ public RecordingInputStream(int bufferSize, String backingFilename) } public void open(InputStream wrappedStream) throws IOException { - logger.fine(Thread.currentThread().getName() + " opening " + - wrappedStream + ", " + Thread.currentThread().getName()); + if (logger.isLoggable(Level.FINE)) { + logger.fine("wrapping " + wrappedStream + " in thread " + + Thread.currentThread().getName()); + } if(isOpen()) { // error; should not be opening/wrapping in an unclosed // stream remains open @@ -135,11 +137,11 @@ public int read(byte[] b) throws IOException { public void close() throws IOException { if (logger.isLoggable(Level.FINE)) { - logger.fine(Thread.currentThread().getName() + " closing " + - this.in + ", " + Thread.currentThread().getName()); + logger.fine("closing " + this.in + " in thread " + + Thread.currentThread().getName()); } IOUtils.closeQuietly(this.in); - this.in = null; + this.in = null; IOUtils.closeQuietly(this.recordingOutputStream); } @@ -159,20 +161,77 @@ public long readFully() throws IOException { return this.recordingOutputStream.getSize(); } + public void readToEndOfContent(long contentLength) + throws IOException, InterruptedException { + // Check we're open before proceeding. + if (!isOpen()) { + // TODO: should this be a noisier exception-raising error? + return; + } + + long totalBytes = recordingOutputStream.position - recordingOutputStream.getMessageBodyBegin(); + long bytesRead = -1L; + long maxToRead = -1; + while (contentLength <= 0 || totalBytes < contentLength) { + try { + // read no more than soft max + maxToRead = (contentLength <= 0) + ? drainBuffer.length + : Math.min(drainBuffer.length, contentLength - totalBytes); + // nor more than hard max + maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength()); + // but always at least 1 (to trigger hard max exception) XXX wtf is this? + maxToRead = Math.max(maxToRead, 1); + + bytesRead = read(drainBuffer,0,(int)maxToRead); + if (bytesRead == -1) { + break; + } + totalBytes += bytesRead; + + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + } catch (SocketTimeoutException e) { + // A socket timeout is just a transient problem, meaning + // nothing was available in the configured timeout period, + // but something else might become available later. + // Take this opportunity to check the overall + // timeout (below). One reason for this timeout is + // servers that keep up the connection, 'keep-alive', even + // though we asked them to not keep the connection open. + if (logger.isLoggable(Level.FINE)) { + logger.log(Level.FINE, "socket timeout", e); + } + // check for interrupt + if (Thread.interrupted()) { + throw new InterruptedException("Interrupted during IO"); + } + // check for overall timeout + recordingOutputStream.checkLimits(); + } catch (SocketException se) { + throw se; + } catch (NullPointerException e) { + // [ 896757 ] NPEs in Andy's Th-Fri Crawl. + // A crawl was showing NPE's in this part of the code but can + // not reproduce. Adding this rethrowing catch block w/ + // diagnostics to help should we come across the problem in the + // future. + throw new NullPointerException("Stream " + this.in + ", " + + e.getMessage() + " " + Thread.currentThread().getName()); + } + } + } + /** * Read all of a stream (Or read until we timeout or have read to the max). * @param softMaxLength Maximum length to read; if zero or < 0, then no * limit. If met, return normally. - * @param hardMaxLength Maximum length to read; if zero or < 0, then no - * limit. If exceeded, throw RecorderLengthExceededException - * @param timeout Timeout in milliseconds for total read; if zero or - * negative, timeout is Long.MAX_VALUE. If exceeded, throw - * RecorderTimeoutException - * @param maxBytesPerMs How many bytes per millisecond. * @throws IOException failed read. * @throws RecorderLengthExceededException * @throws RecorderTimeoutException * @throws InterruptedException + * @deprecated */ public void readFullyOrUntil(long softMaxLength) throws IOException, RecorderLengthExceededException, @@ -349,6 +408,13 @@ public int getRecordedBufferLength() { return recordingOutputStream.getBufferLength(); } + /** + * See doc on {@link RecordingOutputStream#chopAtMessageBodyBegin()} + */ + public void chopAtMessageBodyBegin() { + recordingOutputStream.chopAtMessageBodyBegin(); + } + public void clearForReuse() throws IOException { recordingOutputStream.clearForReuse(); } diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java index 4d0713da..85496042 100644 --- a/src/main/java/org/archive/io/RecordingOutputStream.java +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -84,7 +84,7 @@ public class RecordingOutputStream extends OutputStream { private byte[] buffer; /** current virtual position in the recording */ - private long position; + long position; /** flag to disable recording */ private boolean recording; @@ -132,6 +132,29 @@ public class RecordingOutputStream extends OutputStream { */ protected long messageBodyBeginMark; + /** + * While messageBodyBeginMark is not set, the last two bytes seen. + * + *

+ * This class does automatic detection of http message body begin (i.e. end + * of http headers). Unfortunately httpcomponents did not want to add + * functionality to help us with this, see + * https://issues.apache.org/jira/browse/HTTPCORE-325 + * + *

+ * It works like this: while messageBodyBeginMark is not set, we remember + * the last two bytes seen, and look at each byte we write. If the + * lastTwoBytes+currentByte is "\n\r\n", or lastTwoBytes[1]+currentByte is + * "\n\n" then we call markMessageBodyBegin() at the position after + * currentByte. + * + *

+ * An assumption here is that protocols other than http don't have headers, + * and for those protocols the user of this class will call + * markMessageBodyBegin() at position 0 before writing anything. + */ + protected int[] lastTwoBytes = new int[] {-1, -1}; + /** * Stream to record. */ @@ -204,6 +227,20 @@ public void write(int b) throws IOException { if (this.out != null) { this.out.write(b); } + + // see comment on int[] lastTwoBytes + if (messageBodyBeginMark < 0l) { + // looking for "\n\n" or "\n\r\n" + if (b == '\n' + && (lastTwoBytes[1] == '\n' + || (lastTwoBytes[0] == '\n' && lastTwoBytes[1] == '\r'))) { + markMessageBodyBegin(); + } else { + lastTwoBytes[0] = lastTwoBytes[1]; + lastTwoBytes[1] = b; + } + } + checkLimits(); } @@ -220,6 +257,14 @@ public void write(byte[] b, int off, int len) throws IOException { off += consumeRange; len -= consumeRange; } + + // see comment on int[] lastTwoBytes + while (messageBodyBeginMark < 0 && len > 0) { + write(b[off]); + off++; + len--; + } + if(recording) { record(b, off, len); } @@ -557,6 +602,18 @@ public long getRemainingLength() { return maxLength - position; } + /** + * Forget about anything past the point where the content-body starts. This + * is needed to support FetchHTTP's shouldFetchBody setting. See also the + * docs on {@link #lastTwoBytes} + */ + public void chopAtMessageBodyBegin() { + if (messageBodyBeginMark >= 0) { + this.size = messageBodyBeginMark; + this.position = messageBodyBeginMark; + } + } + public void clearForReuse() throws IOException { this.out = null; this.position = 0; diff --git a/src/main/java/org/archive/io/ReplayInputStream.java b/src/main/java/org/archive/io/ReplayInputStream.java index fccf5fd3..35ea8175 100644 --- a/src/main/java/org/archive/io/ReplayInputStream.java +++ b/src/main/java/org/archive/io/ReplayInputStream.java @@ -192,11 +192,15 @@ public int read(byte[] b, int off, int len) throws IOException { } public void readFullyTo(OutputStream os) throws IOException { + readFullyTo(this, os); + } + + public static void readFullyTo(InputStream in, OutputStream os) throws IOException { byte[] buf = new byte[4096]; - int c = read(buf); + int c = in.read(buf); while (c != -1) { os.write(buf,0,c); - c = read(buf); + c = in.read(buf); } } @@ -218,12 +222,7 @@ public void readHeaderTo(OutputStream os) throws IOException { */ public void readContentTo(OutputStream os) throws IOException { setToResponseBodyStart(); - byte[] buf = new byte[4096]; - int c = read(buf); - while (c != -1) { - os.write(buf,0,c); - c = read(buf); - } + readFullyTo(os); } /** From 7c6c6736c7e48dc2a6252ab13fd253df54eeeb87 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Jan 2014 18:32:03 -0800 Subject: [PATCH 04/11] fix off-by-one mistake with integer arithmetic so RecordingInputStreamTest.testReadFullyOrUntil() test passes (how did this ever pass?) --- src/main/java/org/archive/io/RecordingOutputStream.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java index 85496042..95e444cc 100644 --- a/src/main/java/org/archive/io/RecordingOutputStream.java +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -296,7 +296,7 @@ protected void checkLimits() throws RecorderIOException { throw new RecorderTimeoutException(); } // need to throttle reading to hit max configured rate? - if(position/duration > maxRateBytesPerMs) { + if(position/duration >= maxRateBytesPerMs) { long desiredDuration = position / maxRateBytesPerMs; try { Thread.sleep(desiredDuration-duration); From dcb68a2b08eebc94c54e0e095303057e19e7f918 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Jan 2014 17:21:46 -0800 Subject: [PATCH 05/11] get rid of RecyclingFastBufferedOutputStream, which was supposed to avoid allocating a buffer every time it was allocated, but didn't because FastBufferedOutputStream always goes and allocates its own anyway (in this old version of fastutil); also, lazily initialize RecordingOutputStream.diskStream --- .../org/archive/io/RecordingOutputStream.java | 30 +++++---------- .../io/RecyclingFastBufferedOutputStream.java | 37 ------------------- .../java/org/archive/io/WriterPoolMember.java | 10 ++--- 3 files changed, 13 insertions(+), 64 deletions(-) delete mode 100644 src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java diff --git a/src/main/java/org/archive/io/RecordingOutputStream.java b/src/main/java/org/archive/io/RecordingOutputStream.java index 95e444cc..fe05701c 100644 --- a/src/main/java/org/archive/io/RecordingOutputStream.java +++ b/src/main/java/org/archive/io/RecordingOutputStream.java @@ -21,6 +21,7 @@ import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -89,12 +90,6 @@ public class RecordingOutputStream extends OutputStream { /** flag to disable recording */ private boolean recording; - /** - * Reusable buffer for FastBufferedOutputStream - */ - protected byte[] bufStreamBuf = - new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ]; - /** * True if we're to digest content. */ @@ -206,15 +201,18 @@ public void open(OutputStream wrappedStream) throws IOException { } clearForReuse(); this.out = wrappedStream; + startTime = System.currentTimeMillis(); + } + + protected OutputStream ensureDiskStream() throws FileNotFoundException { if (this.diskStream == null) { - // TODO: Fix so we only make file when its actually needed. FileOutputStream fis = new FileOutputStream(this.backingFilename); - - this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf); + this.diskStream = new FastBufferedOutputStream(fis); } - startTime = System.currentTimeMillis(); + return this.diskStream; } + public void write(int b) throws IOException { if(position= this.buffer.length) { - // TODO: Its possible to call write w/o having first opened a - // stream. Protect ourselves against this. - assert this.diskStream != null: "Diskstream is null"; - this.diskStream.write(b); + this.ensureDiskStream().write(b); } else { this.buffer[(int) this.position] = (byte) b; } @@ -357,12 +352,7 @@ private void record(byte[] b, int off, int len) throws IOException { */ private void tailRecord(byte[] b, int off, int len) throws IOException { if(this.position >= this.buffer.length){ - // TODO: Its possible to call write w/o having first opened a - // stream. Lets protect ourselves against this. - if (this.diskStream == null) { - throw new IOException("diskstream is null"); - } - this.diskStream.write(b, off, len); + this.ensureDiskStream().write(b, off, len); this.position += len; } else { assert this.buffer != null: "Buffer is null"; diff --git a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java b/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java deleted file mode 100644 index a3b76e46..00000000 --- a/src/main/java/org/archive/io/RecyclingFastBufferedOutputStream.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.io; - -import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; - -import java.io.OutputStream; - -/** - * FastBufferedOutputStream that accepts a passed-in buffer (avoiding - * reallocation). - */ -public class RecyclingFastBufferedOutputStream extends FastBufferedOutputStream { - public RecyclingFastBufferedOutputStream( final OutputStream os, final byte[] buffer ) { - super(os); - this.buffer = buffer; - avail = buffer.length; - } -} - - diff --git a/src/main/java/org/archive/io/WriterPoolMember.java b/src/main/java/org/archive/io/WriterPoolMember.java index 6ea6b295..85d44e5d 100644 --- a/src/main/java/org/archive/io/WriterPoolMember.java +++ b/src/main/java/org/archive/io/WriterPoolMember.java @@ -19,6 +19,8 @@ package org.archive.io; +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; + import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -80,9 +82,6 @@ public abstract class WriterPoolMember implements ArchiveFileConstants { /** Counting stream for metering */ protected MiserOutputStream countOut = null; - /** reusable buffer for recycling scenarios */ - protected byte[] rebuf; - protected WriterPoolSettings settings; private final String extension; @@ -209,10 +208,7 @@ protected String createFile(final File file) throws IOException { close(); this.f = file; FileOutputStream fos = new FileOutputStream(this.f); - if(rebuf==null) { - rebuf = new byte[settings.getWriteBufferSize()]; - } - this.countOut = new MiserOutputStream(new RecyclingFastBufferedOutputStream(fos,rebuf),settings.getFrequentFlushes()); + this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes()); this.out = this.countOut; logger.fine("Opened " + this.f.getAbsolutePath()); return this.f.getName(); From 77df67215120dfce8ba0e90f4cf2d96eddec3f5c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 31 Jan 2014 20:39:37 -0800 Subject: [PATCH 06/11] update org.json dependency, because source jar is available in maven repo --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4a7290c7..d2004a27 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ org.json json - 20090211 + 20131018 org.htmlparser From 7c15265b89769d00569c7cfa9284e37461c3009d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 21 Feb 2014 10:43:54 -0800 Subject: [PATCH 07/11] set version for final build as ia-web-commons --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d2004a27..d91de709 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ org.archive ia-web-commons - 1.1.1-SNAPSHOT + 1.1.1-LOC jar ia-web-commons From b82f48888276cbddd61f67b0243c0781b5daedb4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 28 Feb 2014 09:39:14 -0800 Subject: [PATCH 08/11] remove sonatype parent, add distributionManagement back in --- pom.xml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index 918a434c..68523d98 100644 --- a/pom.xml +++ b/pom.xml @@ -1,12 +1,6 @@ 4.0.0 - - org.sonatype.oss - oss-parent - 7 - - org.netpreserve.commons webarchive-commons 1.1.1-SNAPSHOT @@ -239,4 +233,11 @@ - + + + + repository + + ${repository.url} + + From ee3105132883cfcd6e2a1b8b9781897e9417a6f5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 28 Feb 2014 09:45:39 -0800 Subject: [PATCH 09/11] add missing newline back --- pom.xml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 68523d98..7616a17b 100644 --- a/pom.xml +++ b/pom.xml @@ -240,4 +240,6 @@ ${repository.url} - + + + From ffb68b8a87b6656f08a31f5bc2438a5d9d360645 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 28 Feb 2014 12:07:14 -0800 Subject: [PATCH 10/11] pom.xml customizations to allow IA and other 3rd parties to tag builds of their branches with a special version number, and deploy to a custom repository with the maven command line switch -Drepository.url=... --- pom.xml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 7616a17b..53095b83 100644 --- a/pom.xml +++ b/pom.xml @@ -1,9 +1,15 @@ 4.0.0 + + org.sonatype.oss + oss-parent + 7 + + org.netpreserve.commons webarchive-commons - 1.1.1-SNAPSHOT + 1.1.1-${build.tag}SNAPSHOT jar webarchive-commons @@ -47,6 +53,7 @@ UTF-8 ${maven.build.timestamp} yyyyMMddhhmmss + @@ -233,13 +240,17 @@ - repository ${repository.url} + + repository + + ${repository.url} + - + From d81936cb8e4ab06f9ec7a05a763cfaa5d543b527 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 28 Feb 2014 12:24:51 -0800 Subject: [PATCH 11/11] configure properties so sonatype repositories are defaults for distributionManagement --- pom.xml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 53095b83..9c8698c7 100644 --- a/pom.xml +++ b/pom.xml @@ -54,6 +54,9 @@ ${maven.build.timestamp} yyyyMMddhhmmss + + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + https://oss.sonatype.org/content/repositories/snapshots/ @@ -243,13 +246,11 @@ repository - ${repository.url} - repository - - ${repository.url} + snapshotRepository + ${snapshotRepository.url}